This commit is contained in:
Grzegorz Michalski
2026-03-02 09:47:35 +01:00
commit 2c225d68ac
715 changed files with 130067 additions and 0 deletions

View File

View File

@@ -0,0 +1,346 @@
from __future__ import annotations
import os
import sys
import logging
import yaml
from datetime import timedelta
from airflow import DAG
from airflow.utils.dates import days_ago
from airflow.utils.trigger_rule import TriggerRule
from airflow.operators.python import PythonOperator
try:
from airflow.exceptions import AirflowFailException
except Exception:
from airflow.exceptions import AirflowException as AirflowFailException
sys.path.append('/opt/airflow/python/mrds_common')
sys.path.append('/opt/airflow/python/devo_replicator/data_replicator')
from mrds.utils import oraconn
from impala_refresher import main as impala_main
from mrds.utils.security_utils import get_verified_run_id, verify_run_id
ENV_CONFIG_PATH = "/opt/airflow/python/devo_replicator/config/env_config.yaml"
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': days_ago(1),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=1),
}
with DAG(
dag_id='devo_replicator_core',
default_args=default_args,
description='Core Devo replicator workflow for single table',
schedule=None,
catchup=False,
tags=['DevoReplicator'],
max_active_runs=10,
max_active_tasks=16,
) as dag:
# Init - read config from context
def init_step(**context):
dag_run = context.get("dag_run")
ti = context["ti"]
conf = (dag_run.conf or {}) if dag_run else {}
env = os.getenv("MRDS_ENV")
if not env:
raise ValueError("MRDS_ENV environment variable is required")
env = env.lower()
store = conf.get("store")
if not store:
raise ValueError("store parameter is required")
store = store.lower()
owner_table = conf.get("owner_table")
if not owner_table or '.' not in owner_table:
raise ValueError("owner_table must be in format 'OWNER.TABLE_NAME'")
table_owner, table_name = owner_table.split('.', 1)
if env not in {"dev", "tst","acc","prd"}:
raise ValueError(f"Unsupported env '{env}'. Expected 'dev', 'tst', 'acc' or 'prd'.")
if store not in {"mopdb", "rar", "rqsd"}:
raise ValueError(f"Unsupported store '{store}'. Expected 'mopdb', 'rar', 'rqsd'.")
if store == "mopdb":
p_service_name = "MOPDB"
elif store == "rar":
p_service_name = "RAR"
elif store == "rqsd":
p_service_name = "RQSD"
with open(ENV_CONFIG_PATH, "r") as f:
cfg = yaml.safe_load(f)
env_cfg = cfg[env]
store_cfg = cfg[store]
p_objectstore_uri = env_cfg["S3_LOCATION_URI"].replace("{0}", store.lower())
# Get verified run_id using security utilities
p_run_id = get_verified_run_id(context)
logging.info("=== init_step === env=%s store=%s table=%s.%s run_id=%s",
env, store, table_owner, table_name, p_run_id)
xcom = {
"env": env,
"store": store,
"config_path": ENV_CONFIG_PATH,
"p_run_id": p_run_id,
"p_service_name": p_service_name,
"p_table_owner": table_owner,
"p_table_name": table_name,
"p_objectstore_uri": p_objectstore_uri,
"corporate_store": store_cfg["corporate_store"],
"owner_table": owner_table,
}
for k, v in xcom.items():
ti.xcom_push(key=k, value=v)
init = PythonOperator(
task_id='init_step',
python_callable=init_step,
)
# Start log table
def start_log_table_task(**context):
ti = context["ti"]
p_run_id = ti.xcom_pull(task_ids='init_step', key='p_run_id')
p_service_name = ti.xcom_pull(task_ids='init_step', key='p_service_name')
p_table_owner = ti.xcom_pull(task_ids='init_step', key='p_table_owner')
p_table_name = ti.xcom_pull(task_ids='init_step', key='p_table_name')
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
oraconn.run_proc(
oracle_conn,
'MRDS_LOADER.DATA_REPLICATOR.start_log_table',
[p_run_id, p_service_name, p_table_owner, p_table_name]
)
oracle_conn.commit()
logging.info("start_log_table procedure executed successfully")
except Exception as e:
logging.error(f"Error in start_log_table: {e}")
raise
finally:
if oracle_conn:
oracle_conn.close()
t1 = PythonOperator(
task_id='start_log_table',
python_callable=start_log_table_task,
)
# Export table
def export_table_task(**context):
ti = context["ti"]
p_service_name = ti.xcom_pull(task_ids='init_step', key='p_service_name')
p_table_owner = ti.xcom_pull(task_ids='init_step', key='p_table_owner')
p_table_name = ti.xcom_pull(task_ids='init_step', key='p_table_name')
p_objectstore_uri = ti.xcom_pull(task_ids='init_step', key='p_objectstore_uri')
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
oraconn.run_proc(
oracle_conn,
'MRDS_LOADER.DATA_REPLICATOR.export_table',
[p_service_name, p_table_owner, p_table_name, p_objectstore_uri]
)
oracle_conn.commit()
logging.info("export_table procedure executed successfully")
except Exception as e:
logging.error(f"Error in export_table: {e}")
raise
finally:
if oracle_conn:
oracle_conn.close()
t2 = PythonOperator(
task_id='export_table',
python_callable=export_table_task,
trigger_rule=TriggerRule.ALL_DONE,
)
# Check if previous tasks succeeded before triggering child DAG
def check_previous_tasks_success(**context):
ti = context["ti"]
dag_run = context['dag_run']
store = ti.xcom_pull(task_ids='init_step', key='store')
check_tasks = ['start_log_table', 'export_table']
failed = []
for tid in check_tasks:
ti_up = dag_run.get_task_instance(tid)
if ti_up and ti_up.state != 'success':
failed.append(f"{tid}:{ti_up.state}")
if failed:
error_msg = f"Cannot proceed with {store} table generator. Previous tasks not successful: {', '.join(failed)}"
logging.error(error_msg)
raise AirflowFailException(error_msg)
logging.info(f"All previous tasks succeeded. Ready to trigger {store} table generator.")
return True
t3_check = PythonOperator(
task_id='check_previous_tasks_success',
python_callable=check_previous_tasks_success,
trigger_rule=TriggerRule.ALL_DONE,
)
def drop_table(**context):
ti = context["ti"]
p_service_name = ti.xcom_pull(task_ids='init_step', key='p_service_name')
p_table_owner = ti.xcom_pull(task_ids='init_step', key='p_table_owner')
p_table_name = ti.xcom_pull(task_ids='init_step', key='p_table_name')
p_objectstore_uri = ti.xcom_pull(task_ids='init_step', key='p_objectstore_uri')
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
oraconn.run_proc(
oracle_conn,
'MRDS_LOADER.DATA_REPLICATOR.clear_s3_bucket',
[p_service_name, p_table_owner, p_table_name + '_COPY', p_objectstore_uri]
)
oracle_conn.commit()
logging.info("clear_s3_bucket for table {0} procedure executed successfully".format(p_table_name[:-5].lower()))
except Exception as e:
logging.error(f"Error in clear_s3_bucket: {e}")
raise
finally:
if oracle_conn:
oracle_conn.close()
t3_drop = PythonOperator(
task_id='drop_table',
python_callable=drop_table,
trigger_rule=TriggerRule.ALL_DONE,
)
# Trigger table generator DAG based on store
def trigger_table_generator(**context):
from airflow.api.common.trigger_dag import trigger_dag
ti = context["ti"]
store = ti.xcom_pull(task_ids='init_step', key='store')
table_owner = ti.xcom_pull(task_ids='init_step', key='p_table_owner')
table_name = ti.xcom_pull(task_ids='init_step', key='p_table_name')
# Determine target DAG based on store
if store == "mopdb":
target_dag_id = 'devo_table_generator_trigger_mopdb'
elif store == "rar":
target_dag_id = 'devo_table_generator_trigger_rar'
elif store == "rqsd":
target_dag_id = 'devo_table_generator_trigger_rqsd'
else:
raise ValueError(f"Unsupported store: {store}")
# Add _COPY suffix to table name for the target table
owner_table_with_copy = f"{table_owner}.{table_name}_COPY"
# Create configuration dictionary
trigger_conf = {
"owner_table": owner_table_with_copy
}
logging.info(f"Triggering {target_dag_id} with conf: {trigger_conf}")
try:
dag_run = trigger_dag(
dag_id=target_dag_id,
conf=trigger_conf,
execution_date=None,
replace_microseconds=False
)
logging.info(f"Successfully triggered {target_dag_id}, run_id: {dag_run.run_id}")
ti.xcom_push(key='triggered_dag_run_id', value=dag_run.run_id)
ti.xcom_push(key='triggered_dag_id', value=target_dag_id)
return dag_run.run_id
except Exception as e:
logging.error(f"Error triggering {target_dag_id}: {e}")
raise
t3_trigger = PythonOperator(
task_id='trigger_table_generator',
python_callable=trigger_table_generator,
trigger_rule=TriggerRule.ALL_DONE,
)
# End log table
def end_log_table_task(**context):
ti = context["ti"]
p_service_name = ti.xcom_pull(task_ids='init_step', key='p_service_name')
p_table_owner = ti.xcom_pull(task_ids='init_step', key='p_table_owner')
p_table_name = ti.xcom_pull(task_ids='init_step', key='p_table_name')
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
oraconn.run_proc(
oracle_conn,
'MRDS_LOADER.DATA_REPLICATOR.end_log_table',
[p_service_name, p_table_owner, p_table_name]
)
oracle_conn.commit()
logging.info("end_log_table procedure executed successfully")
except Exception as e:
logging.error(f"Error in end_log_table: {e}")
logging.info("Continuing despite end_log_table error (cleanup task)")
finally:
if oracle_conn:
oracle_conn.close()
t4 = PythonOperator(
task_id='end_log_table',
python_callable=end_log_table_task,
trigger_rule=TriggerRule.ALL_DONE,
)
# Check status and fail if needed
def fail_if_any_failed(**context):
dag_run = context['dag_run']
check_tasks = ['start_log_table', 'export_table', 'check_previous_tasks_success', 'trigger_table_generator']
failed = []
for tid in check_tasks:
ti_up = dag_run.get_task_instance(tid)
if ti_up and ti_up.state == 'failed':
failed.append(tid)
if failed:
error_msg = f"Critical task(s) failed: {', '.join(failed)}. DAG execution failed."
logging.error(error_msg)
raise AirflowFailException(error_msg)
logging.info("All critical tasks completed successfully: %s", check_tasks)
t5 = PythonOperator(
task_id='fail_if_any_failed',
python_callable=fail_if_any_failed,
trigger_rule=TriggerRule.ALL_DONE,
)
# Dependencies
init >> t1 >> t2 >> t3_check >> t3_drop >> t3_trigger
[t1, t2, t3_trigger] >> t4
t4 >> t5

View File

@@ -0,0 +1,239 @@
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.hooks.S3_hook import S3Hook
from datetime import datetime, timedelta
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import io
import os
import logging
import sys
sys.path.append('/opt/airflow/python/mrds_common')
from mrds.utils import oraconn
SERVICE_NAME = "SERVICE_NAME"
OWNER = "C2D"
TABLE_NAME = "T_CEPH"
METADATA_OWNER = "CT_MOPDB"
METADATA_TABLE = "mopdb_metadata_inventory"
USE_LOCAL_STORAGE = True
LOCAL_OUTPUT_DIR = "/tmp/devo_replicator_output"
S3_BUCKET = "bucket-name"
S3_PREFIX = "devo/replicator/C2D/T_CEPH/"
AWS_CONN_ID = "aws_default"
DEFAULT_ARGS = {
"owner": "airflow",
"depends_on_past": False,
"email_on_failure": False,
"email_on_retry": False,
"retries": 1,
"retry_delay": timedelta(minutes=5),
}
DAG_ID = "devo_replicator_pandas"
SCHEDULE_INTERVAL = None
CHUNK_SIZE = 100000
def query_oracle_template(owner, table_name):
try:
input_query = """SELECT
COLUMN_NAME,
DATA_TYPE,
CHAR_LENGTH,
DATA_PRECISION,
DATA_SCALE,
COLUMN_ID,
CASE
WHEN DATA_TYPE = 'DATE' OR DATA_TYPE LIKE '%TIMESTAMP%' THEN
'CAST(' || COLUMN_NAME || ' AS VARCHAR2(100)) AS ' || COLUMN_NAME
WHEN DATA_TYPE = 'VARCHAR2' OR DATA_TYPE LIKE '%CHAR%' THEN
'CAST(' || COLUMN_NAME || ' AS VARCHAR2(' || CAST(CHAR_LENGTH AS INT) || ')) AS ' || COLUMN_NAME
WHEN DATA_TYPE IN ('NUMBER', 'DECIMAL') AND DATA_PRECISION IS NOT NULL AND DATA_SCALE IS NOT NULL THEN
'CAST(' || COLUMN_NAME || ' AS ' || DATA_TYPE || '(' || CAST(DATA_PRECISION AS INT) || ',' || CAST(DATA_SCALE AS INT) || ')) AS ' || COLUMN_NAME
WHEN DATA_TYPE IN ('NUMBER', 'DECIMAL') AND DATA_PRECISION IS NOT NULL THEN
'CAST(' || COLUMN_NAME || ' AS ' || DATA_TYPE || '(' || CAST(DATA_PRECISION AS INT) || ')) AS ' || COLUMN_NAME
WHEN DATA_TYPE = 'CLOB' THEN
'TO_CHAR(SUBSTR(' || COLUMN_NAME || ', 1, 32767)) AS ' || COLUMN_NAME
ELSE
COLUMN_NAME
END AS casting
FROM {0}.{1}
WHERE OWNER = '{2}' AND TABLE_NAME = '{3}' AND A_VALID_TO > SYSDATE
ORDER BY COLUMN_ID""".format(METADATA_OWNER, METADATA_TABLE, owner, table_name)
conn = connect('MRDS_LOADER')
df = pd.read_sql(input_query, conn)
if df.empty:
raise ValueError(f"No metadata found for {owner}.{table_name}")
output_query = 'SELECT ' + ', \n'.join(df['casting'].tolist()) + ' FROM {0}.{1}'.format(owner, table_name)
column_metadata = {}
for _, row in df.iterrows():
col_name = row['COLUMN_NAME']
data_type = row['DATA_TYPE']
if data_type in ('NUMBER', 'DECIMAL', 'FLOAT', 'BINARY_FLOAT', 'BINARY_DOUBLE'):
if pd.notna(row['DATA_SCALE']) and row['DATA_SCALE'] > 0:
column_metadata[col_name] = 'float64'
elif pd.notna(row['DATA_PRECISION']) and row['DATA_PRECISION'] <= 9:
column_metadata[col_name] = 'Int32'
elif pd.notna(row['DATA_PRECISION']) and row['DATA_PRECISION'] <= 18:
column_metadata[col_name] = 'Int64'
else:
column_metadata[col_name] = 'float64'
elif data_type == 'DATE' or 'TIMESTAMP' in data_type:
column_metadata[col_name] = 'string'
else:
column_metadata[col_name] = 'string'
logging.info(f"Generated query template with {len(df)} columns")
return output_query, column_metadata
except Exception as e:
logging.error(f"Error in query_oracle_template: {e}")
raise
finally:
if conn:
conn.close()
def query_oracle_and_generate_parquet(partition_num, partitions, sql, column_metadata, use_local):
logging.info(f"[Pandas-Partition {partition_num}] Starting processing (Mode: {'LOCAL' if use_local else 'S3'})")
partition_sql = f"""SELECT /*+ PARALLEL(t, {partitions}) */ *
FROM (
{sql}
) t
WHERE ORA_HASH(ROWID, {partitions - 1}) = {partition_num}"""
conn = connect("MRDS_LOADER")
if use_local:
os.makedirs(LOCAL_OUTPUT_DIR, exist_ok=True)
output_path = os.path.join(LOCAL_OUTPUT_DIR, f"partition_{partition_num:04d}.parquet")
file_handle = open(output_path, 'wb')
write_target = file_handle
else:
s3_hook = S3Hook(aws_conn_id=AWS_CONN_ID)
s3_client = s3_hook.get_conn()
s3_key = f"{S3_PREFIX}partition_{partition_num:04d}.parquet"
buffer = io.BytesIO()
write_target = buffer
try:
chunk_iterator = pd.read_sql(partition_sql, conn, chunksize=CHUNK_SIZE)
pqwriter = None
total_rows = 0
chunk_count = 0
for chunk in chunk_iterator:
for col, dtype in column_metadata.items():
if col in chunk.columns:
try:
if dtype == 'string':
chunk[col] = chunk[col].astype('string')
elif dtype.startswith('Int'):
chunk[col] = pd.to_numeric(chunk[col], errors='coerce').astype(dtype)
elif dtype == 'float64':
chunk[col] = pd.to_numeric(chunk[col], errors='coerce')
except Exception as e:
logging.warning(f"[Pandas-Partition {partition_num}] Could not optimize column {col}: {e}")
table = pa.Table.from_pandas(chunk, preserve_index=False)
if pqwriter is None:
pqwriter = pq.ParquetWriter(
write_target,
table.schema,
compression='snappy',
use_dictionary=True,
write_statistics=True,
version='2.6',
data_page_size=1024*1024,
)
pqwriter.write_table(table)
total_rows += len(chunk)
chunk_count += 1
if chunk_count % 10 == 0:
if use_local:
file_size_mb = os.path.getsize(output_path) / 1024 / 1024
else:
file_size_mb = write_target.tell() / 1024 / 1024
logging.info(f"[Pandas-Partition {partition_num}] Processed {total_rows:,} rows, Size: {file_size_mb:.2f} MB")
if pqwriter:
pqwriter.close()
if use_local:
file_size_mb = os.path.getsize(output_path) / 1024 / 1024
logging.info(f"[Pandas-Partition {partition_num}] Completed - {total_rows:,} rows, {file_size_mb:.2f} MB saved to {output_path}")
else:
write_target.seek(0)
buffer_size_mb = write_target.getbuffer().nbytes / 1024 / 1024
logging.info(f"[Pandas-Partition {partition_num}] Uploading {buffer_size_mb:.2f} MB to s3://{S3_BUCKET}/{s3_key}")
s3_client.upload_fileobj(write_target, S3_BUCKET, s3_key)
logging.info(f"[Pandas-Partition {partition_num}] Completed - {total_rows:,} rows, {buffer_size_mb:.2f} MB uploaded to S3")
except Exception as e:
logging.error(f"[Pandas-Partition {partition_num}] Error: {e}")
raise
finally:
conn.close()
if use_local:
file_handle.close()
else:
write_target.close()
def generate_tasks(dag, partitions, sql, column_metadata, use_local):
tasks = []
for partition_num in range(partitions):
task = PythonOperator(
task_id=f"generate_parquet_partition_{partition_num}",
python_callable=query_oracle_and_generate_parquet,
op_kwargs={
"partition_num": partition_num,
"partitions": partitions,
"sql": sql,
"column_metadata": column_metadata,
"use_local": use_local
},
provide_context=True,
dag=dag,
)
tasks.append(task)
return tasks
with DAG(
dag_id='devo_replicator_pandas',
default_args=DEFAULT_ARGS,
description='Devo replicator using Pandas with dtype optimization',
schedule_interval=SCHEDULE_INTERVAL,
start_date=datetime(2024, 1, 1),
catchup=False,
tags=['DevoReplicator', 'Pandas'],
max_active_runs=1,
max_active_tasks=30,
) as dag:
query, column_metadata = query_oracle_template(OWNER, TABLE_NAME)
PARTITIONS = 16
partition_tasks = generate_tasks(dag, PARTITIONS, query, column_metadata, USE_LOCAL_STORAGE)

View File

@@ -0,0 +1,244 @@
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.hooks.S3_hook import S3Hook
from datetime import datetime, timedelta
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import io
import logging
import sys
sys.path.append('/opt/airflow/python/mrds_common')
from mrds.utils import oraconn
SERVICE_NAME = "SERVICE_NAME"
OWNER = "C2D"
TABLE_NAME = "T_CEPH"
METADATA_OWNER = "CT_MOPDB"
METADATA_TABLE = "mopdb_metadata_inventory"
## need to be changed
S3_BUCKET = "bucket-name"
S3_PREFIX = "devo/replicator/C2D/T_CEPH/"
AWS_CONN_ID = "aws_default"
DEFAULT_ARGS = {
"owner": "airflow",
"depends_on_past": False,
"email_on_failure": False,
"email_on_retry": False,
"retries": 1,
"retry_delay": timedelta(minutes=5),
}
DAG_ID = "devo_replicator_pyarrow"
SCHEDULE_INTERVAL = None
BATCH_SIZE = 100000
def query_oracle_template(owner, table_name):
try:
input_query = """SELECT
CASE
WHEN DATA_TYPE = 'DATE' OR DATA_TYPE LIKE '%TIMESTAMP%' THEN
'CAST(' || COLUMN_NAME || ' AS VARCHAR2(100)) AS ' || COLUMN_NAME
WHEN DATA_TYPE = 'VARCHAR2' OR DATA_TYPE LIKE '%CHAR%' THEN
'CAST(' || COLUMN_NAME || ' AS VARCHAR2(' || CAST(CHAR_LENGTH AS INT) || ')) AS ' || COLUMN_NAME
WHEN DATA_TYPE IN ('NUMBER', 'DECIMAL') AND DATA_PRECISION IS NOT NULL AND DATA_SCALE IS NOT NULL THEN
'CAST(' || COLUMN_NAME || ' AS ' || DATA_TYPE || '(' || CAST(DATA_PRECISION AS INT) || ',' || CAST(DATA_SCALE AS INT) || ')) AS ' || COLUMN_NAME
WHEN DATA_TYPE IN ('NUMBER', 'DECIMAL') AND DATA_PRECISION IS NOT NULL THEN
'CAST(' || COLUMN_NAME || ' AS ' || DATA_TYPE || '(' || CAST(DATA_PRECISION AS INT) || ')) AS ' || COLUMN_NAME
WHEN DATA_TYPE = 'CLOB' THEN
'TO_CHAR(SUBSTR(' || COLUMN_NAME || ', 1, 32767)) AS ' || COLUMN_NAME
ELSE
COLUMN_NAME
END AS casting
FROM {0}.{1}
WHERE OWNER = '{2}' AND TABLE_NAME = '{3}' AND A_VALID_TO > SYSDATE
ORDER BY COLUMN_ID""".format(METADATA_OWNER, METADATA_TABLE, owner, table_name)
conn = connect('MRDS_LOADER')
df = pd.read_sql(input_query, conn)
if df.empty:
raise ValueError(f"No metadata found for {owner}.{table_name}")
output_query = 'SELECT ' + ', \n'.join(df['casting'].tolist()) + ' FROM {0}.{1}'.format(owner, table_name)
logging.info(f"Generated query template with {len(df)} columns")
return output_query
except Exception as e:
logging.error(f"Error in query_oracle_template: {e}")
raise
finally:
if conn:
conn.close()
def query_oracle_and_generate_parquet(partition_num, partitions, sql):
logging.info(f"[PyArrow-Partition {partition_num}] Starting processing")
partition_sql = f"""SELECT /*+ PARALLEL(t, {partitions}) */ *
FROM (
{sql}
) t
WHERE ORA_HASH(ROWID, {partitions - 1}) = {partition_num}"""
conn = connect("MRDS_LOADER")
cursor = conn.cursor()
cursor.arraysize = BATCH_SIZE
cursor.prefetchrows = BATCH_SIZE
s3_hook = S3Hook(aws_conn_id=AWS_CONN_ID)
s3_client = s3_hook.get_conn()
s3_key = f"{S3_PREFIX}partition_{partition_num:04d}.parquet"
buffer = io.BytesIO()
try:
cursor.execute(partition_sql)
column_names = [desc[0] for desc in cursor.description]
first_batch_rows = cursor.fetchmany(BATCH_SIZE)
if not first_batch_rows:
logging.warning(f"[PyArrow-Partition {partition_num}] No data found")
return
arrow_fields = []
sample_row = first_batch_rows[0]
for i, col_name in enumerate(column_names):
sample_val = sample_row[i]
if sample_val is None:
for row in first_batch_rows[1:]:
if row[i] is not None:
sample_val = row[i]
break
if isinstance(sample_val, str):
arrow_type = pa.string()
elif isinstance(sample_val, int):
arrow_type = pa.int64()
elif isinstance(sample_val, float):
arrow_type = pa.float64()
elif isinstance(sample_val, (datetime, pd.Timestamp)):
arrow_type = pa.timestamp('ns')
elif isinstance(sample_val, bytes):
arrow_type = pa.binary()
else:
arrow_type = pa.string()
arrow_fields.append(pa.field(col_name, arrow_type))
schema = pa.schema(arrow_fields)
writer = pq.ParquetWriter(
buffer,
schema,
compression='snappy',
use_dictionary=True,
write_statistics=True,
data_page_size=2*1024*1024,
version='2.6',
)
def process_batch(rows):
if not rows:
return None
columns_data = list(zip(*rows))
arrays = []
for i, col_data in enumerate(columns_data):
try:
arrays.append(pa.array(col_data, type=schema.field(i).type))
except Exception as e:
logging.warning(f"[PyArrow-Partition {partition_num}] Column {column_names[i]} conversion failed: {e}")
converted = [str(val) if val is not None else None for val in col_data]
arrays.append(pa.array(converted, type=pa.string()))
return pa.RecordBatch.from_arrays(arrays, schema=schema)
batch = process_batch(first_batch_rows)
if batch:
writer.write_batch(batch)
total_rows = len(first_batch_rows)
batch_count = 1
while True:
rows = cursor.fetchmany(BATCH_SIZE)
if not rows:
break
batch = process_batch(rows)
if batch:
writer.write_batch(batch)
total_rows += len(rows)
batch_count += 1
if batch_count % 10 == 0:
buffer_size_mb = buffer.tell() / 1024 / 1024
logging.info(f"[PyArrow-Partition {partition_num}] Processed {total_rows:,} rows, Buffer size: {buffer_size_mb:.2f} MB")
writer.close()
buffer.seek(0)
buffer_size_mb = buffer.getbuffer().nbytes / 1024 / 1024
logging.info(f"[PyArrow-Partition {partition_num}] Uploading {buffer_size_mb:.2f} MB to s3://{S3_BUCKET}/{s3_key}")
s3_client.upload_fileobj(buffer, S3_BUCKET, s3_key)
logging.info(f"[PyArrow-Partition {partition_num}] Completed - {total_rows:,} rows, {buffer_size_mb:.2f} MB uploaded to S3")
except Exception as e:
logging.error(f"[PyArrow-Partition {partition_num}] Error: {e}")
raise
finally:
cursor.close()
conn.close()
buffer.close()
def generate_tasks(dag, partitions, sql):
tasks = []
for partition_num in range(partitions):
task = PythonOperator(
task_id=f"generate_parquet_partition_{partition_num}",
python_callable=query_oracle_and_generate_parquet,
op_kwargs={
"partition_num": partition_num,
"partitions": partitions,
"sql": sql
},
provide_context=True,
dag=dag,
)
tasks.append(task)
return tasks
with DAG(
dag_id='devo_replicator_pyarrow',
default_args=DEFAULT_ARGS,
description='Devo replicator using PyArrow native processing',
schedule_interval=SCHEDULE_INTERVAL,
start_date=datetime(2024, 1, 1),
catchup=False,
tags=['DevoReplicator', 'PyArrow'],
max_active_runs=1,
max_active_tasks=16,
) as dag:
query = query_oracle_template(OWNER, TABLE_NAME)
PARTITIONS = 16
partition_tasks = generate_tasks(dag, PARTITIONS, query)

View File

@@ -0,0 +1,181 @@
# dags/dev_replicator_scheduler_rar.py
from __future__ import annotations
import sys
import logging
from datetime import datetime, timedelta
from airflow import DAG
from airflow.decorators import task
from airflow.operators.python import BranchPythonOperator
from airflow.operators.empty import EmptyOperator
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
from datetime import datetime, timedelta
from mrds.utils import oraconn # your Oracle connection helper
sys.path.append('/opt/airflow/python/connectors/devo')
sys.path.append('/opt/airflow/python/mrds_common')
DAG_NAME = "devo_replicator_scheduler_rar"
TARGET_DAG_ID = "devo_replicator_trigger"
ORACLE_CONN_NAME = "MRDS_LOADER"
PRECONDITION_SQL = """
WITH LAST_UPDATE_ORACLE AS (
SELECT MAX(process_end) AS process_end
FROM CT_RAR.A_RAR_FOR_DISC_MONITORING
WHERE UPPER(owner || '.' || target_table_name) = UPPER(:table_name)
AND process_end IS NOT NULL
AND process_successful = 'Y'
),
LAST_UPDATE_DEVO AS (
SELECT CASE
WHEN last_status = 'FINISHED' THEN last_end_time
ELSE TO_DATE('01-JAN-1999', 'DD-MON-YYYY')
END AS process_end
FROM CT_MRDS.a_devo_replica_mgmt_rar
WHERE owner || '.' || table_name = :table_name
)
SELECT CASE
WHEN (SELECT process_end FROM LAST_UPDATE_ORACLE) >
(SELECT process_end FROM LAST_UPDATE_DEVO)
THEN 'Y' ELSE 'N'
END AS trigger_devo_replicator
FROM dual
"""
def _get_conn():
return oraconn.connect(ORACLE_CONN_NAME)
def get_devo_replica_table_options() -> list[str]:
conn = None
cur = None
try:
conn = _get_conn()
cur = conn.cursor()
cur.execute("""
SELECT OWNER || '.' || TABLE_NAME
FROM CT_MRDS.a_devo_replica_mgmt_rar
ORDER BY OWNER, TABLE_NAME
""")
rows = cur.fetchall()
tables = [r[0] for r in rows] if rows else []
logging.info("Fetched %d table(s) from replica mgmt.", len(tables))
return tables
except Exception:
logging.exception("Error getting DEVO replica table options")
return []
finally:
try:
if cur: cur.close()
except Exception:
pass
if conn:
conn.close()
def check_table_precondition(table_full_name: str) -> dict:
"""Returns {"table": <OWNER.TABLE>, "trigger": "Y"|"N"}."""
conn = None
cur = None
try:
conn = _get_conn()
cur = conn.cursor()
cur.execute(PRECONDITION_SQL, {"table_name": table_full_name})
row = cur.fetchone()
status = (row[0] if row else 'N') or 'N'
logging.info("Precondition for %s: %s", table_full_name, status)
return {"table": table_full_name, "trigger": status}
except Exception:
logging.exception("Error checking precondition for %s", table_full_name)
return {"table": table_full_name, "trigger": "N"} # fail closed
finally:
try:
if cur: cur.close()
except Exception:
pass
if conn:
conn.close()
default_args = {
'owner': 'devo',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=1),
}
with DAG(
dag_id=DAG_NAME,
description="Checks DEVO replica preconditions; triggers devo_replicator_trigger_rar once if any table is stale",
default_args=default_args,
start_date=datetime.now() - timedelta(days=2),
schedule=None,
catchup=False,
schedule_interval='*/10 * * * *', # every 10 minutes
max_active_runs=1,
tags=["DevoScheduler", "DevoReplicatorTrigger"],
) as dag:
@task
def fetch_tables() -> list[str]:
tables = get_devo_replica_table_options()
if not tables:
logging.warning("No tables returned from enumeration.")
return tables
@task
def check_one(table_name: str) -> dict:
return check_table_precondition(table_name)
@task
def summarize(results: list[dict]) -> dict:
y_tables = [r["table"] for r in results if r and r.get("trigger") == "Y"]
n_tables = [r["table"] for r in results if r and r.get("trigger") == "N"]
logging.info("Precondition summary -> Y: %d, N: %d", len(y_tables), len(n_tables))
if y_tables:
logging.info("Tables needing replication: %s", ", ".join(y_tables))
else:
logging.info("No tables are updated/stale; nothing to trigger.")
return {"any_true": bool(y_tables), "y_tables": y_tables}
def decide_branch(summary: dict) -> str:
"""Return the EXACT downstream task_id to follow."""
return "prepare_trigger_conf" if summary.get("any_true") else "no_updates"
@task
def prepare_trigger_conf(summary: dict) -> dict:
"""Single conf payload for the downstream DAG."""
return {"tables_to_replicate": summary.get("y_tables", [])}
no_updates = EmptyOperator(task_id="no_updates")
# Graph
tables = fetch_tables()
results = check_one.expand(table_name=tables) # dynamic mapping across tables
summary = summarize(results)
branch = BranchPythonOperator(
task_id="branch_on_any",
python_callable=decide_branch,
op_args=[summary], # XComArg from summarize
)
conf_payload = prepare_trigger_conf(summary)
trigger_devo = TriggerDagRunOperator(
task_id="trigger_devo_replicator_rar",
trigger_dag_id=TARGET_DAG_ID,
wait_for_completion=True,
reset_dag_run=True,
conf=conf_payload,
)
# Wire branching — only ONE instance of prepare_trigger_conf is referenced
summary >> branch
branch >> no_updates
branch >> conf_payload >> trigger_devo

View File

@@ -0,0 +1,255 @@
from __future__ import annotations
import os
import sys
import logging
import time
from datetime import timedelta
from airflow import DAG
from airflow.utils.dates import days_ago
from airflow.operators.python import PythonOperator
from airflow.models import Param
from airflow.decorators import task
from airflow.providers.oracle.hooks.oracle import OracleHook
from mrds.utils import oraconn
sys.path.append('/opt/airflow/python/connectors/devo')
sys.path.append('/opt/airflow/python/mrds_common')
ORACLE_CONN_ID = "MRDS_LOADER"
# TARGET_DAG_ID = "devo_replicator_trigger_rar"
def get_rar_table_options():
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
cursor = oracle_conn.cursor()
cursor.execute("""
SELECT OWNER || '.' || TABLE_NAME
FROM CT_MRDS.a_devo_replica_mgmt_rar
ORDER BY OWNER, TABLE_NAME
""")
options = [row[0] for row in cursor.fetchall()]
cursor.close()
return options
except Exception as e:
logging.error(f"Error getting RAR table options: {e}")
return []
finally:
if oracle_conn:
oracle_conn.close()
default_args = {
'owner': 'devo',
'depends_on_past': False,
'start_date': days_ago(1),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=1),
}
with DAG(
dag_id='devo_replicator_trigger',
default_args=default_args,
description='External trigger DAG for RAR tables',
schedule=None,
catchup=False,
tags=['DevoReplicator', 'DevoReplicatorTrigger'],
max_active_runs=1,
params={
# still allow manual runs from the UI
"owner_table": Param(
default=None,
type=["string", "null"],
description="Select table in format OWNER.TABLE_NAME",
#enum=get_rar_table_options()
)
}
) as dag:
# --- Init: read conf ---
def init_step(**context):
dag_run = context.get("dag_run")
ti = context["ti"]
conf = (dag_run.conf or {}) if dag_run else {}
env = os.getenv("MRDS_ENV")
if not env:
raise ValueError("MRDS_ENV environment variable is required")
env = env.lower()
store = "rar"
owner_table = conf.get("owner_table") # optional single table
tables_to_replicate = conf.get("tables_to_replicate") # optional list of OWNER.TABLE
# Log what we got
if tables_to_replicate:
logging.info("Received tables_to_replicate from upstream: %d table(s).", len(tables_to_replicate))
elif owner_table:
logging.info("Received single owner_table from conf: %s", owner_table)
else:
logging.info("No conf provided; manual UI param may be used or fallback to full list in get_table_list.")
if env not in {"dev", "tst", "acc", "prd"}:
raise ValueError(f"Unsupported env '{env}'. Expected 'dev', 'tst', 'acc' or 'prd'.")
xcom = {
"env": env,
"store": store,
"owner_table": owner_table, # may be None
"tables_to_replicate": tables_to_replicate # may be None/list
}
for k, v in xcom.items():
ti.xcom_push(key=k, value=v)
init = PythonOperator(
task_id='init_step',
python_callable=init_step,
)
# --- Build the processing list ---
def get_table_list(**context):
ti = context["ti"]
store = ti.xcom_pull(task_ids='init_step', key='store')
owner_table = ti.xcom_pull(task_ids='init_step', key='owner_table')
tables_to_replicate = ti.xcom_pull(task_ids='init_step', key='tables_to_replicate')
# 1) If upstream provided a list, use it
if tables_to_replicate:
logging.info("Using tables_to_replicate list from conf: %d items", len(tables_to_replicate))
tables = []
for ot in tables_to_replicate:
if '.' not in ot:
logging.warning("Skipping malformed owner_table (no dot): %s", ot)
continue
table_owner, table_name = ot.split('.', 1)
tables.append((table_owner, table_name))
ti.xcom_push(key='tables_to_process', value=tables)
return tables
# 2) Else if a single owner_table provided (manual/programmatic)
if owner_table:
table_owner, table_name = owner_table.split('.', 1)
tables = [(table_owner, table_name)]
logging.info("Processing single table from conf/params: %s", owner_table)
ti.xcom_push(key='tables_to_process', value=tables)
return tables
# 3) Else fallback to full list in DB (manual run without conf)
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
cursor = oracle_conn.cursor()
cursor.execute("""
SELECT OWNER, TABLE_NAME
FROM CT_MRDS.a_devo_replica_mgmt_rar
ORDER BY OWNER, TABLE_NAME
""")
tables = cursor.fetchall()
cursor.close()
logging.info("Fallback: Found %d tables for RAR", len(tables))
ti.xcom_push(key='tables_to_process', value=tables)
return tables
except Exception as e:
logging.error(f"Error in get_table_list: {e}")
raise
finally:
if oracle_conn:
oracle_conn.close()
t1 = PythonOperator(
task_id='get_table_list',
python_callable=get_table_list,
)
# --- Keep your existing throttled triggering logic unchanged ---
def check_and_trigger(**context):
ti = context["ti"]
env = ti.xcom_pull(task_ids='init_step', key='env')
store = ti.xcom_pull(task_ids='init_step', key='store')
threshold = 30 # you were pushing 30; keep it here or push from init
tables = ti.xcom_pull(task_ids='get_table_list', key='tables_to_process')
oracle_conn = None
triggered_count = 0
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
for table_owner, table_name in tables:
logging.info("Processing table: %s.%s", table_owner, table_name)
while True:
cursor = oracle_conn.cursor()
service_name = store.upper()
sql_query = f"""
SELECT
(SELECT NVL(SUM(MAX_THREADS),0) FROM CT_MRDS.A_DEVO_REPLICA_MGMT_MOPDB WHERE LAST_STATUS = 'RUNNING') +
(SELECT NVL(SUM(MAX_THREADS),0) FROM CT_MRDS.A_DEVO_REPLICA_MGMT_RAR WHERE LAST_STATUS = 'RUNNING')
AS TOTAL_RUNNING_THREADS_NOW,
(SELECT COUNT(*)
FROM CT_MRDS.A_DEVO_REPLICA_MGMT_{service_name}
WHERE OWNER = '{table_owner}' AND TABLE_NAME = '{table_name}' AND LAST_STATUS = 'RUNNING') AS TABLE_IS_ALREADY_RUNNING
FROM DUAL
"""
cursor.execute(sql_query)
total_running_val, table_running_val = cursor.fetchone()
cursor.close()
logging.info(
"Total running: %d, threshold: %d, table running: %d",
total_running_val or 0, threshold, table_running_val or 0
)
if (total_running_val or 0) > threshold:
logging.info("Threshold exceeded. Waiting 5 minutes...")
time.sleep(300)
continue
if (table_running_val or 0) >= 1:
logging.info("Table %s.%s already running. Skipping.", table_owner, table_name)
break
# Trigger the core DAG for this specific table
from airflow.api.common.trigger_dag import trigger_dag
conf = {"store": store, "owner_table": f"{table_owner}.{table_name}"}
trigger_dag(
dag_id='devo_replicator_core',
conf=conf,
execution_date=None,
replace_microseconds=False
)
triggered_count += 1
logging.info("Triggered core DAG for table %s.%s", table_owner, table_name)
break
logging.info("Total core DAGs triggered: %d", triggered_count)
ti.xcom_push(key='triggered_count', value=triggered_count)
except Exception as e:
logging.error(f"Error in check_and_trigger: {e}")
raise
finally:
if oracle_conn:
oracle_conn.close()
t2 = PythonOperator(
task_id='check_and_trigger',
python_callable=check_and_trigger,
)
init >> t1 >> t2
"""
Reading tables_to_replicate from dag_run.conf in init_step.
Pushing it to XCom (so get_table_list can use it).
Tell get_table_list to prioritize the provided list.
init_step reads tables_to_replicate from dag_run.conf and puts it into XCom.
get_table_list prioritizes that list; falls back to owner_table or full table list only if needed.
check_and_trigger loops over those tables and triggers your core DAG (devo_replicator_core) per table, respecting your concurrency threshold.
"""

View File

@@ -0,0 +1,257 @@
from __future__ import annotations
import os
import sys
import logging
import time
from datetime import timedelta
from airflow import DAG
from airflow.utils.dates import days_ago
from airflow.operators.python import PythonOperator
from airflow.models import Param
sys.path.append('/opt/airflow/python/connectors/devo')
sys.path.append('/opt/airflow/python/mrds_common')
from mrds.utils import oraconn
# Get MOPDB table options for dropdown
def get_mopdb_table_options():
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
cursor = oracle_conn.cursor()
cursor.execute("SELECT OWNER || '.' || TABLE_NAME FROM CT_MRDS.a_devo_replica_mgmt_mopdb ORDER BY OWNER, TABLE_NAME")
options = [row[0] for row in cursor.fetchall()]
cursor.close()
return options
except Exception as e:
logging.error(f"Error getting MOPDB table options: {e}")
return []
finally:
if oracle_conn:
oracle_conn.close()
default_args = {
'owner': 'devo',
'depends_on_past': False,
'start_date': days_ago(1),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=1),
}
with DAG(
dag_id='devo_replicator_trigger_mopdb',
default_args=default_args,
description='External trigger DAG for MOPDB tables',
schedule=None,
catchup=False,
tags=['DevoReplicator', 'DevoReplicatorTrigger'],
params={
"owner_table": Param(
default=None,
type="string",
description="Select table in format OWNER.TABLE_NAME",
enum=get_mopdb_table_options()
)
}
) as dag:
# Init
def init_step(**context):
dag_run = context.get("dag_run")
ti = context["ti"]
conf = (dag_run.conf or {}) if dag_run else {}
env = os.getenv("MRDS_ENV")
if not env:
raise ValueError("MRDS_ENV environment variable is required")
env = env.lower()
store = "mopdb"
owner_table = conf.get("owner_table")
if not owner_table:
raise ValueError("owner_table parameter is required")
if '.' not in owner_table:
raise ValueError("owner_table must be in format 'OWNER.TABLE_NAME'")
table_owner, table_name = owner_table.split('.', 1)
if env not in {"dev", "tst", "acc", "prd"}:
raise ValueError(f"Unsupported env '{env}'. Expected 'dev', 'tst', 'acc' or 'prd'.")
logging.info("=== init_step === env=%s store=%s owner_table=%s",
env, store, owner_table)
xcom = {
"env": env,
"store": store,
"table_owner": table_owner,
"table_name": table_name,
"owner_table": owner_table,
"threshold": 30,
}
for k, v in xcom.items():
ti.xcom_push(key=k, value=v)
init = PythonOperator(
task_id='init_step',
python_callable=init_step,
)
# Get table list
def get_table_list(**context):
ti = context["ti"]
store = ti.xcom_pull(task_ids='init_step', key='store')
owner_table = ti.xcom_pull(task_ids='init_step', key='owner_table')
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
if owner_table:
table_owner, table_name = owner_table.split('.', 1)
tables = [(table_owner, table_name)]
logging.info("Processing specific table: %s", owner_table)
else:
cursor = oracle_conn.cursor()
cursor.execute("SELECT OWNER, TABLE_NAME FROM CT_MRDS.a_devo_replica_mgmt_mopdb ORDER BY OWNER, TABLE_NAME")
tables = cursor.fetchall()
cursor.close()
logging.info("Found %d tables for MOPDB", len(tables))
ti.xcom_push(key='tables_to_process', value=tables)
return tables
except Exception as e:
logging.error(f"Error in get_table_list: {e}")
raise
finally:
if oracle_conn:
oracle_conn.close()
t1 = PythonOperator(
task_id='get_table_list',
python_callable=get_table_list,
)
# Check and trigger core DAG
def check_and_trigger(**context):
ti = context["ti"]
env = ti.xcom_pull(task_ids='init_step', key='env')
store = ti.xcom_pull(task_ids='init_step', key='store')
threshold = ti.xcom_pull(task_ids='init_step', key='threshold')
tables = ti.xcom_pull(task_ids='get_table_list', key='tables_to_process')
oracle_conn = None
triggered_count = 0
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
for table_owner, table_name in tables:
logging.info("Processing table: %s.%s", table_owner, table_name)
while True:
cursor = oracle_conn.cursor()
# Execute SQL query with variable substitution
service_name = store.upper()
sql_query = f"""
SELECT (SELECT CASE WHEN SUM(MAX_THREADS) IS NULL THEN 0 ELSE SUM(MAX_THREADS) END AS RUNNING_THREADS
FROM CT_MRDS.A_DEVO_REPLICA_MGMT_MOPDB
WHERE LAST_STATUS = 'RUNNING') +
(SELECT CASE WHEN SUM(MAX_THREADS) IS NULL THEN 0 ELSE SUM(MAX_THREADS) END AS RUNNING_THREADS
FROM CT_MRDS.A_DEVO_REPLICA_MGMT_RAR
WHERE LAST_STATUS = 'RUNNING')
AS TOTAL_RUNNING_THREADS_NOW,
(SELECT COUNT(*) FROM CT_MRDS.A_DEVO_REPLICA_MGMT_{service_name}
WHERE OWNER = '{table_owner}' AND TABLE_NAME = '{table_name}' AND LAST_STATUS = 'RUNNING') AS TABLE_IS_ALREADY_RUNNING
FROM DUAL
"""
cursor.execute(sql_query)
result = cursor.fetchone()
total_running_val = result[0] or 0
table_running_val = result[1] or 0
cursor.close()
logging.info("Total running: %d, threshold: %d, table running: %d",
total_running_val, threshold, table_running_val)
if total_running_val > threshold:
logging.info("Threshold exceeded. Waiting 5 minutes...")
time.sleep(300)
continue
if table_running_val >= 1:
logging.info("Table %s.%s is already running. Skipping.", table_owner, table_name)
break
# Trigger core DAG
from airflow.api.common.trigger_dag import trigger_dag
conf = {
"store": store,
"owner_table": f"{table_owner}.{table_name}"
}
trigger_dag(
dag_id='devo_replicator_core',
conf=conf,
execution_date=None,
replace_microseconds=False
)
triggered_count += 1
logging.info("Triggered core DAG for table %s.%s", table_owner, table_name)
break
logging.info("Total DAGs triggered: %d", triggered_count)
ti.xcom_push(key='triggered_count', value=triggered_count)
except Exception as e:
logging.error(f"Error in check_and_trigger: {e}")
raise
finally:
if oracle_conn:
oracle_conn.close()
t2 = PythonOperator(
task_id='check_and_trigger',
python_callable=check_and_trigger,
)
# Dependencies
init >> t1 >> t2
"""
MOPDB Trigger DAG
1) init_step
- Gets environment from MRDS_ENV environment variable
- Reads owner_table parameter from DAG configuration
- Validates owner_table format (must be OWNER.TABLE_NAME)
- Sets store to "mopdb" (fixed for this DAG)
- Sets threshold to 30 (max concurrent running threads)
- Pushes parameters to XCom
2) get_table_list
- Connects to Oracle database (MRDS_LOADER)
- If specific owner_table provided: creates single table list
- If no owner_table: queries all tables from CT_MRDS.a_devo_replica_mgmt_mopdb
- Returns list of (owner, table_name) tuples to process
- Pushes table list to XCom
3) check_and_trigger
- Loops through each table from the table list
- For each table, enters monitoring loop:
- Executes SQL query to check total running threads across MOPDB+RAR
- Checks if current table is already running
- If total threads > threshold (30): waits 5 minutes and rechecks
- If table already running: skips to next tabl
- If conditions met: triggers core DAG with table parameters
- Counts and logs total number of DAGs triggered
- Ensures system doesn't exceed concurrent processing limits
"""

View File

@@ -0,0 +1,257 @@
from __future__ import annotations
import os
import sys
import logging
import time
from datetime import datetime, timedelta
from airflow import DAG
from airflow.utils.dates import days_ago
from airflow.operators.python import PythonOperator
from airflow.models import Param
sys.path.append('/opt/airflow/python/connectors/devo')
sys.path.append('/opt/airflow/python/mrds_common')
from mrds.utils import oraconn
# Get MOPDB table options for dropdown
def get_rar_table_options():
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
cursor = oracle_conn.cursor()
cursor.execute("SELECT OWNER || '.' || TABLE_NAME FROM CT_MRDS.a_devo_replica_mgmt_rar ORDER BY OWNER, TABLE_NAME")
options = [row[0] for row in cursor.fetchall()]
cursor.close()
return options
except Exception as e:
logging.error(f"Error getting RAR table options: {e}")
return []
finally:
if oracle_conn:
oracle_conn.close()
default_args = {
'owner': 'devo',
'depends_on_past': False,
'start_date': days_ago(1),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=1),
}
with DAG(
dag_id='devo_replicator_trigger_rar',
default_args=default_args,
description='External trigger DAG for RAR tables',
schedule=None,
catchup=False,
tags=['DevoReplicator', 'DevoReplicatorTrigger'],
params={
"owner_table": Param(
default=None,
type="string",
description="Select table in format OWNER.TABLE_NAME",
enum=get_rar_table_options()
)
}
) as dag:
# Init
def init_step(**context):
dag_run = context.get("dag_run")
ti = context["ti"]
conf = (dag_run.conf or {}) if dag_run else {}
env = os.getenv("MRDS_ENV")
if not env:
raise ValueError("MRDS_ENV environment variable is required")
env = env.lower()
store = "rar"
owner_table = conf.get("owner_table")
if not owner_table:
raise ValueError("owner_table parameter is required")
if '.' not in owner_table:
raise ValueError("owner_table must be in format 'OWNER.TABLE_NAME'")
table_owner, table_name = owner_table.split('.', 1)
if env not in {"dev", "tst", "acc", "prd"}:
raise ValueError(f"Unsupported env '{env}'. Expected 'dev', 'tst', 'acc' or 'prd'.")
logging.info("=== init_step === env=%s store=%s owner_table=%s",
env, store, owner_table)
xcom = {
"env": env,
"store": store,
"table_owner": table_owner,
"table_name": table_name,
"owner_table": owner_table,
"threshold": 30,
}
for k, v in xcom.items():
ti.xcom_push(key=k, value=v)
init = PythonOperator(
task_id='init_step',
python_callable=init_step,
)
# Get table list
def get_table_list(**context):
ti = context["ti"]
store = ti.xcom_pull(task_ids='init_step', key='store')
owner_table = ti.xcom_pull(task_ids='init_step', key='owner_table')
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
if owner_table:
table_owner, table_name = owner_table.split('.', 1)
tables = [(table_owner, table_name)]
logging.info("Processing specific table: %s", owner_table)
else:
cursor = oracle_conn.cursor()
cursor.execute("SELECT OWNER, TABLE_NAME FROM CT_MRDS.a_devo_replica_mgmt_rar ORDER BY OWNER, TABLE_NAME")
tables = cursor.fetchall()
cursor.close()
logging.info("Found %d tables for RAR", len(tables))
ti.xcom_push(key='tables_to_process', value=tables)
return tables
except Exception as e:
logging.error(f"Error in get_table_list: {e}")
raise
finally:
if oracle_conn:
oracle_conn.close()
t1 = PythonOperator(
task_id='get_table_list',
python_callable=get_table_list,
)
# Check and trigger core DAG
def check_and_trigger(**context):
ti = context["ti"]
env = ti.xcom_pull(task_ids='init_step', key='env')
store = ti.xcom_pull(task_ids='init_step', key='store')
threshold = ti.xcom_pull(task_ids='init_step', key='threshold')
tables = ti.xcom_pull(task_ids='get_table_list', key='tables_to_process')
oracle_conn = None
triggered_count = 0
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
for table_owner, table_name in tables:
logging.info("Processing table: %s.%s", table_owner, table_name)
while True:
cursor = oracle_conn.cursor()
# Execute SQL query with variable substitution
service_name = store.upper()
sql_query = f"""
SELECT (SELECT CASE WHEN SUM(MAX_THREADS) IS NULL THEN 0 ELSE SUM(MAX_THREADS) END AS RUNNING_THREADS
FROM CT_MRDS.A_DEVO_REPLICA_MGMT_MOPDB
WHERE LAST_STATUS = 'RUNNING') +
(SELECT CASE WHEN SUM(MAX_THREADS) IS NULL THEN 0 ELSE SUM(MAX_THREADS) END AS RUNNING_THREADS
FROM CT_MRDS.A_DEVO_REPLICA_MGMT_RAR
WHERE LAST_STATUS = 'RUNNING')
AS TOTAL_RUNNING_THREADS_NOW,
(SELECT COUNT(*) FROM CT_MRDS.A_DEVO_REPLICA_MGMT_{service_name}
WHERE OWNER = '{table_owner}' AND TABLE_NAME = '{table_name}' AND LAST_STATUS = 'RUNNING') AS TABLE_IS_ALREADY_RUNNING
FROM DUAL
"""
cursor.execute(sql_query)
result = cursor.fetchone()
total_running_val = result[0] or 0
table_running_val = result[1] or 0
cursor.close()
logging.info("Total running: %d, threshold: %d, table running: %d",
total_running_val, threshold, table_running_val)
if total_running_val > threshold:
logging.info("Threshold exceeded. Waiting 5 minutes...")
time.sleep(300)
continue
if table_running_val >= 1:
logging.info("Table %s.%s is already running. Skipping.", table_owner, table_name)
break
# Trigger core DAG
from airflow.api.common.trigger_dag import trigger_dag
conf = {
"store": store,
"owner_table": f"{table_owner}.{table_name}"
}
trigger_dag(
dag_id='devo_replicator_core',
conf=conf,
execution_date=None,
replace_microseconds=False
)
triggered_count += 1
logging.info("Triggered core DAG for table %s.%s", table_owner, table_name)
break
logging.info("Total DAGs triggered: %d", triggered_count)
ti.xcom_push(key='triggered_count', value=triggered_count)
except Exception as e:
logging.error(f"Error in check_and_trigger: {e}")
raise
finally:
if oracle_conn:
oracle_conn.close()
t2 = PythonOperator(
task_id='check_and_trigger',
python_callable=check_and_trigger,
)
# Dependencies
init >> t1 >> t2
"""
RAR Trigger DAG
1) init_step
- Gets environment from MRDS_ENV environment variable
- Reads owner_table parameter from DAG configuration
- Validates owner_table format (must be OWNER.TABLE_NAME)
- Sets store to "rar" (fixed for this DAG)
- Sets threshold to 30 (max concurrent running threads)
- Pushes parameters to XCom
2) get_table_list
- Connects to Oracle database (MRDS_LOADER)
- If specific owner_table provided: creates single table list
- If no owner_table: queries all tables from CT_MRDS.a_devo_replica_mgmt_rar
- Returns list of (owner, table_name) tuples to process
- Pushes table list to XCom
3) check_and_trigger
- Loops through each table from the table list
- For each table, enters monitoring loop:
- Executes SQL query to check total running threads across MOPDB+RAR
- Checks if current table is already running
- If total threads > threshold (30): waits 5 minutes and rechecks
- If table already running: skips to next tabl
- If conditions met: triggers core DAG with table parameters
- Counts and logs total number of DAGs triggered
- Ensures system doesn't exceed concurrent processing limits
"""

View File

@@ -0,0 +1,257 @@
from __future__ import annotations
import os
import sys
import logging
import time
from datetime import timedelta
from airflow import DAG
from airflow.utils.dates import days_ago
from airflow.operators.python import PythonOperator
from airflow.models import Param
sys.path.append('/opt/airflow/python/connectors/devo')
sys.path.append('/opt/airflow/python/mrds_common')
from mrds.utils import oraconn
# Get RQSD table options for dropdown
def get_rqsd_table_options():
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
cursor = oracle_conn.cursor()
cursor.execute("SELECT OWNER || '.' || TABLE_NAME FROM CT_MRDS.a_devo_replica_mgmt_rqsd ORDER BY OWNER, TABLE_NAME")
options = [row[0] for row in cursor.fetchall()]
cursor.close()
return options
except Exception as e:
logging.error(f"Error getting RQSD table options: {e}")
return []
finally:
if oracle_conn:
oracle_conn.close()
default_args = {
'owner': 'devo',
'depends_on_past': False,
'start_date': days_ago(1),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=1),
}
with DAG(
dag_id='devo_replicator_trigger_rqsd',
default_args=default_args,
description='External trigger DAG for RQSD tables',
schedule=None,
catchup=False,
tags=['DevoReplicator', 'DevoReplicatorTrigger'],
params={
"owner_table": Param(
default=None,
type="string",
description="Select table in format OWNER.TABLE_NAME",
enum=get_rqsd_table_options()
)
}
) as dag:
# Init
def init_step(**context):
dag_run = context.get("dag_run")
ti = context["ti"]
conf = (dag_run.conf or {}) if dag_run else {}
env = os.getenv("MRDS_ENV")
if not env:
raise ValueError("MRDS_ENV environment variable is required")
env = env.lower()
store = "rqsd"
owner_table = conf.get("owner_table")
if not owner_table:
raise ValueError("owner_table parameter is required")
if '.' not in owner_table:
raise ValueError("owner_table must be in format 'OWNER.TABLE_NAME'")
table_owner, table_name = owner_table.split('.', 1)
if env not in {"dev", "tst", "acc", "prd"}:
raise ValueError(f"Unsupported env '{env}'. Expected 'dev', 'tst', 'acc' or 'prd'.")
logging.info("=== init_step === env=%s store=%s owner_table=%s",
env, store, owner_table)
xcom = {
"env": env,
"store": store,
"table_owner": table_owner,
"table_name": table_name,
"owner_table": owner_table,
"threshold": 30,
}
for k, v in xcom.items():
ti.xcom_push(key=k, value=v)
init = PythonOperator(
task_id='init_step',
python_callable=init_step,
)
# Get table list
def get_table_list(**context):
ti = context["ti"]
store = ti.xcom_pull(task_ids='init_step', key='store')
owner_table = ti.xcom_pull(task_ids='init_step', key='owner_table')
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
if owner_table:
table_owner, table_name = owner_table.split('.', 1)
tables = [(table_owner, table_name)]
logging.info("Processing specific table: %s", owner_table)
else:
cursor = oracle_conn.cursor()
cursor.execute("SELECT OWNER, TABLE_NAME FROM CT_MRDS.a_devo_replica_mgmt_rqsd ORDER BY OWNER, TABLE_NAME")
tables = cursor.fetchall()
cursor.close()
logging.info("Found %d tables for RQSD", len(tables))
ti.xcom_push(key='tables_to_process', value=tables)
return tables
except Exception as e:
logging.error(f"Error in get_table_list: {e}")
raise
finally:
if oracle_conn:
oracle_conn.close()
t1 = PythonOperator(
task_id='get_table_list',
python_callable=get_table_list,
)
# Check and trigger core DAG
def check_and_trigger(**context):
ti = context["ti"]
env = ti.xcom_pull(task_ids='init_step', key='env')
store = ti.xcom_pull(task_ids='init_step', key='store')
threshold = ti.xcom_pull(task_ids='init_step', key='threshold')
tables = ti.xcom_pull(task_ids='get_table_list', key='tables_to_process')
oracle_conn = None
triggered_count = 0
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
for table_owner, table_name in tables:
logging.info("Processing table: %s.%s", table_owner, table_name)
while True:
cursor = oracle_conn.cursor()
# Execute SQL query with variable substitution
service_name = store.upper()
sql_query = f"""
SELECT (SELECT CASE WHEN SUM(MAX_THREADS) IS NULL THEN 0 ELSE SUM(MAX_THREADS) END AS RUNNING_THREADS
FROM CT_MRDS.A_DEVO_REPLICA_MGMT_MOPDB
WHERE LAST_STATUS = 'RUNNING') +
(SELECT CASE WHEN SUM(MAX_THREADS) IS NULL THEN 0 ELSE SUM(MAX_THREADS) END AS RUNNING_THREADS
FROM CT_MRDS.A_DEVO_REPLICA_MGMT_RAR
WHERE LAST_STATUS = 'RUNNING')
AS TOTAL_RUNNING_THREADS_NOW,
(SELECT COUNT(*) FROM CT_MRDS.A_DEVO_REPLICA_MGMT_{service_name}
WHERE OWNER = '{table_owner}' AND TABLE_NAME = '{table_name}' AND LAST_STATUS = 'RUNNING') AS TABLE_IS_ALREADY_RUNNING
FROM DUAL
"""
cursor.execute(sql_query)
result = cursor.fetchone()
total_running_val = result[0] or 0
table_running_val = result[1] or 0
cursor.close()
logging.info("Total running: %d, threshold: %d, table running: %d",
total_running_val, threshold, table_running_val)
if total_running_val > threshold:
logging.info("Threshold exceeded. Waiting 5 minutes...")
time.sleep(300)
continue
if table_running_val >= 1:
logging.info("Table %s.%s is already running. Skipping.", table_owner, table_name)
break
# Trigger core DAG
from airflow.api.common.trigger_dag import trigger_dag
conf = {
"store": store,
"owner_table": f"{table_owner}.{table_name}"
}
trigger_dag(
dag_id='devo_replicator_core',
conf=conf,
execution_date=None,
replace_microseconds=False
)
triggered_count += 1
logging.info("Triggered core DAG for table %s.%s", table_owner, table_name)
break
logging.info("Total DAGs triggered: %d", triggered_count)
ti.xcom_push(key='triggered_count', value=triggered_count)
except Exception as e:
logging.error(f"Error in check_and_trigger: {e}")
raise
finally:
if oracle_conn:
oracle_conn.close()
t2 = PythonOperator(
task_id='check_and_trigger',
python_callable=check_and_trigger,
)
# Dependencies
init >> t1 >> t2
"""
RQSD Trigger DAG
1) init_step
- Gets environment from MRDS_ENV environment variable
- Reads owner_table parameter from DAG configuration
- Validates owner_table format (must be OWNER.TABLE_NAME)
- Sets store to "rqsd" (fixed for this DAG)
- Sets threshold to 30 (max concurrent running threads)
- Pushes parameters to XCom
2) get_table_list
- Connects to Oracle database (MRDS_LOADER)
- If specific owner_table provided: creates single table list
- If no owner_table: queries all tables from CT_MRDS.a_devo_replica_mgmt_rqsd
- Returns list of (owner, table_name) tuples to process
- Pushes table list to XCom
3) check_and_trigger
- Loops through each table from the table list
- For each table, enters monitoring loop:
- Executes SQL query to check total running threads across MOPDB+RAR
- Checks if current table is already running
- If total threads > threshold (30): waits 5 minutes and rechecks
- If table already running: skips to next tabl
- If conditions met: triggers core DAG with table parameters
- Counts and logs total number of DAGs triggered
- Ensures system doesn't exceed concurrent processing limits
"""

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,255 @@
from __future__ import annotations
import os
import sys
import logging
from datetime import timedelta
from airflow import DAG
from airflow.utils.dates import days_ago
from airflow.operators.python import PythonOperator
from airflow.models import Param
sys.path.append('/opt/airflow/python/connectors/devo')
sys.path.append('/opt/airflow/python/mrds_common')
from mrds.utils import oraconn
# Get MOPDB table options for dropdown
def get_mopdb_table_options():
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
cursor = oracle_conn.cursor()
cursor.execute("SELECT OWNER || '.' || TABLE_NAME FROM CT_MRDS.a_devo_replica_mgmt_mopdb ORDER BY OWNER, TABLE_NAME")
options = ["__ALL_EXCLUDE_COPY__"] + [row[0] for row in cursor.fetchall()]
cursor.close()
return options
except Exception as e:
logging.error(f"Error getting MOPDB table options: {e}")
return ["__ALL_EXCLUDE_COPY__"]
finally:
if oracle_conn:
oracle_conn.close()
default_args = {
'owner': 'devo',
'depends_on_past': False,
'start_date': days_ago(1),
'email_on_failure': False,
'email_on_retry': False,
}
with DAG(
dag_id='devo_table_generator_trigger_mopdb',
default_args=default_args,
description='External trigger DAG for MOPDB tables',
schedule=None,
catchup=False,
tags=['DevoTableGenerator', 'DevoTableGeneratorTrigger'],
params={
"owner_table": Param(
default="__ALL_EXCLUDE_COPY__",
type="string",
description="Select '__ALL_EXCLUDE_COPY__' to run all tables without _COPY, or select specific table in format OWNER.TABLE_NAME",
enum=get_mopdb_table_options()
)
}
) as dag:
# Init
def init_step(**context):
dag_run = context.get("dag_run")
ti = context["ti"]
conf = (dag_run.conf or {}) if dag_run else {}
env = os.getenv("MRDS_ENV")
if not env:
raise ValueError("MRDS_ENV environment variable is required")
env = env.lower()
store = "mopdb"
owner_table = conf.get("owner_table")
if not owner_table:
raise ValueError("owner_table parameter is required")
# Handle special "run all" case
run_all_exclude_copy = (owner_table == "__ALL_EXCLUDE_COPY__")
if not run_all_exclude_copy and '.' not in owner_table:
raise ValueError("owner_table must be in format 'OWNER.TABLE_NAME' or '__ALL_EXCLUDE_COPY__'")
if env not in {"dev", "tst", "acc", "prd"}:
raise ValueError(f"Unsupported env '{env}'. Expected 'dev', 'tst', 'acc' or 'prd'.")
logging.info("=== init_step === env=%s store=%s owner_table=%s run_all_exclude_copy=%s",
env, store, owner_table, run_all_exclude_copy)
xcom = {
"env": env,
"store": store,
"owner_table": owner_table,
"run_all_exclude_copy": run_all_exclude_copy,
}
for k, v in xcom.items():
ti.xcom_push(key=k, value=v)
init = PythonOperator(
task_id='init_step',
python_callable=init_step,
)
# Get table list
def get_table_list(**context):
ti = context["ti"]
store = ti.xcom_pull(task_ids='init_step', key='store')
owner_table = ti.xcom_pull(task_ids='init_step', key='owner_table')
run_all_exclude_copy = ti.xcom_pull(task_ids='init_step', key='run_all_exclude_copy')
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
if run_all_exclude_copy:
# Get all tables excluding those with _COPY in the name
cursor = oracle_conn.cursor()
cursor.execute("""
SELECT OWNER, TABLE_NAME
FROM CT_MRDS.a_devo_replica_mgmt_mopdb
WHERE TABLE_NAME NOT LIKE '%_COPY%'
ORDER BY OWNER, TABLE_NAME
""")
tables = cursor.fetchall()
cursor.close()
logging.info("Processing ALL tables excluding _COPY: %d tables found", len(tables))
elif owner_table:
# Process specific table
table_owner, table_name = owner_table.split('.', 1)
tables = [(table_owner, table_name)]
logging.info("Processing specific table: %s", owner_table)
else:
# Fallback: get all tables
cursor = oracle_conn.cursor()
cursor.execute("SELECT OWNER, TABLE_NAME FROM CT_MRDS.a_devo_replica_mgmt_mopdb ORDER BY OWNER, TABLE_NAME")
tables = cursor.fetchall()
cursor.close()
logging.info("Found %d tables for MOPDB", len(tables))
ti.xcom_push(key='tables_to_process', value=tables)
return tables
except Exception as e:
logging.error(f"Error in get_table_list: {e}")
raise
finally:
if oracle_conn:
oracle_conn.close()
t1 = PythonOperator(
task_id='get_table_list',
python_callable=get_table_list,
)
# Trigger core DAG for each table
def trigger_tables(**context):
ti = context["ti"]
env = ti.xcom_pull(task_ids='init_step', key='env')
store = ti.xcom_pull(task_ids='init_step', key='store')
tables = ti.xcom_pull(task_ids='get_table_list', key='tables_to_process')
oracle_conn = None
triggered_count = 0
skipped_count = 0
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
for table_owner, table_name in tables:
logging.info("Processing table: %s.%s", table_owner, table_name)
cursor = oracle_conn.cursor()
# Check if table is already running
service_name = store.upper()
sql_query = f"""
SELECT COUNT(*)
FROM CT_MRDS.A_DEVO_REPLICA_MGMT_{service_name}
WHERE OWNER = '{table_owner}'
AND TABLE_NAME = '{table_name}'
AND LAST_STATUS = 'RUNNING'
"""
cursor.execute(sql_query)
result = cursor.fetchone()
table_running_val = result[0] or 0
cursor.close()
if table_running_val >= 1:
logging.info("Table %s.%s is already running. Skipping.", table_owner, table_name)
skipped_count += 1
continue
# Trigger core DAG
from airflow.api.common.trigger_dag import trigger_dag
conf = {
"store": store,
"owner_table": f"{table_owner}.{table_name}"
}
trigger_dag(
dag_id='devo_table_generator_core',
conf=conf,
execution_date=None,
replace_microseconds=False
)
triggered_count += 1
logging.info("Triggered core DAG for table %s.%s", table_owner, table_name)
logging.info("Summary: Total DAGs triggered: %d, Skipped (already running): %d",
triggered_count, skipped_count)
ti.xcom_push(key='triggered_count', value=triggered_count)
ti.xcom_push(key='skipped_count', value=skipped_count)
except Exception as e:
logging.error(f"Error in trigger_tables: {e}")
raise
finally:
if oracle_conn:
oracle_conn.close()
t2 = PythonOperator(
task_id='trigger_tables',
python_callable=trigger_tables,
)
# Dependencies
init >> t1 >> t2
"""
MOPDB Trigger DAG
1) init_step
- Gets environment from MRDS_ENV environment variable
- Reads owner_table parameter from DAG configuration
- Validates owner_table format (must be OWNER.TABLE_NAME or __ALL_EXCLUDE_COPY__)
- Sets store to "mopdb" (fixed for this DAG)
- Determines if running all tables excluding _COPY
- Pushes parameters to XCom
2) get_table_list
- Connects to Oracle database (MRDS_LOADER)
- If __ALL_EXCLUDE_COPY__: queries all tables from CT_MRDS.a_devo_replica_mgmt_mopdb excluding _COPY tables
- If specific owner_table provided: creates single table list
- If no owner_table: queries all tables from CT_MRDS.a_devo_replica_mgmt_mopdb
- Returns list of (owner, table_name) tuples to process
- Pushes table list to XCom
3) trigger_tables
- Loops through each table from the table list
- For each table:
- Checks if table is already running
- If table already running: skips to next table
- If not running: triggers core DAG with table parameters
- Counts and logs total number of DAGs triggered and skipped
- No threshold checking or waiting logic
"""

View File

@@ -0,0 +1,257 @@
from __future__ import annotations
import os
import sys
import logging
from datetime import timedelta
from airflow import DAG
from airflow.utils.dates import days_ago
from airflow.operators.python import PythonOperator
from airflow.models import Param
sys.path.append('/opt/airflow/python/connectors/devo')
sys.path.append('/opt/airflow/python/mrds_common')
from mrds.utils import oraconn
# Get RAR table options for dropdown
def get_rar_table_options():
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
cursor = oracle_conn.cursor()
cursor.execute("SELECT OWNER || '.' || TABLE_NAME FROM CT_MRDS.a_devo_replica_mgmt_rar ORDER BY OWNER, TABLE_NAME")
options = ["__ALL_EXCLUDE_COPY__"] + [row[0] for row in cursor.fetchall()]
cursor.close()
return options
except Exception as e:
logging.error(f"Error getting RAR table options: {e}")
return ["__ALL_EXCLUDE_COPY__"]
finally:
if oracle_conn:
oracle_conn.close()
default_args = {
'owner': 'devo',
'depends_on_past': False,
'start_date': days_ago(1),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=1),
}
with DAG(
dag_id='devo_table_generator_trigger_rar',
default_args=default_args,
description='External trigger DAG for RAR tables',
schedule=None,
catchup=False,
tags=['DevoTableGenerator', 'DevoTableGeneratorTrigger'],
params={
"owner_table": Param(
default="__ALL_EXCLUDE_COPY__",
type="string",
description="Select '__ALL_EXCLUDE_COPY__' to run all tables without _COPY, or select specific table in format OWNER.TABLE_NAME",
enum=get_rar_table_options()
)
}
) as dag:
# Init
def init_step(**context):
dag_run = context.get("dag_run")
ti = context["ti"]
conf = (dag_run.conf or {}) if dag_run else {}
env = os.getenv("MRDS_ENV")
if not env:
raise ValueError("MRDS_ENV environment variable is required")
env = env.lower()
store = "rar"
owner_table = conf.get("owner_table")
if not owner_table:
raise ValueError("owner_table parameter is required")
# Handle special "run all" case
run_all_exclude_copy = (owner_table == "__ALL_EXCLUDE_COPY__")
if not run_all_exclude_copy and '.' not in owner_table:
raise ValueError("owner_table must be in format 'OWNER.TABLE_NAME' or '__ALL_EXCLUDE_COPY__'")
if env not in {"dev", "tst", "acc", "prd"}:
raise ValueError(f"Unsupported env '{env}'. Expected 'dev', 'tst', 'acc' or 'prd'.")
logging.info("=== init_step === env=%s store=%s owner_table=%s run_all_exclude_copy=%s",
env, store, owner_table, run_all_exclude_copy)
xcom = {
"env": env,
"store": store,
"owner_table": owner_table,
"run_all_exclude_copy": run_all_exclude_copy,
}
for k, v in xcom.items():
ti.xcom_push(key=k, value=v)
init = PythonOperator(
task_id='init_step',
python_callable=init_step,
)
# Get table list
def get_table_list(**context):
ti = context["ti"]
store = ti.xcom_pull(task_ids='init_step', key='store')
owner_table = ti.xcom_pull(task_ids='init_step', key='owner_table')
run_all_exclude_copy = ti.xcom_pull(task_ids='init_step', key='run_all_exclude_copy')
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
if run_all_exclude_copy:
# Get all tables excluding those with _COPY in the name
cursor = oracle_conn.cursor()
cursor.execute("""
SELECT OWNER, TABLE_NAME
FROM CT_MRDS.a_devo_replica_mgmt_rar
WHERE TABLE_NAME NOT LIKE '%_COPY%'
ORDER BY OWNER, TABLE_NAME
""")
tables = cursor.fetchall()
cursor.close()
logging.info("Processing ALL tables excluding _COPY: %d tables found", len(tables))
elif owner_table:
# Process specific table
table_owner, table_name = owner_table.split('.', 1)
tables = [(table_owner, table_name)]
logging.info("Processing specific table: %s", owner_table)
else:
# Fallback: get all tables
cursor = oracle_conn.cursor()
cursor.execute("SELECT OWNER, TABLE_NAME FROM CT_MRDS.a_devo_replica_mgmt_rar ORDER BY OWNER, TABLE_NAME")
tables = cursor.fetchall()
cursor.close()
logging.info("Found %d tables for RAR", len(tables))
ti.xcom_push(key='tables_to_process', value=tables)
return tables
except Exception as e:
logging.error(f"Error in get_table_list: {e}")
raise
finally:
if oracle_conn:
oracle_conn.close()
t1 = PythonOperator(
task_id='get_table_list',
python_callable=get_table_list,
)
# Trigger core DAG for each table
def trigger_tables(**context):
ti = context["ti"]
env = ti.xcom_pull(task_ids='init_step', key='env')
store = ti.xcom_pull(task_ids='init_step', key='store')
tables = ti.xcom_pull(task_ids='get_table_list', key='tables_to_process')
oracle_conn = None
triggered_count = 0
skipped_count = 0
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
for table_owner, table_name in tables:
logging.info("Processing table: %s.%s", table_owner, table_name)
cursor = oracle_conn.cursor()
# Check if table is already running
service_name = store.upper()
sql_query = f"""
SELECT COUNT(*)
FROM CT_MRDS.A_DEVO_REPLICA_MGMT_{service_name}
WHERE OWNER = '{table_owner}'
AND TABLE_NAME = '{table_name}'
AND LAST_STATUS = 'RUNNING'
"""
cursor.execute(sql_query)
result = cursor.fetchone()
table_running_val = result[0] or 0
cursor.close()
if table_running_val >= 1:
logging.info("Table %s.%s is already running. Skipping.", table_owner, table_name)
skipped_count += 1
continue
# Trigger core DAG
from airflow.api.common.trigger_dag import trigger_dag
conf = {
"store": store,
"owner_table": f"{table_owner}.{table_name}"
}
trigger_dag(
dag_id='devo_table_generator_core',
conf=conf,
execution_date=None,
replace_microseconds=False
)
triggered_count += 1
logging.info("Triggered core DAG for table %s.%s", table_owner, table_name)
logging.info("Summary: Total DAGs triggered: %d, Skipped (already running): %d",
triggered_count, skipped_count)
ti.xcom_push(key='triggered_count', value=triggered_count)
ti.xcom_push(key='skipped_count', value=skipped_count)
except Exception as e:
logging.error(f"Error in trigger_tables: {e}")
raise
finally:
if oracle_conn:
oracle_conn.close()
t2 = PythonOperator(
task_id='trigger_tables',
python_callable=trigger_tables,
)
# Dependencies
init >> t1 >> t2
"""
RAR Trigger DAG
1) init_step
- Gets environment from MRDS_ENV environment variable
- Reads owner_table parameter from DAG configuration
- Validates owner_table format (must be OWNER.TABLE_NAME or __ALL_EXCLUDE_COPY__)
- Sets store to "rar" (fixed for this DAG)
- Determines if running all tables excluding _COPY
- Pushes parameters to XCom
2) get_table_list
- Connects to Oracle database (MRDS_LOADER)
- If __ALL_EXCLUDE_COPY__: queries all tables from CT_MRDS.a_devo_replica_mgmt_rar excluding _COPY tables
- If specific owner_table provided: creates single table list
- If no owner_table: queries all tables from CT_MRDS.a_devo_replica_mgmt_rar
- Returns list of (owner, table_name) tuples to process
- Pushes table list to XCom
3) trigger_tables
- Loops through each table from the table list
- For each table:
- Checks if table is already running
- If table already running: skips to next table
- If not running: triggers core DAG with table parameters
- Counts and logs total number of DAGs triggered and skipped
- No threshold checking or waiting logic
"""

View File

@@ -0,0 +1,239 @@
from __future__ import annotations
import os
import sys
import logging
from datetime import timedelta
from airflow import DAG
from airflow.utils.dates import days_ago
from airflow.operators.python import PythonOperator
from airflow.models import Param
sys.path.append('/opt/airflow/python/connectors/devo')
sys.path.append('/opt/airflow/python/mrds_common')
from mrds.utils import oraconn
# Get RQSD table options for dropdown
def get_rqsd_table_options():
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
cursor = oracle_conn.cursor()
cursor.execute("SELECT OWNER || '.' || TABLE_NAME FROM CT_MRDS.a_devo_replica_mgmt_rqsd ORDER BY OWNER, TABLE_NAME")
options = [row[0] for row in cursor.fetchall()]
cursor.close()
return options
except Exception as e:
logging.error(f"Error getting RQSD table options: {e}")
return []
finally:
if oracle_conn:
oracle_conn.close()
default_args = {
'owner': 'devo',
'depends_on_past': False,
'start_date': days_ago(1),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=1),
}
with DAG(
dag_id='devo_table_generator_trigger_rqsd',
default_args=default_args,
description='External trigger DAG for RQSD tables',
schedule=None,
catchup=False,
tags=['DevoTableGenerator', 'DevoTableGeneratorTrigger'],
params={
"owner_table": Param(
default=None,
type="string",
description="Select table in format OWNER.TABLE_NAME",
enum=get_rqsd_table_options()
)
}
) as dag:
# Init
def init_step(**context):
dag_run = context.get("dag_run")
ti = context["ti"]
conf = (dag_run.conf or {}) if dag_run else {}
env = os.getenv("MRDS_ENV")
if not env:
raise ValueError("MRDS_ENV environment variable is required")
env = env.lower()
store = "rqsd"
owner_table = conf.get("owner_table")
if not owner_table:
raise ValueError("owner_table parameter is required")
if '.' not in owner_table:
raise ValueError("owner_table must be in format 'OWNER.TABLE_NAME'")
table_owner, table_name = owner_table.split('.', 1)
if env not in {"dev", "tst", "acc", "prd"}:
raise ValueError(f"Unsupported env '{env}'. Expected 'dev', 'tst', 'acc' or 'prd'.")
logging.info("=== init_step === env=%s store=%s owner_table=%s",
env, store, owner_table)
xcom = {
"env": env,
"store": store,
"table_owner": table_owner,
"table_name": table_name,
"owner_table": owner_table,
}
for k, v in xcom.items():
ti.xcom_push(key=k, value=v)
init = PythonOperator(
task_id='init_step',
python_callable=init_step,
)
# Get table list
def get_table_list(**context):
ti = context["ti"]
store = ti.xcom_pull(task_ids='init_step', key='store')
owner_table = ti.xcom_pull(task_ids='init_step', key='owner_table')
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
if owner_table:
table_owner, table_name = owner_table.split('.', 1)
tables = [(table_owner, table_name)]
logging.info("Processing specific table: %s", owner_table)
else:
cursor = oracle_conn.cursor()
cursor.execute("SELECT OWNER, TABLE_NAME FROM CT_MRDS.a_devo_replica_mgmt_rqsd ORDER BY OWNER, TABLE_NAME")
tables = cursor.fetchall()
cursor.close()
logging.info("Found %d tables for RQSD", len(tables))
ti.xcom_push(key='tables_to_process', value=tables)
return tables
except Exception as e:
logging.error(f"Error in get_table_list: {e}")
raise
finally:
if oracle_conn:
oracle_conn.close()
t1 = PythonOperator(
task_id='get_table_list',
python_callable=get_table_list,
)
# Trigger core DAG for each table
def trigger_tables(**context):
ti = context["ti"]
env = ti.xcom_pull(task_ids='init_step', key='env')
store = ti.xcom_pull(task_ids='init_step', key='store')
tables = ti.xcom_pull(task_ids='get_table_list', key='tables_to_process')
oracle_conn = None
triggered_count = 0
skipped_count = 0
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
for table_owner, table_name in tables:
logging.info("Processing table: %s.%s", table_owner, table_name)
cursor = oracle_conn.cursor()
# Check if table is already running
service_name = store.upper()
sql_query = f"""
SELECT COUNT(*)
FROM CT_MRDS.A_DEVO_REPLICA_MGMT_{service_name}
WHERE OWNER = '{table_owner}'
AND TABLE_NAME = '{table_name}'
AND LAST_STATUS = 'RUNNING'
"""
cursor.execute(sql_query)
result = cursor.fetchone()
table_running_val = result[0] or 0
cursor.close()
if table_running_val >= 1:
logging.info("Table %s.%s is already running. Skipping.", table_owner, table_name)
skipped_count += 1
continue
# Trigger core DAG
from airflow.api.common.trigger_dag import trigger_dag
conf = {
"store": store,
"owner_table": f"{table_owner}.{table_name}"
}
trigger_dag(
dag_id='devo_table_generator_core',
conf=conf,
execution_date=None,
replace_microseconds=False
)
triggered_count += 1
logging.info("Triggered core DAG for table %s.%s", table_owner, table_name)
logging.info("Summary: Total DAGs triggered: %d, Skipped (already running): %d",
triggered_count, skipped_count)
ti.xcom_push(key='triggered_count', value=triggered_count)
ti.xcom_push(key='skipped_count', value=skipped_count)
except Exception as e:
logging.error(f"Error in trigger_tables: {e}")
raise
finally:
if oracle_conn:
oracle_conn.close()
t2 = PythonOperator(
task_id='trigger_tables',
python_callable=trigger_tables,
)
# Dependencies
init >> t1 >> t2
"""
RQSD Trigger DAG
1) init_step
- Gets environment from MRDS_ENV environment variable
- Reads owner_table parameter from DAG configuration
- Validates owner_table format (must be OWNER.TABLE_NAME)
- Sets store to "rqsd" (fixed for this DAG)
- Pushes parameters to XCom
2) get_table_list
- Connects to Oracle database (MRDS_LOADER)
- If specific owner_table provided: creates single table list
- If no owner_table: queries all tables from CT_MRDS.a_devo_replica_mgmt_rqsd
- Returns list of (owner, table_name) tuples to process
- Pushes table list to XCom
3) trigger_tables
- Loops through each table from the table list
- For each table:
- Checks if table is already running
- If table already running: skips to next table
- If not running: triggers core DAG with table parameters
- Counts and logs total number of DAGs triggered and skipped
- No threshold checking or waiting logic
"""