This commit is contained in:
Grzegorz Michalski
2026-03-02 09:47:35 +01:00
commit 2c225d68ac
715 changed files with 130067 additions and 0 deletions

View File

@@ -0,0 +1,75 @@
import sys
import os
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.utils.dates import days_ago
from datetime import datetime, timedelta
import logging
# Importing custom modules
sys.path.append('/opt/airflow/python/connectors/devo')
sys.path.append('/opt/airflow/python/mrds_common')
sys.path.append('/opt/airflow/src/airflow/dags/ods/rqsd')
sys.path.append('/opt/airflow/python/connectors/devo')
# Import the main function from your script
from devo_connector import main as devo_main
### DEVO CONNECTOR WITH STATIC workflow (task 3)
# Default DAG arguments
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': days_ago(1),
'email_on_failure': False,
'email_on_retry': False,
'retries': 3,
'retry_delay': timedelta(minutes=5),
}
with DAG(
dag_id='devo_connector_test',
default_args=default_args,
description='Run devo RQSD data ingestion workflow',
schedule_interval=None, # we can set later
#start_date=datetime(2025, 10, 7),
catchup=False,
tags=['Devo', 'RQSD', 'Connector'],
) as dag:
def run_devo_connector_rqsd(**context):
try:
# Pick env from ENV variables
env = os.getenv("MRDS_ENV")
username = os.getenv("MRDS_LOADER_DB_USER")
password = os.getenv("MRDS_LOADER_DB_PASS")
tnsalias = os.getenv("MRDS_LOADER_DB_TNS")
if not all([username, password, tnsalias]):
raise ValueError(
"Missing one or more required environment variables: "
"MRDS_LOADER_DB_USER, MRDS_LOADER_DB_PASS, MRDS_LOADER_DB_TNS"
)
logging.info(
f"Starting Casper RQSD workflow from Airflow DAG for env '{env}'"
)
workflow_context = {"run_id": 34, "a_workflow_history_key": 6}
flow_config_path = "/opt/airflow/src/airflow/dags/ods/rqsd/rqsd_process/config/yaml/flow_config_rqsd_observations.yaml"
env_config_path = "/opt/airflow/python/connectors/devo/config/env_config_rqsd.yaml"
#env = "tst"
# flow_config_rqsd_observations.yaml
logging.info("Starting Devo RQSD workflow from Airflow DAG")
devo_main(workflow_context, flow_config_path, env_config_path, env)
logging.info("Devo RQSD workflow completed successfully")
except Exception as e:
logging.error(f"Error running Devo RQSD workflow: {e}", exc_info=True)
raise
run_devo = PythonOperator(
task_id='run_devo_connector_rqsd',
python_callable=run_devo_connector_rqsd,
)

View File

@@ -0,0 +1,158 @@
import os
import sys
import logging
from airflow.decorators import dag
from airflow.operators.python import PythonOperator
from airflow.utils.dates import days_ago
from airflow.utils.trigger_rule import TriggerRule
from airflow import DAG
from airflow.decorators import task
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
from datetime import datetime, timedelta
from airflow.operators.python import BranchPythonOperator
from airflow.operators.empty import EmptyOperator
from mrds.utils import oraconn
sys.path.append('/opt/airflow/python/connectors/devo')
sys.path.append('/opt/airflow/python/mrds_common')
DAG_NAME = "dev_replicator_scheduler_rar"
TARGET_DAG_ID = "devo_replicator_trigger_rar"
def get_devo_replica_table_options():
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
cursor = oracle_conn.cursor()
cursor.execute("SELECT OWNER || '.' || TABLE_NAME FROM CT_MRDS.a_devo_replica_mgmt_rar ORDER BY OWNER, TABLE_NAME")
options = [row[0] for row in cursor.fetchall()]
cursor.close()
return options
except Exception as e:
logging.error(f"Error getting MOPDB table options: {e}")
return []
finally:
if oracle_conn:
oracle_conn.close()
def check_table_precondition(table_full_name):
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
cursor = oracle_conn.cursor()
sql = """
WITH LAST_UPDATE_ORACLE AS (
SELECT max(process_end) as process_end
FROM CT_RAR.A_RAR_FOR_DISC_MONITORING
WHERE upper(owner||'.'||TARGET_TABLE_NAME) = upper(:table_name)
AND PROCESS_END is not null AND PROCESS_SUCCESSFUL='Y'
),
LAST_UPDATE_DEVO AS (
SELECT CASE WHEN last_status = 'FINISHED' THEN LAST_END_TIME ELSE TO_DATE('01-JAN-1999', 'DD-MON-YYYY') END as process_end
FROM CT_MRDS.a_devo_replica_mgmt_rar
WHERE OWNER || '.' || TABLE_NAME = :table_name
)
SELECT CASE WHEN (SELECT process_end FROM LAST_UPDATE_ORACLE) > (SELECT process_end FROM LAST_UPDATE_DEVO)
THEN 'Y' ELSE 'N' END AS TRIGGER_DEVO_REPLICATOR FROM dual
"""
cursor.execute(sql, table_name=table_full_name)
result = cursor.fetchone()
status = result[0] if result else 'N'
logging.info(f"Precondition for {table_full_name}: {status}")
cursor.close()
return {"table": table_full_name, "trigger": status}
except Exception as e:
logging.error(f"Error checking precondition for {table_full_name}: {e}")
return {"table": table_full_name, "trigger": 'ERROR'}
finally:
if oracle_conn:
oracle_conn.close()
def get_tables_to_trigger(precondition_results):
triggered_tables = [r["table"] for r in precondition_results if r["trigger"] == "Y"]
logging.info(f"Tables meeting precondition: {triggered_tables}")
return [{"owner_table": table_name} for table_name in triggered_tables]
def branch_on_tables(ti):
precondition_results = ti.xcom_pull(task_ids='check_all_tables')
tables_to_trigger = [r["table"] for r in precondition_results if r["trigger"] == "Y"]
if tables_to_trigger:
return "trigger_devo_replicators"
else:
return "no_table_updated"
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': days_ago(1),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=2),
}
with DAG(
dag_id=DAG_NAME,
default_args=default_args,
schedule_interval=None,
catchup=False,
tags=['DevoScheduler', 'DevoReplicatorTrigger']
) as dag:
@task()
def fetch_tables():
return get_devo_replica_table_options()
@task()
def check_all_tables(table_list):
results = [check_table_precondition(tbl) for tbl in table_list]
count_y = sum(1 for r in results if r["trigger"] == "Y")
count_n = sum(1 for r in results if r["trigger"] == "N")
logging.info(f"Precondition results: {results}")
logging.info(f"Tables with trigger = 'Y': {count_y}")
logging.info(f"Tables with trigger = 'N': {count_n}")
return results
@task()
def output_tables_to_trigger(precondition_results):
return get_tables_to_trigger(precondition_results)
branch_task = BranchPythonOperator(
task_id="branch_trigger_check",
python_callable=branch_on_tables,
provide_context=True,
)
no_table_updated = EmptyOperator(task_id="no_table_updated")
tables = fetch_tables()
precondition_results = check_all_tables(tables)
tables_to_trigger = output_tables_to_trigger(precondition_results)
trigger_dag = TriggerDagRunOperator.partial(
task_id="trigger_devo_replicators",
trigger_dag_id=TARGET_DAG_ID,
execution_date="{{ ds }}"
).expand(conf=tables_to_trigger)
# Dependencies for branching
tables >> precondition_results >> tables_to_trigger >> branch_task
branch_task >> [trigger_dag, no_table_updated]
"""
1. fetch_tables gets the list of tables.
2. check_all_tables checks each tables trigger status and logs counts.
3. output_tables_to_trigger prepares the mapped parameter list for triggering downstream DAGs.
4. branch_on_tables decides the path:
"trigger_devo_replicators" if any table triggers.
"no_table_updated" otherwise.
5. BranchPythonOperator implements the conditional branching.
6. TriggerDagRunOperator dynamically triggers a run of devo_replicator_trigger_rar per qualifying table.
7. EmptyOperator represents the "no tables to trigger" branch.
"""

View File

@@ -0,0 +1,112 @@
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.providers.oracle.hooks.oracle import OracleHook
from airflow.utils.dates import days_ago
from datetime import timedelta
import logging
p_run_id = 1234
p_service_name = 'MyService'
p_table_owner = 'MY_SCHEMA'
p_table_name = 'MY_TABLE'
p_objectstore_uri = 's3://bucket/uri' # subject to change appropriate for RAR/MOPDB
def start_log_table_task(**context):
proc_call = "BEGIN MRDS_LOADER.DATA_REPLICATOR.start_log_table(:1, :2, :3, :4); END;"
try:
oracle_hook = OracleHook(oracle_conn_id='oracle_default')
conn = oracle_hook.get_conn()
cursor = conn.cursor()
cursor.execute(proc_call, [p_run_id, p_service_name, p_table_owner, p_table_name])
conn.commit()
cursor.close()
conn.close()
logging.info("start_log_table executed successfully.")
except Exception as e:
logging.error("Failed to execute start_log_table: %s", e, exc_info=True)
raise
def export_table_task(**context):
proc_call = "BEGIN MRDS_LOADER.DATA_REPLICATOR.export_table(:1, :2, :3, :4); END;"
try:
oracle_hook = OracleHook(oracle_conn_id='oracle_default')
conn = oracle_hook.get_conn()
cursor = conn.cursor()
cursor.execute(proc_call, [p_service_name, p_table_owner, p_table_name, p_objectstore_uri])
conn.commit()
cursor.close()
conn.close()
logging.info("export_table executed successfully.")
except Exception as e:
logging.error("Failed to execute export_table: %s", e, exc_info=True)
raise
def devo_impyla_task(**context):
# Placeholder for Impyla (Devo) code
# Example for future:
# from impala.dbapi import connect
# conn = connect(host="...", port=21050)
# cursor = conn.cursor()
# cursor.execute("...")
logging.info("Impyla (Devo) task placeholder ran. Please implement.")
def end_log_table_task(**context):
proc_call = "BEGIN MRDS_LOADER.DATA_REPLICATOR.end_log_table(:1, :2, :3); END;"
try:
oracle_hook = OracleHook(oracle_conn_id='oracle_default')
conn = oracle_hook.get_conn()
cursor = conn.cursor()
cursor.execute(proc_call, [p_service_name, p_table_owner, p_table_name])
conn.commit()
cursor.close()
conn.close()
logging.info("end_log_table executed successfully.")
except Exception as e:
logging.error("Failed to execute end_log_table: %s", e, exc_info=True)
raise
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': days_ago(1),
'email_on_failure': False,
'email_on_retry': False,
'retries': 2,
'retry_delay': timedelta(minutes=5),
}
with DAG(
dag_id='rqsd_devo_replicator_test_old',
default_args=default_args,
description='Run Devo replicator workflow',
schedule_interval=None,
catchup=False,
tags=['Devo', 'RQSD', 'Replicator'],
) as dag:
t1 = PythonOperator(
task_id='start_log_table',
python_callable=start_log_table_task,
)
t2 = PythonOperator(
task_id='export_table',
python_callable=export_table_task,
)
t3 = PythonOperator(
task_id='devo_impyla',
python_callable=devo_impyla_task,
)
t4 = PythonOperator(
task_id='end_log_table',
python_callable=end_log_table_task,
)
t1 >> t2 >> t3 >> t4

View File

@@ -0,0 +1,132 @@
from airflow import DAG
#from airflow.providers.oracle.operators.oracle import SQLExecuteQueryOperator
from airflow.providers.common.sql.operators.sql import SQLExecuteQueryOperator
from airflow.utils.dates import days_ago
from datetime import timedelta
import logging
# Importing custom modules
sys.path.append('/opt/airflow/python/connectors/devo')
sys.path.append('/opt/airflow/python/mrds_common')
sys.path.append('/opt/airflow/src/airflow/dags/ods/rqsd')
# Import your functions
from mrds.utils.manage_runs import init_workflow, finalise_workflow
from devo_replicator.data_replicator.impala_refresher import main as impala_main
#step 5) Devo replication
## DEVO REPLICATOR WITH SQLOperator
### check the oracle connection, fixed params --> test cnx
### pick it from a file,
# TASK :
# - retrive directly from config file the param {0} and {1} based dev/test
# need to be passed from infromatic (WLA call) to dags
# wla to airflow, cnx done
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': days_ago(1),
'email_on_failure': False,
'email_on_retry': False,
'retries': 2,
'retry_delay': timedelta(minutes=5),
}
with DAG(
dag_id='rqsd_devo_replicator_2',
default_args=default_args,
description='Run Devo replicator workflow',
schedule_interval=None,
catchup=False,
tags=['Devo', 'RQSD', 'Replicator'],
) as dag:
def init_step(**context):
env = os.getenv("MRDS_ENV")
corporate_store= "corporate store is 'crp_mopdb' for mopdb and 'crp_rar' for rar"
config_path = "/opt/airflow/python/devo_replicator/config/env_config.yaml"
p_service_name = 'MOPDB'
p_table_owner = 'MPEC'
p_table_name = 'T_MPEC'
#parse the config yml and filter by dev or test and mopdb or rar
p_objectstore_uri = 'https://devo-crp-ffppyd8q.bucket.vpce-040b28f5818b670c1-owicl3ow.s3.eu-central-1.vpce.amazonaws.com/mopdb/db' # subject to change as appropriate
p_run_id = str(context['ti'].run_id)
print(f"=== DEBUG INFO : {p_run_id} ===")
context['ti'].xcom_push(key='p_run_id', value=p_run_id)
init_step = PythonOperator(
task_id='init_step',
python_callable=init_step,
provide_context=True,
)
t1 = SQLExecuteQueryOperator(
task_id='start_log_table',
oracle_conn_id='oracle_default',
# failed ,open up the cnx
sql="BEGIN MRDS_LOADER.DATA_REPLICATOR.start_log_table(:p_run_id, :p_service_name, :p_table_owner, :p_table_name); END;",
parameters={
'p_run_id': p_run_id,
'p_service_name': p_service_name,
'p_table_owner': p_table_owner,
'p_table_name': p_table_name
},
#oracle_conn_id='oracle_default'
)
t2 = SQLExecuteQueryOperator(
task_id='export_table',
oracle_conn_id='oracle_default',
sql="BEGIN MRDS_LOADER.DATA_REPLICATOR.export_table(:p_service_name, :p_table_owner, :p_table_name, :p_objectstore_uri); END;",
parameters={
'p_service_name': p_service_name,
'p_table_owner': p_table_owner,
'p_table_name': p_table_name,
'p_objectstore_uri': p_objectstore_uri
},
#oracle_conn_id='oracle_default'
)
# Leaving the Devo/Impyla task as a PythonOperator (placeholder)
from airflow.operators.python import PythonOperator
def devo_impyla_task(**context):
status = impala_main(env_config_path, env, table, corporate_store)
logging.info("Impyla (Devo) task placeholder ran. Please implement.")
# get details-data from impala ( its pending )
t3 = PythonOperator(
task_id='devo_impyla',
python_callable=devo_impyla_task,
)
# push to s3, we need to call the proc
t4 = SQLExecuteQueryOperator(
task_id='end_log_table',
oracle_conn_id='oracle_default',
sql="BEGIN MRDS_LOADER.DATA_REPLICATOR.end_log_table(:p_service_name, :p_table_owner, :p_table_name); END;",
parameters={
'p_service_name': p_service_name,
'p_table_owner': p_table_owner,
'p_table_name': p_table_name
},
#oracle_conn_id='oracle_default'
)
# t4 need to be executed always if we succeed or not ( if t1 failed then go directly to t4)
# t5 that will check if any of previous dag failed put everything will be read
init_step >> t1 >> t2 >> t3 >> t4

View File

@@ -0,0 +1,65 @@
from airflow import DAG
from airflow.providers.common.sql.operators.sql import SQLExecuteQueryOperator
from airflow.operators.python import PythonOperator
from datetime import datetime, timedelta
import logging
logger = logging.getLogger(__name__)
## OLD ( Package repliction)
def test_oracle_connection(**context):
"""Test Oracle connection and log the result"""
conn_id = "marsdb_loader"
from airflow.providers.oracle.hooks.oracle import OracleHook
try:
logger.debug("Attempting to connect to Oracle database...")
hook = OracleHook(oracle_conn_id=conn_id)
conn = hook.get_conn()
cursor = conn.cursor()
cursor.execute("SELECT 1 FROM dual")
result = cursor.fetchone()
logger.info(f"Connection test successful. Result: {result}")
cursor.close()
conn.close()
except Exception as e:
logger.error(f"Connection test failed: {str(e)}")
raise
default_args = {
'depends_on_past': False,
'start_date': datetime(2025, 6, 25),
'retries': 1,
'retry_delay': timedelta(seconds=15),
}
with DAG(
'oracle_plsql_test_dag',
default_args=default_args,
schedule_interval=None,
catchup=False,
) as dag:
test_connection = PythonOperator(
task_id='test_oracle_connection',
python_callable=test_oracle_connection,
)
# With named parameter
run_plsql = SQLExecuteQueryOperator(
task_id='run_plsql_procedure',
conn_id="marsdb_loader",
sql="""
BEGIN
DATA_REPLICATOR.export_table(
p_table_owner => 'c2d',
p_table_name => 't_all_assets_servicer',
p_objectstore_uri => 'https://oci-test-sani.bucket.vpce-0b3a5f000733397b0-kxlyoh5z.s3.eu-central-1.vpce.amazonaws.com/',
p_date_column => 'SNAPSHOT_DATE'
);
END;
""",
)
test_connection >> run_plsql

View File

@@ -0,0 +1,171 @@
import sys
import os
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.utils.dates import days_ago
from datetime import datetime, timedelta
import logging
### DEVO CONNECTOR WITH DYNAMIC WORKFLOW CONTEXT & HISTORY KEY
# Importing custom modules
sys.path.append('/opt/airflow/python/connectors/devo')
sys.path.append('/opt/airflow/python/mrds_common')
sys.path.append('/opt/airflow/src/airflow/dags/ods/rqsd')
# Import your functions
from mrds.utils.manage_runs import init_workflow, finalise_workflow
from devo_connector import main as devo_main
from devo_connector_v2 import run as devo_main2
from mrds.core import main as mrds_main
# Default arguments
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': days_ago(1),
'email_on_failure': False,
'email_on_retry': False,
'retries': 3,
'retry_delay': timedelta(minutes=5),
}
# Dynamic name extraction from basename
dag_id = os.path.splitext(os.path.basename(__file__))[0]
with DAG(
dag_id=dag_id,
default_args=default_args,
description='Run devo RQSD data ingestion workflow with MRDS processing',
schedule_interval=None,
catchup=False,
tags=["Devo", "RQSD", "MRDS", "Connector"],
params={
"source_filename": "",
"config_file": "",
},
) as dag:
def run_devo_connector_rqsd(**context):
"""Run Devo RQSD connector workflow"""
try:
env = os.getenv("MRDS_ENV")
username = os.getenv("MRDS_LOADER_DB_USER")
password = os.getenv("MRDS_LOADER_DB_PASS")
tnsalias = os.getenv("MRDS_LOADER_DB_TNS")
if not all([username, password, tnsalias]):
raise ValueError(
"Missing one or more required environment variables: "
"MRDS_LOADER_DB_USER, MRDS_LOADER_DB_PASS, MRDS_LOADER_DB_TNS"
)
logging.info(f"Starting Devo RQSD workflow from Airflow DAG for env '{env}'")
database_name = 'MOPDB'
workflow_name = 'w_MOPDB_RQSD_PROCESS'
workflow_run_id = str(context['ti'].run_id)
#comment
a_workflow_history_key = init_workflow(database_name, workflow_name, workflow_run_id)
logging.info(f"Initialized workflow with history key: {a_workflow_history_key}")
workflow_context = {
"run_id": workflow_run_id,
"a_workflow_history_key": a_workflow_history_key
}
flow_config_path = "/opt/airflow/src/airflow/dags/ods/rqsd/rqsd_process/config/yaml/flow_config_rqsd_observations.yaml"
env_config_path = "/opt/airflow/python/connectors/devo/config/env_config_rqsd.yaml"
logging.info("Starting Devo RQSD workflow from Airflow DAG")
count = devo_main2(workflow_context, flow_config_path, env_config_path, env)
print("=================================================================")
print(f"Devo RQSD workflow completed successfully with count : {count}")
logging.info(f"Devo RQSD workflow completed successfully with count : {count}")
# Push the workflow context and history key to XCom for downstream tasks
context['ti'].xcom_push(key='workflow_history_key', value=a_workflow_history_key)
context['ti'].xcom_push(key='workflow_context', value=workflow_context)
except Exception as e:
logging.error(f"Error running Devo RQSD workflow: {e}", exc_info=True)
# If init_workflow succeeded but workflow failed, finalize with FAILED status
if 'a_workflow_history_key' in locals():
try:
finalise_workflow(a_workflow_history_key, "FAILED")
except Exception as finalise_error:
logging.error(f"Failed to finalise workflow after error: {finalise_error}")
raise
def run_mrds_task(**context):
"""Run MRDS processing task"""
try:
ti = context.get('ti')
workflow_context = ti.xcom_pull(key='workflow_context', task_ids='run_devo_connector_rqsd')
if not workflow_context:
raise ValueError("No workflow_context from Task 1")
print("=== workflow_context ====:",workflow_context)
source_filename = "RQSD_OBSERVATIONS.csv"
config_file = "/opt/airflow/src/airflow/dags/ods/rqsd/rqsd_process/config/yaml/flow_config_devo_process.yaml"
print("---- run_mrds_task ----")
print("source_filename :", source_filename)
print("config_file = ", config_file)
print("------------------------")
if not source_filename:
raise ValueError("No source_filename provided in DAG run params.")
if not config_file:
raise ValueError("No config_file path provided in DAG run params.")
logging.info(f"Starting MRDS task with source_filename: {source_filename}, config_file: {config_file}")
# Run MRDS with the workflow context from the previous task
mrds_main(workflow_context, source_filename, config_file, generate_workflow_context=True)
logging.info("MRDS task completed successfully")
except Exception as e:
logging.error(f"Error running MRDS task: {e}", exc_info=True)
raise
def finalise_workflow_task(**context):
"""Finalize workflow with SUCCESS status"""
# Pull the workflow_history_key from XCom pushed by the main task
ti = context['ti']
a_workflow_history_key = ti.xcom_pull(key='workflow_history_key', task_ids='run_devo_connector_rqsd')
if a_workflow_history_key is None:
raise ValueError("No workflow history key found in XCom; cannot finalise workflow")
# Call finalise with SUCCESS status
finalise_workflow(a_workflow_history_key, "SUCCESS")
logging.info(f"Finalised workflow with history key {a_workflow_history_key} as SUCCESS")
# Task definitions
run_devo = PythonOperator(
task_id='run_devo_connector_rqsd',
python_callable=run_devo_connector_rqsd,
provide_context=True,
)
run_mrds = PythonOperator(
task_id='run_mrds_task',
python_callable=run_mrds_task,
provide_context=True,
)
finalize = PythonOperator(
task_id='finalise_workflow',
python_callable=finalise_workflow_task,
provide_context=True,
)
# Task dependencies
run_devo >> run_mrds >> finalize

View File

@@ -0,0 +1,320 @@
from __future__ import annotations
import os
import sys
import logging
import yaml
from datetime import timedelta
from airflow import DAG
from airflow.utils.dates import days_ago
from airflow.utils.trigger_rule import TriggerRule
from airflow.operators.python import PythonOperator
from airflow.providers.common.sql.operators.sql import SQLExecuteQueryOperator
try:
from airflow.exceptions import AirflowFailException
except Exception: # fallback for older Airflow
from airflow.exceptions import AirflowException as AirflowFailException
# --- Custom module paths (as in snippet) ---
sys.path.append('/opt/airflow/python/connectors/devo')
sys.path.append('/opt/airflow/python/mrds_common')
sys.path.append('/opt/airflow/src/airflow/dags/ods/rqsd')
sys.path.append('/opt/airflow/python/devo_replicator/data_replicator')
# --- custom imports ---
from mrds.utils import oraconn
from impala_refresher import main as impala_main
# --- Config path ---
ENV_CONFIG_PATH = "/opt/airflow/python/devo_replicator/config/env_config.yaml"
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': days_ago(1),
'email_on_failure': False,
'email_on_retry': False,
'retries': 2,
'retry_delay': timedelta(minutes=5),
}
with DAG(
dag_id='rqsd_devo_replicator_2',
default_args=default_args,
description='Run Devo replicator workflow',
schedule=None,
catchup=False,
tags=['Devo', 'RQSD', 'Replicator'],
) as dag:
# -------------------------------
# 1) Init: read config + set XCom
# -------------------------------
def init_step(**context):
dag_run = context.get("dag_run")
ti = context["ti"]
conf = (dag_run.conf or {}) if dag_run else {}
env = conf.get("env") or os.getenv("MRDS_ENV", "dev").lower()
if env not in {"dev", "tst"}:
raise ValueError(f"Unsupported env '{env}'. Expected 'dev' or 'tst'.")
# hardcoded the mopdb
# ====================================
store = "mopdb"
# ====================================
if store not in {"mopdb", "rar"}:
raise ValueError(f"Unsupported store '{store}'. Expected 'mopdb' or 'rar'.")
p_service_name = "MOPDB" if store == "mopdb" else "RAR"
p_table_owner = "MPEC"
p_table_name = "T_MPEC"
with open(ENV_CONFIG_PATH, "r") as f:
cfg = yaml.safe_load(f)
env_cfg = cfg[env]
store_cfg = cfg[store]
p_objectstore_uri = env_cfg["S3_LOCATION_URI"].replace("{0}",store.lower())
p_run_id = str(ti.run_id)
logging.info("=== init_step === env=%s store=%s run_id=%s", env, store, p_run_id)
logging.info("objectstore_uri=%s", p_objectstore_uri)
xcom = {
"env": env,
"store": store,
"config_path": ENV_CONFIG_PATH,
"p_run_id": p_run_id,
"p_service_name": p_service_name,
"p_table_owner": p_table_owner,
"p_table_name": p_table_name,
"p_objectstore_uri": p_objectstore_uri,
"corporate_store": store_cfg["corporate_store"], # "crp_mopdb" or "crp_rar"
}
print(" ============= DEBUG PARAMS ============= ")
print(xcom)
for k, v in xcom.items():
ti.xcom_push(key=k, value=v)
init = PythonOperator(
task_id='init_step',
python_callable=init_step,
)
# ------------------------------------
# 2) log table (Oracle procedure)
# ------------------------------------
def start_log_table_task(**context):
ti = context["ti"]
# Get parameters from XCom
p_run_id = ti.xcom_pull(task_ids='init_step', key='p_run_id')
p_service_name = ti.xcom_pull(task_ids='init_step', key='p_service_name')
p_table_owner = ti.xcom_pull(task_ids='init_step', key='p_table_owner')
p_table_name = ti.xcom_pull(task_ids='init_step', key='p_table_name')
# Create Oracle connection for this task
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
logging.info("Oracle connection established successfully for start_log_table")
# Execute Oracle procedure using oraconn.run_proc() directly
oraconn.run_proc(
oracle_conn,
'MRDS_LOADER.DATA_REPLICATOR.start_log_table',
[p_run_id, p_service_name, p_table_owner, p_table_name]
)
oracle_conn.commit()
logging.info("start_log_table procedure executed successfully")
except Exception as e:
logging.error(f"Error in start_log_table: {e}")
raise
finally:
if oracle_conn:
try:
oracle_conn.close()
logging.info("Oracle connection closed for start_log_table")
except Exception as e:
logging.error(f"Error closing connection in start_log_table: {e}")
t1 = PythonOperator(
task_id='start_log_table',
python_callable=start_log_table_task,
)
# ---------------------------------------------------------
# 3) Export table (Oracle procedure writes to object store)
# ---------------------------------------------------------
def export_table_task(**context):
ti = context["ti"]
# Get parameters from XCom
p_service_name = ti.xcom_pull(task_ids='init_step', key='p_service_name')
p_table_owner = ti.xcom_pull(task_ids='init_step', key='p_table_owner')
p_table_name = ti.xcom_pull(task_ids='init_step', key='p_table_name')
p_objectstore_uri = ti.xcom_pull(task_ids='init_step', key='p_objectstore_uri')
# Create Oracle connection for this task
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
logging.info("Oracle connection established successfully for export_table")
# Execute Oracle procedure using oraconn.run_proc() directly
oraconn.run_proc(
oracle_conn,
'MRDS_LOADER.DATA_REPLICATOR.export_table',
[p_service_name, p_table_owner, p_table_name, p_objectstore_uri]
)
oracle_conn.commit()
logging.info("export_table procedure executed successfully")
except Exception as e:
logging.error(f"Error in export_table: {e}")
raise
finally:
if oracle_conn:
try:
oracle_conn.close()
logging.info("Oracle connection closed for export_table")
except Exception as e:
logging.error(f"Error closing connection in export_table: {e}")
t2 = PythonOperator(
task_id='export_table',
python_callable=export_table_task,
trigger_rule=TriggerRule.ALL_DONE, # Continue even if t1 failed
)
# ---------------------------------------------
# 4) Devo / Impyla refresh (Python Package)
# ---------------------------------------------
def devo_impyla_task(**context):
ti = context["ti"]
env = ti.xcom_pull(task_ids='init_step', key='env')
store = ti.xcom_pull(task_ids='init_step', key='store')
corporate_store = ti.xcom_pull(task_ids='init_step', key='corporate_store')
config_path = ti.xcom_pull(task_ids='init_step', key='config_path')
owner = ti.xcom_pull(task_ids='init_step', key='p_table_owner')
table = ti.xcom_pull(task_ids='init_step', key='p_table_name')
# For Impala: corporate_store.table_name (e.g., crp_mopdb.T_MPEC)
# NOT corporate_store.owner.table_name (which would be crp_mopdb.MPEC.T_MPEC - INVALID)
table_name = table # Just "T_MPEC"
logging.info(
"Starting Impyla refresh with env=%s store=%s corporate_store=%s table=%s",
env, store, corporate_store, table_name
)
logging.info("Will execute: INVALIDATE METADATA %s.%s", corporate_store, table_name)
logging.info("Will execute: COMPUTE STATS %s.%s", corporate_store, table_name)
try:
# This should result in queries like:
# INVALIDATE METADATA crp_mopdb.T_MPEC
# COMPUTE STATS crp_mopdb.T_MPEC
status = impala_main(config_path, env, table_name, corporate_store)
logging.info("Impyla (Devo) task finished successfully. Status: %s", status)
return status
except Exception as e:
logging.error(f"Error in devo_impyla_task: {e}")
raise
t3 = PythonOperator(
task_id='devo_impyla',
python_callable=devo_impyla_task,
trigger_rule=TriggerRule.ALL_DONE, # Continue even if t2 failed
)
# -------------------------------------
# 5) End log table (always executes after t1, t2, t3 complete - regardless of success/failure)
# -------------------------------------
def end_log_table_task(**context):
ti = context["ti"]
# Get parameters from XCom
p_service_name = ti.xcom_pull(task_ids='init_step', key='p_service_name')
p_table_owner = ti.xcom_pull(task_ids='init_step', key='p_table_owner')
p_table_name = ti.xcom_pull(task_ids='init_step', key='p_table_name')
# Create Oracle connection for this task
oracle_conn = None
try:
oracle_conn = oraconn.connect('MRDS_LOADER')
logging.info("Oracle connection established successfully for end_log_table")
# Execute Oracle procedure using oraconn.run_proc() directly
oraconn.run_proc(
oracle_conn,
'MRDS_LOADER.DATA_REPLICATOR.end_log_table',
[p_service_name, p_table_owner, p_table_name]
)
oracle_conn.commit()
logging.info("end_log_table procedure executed successfully")
except Exception as e:
logging.error(f"Error in end_log_table: {e}")
# Don't raise the exception since this is a cleanup task
logging.info("Continuing despite end_log_table error (cleanup task)")
finally:
if oracle_conn:
try:
oracle_conn.close()
logging.info("Oracle connection closed for end_log_table")
except Exception as e:
logging.error(f"Error closing connection in end_log_table: {e}")
t4 = PythonOperator(
task_id='end_log_table',
python_callable=end_log_table_task,
trigger_rule=TriggerRule.ALL_DONE, # Run after t1, t2, t3 complete (success or failure)
)
# -----------------------------------------------------
# 6) Check and fail the DAG if any of t1..t3 actually failed
# This task always runs after t4, but will fail the DAG if needed
# -----------------------------------------------------
def fail_if_any_failed(**context):
dag_run = context['dag_run']
check_tasks = ['start_log_table', 'export_table', 'devo_impyla']
failed = []
for tid in check_tasks:
ti_up = dag_run.get_task_instance(tid)
if ti_up and ti_up.state == 'failed':
failed.append(tid)
if failed:
error_msg = f"Critical task(s) failed: {', '.join(failed)}. DAG execution failed."
logging.error(error_msg)
raise AirflowFailException(error_msg)
logging.info("All critical tasks completed successfully: %s", check_tasks)
t5 = PythonOperator(
task_id='fail_if_any_failed',
python_callable=fail_if_any_failed,
trigger_rule=TriggerRule.ALL_DONE, # Always run after t4
)
# ---------
# Task Dependencies - SEQUENTIAL
# ---------
# Sequential flow: init -> t1 -> t2 -> t3
init >> t1 >> t2 >> t3
# t4 runs after t1, t2, t3 are all done (regardless of success/failure)
[t1, t2, t3] >> t4
# t5 always runs after t4 to check for failures and fail the DAG if needed
t4 >> t5

View File

@@ -0,0 +1,18 @@
from airflow import DAG
#from airflow.providers.oracle.operators.oracle import OracleOperator
from airflow.operators.bash import BashOperator
from datetime import datetime
from airflow import DAG
from airflow.providers.common.sql.operators.sql import SQLExecuteQueryOperator
with DAG(
'test_oracle_connection',
start_date=datetime(2025, 6, 13),
schedule_interval=None
) as dag:
test_query = SQLExecuteQueryOperator(
task_id='test_oracle_query',
conn_id='oracle_default',
sql='SELECT 1 FROM DUAL'
)