from __future__ import annotations import os import sys import logging import yaml from datetime import timedelta from airflow import DAG from airflow.utils.dates import days_ago from airflow.utils.trigger_rule import TriggerRule from airflow.operators.python import PythonOperator from airflow.providers.common.sql.operators.sql import SQLExecuteQueryOperator try: from airflow.exceptions import AirflowFailException except Exception: # fallback for older Airflow from airflow.exceptions import AirflowException as AirflowFailException # --- Custom module paths (as in snippet) --- sys.path.append('/opt/airflow/python/connectors/devo') sys.path.append('/opt/airflow/python/mrds_common') sys.path.append('/opt/airflow/src/airflow/dags/ods/rqsd') sys.path.append('/opt/airflow/python/devo_replicator/data_replicator') # --- custom imports --- from mrds.utils import oraconn from impala_refresher import main as impala_main # --- Config path --- ENV_CONFIG_PATH = "/opt/airflow/python/devo_replicator/config/env_config.yaml" default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': days_ago(1), 'email_on_failure': False, 'email_on_retry': False, 'retries': 2, 'retry_delay': timedelta(minutes=5), } with DAG( dag_id='rqsd_devo_replicator_2', default_args=default_args, description='Run Devo replicator workflow', schedule=None, catchup=False, tags=['Devo', 'RQSD', 'Replicator'], ) as dag: # ------------------------------- # 1) Init: read config + set XCom # ------------------------------- def init_step(**context): dag_run = context.get("dag_run") ti = context["ti"] conf = (dag_run.conf or {}) if dag_run else {} env = conf.get("env") or os.getenv("MRDS_ENV", "dev").lower() if env not in {"dev", "tst"}: raise ValueError(f"Unsupported env '{env}'. Expected 'dev' or 'tst'.") # hardcoded the mopdb # ==================================== store = "mopdb" # ==================================== if store not in {"mopdb", "rar"}: raise ValueError(f"Unsupported store '{store}'. Expected 'mopdb' or 'rar'.") p_service_name = "MOPDB" if store == "mopdb" else "RAR" p_table_owner = "MPEC" p_table_name = "T_MPEC" with open(ENV_CONFIG_PATH, "r") as f: cfg = yaml.safe_load(f) env_cfg = cfg[env] store_cfg = cfg[store] p_objectstore_uri = env_cfg["S3_LOCATION_URI"].replace("{0}",store.lower()) p_run_id = str(ti.run_id) logging.info("=== init_step === env=%s store=%s run_id=%s", env, store, p_run_id) logging.info("objectstore_uri=%s", p_objectstore_uri) xcom = { "env": env, "store": store, "config_path": ENV_CONFIG_PATH, "p_run_id": p_run_id, "p_service_name": p_service_name, "p_table_owner": p_table_owner, "p_table_name": p_table_name, "p_objectstore_uri": p_objectstore_uri, "corporate_store": store_cfg["corporate_store"], # "crp_mopdb" or "crp_rar" } print(" ============= DEBUG PARAMS ============= ") print(xcom) for k, v in xcom.items(): ti.xcom_push(key=k, value=v) init = PythonOperator( task_id='init_step', python_callable=init_step, ) # ------------------------------------ # 2) log table (Oracle procedure) # ------------------------------------ def start_log_table_task(**context): ti = context["ti"] # Get parameters from XCom p_run_id = ti.xcom_pull(task_ids='init_step', key='p_run_id') p_service_name = ti.xcom_pull(task_ids='init_step', key='p_service_name') p_table_owner = ti.xcom_pull(task_ids='init_step', key='p_table_owner') p_table_name = ti.xcom_pull(task_ids='init_step', key='p_table_name') # Create Oracle connection for this task oracle_conn = None try: oracle_conn = oraconn.connect('MRDS_LOADER') logging.info("Oracle connection established successfully for start_log_table") # Execute Oracle procedure using oraconn.run_proc() directly oraconn.run_proc( oracle_conn, 'MRDS_LOADER.DATA_REPLICATOR.start_log_table', [p_run_id, p_service_name, p_table_owner, p_table_name] ) oracle_conn.commit() logging.info("start_log_table procedure executed successfully") except Exception as e: logging.error(f"Error in start_log_table: {e}") raise finally: if oracle_conn: try: oracle_conn.close() logging.info("Oracle connection closed for start_log_table") except Exception as e: logging.error(f"Error closing connection in start_log_table: {e}") t1 = PythonOperator( task_id='start_log_table', python_callable=start_log_table_task, ) # --------------------------------------------------------- # 3) Export table (Oracle procedure writes to object store) # --------------------------------------------------------- def export_table_task(**context): ti = context["ti"] # Get parameters from XCom p_service_name = ti.xcom_pull(task_ids='init_step', key='p_service_name') p_table_owner = ti.xcom_pull(task_ids='init_step', key='p_table_owner') p_table_name = ti.xcom_pull(task_ids='init_step', key='p_table_name') p_objectstore_uri = ti.xcom_pull(task_ids='init_step', key='p_objectstore_uri') # Create Oracle connection for this task oracle_conn = None try: oracle_conn = oraconn.connect('MRDS_LOADER') logging.info("Oracle connection established successfully for export_table") # Execute Oracle procedure using oraconn.run_proc() directly oraconn.run_proc( oracle_conn, 'MRDS_LOADER.DATA_REPLICATOR.export_table', [p_service_name, p_table_owner, p_table_name, p_objectstore_uri] ) oracle_conn.commit() logging.info("export_table procedure executed successfully") except Exception as e: logging.error(f"Error in export_table: {e}") raise finally: if oracle_conn: try: oracle_conn.close() logging.info("Oracle connection closed for export_table") except Exception as e: logging.error(f"Error closing connection in export_table: {e}") t2 = PythonOperator( task_id='export_table', python_callable=export_table_task, trigger_rule=TriggerRule.ALL_DONE, # Continue even if t1 failed ) # --------------------------------------------- # 4) Devo / Impyla refresh (Python Package) # --------------------------------------------- def devo_impyla_task(**context): ti = context["ti"] env = ti.xcom_pull(task_ids='init_step', key='env') store = ti.xcom_pull(task_ids='init_step', key='store') corporate_store = ti.xcom_pull(task_ids='init_step', key='corporate_store') config_path = ti.xcom_pull(task_ids='init_step', key='config_path') owner = ti.xcom_pull(task_ids='init_step', key='p_table_owner') table = ti.xcom_pull(task_ids='init_step', key='p_table_name') # For Impala: corporate_store.table_name (e.g., crp_mopdb.T_MPEC) # NOT corporate_store.owner.table_name (which would be crp_mopdb.MPEC.T_MPEC - INVALID) table_name = table # Just "T_MPEC" logging.info( "Starting Impyla refresh with env=%s store=%s corporate_store=%s table=%s", env, store, corporate_store, table_name ) logging.info("Will execute: INVALIDATE METADATA %s.%s", corporate_store, table_name) logging.info("Will execute: COMPUTE STATS %s.%s", corporate_store, table_name) try: # This should result in queries like: # INVALIDATE METADATA crp_mopdb.T_MPEC # COMPUTE STATS crp_mopdb.T_MPEC status = impala_main(config_path, env, table_name, corporate_store) logging.info("Impyla (Devo) task finished successfully. Status: %s", status) return status except Exception as e: logging.error(f"Error in devo_impyla_task: {e}") raise t3 = PythonOperator( task_id='devo_impyla', python_callable=devo_impyla_task, trigger_rule=TriggerRule.ALL_DONE, # Continue even if t2 failed ) # ------------------------------------- # 5) End log table (always executes after t1, t2, t3 complete - regardless of success/failure) # ------------------------------------- def end_log_table_task(**context): ti = context["ti"] # Get parameters from XCom p_service_name = ti.xcom_pull(task_ids='init_step', key='p_service_name') p_table_owner = ti.xcom_pull(task_ids='init_step', key='p_table_owner') p_table_name = ti.xcom_pull(task_ids='init_step', key='p_table_name') # Create Oracle connection for this task oracle_conn = None try: oracle_conn = oraconn.connect('MRDS_LOADER') logging.info("Oracle connection established successfully for end_log_table") # Execute Oracle procedure using oraconn.run_proc() directly oraconn.run_proc( oracle_conn, 'MRDS_LOADER.DATA_REPLICATOR.end_log_table', [p_service_name, p_table_owner, p_table_name] ) oracle_conn.commit() logging.info("end_log_table procedure executed successfully") except Exception as e: logging.error(f"Error in end_log_table: {e}") # Don't raise the exception since this is a cleanup task logging.info("Continuing despite end_log_table error (cleanup task)") finally: if oracle_conn: try: oracle_conn.close() logging.info("Oracle connection closed for end_log_table") except Exception as e: logging.error(f"Error closing connection in end_log_table: {e}") t4 = PythonOperator( task_id='end_log_table', python_callable=end_log_table_task, trigger_rule=TriggerRule.ALL_DONE, # Run after t1, t2, t3 complete (success or failure) ) # ----------------------------------------------------- # 6) Check and fail the DAG if any of t1..t3 actually failed # This task always runs after t4, but will fail the DAG if needed # ----------------------------------------------------- def fail_if_any_failed(**context): dag_run = context['dag_run'] check_tasks = ['start_log_table', 'export_table', 'devo_impyla'] failed = [] for tid in check_tasks: ti_up = dag_run.get_task_instance(tid) if ti_up and ti_up.state == 'failed': failed.append(tid) if failed: error_msg = f"Critical task(s) failed: {', '.join(failed)}. DAG execution failed." logging.error(error_msg) raise AirflowFailException(error_msg) logging.info("All critical tasks completed successfully: %s", check_tasks) t5 = PythonOperator( task_id='fail_if_any_failed', python_callable=fail_if_any_failed, trigger_rule=TriggerRule.ALL_DONE, # Always run after t4 ) # --------- # Task Dependencies - SEQUENTIAL # --------- # Sequential flow: init -> t1 -> t2 -> t3 init >> t1 >> t2 >> t3 # t4 runs after t1, t2, t3 are all done (regardless of success/failure) [t1, t2, t3] >> t4 # t5 always runs after t4 to check for failures and fail the DAG if needed t4 >> t5