321 lines
12 KiB
Python
321 lines
12 KiB
Python
from __future__ import annotations
|
|
|
|
import os
|
|
import sys
|
|
import logging
|
|
import yaml
|
|
from datetime import timedelta
|
|
|
|
from airflow import DAG
|
|
from airflow.utils.dates import days_ago
|
|
from airflow.utils.trigger_rule import TriggerRule
|
|
from airflow.operators.python import PythonOperator
|
|
from airflow.providers.common.sql.operators.sql import SQLExecuteQueryOperator
|
|
|
|
try:
|
|
from airflow.exceptions import AirflowFailException
|
|
except Exception: # fallback for older Airflow
|
|
from airflow.exceptions import AirflowException as AirflowFailException
|
|
|
|
# --- Custom module paths (as in snippet) ---
|
|
sys.path.append('/opt/airflow/python/connectors/devo')
|
|
sys.path.append('/opt/airflow/python/mrds_common')
|
|
sys.path.append('/opt/airflow/src/airflow/dags/ods/rqsd')
|
|
sys.path.append('/opt/airflow/python/devo_replicator/data_replicator')
|
|
|
|
# --- custom imports ---
|
|
from mrds.utils import oraconn
|
|
from impala_refresher import main as impala_main
|
|
|
|
# --- Config path ---
|
|
ENV_CONFIG_PATH = "/opt/airflow/python/devo_replicator/config/env_config.yaml"
|
|
|
|
default_args = {
|
|
'owner': 'airflow',
|
|
'depends_on_past': False,
|
|
'start_date': days_ago(1),
|
|
'email_on_failure': False,
|
|
'email_on_retry': False,
|
|
'retries': 2,
|
|
'retry_delay': timedelta(minutes=5),
|
|
}
|
|
|
|
with DAG(
|
|
dag_id='rqsd_devo_replicator_2',
|
|
default_args=default_args,
|
|
description='Run Devo replicator workflow',
|
|
schedule=None,
|
|
catchup=False,
|
|
tags=['Devo', 'RQSD', 'Replicator'],
|
|
) as dag:
|
|
|
|
# -------------------------------
|
|
# 1) Init: read config + set XCom
|
|
# -------------------------------
|
|
def init_step(**context):
|
|
dag_run = context.get("dag_run")
|
|
ti = context["ti"]
|
|
|
|
conf = (dag_run.conf or {}) if dag_run else {}
|
|
|
|
env = conf.get("env") or os.getenv("MRDS_ENV", "dev").lower()
|
|
if env not in {"dev", "tst"}:
|
|
raise ValueError(f"Unsupported env '{env}'. Expected 'dev' or 'tst'.")
|
|
|
|
# hardcoded the mopdb
|
|
# ====================================
|
|
store = "mopdb"
|
|
# ====================================
|
|
|
|
if store not in {"mopdb", "rar"}:
|
|
raise ValueError(f"Unsupported store '{store}'. Expected 'mopdb' or 'rar'.")
|
|
|
|
p_service_name = "MOPDB" if store == "mopdb" else "RAR"
|
|
p_table_owner = "MPEC"
|
|
p_table_name = "T_MPEC"
|
|
|
|
with open(ENV_CONFIG_PATH, "r") as f:
|
|
cfg = yaml.safe_load(f)
|
|
|
|
env_cfg = cfg[env]
|
|
store_cfg = cfg[store]
|
|
|
|
p_objectstore_uri = env_cfg["S3_LOCATION_URI"].replace("{0}",store.lower())
|
|
|
|
p_run_id = str(ti.run_id)
|
|
logging.info("=== init_step === env=%s store=%s run_id=%s", env, store, p_run_id)
|
|
logging.info("objectstore_uri=%s", p_objectstore_uri)
|
|
|
|
xcom = {
|
|
"env": env,
|
|
"store": store,
|
|
"config_path": ENV_CONFIG_PATH,
|
|
"p_run_id": p_run_id,
|
|
"p_service_name": p_service_name,
|
|
"p_table_owner": p_table_owner,
|
|
"p_table_name": p_table_name,
|
|
"p_objectstore_uri": p_objectstore_uri,
|
|
"corporate_store": store_cfg["corporate_store"], # "crp_mopdb" or "crp_rar"
|
|
}
|
|
|
|
print(" ============= DEBUG PARAMS ============= ")
|
|
print(xcom)
|
|
|
|
for k, v in xcom.items():
|
|
ti.xcom_push(key=k, value=v)
|
|
|
|
init = PythonOperator(
|
|
task_id='init_step',
|
|
python_callable=init_step,
|
|
)
|
|
|
|
# ------------------------------------
|
|
# 2) log table (Oracle procedure)
|
|
# ------------------------------------
|
|
def start_log_table_task(**context):
|
|
ti = context["ti"]
|
|
|
|
# Get parameters from XCom
|
|
p_run_id = ti.xcom_pull(task_ids='init_step', key='p_run_id')
|
|
p_service_name = ti.xcom_pull(task_ids='init_step', key='p_service_name')
|
|
p_table_owner = ti.xcom_pull(task_ids='init_step', key='p_table_owner')
|
|
p_table_name = ti.xcom_pull(task_ids='init_step', key='p_table_name')
|
|
|
|
# Create Oracle connection for this task
|
|
oracle_conn = None
|
|
try:
|
|
oracle_conn = oraconn.connect('MRDS_LOADER')
|
|
logging.info("Oracle connection established successfully for start_log_table")
|
|
|
|
# Execute Oracle procedure using oraconn.run_proc() directly
|
|
oraconn.run_proc(
|
|
oracle_conn,
|
|
'MRDS_LOADER.DATA_REPLICATOR.start_log_table',
|
|
[p_run_id, p_service_name, p_table_owner, p_table_name]
|
|
)
|
|
oracle_conn.commit()
|
|
logging.info("start_log_table procedure executed successfully")
|
|
|
|
except Exception as e:
|
|
logging.error(f"Error in start_log_table: {e}")
|
|
raise
|
|
finally:
|
|
if oracle_conn:
|
|
try:
|
|
oracle_conn.close()
|
|
logging.info("Oracle connection closed for start_log_table")
|
|
except Exception as e:
|
|
logging.error(f"Error closing connection in start_log_table: {e}")
|
|
|
|
t1 = PythonOperator(
|
|
task_id='start_log_table',
|
|
python_callable=start_log_table_task,
|
|
)
|
|
|
|
# ---------------------------------------------------------
|
|
# 3) Export table (Oracle procedure writes to object store)
|
|
# ---------------------------------------------------------
|
|
def export_table_task(**context):
|
|
ti = context["ti"]
|
|
|
|
# Get parameters from XCom
|
|
p_service_name = ti.xcom_pull(task_ids='init_step', key='p_service_name')
|
|
p_table_owner = ti.xcom_pull(task_ids='init_step', key='p_table_owner')
|
|
p_table_name = ti.xcom_pull(task_ids='init_step', key='p_table_name')
|
|
p_objectstore_uri = ti.xcom_pull(task_ids='init_step', key='p_objectstore_uri')
|
|
|
|
# Create Oracle connection for this task
|
|
oracle_conn = None
|
|
try:
|
|
oracle_conn = oraconn.connect('MRDS_LOADER')
|
|
logging.info("Oracle connection established successfully for export_table")
|
|
|
|
# Execute Oracle procedure using oraconn.run_proc() directly
|
|
oraconn.run_proc(
|
|
oracle_conn,
|
|
'MRDS_LOADER.DATA_REPLICATOR.export_table',
|
|
[p_service_name, p_table_owner, p_table_name, p_objectstore_uri]
|
|
)
|
|
oracle_conn.commit()
|
|
logging.info("export_table procedure executed successfully")
|
|
|
|
except Exception as e:
|
|
logging.error(f"Error in export_table: {e}")
|
|
raise
|
|
finally:
|
|
if oracle_conn:
|
|
try:
|
|
oracle_conn.close()
|
|
logging.info("Oracle connection closed for export_table")
|
|
except Exception as e:
|
|
logging.error(f"Error closing connection in export_table: {e}")
|
|
|
|
t2 = PythonOperator(
|
|
task_id='export_table',
|
|
python_callable=export_table_task,
|
|
trigger_rule=TriggerRule.ALL_DONE, # Continue even if t1 failed
|
|
)
|
|
|
|
# ---------------------------------------------
|
|
# 4) Devo / Impyla refresh (Python Package)
|
|
# ---------------------------------------------
|
|
def devo_impyla_task(**context):
|
|
ti = context["ti"]
|
|
env = ti.xcom_pull(task_ids='init_step', key='env')
|
|
store = ti.xcom_pull(task_ids='init_step', key='store')
|
|
corporate_store = ti.xcom_pull(task_ids='init_step', key='corporate_store')
|
|
config_path = ti.xcom_pull(task_ids='init_step', key='config_path')
|
|
owner = ti.xcom_pull(task_ids='init_step', key='p_table_owner')
|
|
table = ti.xcom_pull(task_ids='init_step', key='p_table_name')
|
|
|
|
# For Impala: corporate_store.table_name (e.g., crp_mopdb.T_MPEC)
|
|
# NOT corporate_store.owner.table_name (which would be crp_mopdb.MPEC.T_MPEC - INVALID)
|
|
table_name = table # Just "T_MPEC"
|
|
|
|
logging.info(
|
|
"Starting Impyla refresh with env=%s store=%s corporate_store=%s table=%s",
|
|
env, store, corporate_store, table_name
|
|
)
|
|
logging.info("Will execute: INVALIDATE METADATA %s.%s", corporate_store, table_name)
|
|
logging.info("Will execute: COMPUTE STATS %s.%s", corporate_store, table_name)
|
|
|
|
try:
|
|
# This should result in queries like:
|
|
# INVALIDATE METADATA crp_mopdb.T_MPEC
|
|
# COMPUTE STATS crp_mopdb.T_MPEC
|
|
status = impala_main(config_path, env, table_name, corporate_store)
|
|
logging.info("Impyla (Devo) task finished successfully. Status: %s", status)
|
|
return status
|
|
except Exception as e:
|
|
logging.error(f"Error in devo_impyla_task: {e}")
|
|
raise
|
|
|
|
t3 = PythonOperator(
|
|
task_id='devo_impyla',
|
|
python_callable=devo_impyla_task,
|
|
trigger_rule=TriggerRule.ALL_DONE, # Continue even if t2 failed
|
|
)
|
|
|
|
# -------------------------------------
|
|
# 5) End log table (always executes after t1, t2, t3 complete - regardless of success/failure)
|
|
# -------------------------------------
|
|
def end_log_table_task(**context):
|
|
ti = context["ti"]
|
|
|
|
# Get parameters from XCom
|
|
p_service_name = ti.xcom_pull(task_ids='init_step', key='p_service_name')
|
|
p_table_owner = ti.xcom_pull(task_ids='init_step', key='p_table_owner')
|
|
p_table_name = ti.xcom_pull(task_ids='init_step', key='p_table_name')
|
|
|
|
# Create Oracle connection for this task
|
|
oracle_conn = None
|
|
try:
|
|
oracle_conn = oraconn.connect('MRDS_LOADER')
|
|
logging.info("Oracle connection established successfully for end_log_table")
|
|
|
|
# Execute Oracle procedure using oraconn.run_proc() directly
|
|
oraconn.run_proc(
|
|
oracle_conn,
|
|
'MRDS_LOADER.DATA_REPLICATOR.end_log_table',
|
|
[p_service_name, p_table_owner, p_table_name]
|
|
)
|
|
oracle_conn.commit()
|
|
logging.info("end_log_table procedure executed successfully")
|
|
|
|
except Exception as e:
|
|
logging.error(f"Error in end_log_table: {e}")
|
|
# Don't raise the exception since this is a cleanup task
|
|
logging.info("Continuing despite end_log_table error (cleanup task)")
|
|
finally:
|
|
if oracle_conn:
|
|
try:
|
|
oracle_conn.close()
|
|
logging.info("Oracle connection closed for end_log_table")
|
|
except Exception as e:
|
|
logging.error(f"Error closing connection in end_log_table: {e}")
|
|
|
|
t4 = PythonOperator(
|
|
task_id='end_log_table',
|
|
python_callable=end_log_table_task,
|
|
trigger_rule=TriggerRule.ALL_DONE, # Run after t1, t2, t3 complete (success or failure)
|
|
)
|
|
|
|
# -----------------------------------------------------
|
|
# 6) Check and fail the DAG if any of t1..t3 actually failed
|
|
# This task always runs after t4, but will fail the DAG if needed
|
|
# -----------------------------------------------------
|
|
def fail_if_any_failed(**context):
|
|
dag_run = context['dag_run']
|
|
check_tasks = ['start_log_table', 'export_table', 'devo_impyla']
|
|
failed = []
|
|
|
|
for tid in check_tasks:
|
|
ti_up = dag_run.get_task_instance(tid)
|
|
if ti_up and ti_up.state == 'failed':
|
|
failed.append(tid)
|
|
|
|
if failed:
|
|
error_msg = f"Critical task(s) failed: {', '.join(failed)}. DAG execution failed."
|
|
logging.error(error_msg)
|
|
raise AirflowFailException(error_msg)
|
|
|
|
logging.info("All critical tasks completed successfully: %s", check_tasks)
|
|
|
|
t5 = PythonOperator(
|
|
task_id='fail_if_any_failed',
|
|
python_callable=fail_if_any_failed,
|
|
trigger_rule=TriggerRule.ALL_DONE, # Always run after t4
|
|
)
|
|
|
|
# ---------
|
|
# Task Dependencies - SEQUENTIAL
|
|
# ---------
|
|
# Sequential flow: init -> t1 -> t2 -> t3
|
|
init >> t1 >> t2 >> t3
|
|
|
|
# t4 runs after t1, t2, t3 are all done (regardless of success/failure)
|
|
[t1, t2, t3] >> t4
|
|
|
|
# t5 always runs after t4 to check for failures and fail the DAG if needed
|
|
t4 >> t5
|