Files
mars-elt/airflow/ods/lm/balancesheet/dags/w_ODS_LM_BALANCESHEET.py
Grzegorz Michalski 2c225d68ac init
2026-03-02 09:47:35 +01:00

521 lines
20 KiB
Python

# dags/m_ODS_LM_BALANCESHEET.py
# Idempotent, per-object mtime tracking
import sys
import os
import json
import logging
from pathlib import Path
from datetime import timedelta, datetime, timezone
from email.utils import parsedate_to_datetime
from airflow import DAG
from airflow.models import Variable
from airflow.decorators import task as af_task
from airflow.operators.python import PythonOperator
from airflow.utils.dates import days_ago
from airflow.utils.trigger_rule import TriggerRule
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
from airflow.operators.empty import EmptyOperator
try:
from airflow.exceptions import AirflowFailException, AirflowSkipException
except Exception:
from airflow.exceptions import AirflowException as AirflowFailException
from airflow.exceptions import AirflowSkipException
# Import libs
sys.path.append('/opt/airflow/python/mrds_common')
sys.path.append('/opt/airflow/src/airflow/dags/ods/exdi')
from mrds.utils.manage_runs import init_workflow as mrds_init_workflow, finalise_workflow as mrds_finalise_workflow
from mrds.core import main as mrds_main
dag_id = Path(__file__).stem
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': days_ago(1),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
}
WORKFLOW_CONFIG = {
"database_name": "ODS",
"workflow_name": dag_id,
}
# OCI settings
OCI_NAMESPACE = os.getenv("BUCKET_NAMESPACE")
OCI_BUCKET = os.getenv("INBOX_BUCKET")
# Config YAML (single config for all files)
CONFIG_YAML = os.getenv(
"EXDI_SINGLE_CONFIG_YAML",
"/opt/airflow/src/airflow/dags/ods/lm/balancesheet/config/m_ODS_LM_BALANCESHEET_PARSE.yaml",
)
logging.info("Using EXDI_SINGLE_CONFIG_YAML=%s", CONFIG_YAML)
# Idempotency controls
REPROCESS = (os.getenv("EXDI_REPROCESS", "false").lower() in ("1", "true", "yes"))
LAST_TS_VAR = f"{dag_id}__last_seen_ts" # legacy watermark (kept for observability)
PROCESSED_SET_VAR = f"{dag_id}__processed_objects" # legacy: list of keys (back-compat only)
PROCESSED_TS_VAR = f"{dag_id}__processed_objects_ts" # NEW: map key -> last processed mtime (epoch float)
# Helpers
def _oci_client():
"""
Create an OCI Object Storage client.
Order: Resource Principals -> Instance Principals.
"""
import oci
region = os.getenv("OCI_REGION") or os.getenv("OCI_RESOURCE_PRINCIPAL_REGION") or "eu-frankfurt-1"
# RP
try:
rp_signer = oci.auth.signers.get_resource_principals_signer()
cfg = {"region": region} if region else {}
logging.info("Using OCI Resource Principals signer (region=%s).", cfg.get("region"))
return oci.object_storage.ObjectStorageClient(cfg, signer=rp_signer)
except Exception as e:
logging.info("RP not available: %s", e)
# IP
try:
ip_signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
cfg = {"region": region} if region else {}
logging.info("Using OCI Instance Principals signer (region=%s).", cfg.get("region"))
return oci.object_storage.ObjectStorageClient(cfg, signer=ip_signer)
except Exception as e:
logging.info("IP not available: %s", e)
logging.error("Neither Resource Principals nor Instance Principals authentication found.")
raise RuntimeError("Failed to create OCI client")
def _load_yaml(cfg_path: str) -> dict:
import yaml
p = Path(cfg_path)
if not p.exists():
raise FileNotFoundError(f"Config YAML not found: {cfg_path}")
return yaml.safe_load(p.read_text()) or {}
# Build config-derived constants directly from YAML
try:
CONFIG_DATA = _load_yaml(CONFIG_YAML)
OBJECT_PREFIX = CONFIG_DATA.get("inbox_prefix")
if not (isinstance(OBJECT_PREFIX, str) and OBJECT_PREFIX.strip()):
raise AirflowFailException("YAML must define 'inbox_prefix' for OBJECT_PREFIX.")
OBJECT_PREFIX = OBJECT_PREFIX.strip()
logging.info("YAML inbox_prefix -> OBJECT_PREFIX: %s", OBJECT_PREFIX)
except Exception as e:
logging.error("Failed to resolve OBJECT_PREFIX from YAML %s: %s", CONFIG_YAML, e)
OBJECT_PREFIX = None
# New idempotency map (key -> last_processed_ts)
def _load_processed_map() -> dict[str, float]:
"""
Returns {object_key: last_processed_ts}.
Back-compat: if old set variable exists (list), treat those keys as ts=0.
"""
try:
raw = Variable.get(PROCESSED_TS_VAR, default_var="{}")
m = json.loads(raw) or {}
if isinstance(m, dict):
return {k: float(v) for k, v in m.items()}
except Exception:
pass
# Back-compat: migrate old set/list
try:
old = json.loads(Variable.get(PROCESSED_SET_VAR, default_var="[]"))
if isinstance(old, list):
return {k: 0.0 for k in old}
except Exception:
pass
return {}
def _save_processed_map(m: dict[str, float]) -> None:
Variable.set(PROCESSED_TS_VAR, json.dumps(m))
def _mark_processed_ts(objs: list[tuple[str, float]]):
"""
Update processed map with list of (object_key, mtime).
"""
if REPROCESS or not objs:
return
m = _load_processed_map()
for key, ts in objs:
try:
ts = float(ts)
except Exception:
continue
prev = float(m.get(key, 0.0))
if ts > prev:
m[key] = ts
_save_processed_map(m)
logging.info("Processed map updated; size=%d", len(m))
# Object listing (per-key mtime)
def _list_new_xml_objects(prefix: str) -> list[dict]:
"""
List .xml objects and decide inclusion per-object:
include if REPROCESS or object_mtime > processed_map.get(object_key, 0.0)
Returns: [{"name": "<full-key>", "base": "<file.xml>", "mtime": <epoch float>}]
"""
if not OCI_NAMESPACE or not OCI_BUCKET:
raise AirflowFailException("BUCKET_NAMESPACE and INBOX_BUCKET must be set")
client = _oci_client()
processed_map = _load_processed_map()
try:
last_seen = float(Variable.get(LAST_TS_VAR, default_var="0"))
except Exception:
last_seen = 0.0
logging.info("Watermark last_seen=%s; processed_map_count=%d; prefix=%s",
last_seen, len(processed_map), prefix)
# NOTE: add pagination if needed
resp = client.list_objects(OCI_NAMESPACE, OCI_BUCKET, prefix=prefix)
new_items: list[dict] = []
newest_ts = last_seen
for o in (resp.data.objects or []):
name = (o.name or "").strip()
base = name.rsplit("/", 1)[-1] if name else ""
logging.info("Processing object: %s", base)
# Skip folder markers / empty keys
if not name or name.endswith('/') or not base:
logging.debug("Skip: folder marker or empty key: %r", name)
continue
if not base.lower().endswith(".xml"):
logging.debug("Skip: not .xml: %r", name)
continue
# Resolve mtime
ts = None
t = getattr(o, "time_created", None)
if t:
try:
ts = t.timestamp() if hasattr(t, "timestamp") else float(t) / 1000.0
except Exception:
ts = None
if ts is None:
try:
head = client.head_object(OCI_NAMESPACE, OCI_BUCKET, name)
lm = head.headers.get("last-modified") or head.headers.get("Last-Modified")
if lm:
dt = parsedate_to_datetime(lm)
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
ts = dt.timestamp()
logging.debug("Resolved ts via HEAD Last-Modified for %s: %s", name, ts)
except Exception as e:
logging.warning("head_object failed for %s: %s", name, e)
if ts is None:
ts = datetime.now(timezone.utc).timestamp()
logging.warning("Object %s missing timestamp; falling back to now=%s", name, ts)
last_proc_ts = float(processed_map.get(name, 0.0))
include = REPROCESS or (ts > last_proc_ts)
logging.info(
"Decision for %s: obj_ts=%s, last_proc_ts=%s, REPROCESS=%s -> include=%s",
name, ts, last_proc_ts, REPROCESS, include
)
if not include:
continue
item = {"name": name, "base": base, "mtime": ts}
new_items.append(item)
if ts > newest_ts:
newest_ts = ts
# Watermark advanced for visibility (optional)
if not REPROCESS and new_items and newest_ts > last_seen:
Variable.set(LAST_TS_VAR, str(newest_ts))
logging.info("Advanced watermark from %s to %s", last_seen, newest_ts)
new_items.sort(key=lambda x: x["mtime"]) # ascending
logging.info("Found %d candidate .xml object(s) under prefix %s", len(new_items), prefix)
return new_items
# DAG
with DAG(
dag_id=dag_id,
default_args=default_args,
description='EXDI workflow (polling): single YAML config for all XML files in OCI',
schedule_interval=None, # Run EVERY 10 MIN
catchup=False,
max_active_runs=1,
render_template_as_native_obj=True,
tags=["EXDI", "MRDS", "ODS", "OCI", "BALANCESHEET"],
) as dag:
@af_task(task_id="poll_oci_for_xml")
def poll_oci_for_xml():
"""
Lists new .xml objects and prepares a workload list.
Returns {"workload": [{"object": "<key>", "base": "<file.xml>", "mtime": <float>} ...]}
"""
if not OBJECT_PREFIX:
raise AirflowFailException("No OCI object prefix configured. Check YAML 'inbox_prefix'.")
new_objs = _list_new_xml_objects(OBJECT_PREFIX)
logging.info("New .xml objects found: %s", json.dumps(new_objs, indent=2))
print("New .xml objects found:", json.dumps(new_objs, indent=2))
# already contains base + mtime
workload = [{"object": it["name"], "base": it["base"], "mtime": it["mtime"]} for it in new_objs]
logging.info("Prepared workload items: %d", len(workload))
print("Prepared workload:", json.dumps(workload, indent=2))
return {"workload": workload}
@af_task(task_id="init_workflow")
def init_workflow(polled: dict):
"""Initialize workflow; start MRDS workflow; build per-file task configs."""
database_name = WORKFLOW_CONFIG["database_name"]
workflow_name = WORKFLOW_CONFIG["workflow_name"]
env = os.getenv("MRDS_ENV", "dev")
username = os.getenv("MRDS_LOADER_DB_USER")
password = os.getenv("MRDS_LOADER_DB_PASS")
tnsalias = os.getenv("MRDS_LOADER_DB_TNS")
if not all([username, password, tnsalias]):
missing = []
if not username: missing.append("MRDS_LOADER_DB_USER")
if not password: missing.append("MRDS_LOADER_DB_PASS")
if not tnsalias: missing.append("MRDS_LOADER_DB_TNS")
raise AirflowFailException(f"Missing required env vars: {', '.join(missing)}")
workload = (polled or {}).get("workload") or []
# Airflow context for run_id
from airflow.operators.python import get_current_context
ctx = get_current_context()
run_id = str(ctx['ti'].run_id)
a_workflow_history_key = mrds_init_workflow(database_name, workflow_name, run_id)
workflow_context = {
"run_id": run_id,
"a_workflow_history_key": a_workflow_history_key
}
# Build TASK_CONFIGS dynamically: one per file, sequential numbering
task_base_name = "m_ODS_LM_BALANCESHEET"
task_configs = []
for idx, w in enumerate(workload, start=1):
task_configs.append({
"task_name": f"{task_base_name}_{idx}",
"source_filename": w["base"], # pass basename to MRDS (adjust if you need full key)
"config_file": CONFIG_YAML,
})
bundle = {
"workflow_history_key": a_workflow_history_key,
"workflow_context": workflow_context,
"workload": workload, # includes object + mtime
"task_configs": task_configs, # list-of-dicts for mapping
"env": env,
}
logging.info("Init complete; workload=%d, tasks=%d", len(workload), len(task_configs))
return bundle
@af_task(task_id="get_task_configs")
def get_task_configs(init_bundle: dict):
return init_bundle["task_configs"]
def run_mrds_task(task_name: str, source_filename: str, config_file: str, **context):
"""Run MRDS for a single file (sequential via mapped task with max_active_tis_per_dag=1)."""
ti = context['ti']
if not os.path.exists(config_file):
raise FileNotFoundError(f"Config file not found: {config_file}")
init_bundle = ti.xcom_pull(task_ids='init_workflow') or {}
workflow_context = init_bundle.get('workflow_context')
workload = init_bundle.get('workload') or []
if not workflow_context:
raise AirflowFailException("No workflow_context from init_workflow")
# resolve full object key + mtime by matching base name from workload
full_object_key, object_mtime = None, None
for w in workload:
if w.get('base') == source_filename:
full_object_key = w.get('object')
object_mtime = w.get('mtime')
break
# Print/log the file being processed
logging.info("%s: picking file %s (object=%s, mtime=%s)",
task_name, source_filename, full_object_key or source_filename, object_mtime)
print(f"{task_name}: picking file {source_filename} (object={full_object_key or source_filename}, mtime={object_mtime})")
try:
# NOTE: if MRDS expects full URI, change 'source_filename' to 'full_object_key'
mrds_main(
workflow_context,
source_filename, # or full_object_key if required in your env
config_file,
generate_workflow_context=False
)
except Exception:
logging.exception("%s: MRDS failed on %s", task_name, source_filename)
raise
# Mark processed with the mtime we saw during poll
if full_object_key and object_mtime:
_mark_processed_ts([(full_object_key, object_mtime)])
ti.xcom_push(key='task_status', value='SUCCESS')
logging.info("%s: success", task_name)
return "SUCCESS"
def finalise_workflow_task(**context):
"""Finalize workflow across all per-file tasks (mapped)."""
from airflow.utils.state import State
ti = context['ti']
dag_run = context['dag_run']
init_bundle = ti.xcom_pull(task_ids='init_workflow') or {}
a_workflow_history_key = init_bundle.get('workflow_history_key')
if a_workflow_history_key is None:
raise AirflowFailException("No workflow history key; cannot finalise workflow")
mapped_task_id = "m_ODS_LM_BALANCESHEET"
tis = [t for t in dag_run.get_task_instances() if t.task_id == mapped_task_id]
if not tis:
mrds_finalise_workflow(a_workflow_history_key, "Y")
logging.info("Finalised workflow %s as SUCCESS (no files)", a_workflow_history_key)
return
any_failed = any(ti_i.state in {State.FAILED, State.UPSTREAM_FAILED} for ti_i in tis)
if not any_failed:
mrds_finalise_workflow(a_workflow_history_key, "Y")
logging.info("Finalised workflow %s as SUCCESS", a_workflow_history_key)
return
failed_idxs = [getattr(ti_i, "map_index", None) for ti_i in tis if ti_i.state in {State.FAILED, State.UPSTREAM_FAILED}]
mrds_finalise_workflow(a_workflow_history_key, "N")
logging.error("Finalised workflow %s as FAILED (failed map indexes=%s)", a_workflow_history_key, failed_idxs)
raise AirflowFailException(f"Workflow failed for mapped indexes: {failed_idxs}")
def check_success_for_mopdb(**context):
"""Check if all processing tasks succeeded before triggering MOPDB."""
from airflow.utils.state import State
try:
ti = context['ti']
dag_run = context['dag_run']
has_failures = False
failure_reasons = []
# Check finalize_workflow task
finalize_task = dag_run.get_task_instance('finalize_workflow')
if finalize_task.state == State.FAILED:
has_failures = True
failure_reasons.append("finalize_workflow failed")
# Check all mapped tasks (per-file processing)
mapped_task_id = "m_ODS_LM_BALANCESHEET"
mapped_tasks = [t for t in dag_run.get_task_instances() if t.task_id == mapped_task_id]
for task_instance in mapped_tasks:
if task_instance.state in {State.FAILED, State.UPSTREAM_FAILED}:
has_failures = True
map_idx = getattr(task_instance, 'map_index', 'unknown')
failure_reasons.append(f"Processing task failed at index {map_idx}")
if has_failures:
error_msg = f"Tasks failed - skipping MOPDB trigger: {', '.join(failure_reasons)}"
logging.info(error_msg)
raise AirflowSkipException(error_msg)
# Check if all mapped tasks were skipped (no files to process)
all_skipped = all(t.state == State.SKIPPED for t in mapped_tasks) if mapped_tasks else True
if all_skipped or not mapped_tasks:
error_msg = "All processing tasks were skipped (no files to process) - skipping MOPDB trigger"
logging.info(error_msg)
raise AirflowSkipException(error_msg)
logging.info("All tasks completed successfully - proceeding to trigger MOPDB")
return "SUCCESS"
except AirflowSkipException:
raise
except Exception as e:
logging.error(f"Error checking success for MOPDB: {e}", exc_info=True)
raise AirflowSkipException(f"Error checking success - skipping MOPDB trigger: {e}")
# Operators & Dependencies
poll_task = poll_oci_for_xml()
init_out = init_workflow(poll_task)
task_cfgs = get_task_configs(init_out)
@af_task(task_id="m_ODS_LM_BALANCESHEET", max_active_tis_per_dag=1)
def mapped_run(task_name: str, source_filename: str, config_file: str, **context):
return run_mrds_task(task_name=task_name, source_filename=source_filename, config_file=config_file, **context)
per_file = mapped_run.expand_kwargs(task_cfgs)
finalize_workflow = PythonOperator(
task_id='finalize_workflow',
python_callable=finalise_workflow_task,
provide_context=True,
trigger_rule=TriggerRule.ALL_DONE,
retries=0,
)
check_mopdb = PythonOperator(
task_id='check_success_for_mopdb',
python_callable=check_success_for_mopdb,
provide_context=True,
trigger_rule=TriggerRule.ALL_DONE,
retries=0,
)
trigger_mopdb = TriggerDagRunOperator(
task_id="Trigger_w_MOPDB_LM_BALANCESHEET",
trigger_dag_id="w_MOPDB_LM_BALANCESHEET",
conf={
"source_dag": dag_id,
"upstream_run_id": "{{ run_id }}",
"objects": "{{ (ti.xcom_pull(task_ids='poll_oci_for_xml')['workload'] | map(attribute='object') | list) if ti.xcom_pull(task_ids='poll_oci_for_xml') else [] }}",
"workflow_history_key": "{{ (ti.xcom_pull(task_ids='init_workflow')['workflow_history_key']) if ti.xcom_pull(task_ids='init_workflow') else None }}"
},
wait_for_completion=False, # CHANGED: Don't wait for completion
trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS, # CHANGED: Only trigger if check succeeds
retries=0,
)
all_good = EmptyOperator(
task_id="All_went_well",
trigger_rule=TriggerRule.ALL_DONE, # CHANGED: Always run to mark end
)
# CHANGED: Chain with check task before trigger
poll_task >> init_out >> task_cfgs >> per_file >> finalize_workflow >> check_mopdb >> trigger_mopdb >> all_good
logging.info(
"EXDI DAG ready: inbox_prefix=%s; using per-object processed ts map %s.",
OBJECT_PREFIX, PROCESSED_TS_VAR
)