This commit is contained in:
Grzegorz Michalski
2026-03-02 09:47:35 +01:00
commit 2c225d68ac
715 changed files with 130067 additions and 0 deletions

View File

View File

@@ -0,0 +1,103 @@
encoding_type: latin1
tmpdir: /tmp
inbox_prefix: INBOX/CSDB/FullRatingsDissemination
archive_prefix: ARCHIVE/CSDB/FullRatingsDissemination
workflow_name: w_ODS_CSDB_RATINGS_FULL
##file format
validation_schema_path: None
file_type: csv
tasks:
- task_name: m_ODS_CSDB_INSTR_DESC_FULL_PARSE
ods_prefix: INBOX/CSDB/FullRatingsDissemination/CSDB_INSTR_DESC_FULL
output_table: CSDB_INSTR_DESC_FULL
output_columns:
- type: 'a_key'
column_header: 'A_KEY'
- type: 'workflow_key'
column_header: 'A_WORKFLOW_HISTORY_KEY'
- type: 'csv_header'
value: 'RDB_INSTR_ID'
column_header: 'IDIRINSTRUMENT'
- type: 'csv_header'
value: 'ISIN'
column_header: 'ISIN'
- type: 'csv_header'
value: 'MOO_INSTR_ID'
column_header: 'MOO_INSTR_ID'
- type: 'csv_header'
value: 'SNP_INSTR_ID'
column_header: 'SNP_INSTR_ID'
- type: 'csv_header'
value: 'FTC_INSTR_ID'
column_header: 'FITCH_IDENTIFIER'
- type: 'csv_header'
value: 'DBR_INSTR_ID'
column_header: 'DBRS_IDENTIFIER'
- type: 'csv_header'
value: 'EA_STATUS'
column_header: 'EA_STATUS'
- type: 'csv_header'
value: 'IS_TMS'
column_header: 'IS_TMS'
- type: 'csv_header'
value: 'DBRS_COVERED_BOND_PROGRAM_ID'
column_header: 'DBRS_COVERED_BOND_PROGRAM'
- type: 'csv_header'
value: 'FITCH_PROGRAM_ID'
column_header: 'FITCH_PRG_IDENTIFIER'
- type: 'csv_header'
value: 'MOO_DEAL_NUMBER'
column_header: 'MOO_DEAL_NUMBER'
- type: 'csv_header'
value: 'SNP_PROGRAM_ID'
column_header: 'SNP_PROGRAM_ID'
- type: 'csv_header'
value: 'DBRS_DEBT_TYPE'
column_header: 'IDIRDEBTTYPE'
- type: 'csv_header'
value: 'SNP_DEBT_TYPE'
column_header: 'SNP_DEBT_TYPE'
- type: 'csv_header'
value: 'MOODY_SENIORITY'
column_header: 'MOODY_SENIORITY'
- type: 'csv_header'
value: 'FITCH_DEBT_LEVEL_CODE'
column_header: 'FITCH_DEBT_LEVEL_CODE'
- type: 'csv_header'
value: 'DBRS_RANK_TYPE'
column_header: 'DBRS_RANK_TYPE'
- type: 'csv_header'
value: 'DBRS_SECURITY_TYPE'
column_header: 'DBRS_SECURITY_TYPE'
- type: 'csv_header'
value: 'SCO_DEBT_TYPE'
column_header: 'SCO_DEBT_TYPE'
- type: 'csv_header'
value: 'SCO_INSTR_ID'
column_header: 'SCO_INSTR_ID'
- type: 'csv_header'
value: 'SCO_COVERED_BOND_PROGRAM'
column_header: 'SCO_COVERED_BOND_PROGRAM'
- type: 'csv_header'
value: 'SCO_CATEGORY'
column_header: 'SCO_CATEGORY'
- type: 'csv_header'
value: 'PLACEHOLDER15'
column_header: 'PLACEHOLDER15'
- type: 'csv_header'
value: 'PLACEHOLDER16'
column_header: 'PLACEHOLDER16'
- type: 'csv_header'
value: 'PLACEHOLDER17'
column_header: 'PLACEHOLDER17'
- type: 'csv_header'
value: 'PLACEHOLDER18'
column_header: 'PLACEHOLDER18'
- type: 'csv_header'
value: 'PLACEHOLDER19'
column_header: 'PLACEHOLDER19'
- type: 'csv_header'
value: 'PLACEHOLDER20'
column_header: 'PLACEHOLDER20'

View File

@@ -0,0 +1,130 @@
encoding_type: latin1
# Global configurations
tmpdir: /tmp
inbox_prefix: INBOX/CSDB/FullRatingsDissemination
archive_prefix: ARCHIVE/CSDB/FullRatingsDissemination
workflow_name: w_ODS_CSDB_RATINGS_FULL
validation_schema_path: None
file_type: csv
tasks:
- task_name: m_ODS_CSDB_INSTR_RAT_FULL_PARSE
ods_prefix: INBOX/CSDB/FullRatingsDissemination/CSDB_INSTR_RAT_FULL
output_table: CSDB_INSTR_RAT_FULL
output_columns:
- type: 'a_key'
column_header: 'A_KEY'
- type: 'workflow_key'
column_header: 'A_WORKFLOW_HISTORY_KEY'
- type: 'csv_header'
value: 'RDB_INSTR_ID'
column_header: 'RDB_INSTR_ID'
- type: 'csv_header'
value: 'SOURCE'
column_header: 'SOURCE'
- type: 'csv_header'
value: 'RATING_SCHEME'
column_header: 'RATING_SCHEME'
- type: 'csv_header'
value: 'RATING'
column_header: 'RATING'
- type: 'csv_header'
value: 'RATING_DATE'
column_header: 'RATING_DATE'
- type: 'csv_header'
value: 'TIME_HORIZON'
column_header: 'TIME_HORIZON'
- type: 'csv_header'
value: 'CURRENCY_TYPE'
column_header: 'CURRENCY_TYPE'
- type: 'csv_header'
value: 'NOTES'
column_header: 'NOTES'
- type: 'csv_header'
value: 'VALID_FROM'
column_header: 'VALID_FROM'
- type: 'csv_header'
value: 'VALID_UNTIL'
column_header: 'VALID_UNTIL'
- type: 'csv_header'
value: 'RDB_RATINGS_ID'
column_header: 'RDB_RATINGS_ID'
- type: 'csv_header'
value: 'OUTLOOK'
column_header: 'WATCHLIST'
- type: 'csv_header'
value: 'OUTLOOK_DATE'
column_header: 'WATCHLIST_DATE'
- type: 'csv_header'
value: 'WATCHLIST'
column_header: 'OUTLOOK'
- type: 'csv_header'
value: 'WATCHLIST_DATE'
column_header: 'OUTLOOK_DATE'
- type: 'csv_header'
value: 'RATING_ACTION'
column_header: 'RATING_ACTION'
- type: 'csv_header'
value: 'RATING_ACTION_DATE'
column_header: 'RATING_ACTION_DATE'
- type: 'csv_header'
value: 'IS_PRELIMINARY'
column_header: 'IS_PRELIMINARY'
- type: 'csv_header'
value: 'RATING_RAW'
column_header: 'RATING_RAW'
- type: 'csv_header'
value: 'RATING_TYPE'
column_header: 'RATING_TYPE'
- type: 'csv_header'
value: 'ENDORSEMENT_INDICATOR'
column_header: 'ENDORSEMENT_INDICATOR'
- type: 'csv_header'
value: 'LAST_REVIEW_DATE'
column_header: 'LAST_REVIEW_DATE'
- type: 'csv_header'
value: 'PLACEHOLDER6'
column_header: 'PLACEHOLDER6'
- type: 'csv_header'
value: 'PLACEHOLDER7'
column_header: 'PLACEHOLDER7'
- type: 'csv_header'
value: 'PLACEHOLDER8'
column_header: 'PLACEHOLDER8'
- type: 'csv_header'
value: 'PLACEHOLDER9'
column_header: 'PLACEHOLDER9'
- type: 'csv_header'
value: 'PLACEHOLDER10'
column_header: 'PLACEHOLDER10'
- type: 'csv_header'
value: 'PLACEHOLDER11'
column_header: 'PLACEHOLDER11'
- type: 'csv_header'
value: 'PLACEHOLDER12'
column_header: 'PLACEHOLDER12'
- type: 'csv_header'
value: 'PLACEHOLDER13'
column_header: 'PLACEHOLDER13'
- type: 'csv_header'
value: 'PLACEHOLDER14'
column_header: 'PLACEHOLDER14'
- type: 'csv_header'
value: 'PLACEHOLDER15'
column_header: 'PLACEHOLDER15'
- type: 'csv_header'
value: 'PLACEHOLDER16'
column_header: 'PLACEHOLDER16'
- type: 'csv_header'
value: 'PLACEHOLDER17'
column_header: 'PLACEHOLDER17'
- type: 'csv_header'
value: 'PLACEHOLDER18'
column_header: 'PLACEHOLDER18'
- type: 'csv_header'
value: 'PLACEHOLDER19'
column_header: 'PLACEHOLDER19'
- type: 'csv_header'
value: 'PLACEHOLDER20'
column_header: 'PLACEHOLDER20'

View File

@@ -0,0 +1,106 @@
encoding_type: latin1
# Global configurations
tmpdir: /tmp
inbox_prefix: INBOX/CSDB/FullRatingsDissemination
archive_prefix: ARCHIVE/CSDB/FullRatingsDissemination
workflow_name: w_ODS_CSDB_RATINGS_FULL
validation_schema_path: None
file_type: csv
tasks:
- task_name: m_ODS_CSDB_ISSUER_DESC_FULL_PARSE
ods_prefix: INBOX/CSDB/FullRatingsDissemination/CSDB_ISSUER_DESC_FULL
output_table: CSDB_ISSUER_DESC_FULL
output_columns:
- type: 'a_key'
column_header: 'A_KEY'
- type: 'workflow_key'
column_header: 'A_WORKFLOW_HISTORY_KEY'
- type: 'csv_header'
value: 'RDB_ISSUER_ID'
column_header: 'RDB_ISSUER_ID'
- type: 'csv_header'
value: 'ISSUER_NAME'
column_header: 'ISSUERNAME'
- type: 'csv_header'
value: 'COUNTRY_DOMICILE'
column_header: 'COUNTRY_DOMICILE'
- type: 'csv_header'
value: 'IS_SOVEREIGN'
column_header: 'IS_SOVEREIGN'
- type: 'csv_header'
value: 'MOO_ISSUER_ID'
column_header: 'MOODY_IDENTIFIER'
- type: 'csv_header'
value: 'SNP_ISSUER_ID'
column_header: 'SNP_ISSUER_ID'
- type: 'csv_header'
value: 'FTC_ISSUER_ID'
column_header: 'FITCH_IDENTIFIER'
- type: 'csv_header'
value: 'DBR_ISSUER_ID'
column_header: 'DBRS_IDENTIFIER'
- type: 'csv_header'
value: 'LEI_ISSUER_ID'
column_header: 'LEI_ISSUER_ID'
- type: 'csv_header'
value: 'RIAD_CODE'
column_header: 'RIAD_CODE'
- type: 'csv_header'
value: 'RIAD_OUID'
column_header: 'RIAD_OUID'
- type: 'csv_header'
value: 'CLASH_GROUP_STATUS'
column_header: 'CLASH_GROUP_STATUS'
- type: 'csv_header'
value: 'SCO_ISSUER_ID'
column_header: 'SCO_ISSUER_ID'
- type: 'csv_header'
value: 'PLACEHOLDER5'
column_header: 'PLACEHOLDER5'
- type: 'csv_header'
value: 'PLACEHOLDER6'
column_header: 'PLACEHOLDER6'
- type: 'csv_header'
value: 'PLACEHOLDER7'
column_header: 'PLACEHOLDER7'
- type: 'csv_header'
value: 'PLACEHOLDER8'
column_header: 'PLACEHOLDER8'
- type: 'csv_header'
value: 'PLACEHOLDER9'
column_header: 'PLACEHOLDER9'
- type: 'csv_header'
value: 'PLACEHOLDER10'
column_header: 'PLACEHOLDER10'
- type: 'csv_header'
value: 'PLACEHOLDER11'
column_header: 'PLACEHOLDER11'
- type: 'csv_header'
value: 'PLACEHOLDER12'
column_header: 'PLACEHOLDER12'
- type: 'csv_header'
value: 'PLACEHOLDER13'
column_header: 'PLACEHOLDER13'
- type: 'csv_header'
value: 'PLACEHOLDER14'
column_header: 'PLACEHOLDER14'
- type: 'csv_header'
value: 'PLACEHOLDER15'
column_header: 'PLACEHOLDER15'
- type: 'csv_header'
value: 'PLACEHOLDER16'
column_header: 'PLACEHOLDER16'
- type: 'csv_header'
value: 'PLACEHOLDER17'
column_header: 'PLACEHOLDER17'
- type: 'csv_header'
value: 'PLACEHOLDER18'
column_header: 'PLACEHOLDER18'
- type: 'csv_header'
value: 'PLACEHOLDER19'
column_header: 'PLACEHOLDER19'
- type: 'csv_header'
value: 'PLACEHOLDER20'
column_header: 'PLACEHOLDER20'

View File

@@ -0,0 +1,131 @@
encoding_type: latin1
# Global configurations
tmpdir: /tmp
inbox_prefix: INBOX/CSDB/FullRatingsDissemination
archive_prefix: ARCHIVE/CSDB/FullRatingsDissemination
workflow_name: w_ODS_CSDB_RATINGS_FULL
validation_schema_path: None
file_type: csv
tasks:
- task_name: m_ODS_CSDB_ISSUER_RAT_FULL_PARSE
ods_prefix: INBOX/CSDB/FullRatingsDissemination/CSDB_ISSUER_RAT_FULL
output_table: CSDB_ISSUER_RAT_FULL
output_columns:
- type: 'a_key'
column_header: 'A_KEY'
- type: 'workflow_key'
column_header: 'A_WORKFLOW_HISTORY_KEY'
- type: 'csv_header'
value: 'RDB_ISSUER_ID'
column_header: 'RDB_ISSUER_ID'
- type: 'csv_header'
value: 'SOURCE'
column_header: 'SOURCE'
- type: 'csv_header'
value: 'RATING_SCHEME'
column_header: 'RATING_SCHEME'
- type: 'csv_header'
value: 'RATING'
column_header: 'RATING'
- type: 'csv_header'
value: 'RATING_DATE'
column_header: 'RATING_DATE'
- type: 'csv_header'
value: 'TIME_HORIZON'
column_header: 'TIME_HORIZON'
- type: 'csv_header'
value: 'CURRENCY_TYPE'
column_header: 'CURRENCY_TYPE'
- type: 'csv_header'
value: 'NOTES'
column_header: 'NOTES'
- type: 'csv_header'
value: 'VALID_FROM'
column_header: 'VALID_FROM'
- type: 'csv_header'
value: 'VALID_UNTIL'
column_header: 'VALID_UNTIL'
- type: 'csv_header'
value: 'RDB_RATINGS_ID'
column_header: 'RDB_RATINGS_ID'
- type: 'csv_header'
value: 'OUTLOOK'
column_header: 'OUTLOOK'
- type: 'csv_header'
value: 'OUTLOOK_DATE'
column_header: 'OUTLOOK_DATE'
- type: 'csv_header'
value: 'WATCHLIST'
column_header: 'WATCHLIST'
- type: 'csv_header'
value: 'WATCHLIST_DATE'
column_header: 'WATCHLIST_DATE'
- type: 'csv_header'
value: 'RATING_ACTION'
column_header: 'RATING_ACTION'
- type: 'csv_header'
value: 'RATING_ACTION_DATE'
column_header: 'RATING_ACTION_DATE'
- type: 'csv_header'
value: 'IS_PRELIMINARY'
column_header: 'IS_PRELIMINARY'
- type: 'csv_header'
value: 'RATING_RAW'
column_header: 'RATING_RAW'
- type: 'csv_header'
value: 'RATING_TYPE'
column_header: 'RATING_TYPE'
- type: 'csv_header'
value: 'ENDORSEMENT_INDICATOR'
column_header: 'ENDORSEMENT_INDICATOR'
- type: 'csv_header'
value: 'LAST_REVIEW_DATE'
column_header: 'LAST_REVIEW_DATE'
- type: 'csv_header'
value: 'PLACEHOLDER6'
column_header: 'PLACEHOLDER6'
- type: 'csv_header'
value: 'PLACEHOLDER7'
column_header: 'PLACEHOLDER7'
- type: 'csv_header'
value: 'PLACEHOLDER8'
column_header: 'PLACEHOLDER8'
- type: 'csv_header'
value: 'PLACEHOLDER9'
column_header: 'PLACEHOLDER9'
- type: 'csv_header'
value: 'PLACEHOLDER10'
column_header: 'PLACEHOLDER10'
- type: 'csv_header'
value: 'PLACEHOLDER11'
column_header: 'PLACEHOLDER11'
- type: 'csv_header'
value: 'PLACEHOLDER12'
column_header: 'PLACEHOLDER12'
- type: 'csv_header'
value: 'PLACEHOLDER13'
column_header: 'PLACEHOLDER13'
- type: 'csv_header'
value: 'PLACEHOLDER14'
column_header: 'PLACEHOLDER14'
- type: 'csv_header'
value: 'PLACEHOLDER15'
column_header: 'PLACEHOLDER15'
- type: 'csv_header'
value: 'PLACEHOLDER16'
column_header: 'PLACEHOLDER16'
- type: 'csv_header'
value: 'PLACEHOLDER17'
column_header: 'PLACEHOLDER17'
- type: 'csv_header'
value: 'PLACEHOLDER18'
column_header: 'PLACEHOLDER18'
- type: 'csv_header'
value: 'PLACEHOLDER19'
column_header: 'PLACEHOLDER19'
- type: 'csv_header'
value: 'PLACEHOLDER20'
column_header: 'PLACEHOLDER20'

View File

@@ -0,0 +1,420 @@
import sys
import os
import json
import logging
from pathlib import Path
from datetime import timedelta, datetime, timezone
from email.utils import parsedate_to_datetime
from airflow import DAG
from airflow.models import Variable
from airflow.decorators import task as af_task
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
from airflow.utils.dates import days_ago
from airflow.utils.trigger_rule import TriggerRule
from airflow.operators.python import get_current_context
try:
from airflow.exceptions import AirflowFailException, AirflowSkipException
except Exception:
from airflow.exceptions import AirflowException as AirflowFailException
from airflow.exceptions import AirflowSkipException
dag_id = "w_ODS_CSDB_RATINGS_FULL_COORDINATOR"
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': days_ago(1),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
}
OCI_NAMESPACE = os.getenv("BUCKET_NAMESPACE")
OCI_BUCKET = os.getenv("INBOX_BUCKET")
OBJECT_PREFIX = os.getenv("OBJECT_PREFIX", "csdb/ratings/full/")
REPROCESS = (os.getenv("CSDB_REPROCESS", "false").lower() in ("1", "true", "yes"))
LAST_TS_VAR = f"{dag_id}__last_seen_ts"
PROCESSED_TS_VAR = f"{dag_id}__processed_objects_ts"
def _oci_client():
import oci
region = os.getenv("OCI_REGION") or os.getenv("OCI_RESOURCE_PRINCIPAL_REGION") or "eu-frankfurt-1"
try:
rp_signer = oci.auth.signers.get_resource_principals_signer()
cfg = {"region": region} if region else {}
logging.info("Using OCI Resource Principals signer (region=%s).", cfg.get("region"))
return oci.object_storage.ObjectStorageClient(cfg, signer=rp_signer)
except Exception as e:
logging.info("RP not available: %s", e)
try:
ip_signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
cfg = {"region": region} if region else {}
logging.info("Using OCI Instance Principals signer (region=%s).", cfg.get("region"))
return oci.object_storage.ObjectStorageClient(cfg, signer=ip_signer)
except Exception as e:
logging.info("IP not available: %s", e)
logging.error("Neither Resource Principals nor Instance Principals authentication found.")
raise RuntimeError("Failed to create OCI client")
def _load_processed_map() -> dict[str, float]:
try:
raw = Variable.get(PROCESSED_TS_VAR, default_var="{}")
m = json.loads(raw) or {}
if isinstance(m, dict):
return {k: float(v) for k, v in m.items()}
except Exception:
pass
return {}
def _list_all_zip_objects(include_processed: bool = False) -> list[dict]:
"""List all zip files in the bucket"""
if not OCI_NAMESPACE or not OCI_BUCKET:
raise AirflowFailException("BUCKET_NAMESPACE and INBOX_BUCKET must be set")
client = _oci_client()
processed_map = _load_processed_map() if not include_processed else {}
resp = client.list_objects(OCI_NAMESPACE, OCI_BUCKET, prefix=OBJECT_PREFIX)
all_items: list[dict] = []
for o in (resp.data.objects or []):
name = (o.name or "").strip()
base = name.rsplit("/", 1)[-1] if name else ""
if not name or name.endswith('/') or not base:
continue
if not ("STC-FullRatingsDissemination" in base and base.lower().endswith(".zip")):
continue
# Get timestamp
ts = None
t = getattr(o, "time_created", None)
if t:
try:
ts = t.timestamp() if hasattr(t, "timestamp") else float(t) / 1000.0
except Exception:
ts = None
if ts is None:
try:
head = client.head_object(OCI_NAMESPACE, OCI_BUCKET, name)
lm = head.headers.get("last-modified") or head.headers.get("Last-Modified")
if lm:
dt = parsedate_to_datetime(lm)
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
ts = dt.timestamp()
except Exception as e:
logging.warning("head_object failed for %s: %s", name, e)
if ts is None:
ts = datetime.now(timezone.utc).timestamp()
# Check if already processed
last_proc_ts = float(processed_map.get(name, 0.0))
is_processed = (ts <= last_proc_ts) if processed_map else False
item = {
"name": name,
"base": base,
"mtime": ts,
"is_processed": is_processed
}
all_items.append(item)
# Sort by timestamp (oldest first)
all_items.sort(key=lambda x: x["mtime"])
return all_items
def _list_new_zip_objects() -> list[dict]:
"""List only new/unprocessed zip files"""
all_items = _list_all_zip_objects(include_processed=False)
# Filter out processed items
new_items = [item for item in all_items if not item.get("is_processed", False)]
logging.info("Found %d new STC-FullRatingsDissemination zip file(s) (sorted oldest to newest)", len(new_items))
return new_items
def _find_specific_zip(filename_pattern: str) -> dict:
"""Find a specific zip file by name pattern"""
all_items = _list_all_zip_objects(include_processed=True)
# Try exact match first
for item in all_items:
if item["base"] == filename_pattern or item["name"] == filename_pattern:
logging.info("Found exact match: %s", item["base"])
return item
# Try partial match
for item in all_items:
if filename_pattern.lower() in item["base"].lower():
logging.info("Found partial match: %s", item["base"])
return item
raise AirflowFailException(f"No zip file found matching pattern: {filename_pattern}")
with DAG(
dag_id=dag_id,
default_args=default_args,
description='CSDB Ratings Full Coordinator: Lists and triggers processing for zip files',
schedule_interval="0 */6 * * *", # Every 6 hours, adjust as needed
catchup=False,
max_active_runs=1,
render_template_as_native_obj=True,
tags=["CSDB", "COORDINATOR", "ODS", "OCI", "RATINGS"],
) as dag:
@af_task(task_id="determine_processing_mode")
def determine_processing_mode(**context):
"""
Determine what to process based on dag_run configuration.
Configuration options:
1. No config or mode='all': Process all new zip files
2. mode='specific' + filename='xxx': Process specific zip file
3. mode='reprocess_all': Reprocess all zip files (including already processed)
4. mode='list_only': Just list available files without processing
5. filenames=['file1.zip', 'file2.zip']: Process specific list of files
"""
conf = context.get('dag_run').conf or {}
mode = conf.get('mode', 'all')
filename = conf.get('filename')
filenames = conf.get('filenames', [])
force_reprocess = conf.get('force_reprocess', False)
limit = conf.get('limit') # Limit number of files to process
logging.info("Processing mode: %s", mode)
logging.info("Configuration: %s", json.dumps(conf, indent=2))
result = {
"mode": mode,
"filename": filename,
"filenames": filenames,
"force_reprocess": force_reprocess,
"limit": limit
}
return result
@af_task(task_id="list_zip_files")
def list_zip_files(mode_config: dict):
"""List zip files based on the processing mode"""
mode = mode_config.get("mode", "all")
filename = mode_config.get("filename")
filenames = mode_config.get("filenames", [])
force_reprocess = mode_config.get("force_reprocess", False)
limit = mode_config.get("limit")
zip_files = []
if mode == "list_only":
# Just list all files for information
all_files = _list_all_zip_objects(include_processed=True)
logging.info("=== Available ZIP Files ===")
for idx, f in enumerate(all_files, 1):
status = "PROCESSED" if f.get("is_processed") else "NEW"
logging.info("%d. [%s] %s (mtime: %s)",
idx, status, f["base"],
datetime.fromtimestamp(f["mtime"]).isoformat())
raise AirflowSkipException("List only mode - no processing triggered")
elif mode == "specific":
# Process a specific file
if not filename:
raise AirflowFailException("mode='specific' requires 'filename' parameter")
zip_file = _find_specific_zip(filename)
zip_files = [zip_file]
logging.info("Processing specific file: %s", zip_file["base"])
elif mode == "specific_list":
# Process a list of specific files
if not filenames:
raise AirflowFailException("mode='specific_list' requires 'filenames' parameter")
for fn in filenames:
try:
zip_file = _find_specific_zip(fn)
zip_files.append(zip_file)
except Exception as e:
logging.warning("Could not find file %s: %s", fn, e)
if not zip_files:
raise AirflowFailException("None of the specified files were found")
logging.info("Processing %d specific files", len(zip_files))
elif mode == "reprocess_all":
# Reprocess all files (including already processed)
all_files = _list_all_zip_objects(include_processed=True)
zip_files = all_files
logging.info("Reprocessing all %d files", len(zip_files))
elif mode == "date_range":
# Process files within a date range
start_date = mode_config.get("start_date")
end_date = mode_config.get("end_date")
if not start_date or not end_date:
raise AirflowFailException("mode='date_range' requires 'start_date' and 'end_date'")
start_ts = datetime.fromisoformat(start_date).timestamp()
end_ts = datetime.fromisoformat(end_date).timestamp()
all_files = _list_all_zip_objects(include_processed=True)
zip_files = [f for f in all_files if start_ts <= f["mtime"] <= end_ts]
logging.info("Found %d files in date range %s to %s",
len(zip_files), start_date, end_date)
else: # mode == "all" or default
# Process all new files
zip_files = _list_new_zip_objects()
if not zip_files:
logging.info("No new zip files to process")
raise AirflowSkipException("No new zip files found")
# Apply limit if specified
if limit and isinstance(limit, int) and limit > 0:
original_count = len(zip_files)
zip_files = zip_files[:limit]
logging.info("Limited processing from %d to %d files", original_count, len(zip_files))
# Sort by timestamp (oldest first)
zip_files.sort(key=lambda x: x["mtime"])
logging.info("Selected %d zip file(s) for processing:", len(zip_files))
for idx, f in enumerate(zip_files, 1):
logging.info("%d. %s (mtime: %s)",
idx, f["base"],
datetime.fromtimestamp(f["mtime"]).isoformat())
return {
"zip_files": zip_files,
"mode": mode,
"force_reprocess": force_reprocess
}
@af_task(task_id="trigger_processing_dags")
def trigger_processing_dags(list_result: dict):
"""Trigger the processing DAG for each zip file sequentially"""
from airflow.api.common.trigger_dag import trigger_dag
from time import sleep
zip_files = list_result.get("zip_files", [])
mode = list_result.get("mode", "all")
force_reprocess = list_result.get("force_reprocess", False)
if not zip_files:
logging.info("No zip files to process")
return []
triggered_runs = []
for idx, zip_file in enumerate(zip_files):
conf = {
"zip_object_name": zip_file["name"],
"zip_base_name": zip_file["base"],
"zip_mtime": zip_file["mtime"],
"sequence_number": idx + 1,
"total_files": len(zip_files),
"processing_mode": mode,
"force_reprocess": force_reprocess,
"is_processed": zip_file.get("is_processed", False)
}
logging.info(f"Triggering processing DAG for file {idx + 1}/{len(zip_files)}: {zip_file['base']}")
try:
run_id = trigger_dag(
dag_id="w_ODS_CSDB_RATINGS_FULL_CORE",
run_id=f"coordinator__{datetime.now().strftime('%Y%m%d_%H%M%S')}__{idx}",
conf=conf,
execution_date=None,
replace_microseconds=False,
)
triggered_runs.append({
"run_id": str(run_id),
"zip_file": zip_file["base"],
"sequence": idx + 1,
"status": "triggered"
})
logging.info(f"Successfully triggered run: {run_id}")
except Exception as e:
logging.error(f"Failed to trigger processing for {zip_file['base']}: {e}")
triggered_runs.append({
"zip_file": zip_file["base"],
"sequence": idx + 1,
"status": "failed",
"error": str(e)
})
# Small delay between triggers to avoid overwhelming the system
sleep(2)
logging.info(f"Triggered {len([r for r in triggered_runs if r.get('status') == 'triggered'])} processing DAG runs")
logging.info(f"Failed to trigger {len([r for r in triggered_runs if r.get('status') == 'failed'])} runs")
return triggered_runs
@af_task(task_id="summary_report")
def summary_report(trigger_result: list):
"""Generate a summary report of triggered runs"""
if not trigger_result:
logging.info("No runs were triggered")
return
successful = [r for r in trigger_result if r.get("status") == "triggered"]
failed = [r for r in trigger_result if r.get("status") == "failed"]
logging.info("=" * 80)
logging.info("PROCESSING SUMMARY")
logging.info("=" * 80)
logging.info(f"Total files: {len(trigger_result)}")
logging.info(f"Successfully triggered: {len(successful)}")
logging.info(f"Failed to trigger: {len(failed)}")
if successful:
logging.info("\nSuccessfully triggered:")
for r in successful:
logging.info(f" - {r['zip_file']} (run_id: {r['run_id']})")
if failed:
logging.info("\nFailed to trigger:")
for r in failed:
logging.info(f" - {r['zip_file']} (error: {r.get('error', 'unknown')})")
logging.info("=" * 80)
return {
"total": len(trigger_result),
"successful": len(successful),
"failed": len(failed)
}
# Build DAG structure
mode_task = determine_processing_mode()
list_task = list_zip_files(mode_task)
trigger_task = trigger_processing_dags(list_task)
summary_task = summary_report(trigger_task)
mode_task >> list_task >> trigger_task >> summary_task
logging.info("CSDB Ratings Full Coordinator DAG ready")

View File

@@ -0,0 +1,388 @@
import sys
import os
import json
import logging
import zipfile
from pathlib import Path
from datetime import timedelta, datetime, timezone
from airflow import DAG
from airflow.models import Variable
from airflow.decorators import task as af_task
from airflow.operators.python import PythonOperator
from airflow.utils.dates import days_ago
from airflow.utils.trigger_rule import TriggerRule
from airflow.operators.empty import EmptyOperator
from airflow.operators.python import get_current_context
try:
from airflow.exceptions import AirflowFailException, AirflowSkipException
except Exception:
from airflow.exceptions import AirflowException as AirflowFailException
from airflow.exceptions import AirflowSkipException
sys.path.append('/opt/airflow/python/mrds_common')
sys.path.append('/opt/airflow/src/airflow/dags/ods/csdb')
from mrds.utils.manage_runs import init_workflow as mrds_init_workflow, finalise_workflow as mrds_finalise_workflow
from mrds.core import main as mrds_main
dag_id = "w_ODS_CSDB_RATINGS_FULL_CORE"
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': days_ago(1),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
}
WORKFLOW_CONFIG = {
"database_name": "ODS",
"workflow_name": dag_id,
}
OCI_NAMESPACE = os.getenv("BUCKET_NAMESPACE")
OCI_BUCKET = os.getenv("INBOX_BUCKET")
OBJECT_PREFIX = os.getenv("OBJECT_PREFIX", "csdb/ratings/full/")
TEMP_DIR = "/tmp/csdb_ratings"
PROCESSED_TS_VAR = "w_ODS_CSDB_RATINGS_FULL_COORDINATOR__processed_objects_ts"
# CSV configurations
CSV_CONFIGS = [
{
"source_filename": "FULL_INSTRUMENT_DESCRIPTION.csv",
"config_yaml": "/opt/airflow/src/airflow/dags/ods/csdb/full_ratings/config/m_ODS_CSDB_INSTR_DESC_FULL_PARSE.yaml",
"task_name": "m_ODS_CSDB_RATINGS_FULL_INSTRUMENT_DESCRIPTION"
},
{
"source_filename": "FULL_INSTRUMENT_RATINGS.csv",
"config_yaml": "/opt/airflow/src/airflow/dags/ods/csdb/full_ratings/config/m_ODS_CSDB_INSTR_RAT_FULL_PARSE.yaml",
"task_name": "m_ODS_CSDB_RATINGS_FULL_INSTRUMENT_RATINGS"
},
{
"source_filename": "FULL_ISSUER_DESCRIPTION.csv",
"config_yaml": "/opt/airflow/src/airflow/dags/ods/csdb/full_ratings/config/m_ODS_CSDB_ISSUER_DESC_FULL_PARSE.yaml",
"task_name": "m_ODS_CSDB_RATINGS_FULL_ISSUER_DESCRIPTION"
},
{
"source_filename": "FULL_ISSUER_RATINGS.csv",
"config_yaml": "/opt/airflow/src/airflow/dags/ods/csdb/full_ratings/config/m_ODS_CSDB_ISSUER_RAT_FULL_PARSE.yaml",
"task_name": "m_ODS_CSDB_RATINGS_FULL_ISSUER_RATINGS"
}
]
def _oci_client():
import oci
region = os.getenv("OCI_REGION") or os.getenv("OCI_RESOURCE_PRINCIPAL_REGION") or "eu-frankfurt-1"
try:
rp_signer = oci.auth.signers.get_resource_principals_signer()
cfg = {"region": region} if region else {}
logging.info("Using OCI Resource Principals signer (region=%s).", cfg.get("region"))
return oci.object_storage.ObjectStorageClient(cfg, signer=rp_signer)
except Exception as e:
logging.info("RP not available: %s", e)
try:
ip_signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
cfg = {"region": region} if region else {}
logging.info("Using OCI Instance Principals signer (region=%s).", cfg.get("region"))
return oci.object_storage.ObjectStorageClient(cfg, signer=ip_signer)
except Exception as e:
logging.info("IP not available: %s", e)
logging.error("Neither Resource Principals nor Instance Principals authentication found.")
raise RuntimeError("Failed to create OCI client")
def _load_processed_map() -> dict[str, float]:
try:
raw = Variable.get(PROCESSED_TS_VAR, default_var="{}")
m = json.loads(raw) or {}
if isinstance(m, dict):
return {k: float(v) for k, v in m.items()}
except Exception:
pass
return {}
def _save_processed_map(m: dict[str, float]) -> None:
Variable.set(PROCESSED_TS_VAR, json.dumps(m))
def _mark_processed(zip_key: str, zip_mtime: float):
m = _load_processed_map()
m[zip_key] = float(zip_mtime)
_save_processed_map(m)
logging.info("Marked as processed: %s (mtime=%s)", zip_key, zip_mtime)
with DAG(
dag_id=dag_id,
default_args=default_args,
description='CSDB Ratings Full Processor: Processes one zip file with 4 CSV files in parallel',
schedule_interval=None, # Triggered by coordinator
catchup=False,
max_active_runs=3, # Allow some parallelism but controlled
render_template_as_native_obj=True,
tags=["CSDB", "PROCESSOR", "MRDS", "ODS", "OCI", "RATINGS"],
) as dag:
@af_task(task_id="get_zip_config")
def get_zip_config(**context):
"""Get the zip file configuration from dag_run conf"""
conf = context['dag_run'].conf or {}
zip_object_name = conf.get('zip_object_name')
zip_base_name = conf.get('zip_base_name')
zip_mtime = conf.get('zip_mtime')
sequence_number = conf.get('sequence_number', 0)
total_files = conf.get('total_files', 0)
if not all([zip_object_name, zip_base_name, zip_mtime]):
raise AirflowFailException("Missing required configuration: zip_object_name, zip_base_name, or zip_mtime")
logging.info(f"Processing zip file {sequence_number}/{total_files}: {zip_base_name}")
return {
"zip_object_name": zip_object_name,
"zip_base_name": zip_base_name,
"zip_mtime": zip_mtime,
"sequence_number": sequence_number,
"total_files": total_files
}
@af_task(task_id="download_and_unzip")
def download_and_unzip(config: dict):
"""Download and unzip the specific zip file"""
zip_key = config["zip_object_name"]
zip_base = config["zip_base_name"]
client = _oci_client()
os.makedirs(TEMP_DIR, exist_ok=True)
# Create unique temp directory for this run
run_temp_dir = os.path.join(TEMP_DIR, f"run_{datetime.now().strftime('%Y%m%d_%H%M%S')}")
os.makedirs(run_temp_dir, exist_ok=True)
local_zip = os.path.join(run_temp_dir, zip_base)
logging.info("Downloading %s to %s", zip_key, local_zip)
get_obj = client.get_object(OCI_NAMESPACE, OCI_BUCKET, zip_key)
with open(local_zip, 'wb') as f:
for chunk in get_obj.data.raw.stream(1024 * 1024, decode_content=False):
f.write(chunk)
logging.info("Unzipping %s", local_zip)
with zipfile.ZipFile(local_zip, 'r') as zip_ref:
zip_ref.extractall(run_temp_dir)
extracted_files = []
for root, dirs, files in os.walk(run_temp_dir):
for file in files:
if file.endswith('.csv'):
full_path = os.path.join(root, file)
extracted_files.append({"filename": file, "path": full_path})
logging.info("Extracted CSV: %s", file)
logging.info("Total CSV files extracted: %d", len(extracted_files))
return {
"extracted_files": extracted_files,
"zip_config": config,
"temp_dir": run_temp_dir
}
@af_task(task_id="init_workflow")
def init_workflow(unzipped: dict):
"""Initialize MRDS workflow"""
database_name = WORKFLOW_CONFIG["database_name"]
workflow_name = WORKFLOW_CONFIG["workflow_name"]
ctx = get_current_context()
run_id = str(ctx['ti'].run_id)
a_workflow_history_key = mrds_init_workflow(database_name, workflow_name, run_id)
extracted_files = unzipped.get("extracted_files", [])
zip_config = unzipped.get("zip_config", {})
temp_dir = unzipped.get("temp_dir")
task_configs = []
for csv_config in CSV_CONFIGS:
matching_file = next(
(ef for ef in extracted_files if ef["filename"] == csv_config["source_filename"]),
None
)
if matching_file:
task_configs.append({
"task_name": csv_config["task_name"],
"source_filename": csv_config["source_filename"],
"source_path": matching_file["path"],
"config_file": csv_config["config_yaml"],
})
logging.info("Prepared task config for %s", csv_config["source_filename"])
else:
logging.warning("CSV file %s not found in extracted files", csv_config["source_filename"])
return {
"workflow_history_key": a_workflow_history_key,
"task_configs": task_configs,
"zip_config": zip_config,
"temp_dir": temp_dir
}
def run_mrds_task(task_config: dict, **context):
"""Run MRDS processing for a single CSV file"""
ti = context['ti']
task_name = task_config["task_name"]
source_path = task_config["source_path"]
config_file = task_config["config_file"]
if not os.path.exists(config_file):
raise FileNotFoundError(f"Config file not found: {config_file}")
if not os.path.exists(source_path):
raise FileNotFoundError(f"Source CSV file not found: {source_path}")
init_bundle = ti.xcom_pull(task_ids='init_workflow') or {}
workflow_history_key = init_bundle.get('workflow_history_key')
if not workflow_history_key:
raise AirflowFailException("No workflow_history_key from init_workflow")
try:
logging.info(f"{task_name}: Starting MRDS processing for {source_path}")
mrds_main(workflow_history_key, source_path, config_file, generate_workflow_context=False)
logging.info(f"{task_name}: MRDS processing completed successfully")
except Exception as e:
logging.exception(f"{task_name}: MRDS failed on {source_path}")
raise
return "SUCCESS"
def finalise_workflow_task(**context):
"""Finalize the workflow and mark zip as processed"""
ti = context['ti']
init_bundle = ti.xcom_pull(task_ids='init_workflow') or {}
a_workflow_history_key = init_bundle.get('workflow_history_key')
zip_config = init_bundle.get('zip_config', {})
if a_workflow_history_key is None:
raise AirflowFailException("No workflow history key; cannot finalise workflow")
# Check if any CSV task failed
csv_task_ids = [cfg["task_name"] for cfg in CSV_CONFIGS]
dag_run = context['dag_run']
tis = [t for t in dag_run.get_task_instances() if t.task_id in csv_task_ids]
from airflow.utils.state import State
any_failed = any(ti_i.state in {State.FAILED, State.UPSTREAM_FAILED} for ti_i in tis)
if not any_failed:
# Mark zip as processed
zip_key = zip_config.get("zip_object_name")
zip_mtime = zip_config.get("zip_mtime")
if zip_key and zip_mtime:
_mark_processed(zip_key, zip_mtime)
mrds_finalise_workflow(a_workflow_history_key, "Y")
logging.info("Finalised workflow %s as SUCCESS", a_workflow_history_key)
else:
failed_tasks = [ti_i.task_id for ti_i in tis if ti_i.state in {State.FAILED, State.UPSTREAM_FAILED}]
mrds_finalise_workflow(a_workflow_history_key, "N")
logging.error("Finalised workflow %s as FAILED (failed tasks=%s)",
a_workflow_history_key, failed_tasks)
raise AirflowFailException(f"Workflow failed for tasks: {failed_tasks}")
@af_task(task_id="cleanup_temp_files")
def cleanup_temp_files(**context):
"""Clean up temporary files for this run"""
import shutil
ti = context['ti']
init_bundle = ti.xcom_pull(task_ids='init_workflow') or {}
temp_dir = init_bundle.get('temp_dir')
if temp_dir and os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
logging.info("Cleaned up temp directory: %s", temp_dir)
@af_task(task_id="move_zip_to_archive")
def move_zip_to_archive(**context):
"""Move processed zip file to archive"""
ti = context['ti']
init_bundle = ti.xcom_pull(task_ids='init_workflow') or {}
zip_config = init_bundle.get('zip_config', {})
zip_key = zip_config.get("zip_object_name")
if not zip_key:
logging.warning("No zip key found, skipping archive")
return
client = _oci_client()
archive_key = zip_key.replace(OBJECT_PREFIX, f"{OBJECT_PREFIX}archive/", 1)
try:
client.copy_object(
OCI_NAMESPACE,
OCI_BUCKET,
{
"sourceObjectName": zip_key,
"destinationRegion": os.getenv("OCI_REGION", "eu-frankfurt-1"),
"destinationNamespace": OCI_NAMESPACE,
"destinationBucket": OCI_BUCKET,
"destinationObjectName": archive_key
}
)
logging.info("Copied to archive: %s -> %s", zip_key, archive_key)
client.delete_object(OCI_NAMESPACE, OCI_BUCKET, zip_key)
logging.info("Deleted from inbox: %s", zip_key)
except Exception as e:
logging.error("Failed to archive zip file %s: %s", zip_key, e)
raise
# Build the DAG structure
config_task = get_zip_config()
unzip_task = download_and_unzip(config_task)
init_task = init_workflow(unzip_task)
# Create CSV processing tasks dynamically
csv_tasks = []
for csv_config in CSV_CONFIGS:
task = PythonOperator(
task_id=csv_config["task_name"],
python_callable=run_mrds_task,
op_kwargs={
"task_config": {
"task_name": csv_config["task_name"],
"source_filename": csv_config["source_filename"],
"source_path": "{{ ti.xcom_pull(task_ids='init_workflow')['task_configs'] | selectattr('task_name', 'equalto', '" + csv_config["task_name"] + "') | map(attribute='source_path') | first }}",
"config_file": csv_config["config_yaml"],
}
},
provide_context=True,
)
csv_tasks.append(task)
finalize_task = PythonOperator(
task_id='finalize_workflow',
python_callable=finalise_workflow_task,
provide_context=True,
trigger_rule=TriggerRule.ALL_DONE,
retries=0,
)
cleanup_task = cleanup_temp_files()
archive_task = move_zip_to_archive()
all_good = EmptyOperator(
task_id="All_went_well",
trigger_rule=TriggerRule.ALL_SUCCESS,
)
# Define task dependencies
config_task >> unzip_task >> init_task >> csv_tasks >> finalize_task >> [cleanup_task, archive_task] >> all_good
logging.info("CSDB Ratings Full Processor DAG ready")