init
This commit is contained in:
0
airflow/ods/csdb/full_ratings/.gitkeep
Normal file
0
airflow/ods/csdb/full_ratings/.gitkeep
Normal file
0
airflow/ods/csdb/full_ratings/config/.gitkeep
Normal file
0
airflow/ods/csdb/full_ratings/config/.gitkeep
Normal file
@@ -0,0 +1,103 @@
|
||||
encoding_type: latin1
|
||||
tmpdir: /tmp
|
||||
inbox_prefix: INBOX/CSDB/FullRatingsDissemination
|
||||
archive_prefix: ARCHIVE/CSDB/FullRatingsDissemination
|
||||
workflow_name: w_ODS_CSDB_RATINGS_FULL
|
||||
##file format
|
||||
validation_schema_path: None
|
||||
file_type: csv
|
||||
|
||||
tasks:
|
||||
- task_name: m_ODS_CSDB_INSTR_DESC_FULL_PARSE
|
||||
ods_prefix: INBOX/CSDB/FullRatingsDissemination/CSDB_INSTR_DESC_FULL
|
||||
output_table: CSDB_INSTR_DESC_FULL
|
||||
output_columns:
|
||||
- type: 'a_key'
|
||||
column_header: 'A_KEY'
|
||||
- type: 'workflow_key'
|
||||
column_header: 'A_WORKFLOW_HISTORY_KEY'
|
||||
- type: 'csv_header'
|
||||
value: 'RDB_INSTR_ID'
|
||||
column_header: 'IDIRINSTRUMENT'
|
||||
- type: 'csv_header'
|
||||
value: 'ISIN'
|
||||
column_header: 'ISIN'
|
||||
- type: 'csv_header'
|
||||
value: 'MOO_INSTR_ID'
|
||||
column_header: 'MOO_INSTR_ID'
|
||||
- type: 'csv_header'
|
||||
value: 'SNP_INSTR_ID'
|
||||
column_header: 'SNP_INSTR_ID'
|
||||
- type: 'csv_header'
|
||||
value: 'FTC_INSTR_ID'
|
||||
column_header: 'FITCH_IDENTIFIER'
|
||||
- type: 'csv_header'
|
||||
value: 'DBR_INSTR_ID'
|
||||
column_header: 'DBRS_IDENTIFIER'
|
||||
- type: 'csv_header'
|
||||
value: 'EA_STATUS'
|
||||
column_header: 'EA_STATUS'
|
||||
- type: 'csv_header'
|
||||
value: 'IS_TMS'
|
||||
column_header: 'IS_TMS'
|
||||
- type: 'csv_header'
|
||||
value: 'DBRS_COVERED_BOND_PROGRAM_ID'
|
||||
column_header: 'DBRS_COVERED_BOND_PROGRAM'
|
||||
- type: 'csv_header'
|
||||
value: 'FITCH_PROGRAM_ID'
|
||||
column_header: 'FITCH_PRG_IDENTIFIER'
|
||||
- type: 'csv_header'
|
||||
value: 'MOO_DEAL_NUMBER'
|
||||
column_header: 'MOO_DEAL_NUMBER'
|
||||
- type: 'csv_header'
|
||||
value: 'SNP_PROGRAM_ID'
|
||||
column_header: 'SNP_PROGRAM_ID'
|
||||
- type: 'csv_header'
|
||||
value: 'DBRS_DEBT_TYPE'
|
||||
column_header: 'IDIRDEBTTYPE'
|
||||
- type: 'csv_header'
|
||||
value: 'SNP_DEBT_TYPE'
|
||||
column_header: 'SNP_DEBT_TYPE'
|
||||
- type: 'csv_header'
|
||||
value: 'MOODY_SENIORITY'
|
||||
column_header: 'MOODY_SENIORITY'
|
||||
- type: 'csv_header'
|
||||
value: 'FITCH_DEBT_LEVEL_CODE'
|
||||
column_header: 'FITCH_DEBT_LEVEL_CODE'
|
||||
- type: 'csv_header'
|
||||
value: 'DBRS_RANK_TYPE'
|
||||
column_header: 'DBRS_RANK_TYPE'
|
||||
- type: 'csv_header'
|
||||
value: 'DBRS_SECURITY_TYPE'
|
||||
column_header: 'DBRS_SECURITY_TYPE'
|
||||
- type: 'csv_header'
|
||||
value: 'SCO_DEBT_TYPE'
|
||||
column_header: 'SCO_DEBT_TYPE'
|
||||
- type: 'csv_header'
|
||||
value: 'SCO_INSTR_ID'
|
||||
column_header: 'SCO_INSTR_ID'
|
||||
- type: 'csv_header'
|
||||
value: 'SCO_COVERED_BOND_PROGRAM'
|
||||
column_header: 'SCO_COVERED_BOND_PROGRAM'
|
||||
- type: 'csv_header'
|
||||
value: 'SCO_CATEGORY'
|
||||
column_header: 'SCO_CATEGORY'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER15'
|
||||
column_header: 'PLACEHOLDER15'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER16'
|
||||
column_header: 'PLACEHOLDER16'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER17'
|
||||
column_header: 'PLACEHOLDER17'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER18'
|
||||
column_header: 'PLACEHOLDER18'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER19'
|
||||
column_header: 'PLACEHOLDER19'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER20'
|
||||
column_header: 'PLACEHOLDER20'
|
||||
|
||||
@@ -0,0 +1,130 @@
|
||||
encoding_type: latin1
|
||||
# Global configurations
|
||||
tmpdir: /tmp
|
||||
inbox_prefix: INBOX/CSDB/FullRatingsDissemination
|
||||
archive_prefix: ARCHIVE/CSDB/FullRatingsDissemination
|
||||
workflow_name: w_ODS_CSDB_RATINGS_FULL
|
||||
validation_schema_path: None
|
||||
file_type: csv
|
||||
|
||||
tasks:
|
||||
- task_name: m_ODS_CSDB_INSTR_RAT_FULL_PARSE
|
||||
ods_prefix: INBOX/CSDB/FullRatingsDissemination/CSDB_INSTR_RAT_FULL
|
||||
output_table: CSDB_INSTR_RAT_FULL
|
||||
output_columns:
|
||||
- type: 'a_key'
|
||||
column_header: 'A_KEY'
|
||||
- type: 'workflow_key'
|
||||
column_header: 'A_WORKFLOW_HISTORY_KEY'
|
||||
- type: 'csv_header'
|
||||
value: 'RDB_INSTR_ID'
|
||||
column_header: 'RDB_INSTR_ID'
|
||||
- type: 'csv_header'
|
||||
value: 'SOURCE'
|
||||
column_header: 'SOURCE'
|
||||
- type: 'csv_header'
|
||||
value: 'RATING_SCHEME'
|
||||
column_header: 'RATING_SCHEME'
|
||||
- type: 'csv_header'
|
||||
value: 'RATING'
|
||||
column_header: 'RATING'
|
||||
- type: 'csv_header'
|
||||
value: 'RATING_DATE'
|
||||
column_header: 'RATING_DATE'
|
||||
- type: 'csv_header'
|
||||
value: 'TIME_HORIZON'
|
||||
column_header: 'TIME_HORIZON'
|
||||
- type: 'csv_header'
|
||||
value: 'CURRENCY_TYPE'
|
||||
column_header: 'CURRENCY_TYPE'
|
||||
- type: 'csv_header'
|
||||
value: 'NOTES'
|
||||
column_header: 'NOTES'
|
||||
- type: 'csv_header'
|
||||
value: 'VALID_FROM'
|
||||
column_header: 'VALID_FROM'
|
||||
- type: 'csv_header'
|
||||
value: 'VALID_UNTIL'
|
||||
column_header: 'VALID_UNTIL'
|
||||
- type: 'csv_header'
|
||||
value: 'RDB_RATINGS_ID'
|
||||
column_header: 'RDB_RATINGS_ID'
|
||||
- type: 'csv_header'
|
||||
value: 'OUTLOOK'
|
||||
column_header: 'WATCHLIST'
|
||||
- type: 'csv_header'
|
||||
value: 'OUTLOOK_DATE'
|
||||
column_header: 'WATCHLIST_DATE'
|
||||
- type: 'csv_header'
|
||||
value: 'WATCHLIST'
|
||||
column_header: 'OUTLOOK'
|
||||
- type: 'csv_header'
|
||||
value: 'WATCHLIST_DATE'
|
||||
column_header: 'OUTLOOK_DATE'
|
||||
- type: 'csv_header'
|
||||
value: 'RATING_ACTION'
|
||||
column_header: 'RATING_ACTION'
|
||||
- type: 'csv_header'
|
||||
value: 'RATING_ACTION_DATE'
|
||||
column_header: 'RATING_ACTION_DATE'
|
||||
- type: 'csv_header'
|
||||
value: 'IS_PRELIMINARY'
|
||||
column_header: 'IS_PRELIMINARY'
|
||||
- type: 'csv_header'
|
||||
value: 'RATING_RAW'
|
||||
column_header: 'RATING_RAW'
|
||||
- type: 'csv_header'
|
||||
value: 'RATING_TYPE'
|
||||
column_header: 'RATING_TYPE'
|
||||
- type: 'csv_header'
|
||||
value: 'ENDORSEMENT_INDICATOR'
|
||||
column_header: 'ENDORSEMENT_INDICATOR'
|
||||
- type: 'csv_header'
|
||||
value: 'LAST_REVIEW_DATE'
|
||||
column_header: 'LAST_REVIEW_DATE'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER6'
|
||||
column_header: 'PLACEHOLDER6'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER7'
|
||||
column_header: 'PLACEHOLDER7'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER8'
|
||||
column_header: 'PLACEHOLDER8'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER9'
|
||||
column_header: 'PLACEHOLDER9'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER10'
|
||||
column_header: 'PLACEHOLDER10'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER11'
|
||||
column_header: 'PLACEHOLDER11'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER12'
|
||||
column_header: 'PLACEHOLDER12'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER13'
|
||||
column_header: 'PLACEHOLDER13'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER14'
|
||||
column_header: 'PLACEHOLDER14'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER15'
|
||||
column_header: 'PLACEHOLDER15'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER16'
|
||||
column_header: 'PLACEHOLDER16'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER17'
|
||||
column_header: 'PLACEHOLDER17'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER18'
|
||||
column_header: 'PLACEHOLDER18'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER19'
|
||||
column_header: 'PLACEHOLDER19'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER20'
|
||||
column_header: 'PLACEHOLDER20'
|
||||
|
||||
@@ -0,0 +1,106 @@
|
||||
encoding_type: latin1
|
||||
# Global configurations
|
||||
tmpdir: /tmp
|
||||
inbox_prefix: INBOX/CSDB/FullRatingsDissemination
|
||||
archive_prefix: ARCHIVE/CSDB/FullRatingsDissemination
|
||||
workflow_name: w_ODS_CSDB_RATINGS_FULL
|
||||
validation_schema_path: None
|
||||
file_type: csv
|
||||
|
||||
tasks:
|
||||
- task_name: m_ODS_CSDB_ISSUER_DESC_FULL_PARSE
|
||||
ods_prefix: INBOX/CSDB/FullRatingsDissemination/CSDB_ISSUER_DESC_FULL
|
||||
output_table: CSDB_ISSUER_DESC_FULL
|
||||
output_columns:
|
||||
- type: 'a_key'
|
||||
column_header: 'A_KEY'
|
||||
- type: 'workflow_key'
|
||||
column_header: 'A_WORKFLOW_HISTORY_KEY'
|
||||
- type: 'csv_header'
|
||||
value: 'RDB_ISSUER_ID'
|
||||
column_header: 'RDB_ISSUER_ID'
|
||||
- type: 'csv_header'
|
||||
value: 'ISSUER_NAME'
|
||||
column_header: 'ISSUERNAME'
|
||||
- type: 'csv_header'
|
||||
value: 'COUNTRY_DOMICILE'
|
||||
column_header: 'COUNTRY_DOMICILE'
|
||||
- type: 'csv_header'
|
||||
value: 'IS_SOVEREIGN'
|
||||
column_header: 'IS_SOVEREIGN'
|
||||
- type: 'csv_header'
|
||||
value: 'MOO_ISSUER_ID'
|
||||
column_header: 'MOODY_IDENTIFIER'
|
||||
- type: 'csv_header'
|
||||
value: 'SNP_ISSUER_ID'
|
||||
column_header: 'SNP_ISSUER_ID'
|
||||
- type: 'csv_header'
|
||||
value: 'FTC_ISSUER_ID'
|
||||
column_header: 'FITCH_IDENTIFIER'
|
||||
- type: 'csv_header'
|
||||
value: 'DBR_ISSUER_ID'
|
||||
column_header: 'DBRS_IDENTIFIER'
|
||||
- type: 'csv_header'
|
||||
value: 'LEI_ISSUER_ID'
|
||||
column_header: 'LEI_ISSUER_ID'
|
||||
- type: 'csv_header'
|
||||
value: 'RIAD_CODE'
|
||||
column_header: 'RIAD_CODE'
|
||||
- type: 'csv_header'
|
||||
value: 'RIAD_OUID'
|
||||
column_header: 'RIAD_OUID'
|
||||
- type: 'csv_header'
|
||||
value: 'CLASH_GROUP_STATUS'
|
||||
column_header: 'CLASH_GROUP_STATUS'
|
||||
- type: 'csv_header'
|
||||
value: 'SCO_ISSUER_ID'
|
||||
column_header: 'SCO_ISSUER_ID'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER5'
|
||||
column_header: 'PLACEHOLDER5'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER6'
|
||||
column_header: 'PLACEHOLDER6'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER7'
|
||||
column_header: 'PLACEHOLDER7'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER8'
|
||||
column_header: 'PLACEHOLDER8'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER9'
|
||||
column_header: 'PLACEHOLDER9'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER10'
|
||||
column_header: 'PLACEHOLDER10'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER11'
|
||||
column_header: 'PLACEHOLDER11'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER12'
|
||||
column_header: 'PLACEHOLDER12'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER13'
|
||||
column_header: 'PLACEHOLDER13'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER14'
|
||||
column_header: 'PLACEHOLDER14'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER15'
|
||||
column_header: 'PLACEHOLDER15'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER16'
|
||||
column_header: 'PLACEHOLDER16'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER17'
|
||||
column_header: 'PLACEHOLDER17'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER18'
|
||||
column_header: 'PLACEHOLDER18'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER19'
|
||||
column_header: 'PLACEHOLDER19'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER20'
|
||||
column_header: 'PLACEHOLDER20'
|
||||
|
||||
@@ -0,0 +1,131 @@
|
||||
encoding_type: latin1
|
||||
# Global configurations
|
||||
tmpdir: /tmp
|
||||
inbox_prefix: INBOX/CSDB/FullRatingsDissemination
|
||||
archive_prefix: ARCHIVE/CSDB/FullRatingsDissemination
|
||||
workflow_name: w_ODS_CSDB_RATINGS_FULL
|
||||
validation_schema_path: None
|
||||
file_type: csv
|
||||
|
||||
tasks:
|
||||
- task_name: m_ODS_CSDB_ISSUER_RAT_FULL_PARSE
|
||||
ods_prefix: INBOX/CSDB/FullRatingsDissemination/CSDB_ISSUER_RAT_FULL
|
||||
output_table: CSDB_ISSUER_RAT_FULL
|
||||
output_columns:
|
||||
- type: 'a_key'
|
||||
column_header: 'A_KEY'
|
||||
- type: 'workflow_key'
|
||||
column_header: 'A_WORKFLOW_HISTORY_KEY'
|
||||
- type: 'csv_header'
|
||||
value: 'RDB_ISSUER_ID'
|
||||
column_header: 'RDB_ISSUER_ID'
|
||||
- type: 'csv_header'
|
||||
value: 'SOURCE'
|
||||
column_header: 'SOURCE'
|
||||
- type: 'csv_header'
|
||||
value: 'RATING_SCHEME'
|
||||
column_header: 'RATING_SCHEME'
|
||||
- type: 'csv_header'
|
||||
value: 'RATING'
|
||||
column_header: 'RATING'
|
||||
- type: 'csv_header'
|
||||
value: 'RATING_DATE'
|
||||
column_header: 'RATING_DATE'
|
||||
- type: 'csv_header'
|
||||
value: 'TIME_HORIZON'
|
||||
column_header: 'TIME_HORIZON'
|
||||
- type: 'csv_header'
|
||||
value: 'CURRENCY_TYPE'
|
||||
column_header: 'CURRENCY_TYPE'
|
||||
- type: 'csv_header'
|
||||
value: 'NOTES'
|
||||
column_header: 'NOTES'
|
||||
- type: 'csv_header'
|
||||
value: 'VALID_FROM'
|
||||
column_header: 'VALID_FROM'
|
||||
- type: 'csv_header'
|
||||
value: 'VALID_UNTIL'
|
||||
column_header: 'VALID_UNTIL'
|
||||
- type: 'csv_header'
|
||||
value: 'RDB_RATINGS_ID'
|
||||
column_header: 'RDB_RATINGS_ID'
|
||||
- type: 'csv_header'
|
||||
value: 'OUTLOOK'
|
||||
column_header: 'OUTLOOK'
|
||||
- type: 'csv_header'
|
||||
value: 'OUTLOOK_DATE'
|
||||
column_header: 'OUTLOOK_DATE'
|
||||
- type: 'csv_header'
|
||||
value: 'WATCHLIST'
|
||||
column_header: 'WATCHLIST'
|
||||
- type: 'csv_header'
|
||||
value: 'WATCHLIST_DATE'
|
||||
column_header: 'WATCHLIST_DATE'
|
||||
- type: 'csv_header'
|
||||
value: 'RATING_ACTION'
|
||||
column_header: 'RATING_ACTION'
|
||||
- type: 'csv_header'
|
||||
value: 'RATING_ACTION_DATE'
|
||||
column_header: 'RATING_ACTION_DATE'
|
||||
- type: 'csv_header'
|
||||
value: 'IS_PRELIMINARY'
|
||||
column_header: 'IS_PRELIMINARY'
|
||||
- type: 'csv_header'
|
||||
value: 'RATING_RAW'
|
||||
column_header: 'RATING_RAW'
|
||||
- type: 'csv_header'
|
||||
value: 'RATING_TYPE'
|
||||
column_header: 'RATING_TYPE'
|
||||
- type: 'csv_header'
|
||||
value: 'ENDORSEMENT_INDICATOR'
|
||||
column_header: 'ENDORSEMENT_INDICATOR'
|
||||
- type: 'csv_header'
|
||||
value: 'LAST_REVIEW_DATE'
|
||||
column_header: 'LAST_REVIEW_DATE'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER6'
|
||||
column_header: 'PLACEHOLDER6'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER7'
|
||||
column_header: 'PLACEHOLDER7'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER8'
|
||||
column_header: 'PLACEHOLDER8'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER9'
|
||||
column_header: 'PLACEHOLDER9'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER10'
|
||||
column_header: 'PLACEHOLDER10'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER11'
|
||||
column_header: 'PLACEHOLDER11'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER12'
|
||||
column_header: 'PLACEHOLDER12'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER13'
|
||||
column_header: 'PLACEHOLDER13'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER14'
|
||||
column_header: 'PLACEHOLDER14'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER15'
|
||||
column_header: 'PLACEHOLDER15'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER16'
|
||||
column_header: 'PLACEHOLDER16'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER17'
|
||||
column_header: 'PLACEHOLDER17'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER18'
|
||||
column_header: 'PLACEHOLDER18'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER19'
|
||||
column_header: 'PLACEHOLDER19'
|
||||
- type: 'csv_header'
|
||||
value: 'PLACEHOLDER20'
|
||||
column_header: 'PLACEHOLDER20'
|
||||
|
||||
|
||||
@@ -0,0 +1,420 @@
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from datetime import timedelta, datetime, timezone
|
||||
from email.utils import parsedate_to_datetime
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.models import Variable
|
||||
from airflow.decorators import task as af_task
|
||||
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
|
||||
from airflow.utils.dates import days_ago
|
||||
from airflow.utils.trigger_rule import TriggerRule
|
||||
from airflow.operators.python import get_current_context
|
||||
|
||||
try:
|
||||
from airflow.exceptions import AirflowFailException, AirflowSkipException
|
||||
except Exception:
|
||||
from airflow.exceptions import AirflowException as AirflowFailException
|
||||
from airflow.exceptions import AirflowSkipException
|
||||
|
||||
dag_id = "w_ODS_CSDB_RATINGS_FULL_COORDINATOR"
|
||||
|
||||
default_args = {
|
||||
'owner': 'airflow',
|
||||
'depends_on_past': False,
|
||||
'start_date': days_ago(1),
|
||||
'email_on_failure': False,
|
||||
'email_on_retry': False,
|
||||
'retries': 1,
|
||||
'retry_delay': timedelta(minutes=5),
|
||||
}
|
||||
|
||||
OCI_NAMESPACE = os.getenv("BUCKET_NAMESPACE")
|
||||
OCI_BUCKET = os.getenv("INBOX_BUCKET")
|
||||
OBJECT_PREFIX = os.getenv("OBJECT_PREFIX", "csdb/ratings/full/")
|
||||
REPROCESS = (os.getenv("CSDB_REPROCESS", "false").lower() in ("1", "true", "yes"))
|
||||
LAST_TS_VAR = f"{dag_id}__last_seen_ts"
|
||||
PROCESSED_TS_VAR = f"{dag_id}__processed_objects_ts"
|
||||
|
||||
|
||||
def _oci_client():
|
||||
import oci
|
||||
region = os.getenv("OCI_REGION") or os.getenv("OCI_RESOURCE_PRINCIPAL_REGION") or "eu-frankfurt-1"
|
||||
try:
|
||||
rp_signer = oci.auth.signers.get_resource_principals_signer()
|
||||
cfg = {"region": region} if region else {}
|
||||
logging.info("Using OCI Resource Principals signer (region=%s).", cfg.get("region"))
|
||||
return oci.object_storage.ObjectStorageClient(cfg, signer=rp_signer)
|
||||
except Exception as e:
|
||||
logging.info("RP not available: %s", e)
|
||||
try:
|
||||
ip_signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
|
||||
cfg = {"region": region} if region else {}
|
||||
logging.info("Using OCI Instance Principals signer (region=%s).", cfg.get("region"))
|
||||
return oci.object_storage.ObjectStorageClient(cfg, signer=ip_signer)
|
||||
except Exception as e:
|
||||
logging.info("IP not available: %s", e)
|
||||
logging.error("Neither Resource Principals nor Instance Principals authentication found.")
|
||||
raise RuntimeError("Failed to create OCI client")
|
||||
|
||||
|
||||
def _load_processed_map() -> dict[str, float]:
|
||||
try:
|
||||
raw = Variable.get(PROCESSED_TS_VAR, default_var="{}")
|
||||
m = json.loads(raw) or {}
|
||||
if isinstance(m, dict):
|
||||
return {k: float(v) for k, v in m.items()}
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def _list_all_zip_objects(include_processed: bool = False) -> list[dict]:
|
||||
"""List all zip files in the bucket"""
|
||||
if not OCI_NAMESPACE or not OCI_BUCKET:
|
||||
raise AirflowFailException("BUCKET_NAMESPACE and INBOX_BUCKET must be set")
|
||||
|
||||
client = _oci_client()
|
||||
processed_map = _load_processed_map() if not include_processed else {}
|
||||
|
||||
resp = client.list_objects(OCI_NAMESPACE, OCI_BUCKET, prefix=OBJECT_PREFIX)
|
||||
all_items: list[dict] = []
|
||||
|
||||
for o in (resp.data.objects or []):
|
||||
name = (o.name or "").strip()
|
||||
base = name.rsplit("/", 1)[-1] if name else ""
|
||||
|
||||
if not name or name.endswith('/') or not base:
|
||||
continue
|
||||
|
||||
if not ("STC-FullRatingsDissemination" in base and base.lower().endswith(".zip")):
|
||||
continue
|
||||
|
||||
# Get timestamp
|
||||
ts = None
|
||||
t = getattr(o, "time_created", None)
|
||||
if t:
|
||||
try:
|
||||
ts = t.timestamp() if hasattr(t, "timestamp") else float(t) / 1000.0
|
||||
except Exception:
|
||||
ts = None
|
||||
|
||||
if ts is None:
|
||||
try:
|
||||
head = client.head_object(OCI_NAMESPACE, OCI_BUCKET, name)
|
||||
lm = head.headers.get("last-modified") or head.headers.get("Last-Modified")
|
||||
if lm:
|
||||
dt = parsedate_to_datetime(lm)
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
ts = dt.timestamp()
|
||||
except Exception as e:
|
||||
logging.warning("head_object failed for %s: %s", name, e)
|
||||
|
||||
if ts is None:
|
||||
ts = datetime.now(timezone.utc).timestamp()
|
||||
|
||||
# Check if already processed
|
||||
last_proc_ts = float(processed_map.get(name, 0.0))
|
||||
is_processed = (ts <= last_proc_ts) if processed_map else False
|
||||
|
||||
item = {
|
||||
"name": name,
|
||||
"base": base,
|
||||
"mtime": ts,
|
||||
"is_processed": is_processed
|
||||
}
|
||||
all_items.append(item)
|
||||
|
||||
# Sort by timestamp (oldest first)
|
||||
all_items.sort(key=lambda x: x["mtime"])
|
||||
|
||||
return all_items
|
||||
|
||||
|
||||
def _list_new_zip_objects() -> list[dict]:
|
||||
"""List only new/unprocessed zip files"""
|
||||
all_items = _list_all_zip_objects(include_processed=False)
|
||||
|
||||
# Filter out processed items
|
||||
new_items = [item for item in all_items if not item.get("is_processed", False)]
|
||||
|
||||
logging.info("Found %d new STC-FullRatingsDissemination zip file(s) (sorted oldest to newest)", len(new_items))
|
||||
return new_items
|
||||
|
||||
|
||||
def _find_specific_zip(filename_pattern: str) -> dict:
|
||||
"""Find a specific zip file by name pattern"""
|
||||
all_items = _list_all_zip_objects(include_processed=True)
|
||||
|
||||
# Try exact match first
|
||||
for item in all_items:
|
||||
if item["base"] == filename_pattern or item["name"] == filename_pattern:
|
||||
logging.info("Found exact match: %s", item["base"])
|
||||
return item
|
||||
|
||||
# Try partial match
|
||||
for item in all_items:
|
||||
if filename_pattern.lower() in item["base"].lower():
|
||||
logging.info("Found partial match: %s", item["base"])
|
||||
return item
|
||||
|
||||
raise AirflowFailException(f"No zip file found matching pattern: {filename_pattern}")
|
||||
|
||||
|
||||
with DAG(
|
||||
dag_id=dag_id,
|
||||
default_args=default_args,
|
||||
description='CSDB Ratings Full Coordinator: Lists and triggers processing for zip files',
|
||||
schedule_interval="0 */6 * * *", # Every 6 hours, adjust as needed
|
||||
catchup=False,
|
||||
max_active_runs=1,
|
||||
render_template_as_native_obj=True,
|
||||
tags=["CSDB", "COORDINATOR", "ODS", "OCI", "RATINGS"],
|
||||
) as dag:
|
||||
|
||||
@af_task(task_id="determine_processing_mode")
|
||||
def determine_processing_mode(**context):
|
||||
"""
|
||||
Determine what to process based on dag_run configuration.
|
||||
|
||||
Configuration options:
|
||||
1. No config or mode='all': Process all new zip files
|
||||
2. mode='specific' + filename='xxx': Process specific zip file
|
||||
3. mode='reprocess_all': Reprocess all zip files (including already processed)
|
||||
4. mode='list_only': Just list available files without processing
|
||||
5. filenames=['file1.zip', 'file2.zip']: Process specific list of files
|
||||
"""
|
||||
conf = context.get('dag_run').conf or {}
|
||||
|
||||
mode = conf.get('mode', 'all')
|
||||
filename = conf.get('filename')
|
||||
filenames = conf.get('filenames', [])
|
||||
force_reprocess = conf.get('force_reprocess', False)
|
||||
limit = conf.get('limit') # Limit number of files to process
|
||||
|
||||
logging.info("Processing mode: %s", mode)
|
||||
logging.info("Configuration: %s", json.dumps(conf, indent=2))
|
||||
|
||||
result = {
|
||||
"mode": mode,
|
||||
"filename": filename,
|
||||
"filenames": filenames,
|
||||
"force_reprocess": force_reprocess,
|
||||
"limit": limit
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
@af_task(task_id="list_zip_files")
|
||||
def list_zip_files(mode_config: dict):
|
||||
"""List zip files based on the processing mode"""
|
||||
mode = mode_config.get("mode", "all")
|
||||
filename = mode_config.get("filename")
|
||||
filenames = mode_config.get("filenames", [])
|
||||
force_reprocess = mode_config.get("force_reprocess", False)
|
||||
limit = mode_config.get("limit")
|
||||
|
||||
zip_files = []
|
||||
|
||||
if mode == "list_only":
|
||||
# Just list all files for information
|
||||
all_files = _list_all_zip_objects(include_processed=True)
|
||||
logging.info("=== Available ZIP Files ===")
|
||||
for idx, f in enumerate(all_files, 1):
|
||||
status = "PROCESSED" if f.get("is_processed") else "NEW"
|
||||
logging.info("%d. [%s] %s (mtime: %s)",
|
||||
idx, status, f["base"],
|
||||
datetime.fromtimestamp(f["mtime"]).isoformat())
|
||||
raise AirflowSkipException("List only mode - no processing triggered")
|
||||
|
||||
elif mode == "specific":
|
||||
# Process a specific file
|
||||
if not filename:
|
||||
raise AirflowFailException("mode='specific' requires 'filename' parameter")
|
||||
|
||||
zip_file = _find_specific_zip(filename)
|
||||
zip_files = [zip_file]
|
||||
logging.info("Processing specific file: %s", zip_file["base"])
|
||||
|
||||
elif mode == "specific_list":
|
||||
# Process a list of specific files
|
||||
if not filenames:
|
||||
raise AirflowFailException("mode='specific_list' requires 'filenames' parameter")
|
||||
|
||||
for fn in filenames:
|
||||
try:
|
||||
zip_file = _find_specific_zip(fn)
|
||||
zip_files.append(zip_file)
|
||||
except Exception as e:
|
||||
logging.warning("Could not find file %s: %s", fn, e)
|
||||
|
||||
if not zip_files:
|
||||
raise AirflowFailException("None of the specified files were found")
|
||||
|
||||
logging.info("Processing %d specific files", len(zip_files))
|
||||
|
||||
elif mode == "reprocess_all":
|
||||
# Reprocess all files (including already processed)
|
||||
all_files = _list_all_zip_objects(include_processed=True)
|
||||
zip_files = all_files
|
||||
logging.info("Reprocessing all %d files", len(zip_files))
|
||||
|
||||
elif mode == "date_range":
|
||||
# Process files within a date range
|
||||
start_date = mode_config.get("start_date")
|
||||
end_date = mode_config.get("end_date")
|
||||
|
||||
if not start_date or not end_date:
|
||||
raise AirflowFailException("mode='date_range' requires 'start_date' and 'end_date'")
|
||||
|
||||
start_ts = datetime.fromisoformat(start_date).timestamp()
|
||||
end_ts = datetime.fromisoformat(end_date).timestamp()
|
||||
|
||||
all_files = _list_all_zip_objects(include_processed=True)
|
||||
zip_files = [f for f in all_files if start_ts <= f["mtime"] <= end_ts]
|
||||
|
||||
logging.info("Found %d files in date range %s to %s",
|
||||
len(zip_files), start_date, end_date)
|
||||
|
||||
else: # mode == "all" or default
|
||||
# Process all new files
|
||||
zip_files = _list_new_zip_objects()
|
||||
|
||||
if not zip_files:
|
||||
logging.info("No new zip files to process")
|
||||
raise AirflowSkipException("No new zip files found")
|
||||
|
||||
# Apply limit if specified
|
||||
if limit and isinstance(limit, int) and limit > 0:
|
||||
original_count = len(zip_files)
|
||||
zip_files = zip_files[:limit]
|
||||
logging.info("Limited processing from %d to %d files", original_count, len(zip_files))
|
||||
|
||||
# Sort by timestamp (oldest first)
|
||||
zip_files.sort(key=lambda x: x["mtime"])
|
||||
|
||||
logging.info("Selected %d zip file(s) for processing:", len(zip_files))
|
||||
for idx, f in enumerate(zip_files, 1):
|
||||
logging.info("%d. %s (mtime: %s)",
|
||||
idx, f["base"],
|
||||
datetime.fromtimestamp(f["mtime"]).isoformat())
|
||||
|
||||
return {
|
||||
"zip_files": zip_files,
|
||||
"mode": mode,
|
||||
"force_reprocess": force_reprocess
|
||||
}
|
||||
|
||||
@af_task(task_id="trigger_processing_dags")
|
||||
def trigger_processing_dags(list_result: dict):
|
||||
"""Trigger the processing DAG for each zip file sequentially"""
|
||||
from airflow.api.common.trigger_dag import trigger_dag
|
||||
from time import sleep
|
||||
|
||||
zip_files = list_result.get("zip_files", [])
|
||||
mode = list_result.get("mode", "all")
|
||||
force_reprocess = list_result.get("force_reprocess", False)
|
||||
|
||||
if not zip_files:
|
||||
logging.info("No zip files to process")
|
||||
return []
|
||||
|
||||
triggered_runs = []
|
||||
|
||||
for idx, zip_file in enumerate(zip_files):
|
||||
conf = {
|
||||
"zip_object_name": zip_file["name"],
|
||||
"zip_base_name": zip_file["base"],
|
||||
"zip_mtime": zip_file["mtime"],
|
||||
"sequence_number": idx + 1,
|
||||
"total_files": len(zip_files),
|
||||
"processing_mode": mode,
|
||||
"force_reprocess": force_reprocess,
|
||||
"is_processed": zip_file.get("is_processed", False)
|
||||
}
|
||||
|
||||
logging.info(f"Triggering processing DAG for file {idx + 1}/{len(zip_files)}: {zip_file['base']}")
|
||||
|
||||
try:
|
||||
run_id = trigger_dag(
|
||||
dag_id="w_ODS_CSDB_RATINGS_FULL_CORE",
|
||||
run_id=f"coordinator__{datetime.now().strftime('%Y%m%d_%H%M%S')}__{idx}",
|
||||
conf=conf,
|
||||
execution_date=None,
|
||||
replace_microseconds=False,
|
||||
)
|
||||
|
||||
triggered_runs.append({
|
||||
"run_id": str(run_id),
|
||||
"zip_file": zip_file["base"],
|
||||
"sequence": idx + 1,
|
||||
"status": "triggered"
|
||||
})
|
||||
|
||||
logging.info(f"Successfully triggered run: {run_id}")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to trigger processing for {zip_file['base']}: {e}")
|
||||
triggered_runs.append({
|
||||
"zip_file": zip_file["base"],
|
||||
"sequence": idx + 1,
|
||||
"status": "failed",
|
||||
"error": str(e)
|
||||
})
|
||||
|
||||
# Small delay between triggers to avoid overwhelming the system
|
||||
sleep(2)
|
||||
|
||||
logging.info(f"Triggered {len([r for r in triggered_runs if r.get('status') == 'triggered'])} processing DAG runs")
|
||||
logging.info(f"Failed to trigger {len([r for r in triggered_runs if r.get('status') == 'failed'])} runs")
|
||||
|
||||
return triggered_runs
|
||||
|
||||
@af_task(task_id="summary_report")
|
||||
def summary_report(trigger_result: list):
|
||||
"""Generate a summary report of triggered runs"""
|
||||
if not trigger_result:
|
||||
logging.info("No runs were triggered")
|
||||
return
|
||||
|
||||
successful = [r for r in trigger_result if r.get("status") == "triggered"]
|
||||
failed = [r for r in trigger_result if r.get("status") == "failed"]
|
||||
|
||||
logging.info("=" * 80)
|
||||
logging.info("PROCESSING SUMMARY")
|
||||
logging.info("=" * 80)
|
||||
logging.info(f"Total files: {len(trigger_result)}")
|
||||
logging.info(f"Successfully triggered: {len(successful)}")
|
||||
logging.info(f"Failed to trigger: {len(failed)}")
|
||||
|
||||
if successful:
|
||||
logging.info("\nSuccessfully triggered:")
|
||||
for r in successful:
|
||||
logging.info(f" - {r['zip_file']} (run_id: {r['run_id']})")
|
||||
|
||||
if failed:
|
||||
logging.info("\nFailed to trigger:")
|
||||
for r in failed:
|
||||
logging.info(f" - {r['zip_file']} (error: {r.get('error', 'unknown')})")
|
||||
|
||||
logging.info("=" * 80)
|
||||
|
||||
return {
|
||||
"total": len(trigger_result),
|
||||
"successful": len(successful),
|
||||
"failed": len(failed)
|
||||
}
|
||||
|
||||
# Build DAG structure
|
||||
mode_task = determine_processing_mode()
|
||||
list_task = list_zip_files(mode_task)
|
||||
trigger_task = trigger_processing_dags(list_task)
|
||||
summary_task = summary_report(trigger_task)
|
||||
|
||||
mode_task >> list_task >> trigger_task >> summary_task
|
||||
|
||||
logging.info("CSDB Ratings Full Coordinator DAG ready")
|
||||
@@ -0,0 +1,388 @@
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from datetime import timedelta, datetime, timezone
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.models import Variable
|
||||
from airflow.decorators import task as af_task
|
||||
from airflow.operators.python import PythonOperator
|
||||
from airflow.utils.dates import days_ago
|
||||
from airflow.utils.trigger_rule import TriggerRule
|
||||
from airflow.operators.empty import EmptyOperator
|
||||
from airflow.operators.python import get_current_context
|
||||
|
||||
try:
|
||||
from airflow.exceptions import AirflowFailException, AirflowSkipException
|
||||
except Exception:
|
||||
from airflow.exceptions import AirflowException as AirflowFailException
|
||||
from airflow.exceptions import AirflowSkipException
|
||||
|
||||
sys.path.append('/opt/airflow/python/mrds_common')
|
||||
sys.path.append('/opt/airflow/src/airflow/dags/ods/csdb')
|
||||
from mrds.utils.manage_runs import init_workflow as mrds_init_workflow, finalise_workflow as mrds_finalise_workflow
|
||||
from mrds.core import main as mrds_main
|
||||
|
||||
dag_id = "w_ODS_CSDB_RATINGS_FULL_CORE"
|
||||
|
||||
default_args = {
|
||||
'owner': 'airflow',
|
||||
'depends_on_past': False,
|
||||
'start_date': days_ago(1),
|
||||
'email_on_failure': False,
|
||||
'email_on_retry': False,
|
||||
'retries': 1,
|
||||
'retry_delay': timedelta(minutes=5),
|
||||
}
|
||||
|
||||
WORKFLOW_CONFIG = {
|
||||
"database_name": "ODS",
|
||||
"workflow_name": dag_id,
|
||||
}
|
||||
|
||||
OCI_NAMESPACE = os.getenv("BUCKET_NAMESPACE")
|
||||
OCI_BUCKET = os.getenv("INBOX_BUCKET")
|
||||
OBJECT_PREFIX = os.getenv("OBJECT_PREFIX", "csdb/ratings/full/")
|
||||
TEMP_DIR = "/tmp/csdb_ratings"
|
||||
PROCESSED_TS_VAR = "w_ODS_CSDB_RATINGS_FULL_COORDINATOR__processed_objects_ts"
|
||||
|
||||
# CSV configurations
|
||||
CSV_CONFIGS = [
|
||||
{
|
||||
"source_filename": "FULL_INSTRUMENT_DESCRIPTION.csv",
|
||||
"config_yaml": "/opt/airflow/src/airflow/dags/ods/csdb/full_ratings/config/m_ODS_CSDB_INSTR_DESC_FULL_PARSE.yaml",
|
||||
"task_name": "m_ODS_CSDB_RATINGS_FULL_INSTRUMENT_DESCRIPTION"
|
||||
},
|
||||
{
|
||||
"source_filename": "FULL_INSTRUMENT_RATINGS.csv",
|
||||
"config_yaml": "/opt/airflow/src/airflow/dags/ods/csdb/full_ratings/config/m_ODS_CSDB_INSTR_RAT_FULL_PARSE.yaml",
|
||||
"task_name": "m_ODS_CSDB_RATINGS_FULL_INSTRUMENT_RATINGS"
|
||||
},
|
||||
{
|
||||
"source_filename": "FULL_ISSUER_DESCRIPTION.csv",
|
||||
"config_yaml": "/opt/airflow/src/airflow/dags/ods/csdb/full_ratings/config/m_ODS_CSDB_ISSUER_DESC_FULL_PARSE.yaml",
|
||||
"task_name": "m_ODS_CSDB_RATINGS_FULL_ISSUER_DESCRIPTION"
|
||||
},
|
||||
{
|
||||
"source_filename": "FULL_ISSUER_RATINGS.csv",
|
||||
"config_yaml": "/opt/airflow/src/airflow/dags/ods/csdb/full_ratings/config/m_ODS_CSDB_ISSUER_RAT_FULL_PARSE.yaml",
|
||||
"task_name": "m_ODS_CSDB_RATINGS_FULL_ISSUER_RATINGS"
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _oci_client():
|
||||
import oci
|
||||
region = os.getenv("OCI_REGION") or os.getenv("OCI_RESOURCE_PRINCIPAL_REGION") or "eu-frankfurt-1"
|
||||
try:
|
||||
rp_signer = oci.auth.signers.get_resource_principals_signer()
|
||||
cfg = {"region": region} if region else {}
|
||||
logging.info("Using OCI Resource Principals signer (region=%s).", cfg.get("region"))
|
||||
return oci.object_storage.ObjectStorageClient(cfg, signer=rp_signer)
|
||||
except Exception as e:
|
||||
logging.info("RP not available: %s", e)
|
||||
try:
|
||||
ip_signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
|
||||
cfg = {"region": region} if region else {}
|
||||
logging.info("Using OCI Instance Principals signer (region=%s).", cfg.get("region"))
|
||||
return oci.object_storage.ObjectStorageClient(cfg, signer=ip_signer)
|
||||
except Exception as e:
|
||||
logging.info("IP not available: %s", e)
|
||||
logging.error("Neither Resource Principals nor Instance Principals authentication found.")
|
||||
raise RuntimeError("Failed to create OCI client")
|
||||
|
||||
|
||||
def _load_processed_map() -> dict[str, float]:
|
||||
try:
|
||||
raw = Variable.get(PROCESSED_TS_VAR, default_var="{}")
|
||||
m = json.loads(raw) or {}
|
||||
if isinstance(m, dict):
|
||||
return {k: float(v) for k, v in m.items()}
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def _save_processed_map(m: dict[str, float]) -> None:
|
||||
Variable.set(PROCESSED_TS_VAR, json.dumps(m))
|
||||
|
||||
|
||||
def _mark_processed(zip_key: str, zip_mtime: float):
|
||||
m = _load_processed_map()
|
||||
m[zip_key] = float(zip_mtime)
|
||||
_save_processed_map(m)
|
||||
logging.info("Marked as processed: %s (mtime=%s)", zip_key, zip_mtime)
|
||||
|
||||
|
||||
with DAG(
|
||||
dag_id=dag_id,
|
||||
default_args=default_args,
|
||||
description='CSDB Ratings Full Processor: Processes one zip file with 4 CSV files in parallel',
|
||||
schedule_interval=None, # Triggered by coordinator
|
||||
catchup=False,
|
||||
max_active_runs=3, # Allow some parallelism but controlled
|
||||
render_template_as_native_obj=True,
|
||||
tags=["CSDB", "PROCESSOR", "MRDS", "ODS", "OCI", "RATINGS"],
|
||||
) as dag:
|
||||
|
||||
@af_task(task_id="get_zip_config")
|
||||
def get_zip_config(**context):
|
||||
"""Get the zip file configuration from dag_run conf"""
|
||||
conf = context['dag_run'].conf or {}
|
||||
|
||||
zip_object_name = conf.get('zip_object_name')
|
||||
zip_base_name = conf.get('zip_base_name')
|
||||
zip_mtime = conf.get('zip_mtime')
|
||||
sequence_number = conf.get('sequence_number', 0)
|
||||
total_files = conf.get('total_files', 0)
|
||||
|
||||
if not all([zip_object_name, zip_base_name, zip_mtime]):
|
||||
raise AirflowFailException("Missing required configuration: zip_object_name, zip_base_name, or zip_mtime")
|
||||
|
||||
logging.info(f"Processing zip file {sequence_number}/{total_files}: {zip_base_name}")
|
||||
|
||||
return {
|
||||
"zip_object_name": zip_object_name,
|
||||
"zip_base_name": zip_base_name,
|
||||
"zip_mtime": zip_mtime,
|
||||
"sequence_number": sequence_number,
|
||||
"total_files": total_files
|
||||
}
|
||||
|
||||
@af_task(task_id="download_and_unzip")
|
||||
def download_and_unzip(config: dict):
|
||||
"""Download and unzip the specific zip file"""
|
||||
zip_key = config["zip_object_name"]
|
||||
zip_base = config["zip_base_name"]
|
||||
|
||||
client = _oci_client()
|
||||
os.makedirs(TEMP_DIR, exist_ok=True)
|
||||
|
||||
# Create unique temp directory for this run
|
||||
run_temp_dir = os.path.join(TEMP_DIR, f"run_{datetime.now().strftime('%Y%m%d_%H%M%S')}")
|
||||
os.makedirs(run_temp_dir, exist_ok=True)
|
||||
|
||||
local_zip = os.path.join(run_temp_dir, zip_base)
|
||||
|
||||
logging.info("Downloading %s to %s", zip_key, local_zip)
|
||||
get_obj = client.get_object(OCI_NAMESPACE, OCI_BUCKET, zip_key)
|
||||
with open(local_zip, 'wb') as f:
|
||||
for chunk in get_obj.data.raw.stream(1024 * 1024, decode_content=False):
|
||||
f.write(chunk)
|
||||
|
||||
logging.info("Unzipping %s", local_zip)
|
||||
with zipfile.ZipFile(local_zip, 'r') as zip_ref:
|
||||
zip_ref.extractall(run_temp_dir)
|
||||
|
||||
extracted_files = []
|
||||
for root, dirs, files in os.walk(run_temp_dir):
|
||||
for file in files:
|
||||
if file.endswith('.csv'):
|
||||
full_path = os.path.join(root, file)
|
||||
extracted_files.append({"filename": file, "path": full_path})
|
||||
logging.info("Extracted CSV: %s", file)
|
||||
|
||||
logging.info("Total CSV files extracted: %d", len(extracted_files))
|
||||
|
||||
return {
|
||||
"extracted_files": extracted_files,
|
||||
"zip_config": config,
|
||||
"temp_dir": run_temp_dir
|
||||
}
|
||||
|
||||
@af_task(task_id="init_workflow")
|
||||
def init_workflow(unzipped: dict):
|
||||
"""Initialize MRDS workflow"""
|
||||
database_name = WORKFLOW_CONFIG["database_name"]
|
||||
workflow_name = WORKFLOW_CONFIG["workflow_name"]
|
||||
|
||||
ctx = get_current_context()
|
||||
run_id = str(ctx['ti'].run_id)
|
||||
|
||||
a_workflow_history_key = mrds_init_workflow(database_name, workflow_name, run_id)
|
||||
|
||||
extracted_files = unzipped.get("extracted_files", [])
|
||||
zip_config = unzipped.get("zip_config", {})
|
||||
temp_dir = unzipped.get("temp_dir")
|
||||
|
||||
task_configs = []
|
||||
for csv_config in CSV_CONFIGS:
|
||||
matching_file = next(
|
||||
(ef for ef in extracted_files if ef["filename"] == csv_config["source_filename"]),
|
||||
None
|
||||
)
|
||||
if matching_file:
|
||||
task_configs.append({
|
||||
"task_name": csv_config["task_name"],
|
||||
"source_filename": csv_config["source_filename"],
|
||||
"source_path": matching_file["path"],
|
||||
"config_file": csv_config["config_yaml"],
|
||||
})
|
||||
logging.info("Prepared task config for %s", csv_config["source_filename"])
|
||||
else:
|
||||
logging.warning("CSV file %s not found in extracted files", csv_config["source_filename"])
|
||||
|
||||
return {
|
||||
"workflow_history_key": a_workflow_history_key,
|
||||
"task_configs": task_configs,
|
||||
"zip_config": zip_config,
|
||||
"temp_dir": temp_dir
|
||||
}
|
||||
|
||||
def run_mrds_task(task_config: dict, **context):
|
||||
"""Run MRDS processing for a single CSV file"""
|
||||
ti = context['ti']
|
||||
|
||||
task_name = task_config["task_name"]
|
||||
source_path = task_config["source_path"]
|
||||
config_file = task_config["config_file"]
|
||||
|
||||
if not os.path.exists(config_file):
|
||||
raise FileNotFoundError(f"Config file not found: {config_file}")
|
||||
if not os.path.exists(source_path):
|
||||
raise FileNotFoundError(f"Source CSV file not found: {source_path}")
|
||||
|
||||
init_bundle = ti.xcom_pull(task_ids='init_workflow') or {}
|
||||
workflow_history_key = init_bundle.get('workflow_history_key')
|
||||
|
||||
if not workflow_history_key:
|
||||
raise AirflowFailException("No workflow_history_key from init_workflow")
|
||||
|
||||
try:
|
||||
logging.info(f"{task_name}: Starting MRDS processing for {source_path}")
|
||||
mrds_main(workflow_history_key, source_path, config_file, generate_workflow_context=False)
|
||||
logging.info(f"{task_name}: MRDS processing completed successfully")
|
||||
except Exception as e:
|
||||
logging.exception(f"{task_name}: MRDS failed on {source_path}")
|
||||
raise
|
||||
|
||||
return "SUCCESS"
|
||||
|
||||
def finalise_workflow_task(**context):
|
||||
"""Finalize the workflow and mark zip as processed"""
|
||||
ti = context['ti']
|
||||
init_bundle = ti.xcom_pull(task_ids='init_workflow') or {}
|
||||
|
||||
a_workflow_history_key = init_bundle.get('workflow_history_key')
|
||||
zip_config = init_bundle.get('zip_config', {})
|
||||
|
||||
if a_workflow_history_key is None:
|
||||
raise AirflowFailException("No workflow history key; cannot finalise workflow")
|
||||
|
||||
# Check if any CSV task failed
|
||||
csv_task_ids = [cfg["task_name"] for cfg in CSV_CONFIGS]
|
||||
dag_run = context['dag_run']
|
||||
tis = [t for t in dag_run.get_task_instances() if t.task_id in csv_task_ids]
|
||||
|
||||
from airflow.utils.state import State
|
||||
any_failed = any(ti_i.state in {State.FAILED, State.UPSTREAM_FAILED} for ti_i in tis)
|
||||
|
||||
if not any_failed:
|
||||
# Mark zip as processed
|
||||
zip_key = zip_config.get("zip_object_name")
|
||||
zip_mtime = zip_config.get("zip_mtime")
|
||||
if zip_key and zip_mtime:
|
||||
_mark_processed(zip_key, zip_mtime)
|
||||
|
||||
mrds_finalise_workflow(a_workflow_history_key, "Y")
|
||||
logging.info("Finalised workflow %s as SUCCESS", a_workflow_history_key)
|
||||
else:
|
||||
failed_tasks = [ti_i.task_id for ti_i in tis if ti_i.state in {State.FAILED, State.UPSTREAM_FAILED}]
|
||||
mrds_finalise_workflow(a_workflow_history_key, "N")
|
||||
logging.error("Finalised workflow %s as FAILED (failed tasks=%s)",
|
||||
a_workflow_history_key, failed_tasks)
|
||||
raise AirflowFailException(f"Workflow failed for tasks: {failed_tasks}")
|
||||
|
||||
@af_task(task_id="cleanup_temp_files")
|
||||
def cleanup_temp_files(**context):
|
||||
"""Clean up temporary files for this run"""
|
||||
import shutil
|
||||
ti = context['ti']
|
||||
init_bundle = ti.xcom_pull(task_ids='init_workflow') or {}
|
||||
temp_dir = init_bundle.get('temp_dir')
|
||||
|
||||
if temp_dir and os.path.exists(temp_dir):
|
||||
shutil.rmtree(temp_dir)
|
||||
logging.info("Cleaned up temp directory: %s", temp_dir)
|
||||
|
||||
@af_task(task_id="move_zip_to_archive")
|
||||
def move_zip_to_archive(**context):
|
||||
"""Move processed zip file to archive"""
|
||||
ti = context['ti']
|
||||
init_bundle = ti.xcom_pull(task_ids='init_workflow') or {}
|
||||
zip_config = init_bundle.get('zip_config', {})
|
||||
|
||||
zip_key = zip_config.get("zip_object_name")
|
||||
if not zip_key:
|
||||
logging.warning("No zip key found, skipping archive")
|
||||
return
|
||||
|
||||
client = _oci_client()
|
||||
archive_key = zip_key.replace(OBJECT_PREFIX, f"{OBJECT_PREFIX}archive/", 1)
|
||||
|
||||
try:
|
||||
client.copy_object(
|
||||
OCI_NAMESPACE,
|
||||
OCI_BUCKET,
|
||||
{
|
||||
"sourceObjectName": zip_key,
|
||||
"destinationRegion": os.getenv("OCI_REGION", "eu-frankfurt-1"),
|
||||
"destinationNamespace": OCI_NAMESPACE,
|
||||
"destinationBucket": OCI_BUCKET,
|
||||
"destinationObjectName": archive_key
|
||||
}
|
||||
)
|
||||
logging.info("Copied to archive: %s -> %s", zip_key, archive_key)
|
||||
|
||||
client.delete_object(OCI_NAMESPACE, OCI_BUCKET, zip_key)
|
||||
logging.info("Deleted from inbox: %s", zip_key)
|
||||
except Exception as e:
|
||||
logging.error("Failed to archive zip file %s: %s", zip_key, e)
|
||||
raise
|
||||
|
||||
# Build the DAG structure
|
||||
config_task = get_zip_config()
|
||||
unzip_task = download_and_unzip(config_task)
|
||||
init_task = init_workflow(unzip_task)
|
||||
|
||||
# Create CSV processing tasks dynamically
|
||||
csv_tasks = []
|
||||
for csv_config in CSV_CONFIGS:
|
||||
task = PythonOperator(
|
||||
task_id=csv_config["task_name"],
|
||||
python_callable=run_mrds_task,
|
||||
op_kwargs={
|
||||
"task_config": {
|
||||
"task_name": csv_config["task_name"],
|
||||
"source_filename": csv_config["source_filename"],
|
||||
"source_path": "{{ ti.xcom_pull(task_ids='init_workflow')['task_configs'] | selectattr('task_name', 'equalto', '" + csv_config["task_name"] + "') | map(attribute='source_path') | first }}",
|
||||
"config_file": csv_config["config_yaml"],
|
||||
}
|
||||
},
|
||||
provide_context=True,
|
||||
)
|
||||
csv_tasks.append(task)
|
||||
|
||||
finalize_task = PythonOperator(
|
||||
task_id='finalize_workflow',
|
||||
python_callable=finalise_workflow_task,
|
||||
provide_context=True,
|
||||
trigger_rule=TriggerRule.ALL_DONE,
|
||||
retries=0,
|
||||
)
|
||||
|
||||
cleanup_task = cleanup_temp_files()
|
||||
archive_task = move_zip_to_archive()
|
||||
|
||||
all_good = EmptyOperator(
|
||||
task_id="All_went_well",
|
||||
trigger_rule=TriggerRule.ALL_SUCCESS,
|
||||
)
|
||||
|
||||
# Define task dependencies
|
||||
config_task >> unzip_task >> init_task >> csv_tasks >> finalize_task >> [cleanup_task, archive_task] >> all_good
|
||||
|
||||
logging.info("CSDB Ratings Full Processor DAG ready")
|
||||
Reference in New Issue
Block a user