init
This commit is contained in:
201
python/connectors/casper/casper_rqsd.py
Normal file
201
python/connectors/casper/casper_rqsd.py
Normal file
@@ -0,0 +1,201 @@
|
||||
import requests
|
||||
import io
|
||||
import zipfile
|
||||
import pandas as pd
|
||||
import os
|
||||
from datetime import datetime
|
||||
import oci
|
||||
from mrds.utils.secrets import get_secret
|
||||
import mrds.utils.manage_runs as runManager
|
||||
import mrds.utils.manage_files as fileManager
|
||||
import mrds.utils.sql_statements as sqls
|
||||
import sys
|
||||
import yaml
|
||||
TASK_HISTORY_MULTIPLIER = 1_000_000_000
|
||||
|
||||
def initialize_task(workflow_context, task_name):
|
||||
# Initialize task
|
||||
a_task_history_key = runManager.init_task(
|
||||
task_name,
|
||||
workflow_context["run_id"],
|
||||
workflow_context["a_workflow_history_key"],
|
||||
)
|
||||
|
||||
return a_task_history_key
|
||||
|
||||
def rqsd_parser(fileName,bucket_path,file,bucket_name):
|
||||
if "SCOPA" in fileName or "SCOPF" in fileName:
|
||||
print("SCOP")
|
||||
annex_1_1(fileName,bucket_path,file,bucket_name)
|
||||
annex_1_2(fileName,bucket_path,file,bucket_name)
|
||||
elif "RQSDC" in fileName:
|
||||
print("RQSDC")
|
||||
return annex_2(fileName, bucket_path,file,bucket_name)
|
||||
|
||||
def annex_1_1(fileName, bucket_path,file,bucket_name):
|
||||
fileData=fileName.split("_")
|
||||
csv_file_path = fileName[:-4]+".csv"
|
||||
version_number = fileData[6]
|
||||
ref_exercise = fileData[2]
|
||||
ncb = fileData[4]
|
||||
df = pd.read_excel(file, sheet_name="Counterparties in scope", skiprows=3)
|
||||
df = df.dropna(axis=1, how='all').dropna(axis=0, how='all')
|
||||
df['file_name'] = os.path.basename(fileName)
|
||||
df['ingestion_timestamp'] = datetime.now().isoformat()
|
||||
df['version_number'] = version_number
|
||||
df['ref_exercise'] = ref_exercise
|
||||
df['ncb'] = ncb
|
||||
signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
|
||||
client = oci.object_storage.ObjectStorageClient({}, signer = signer) # the first empyty bracket is an empty config
|
||||
client.put_object("frcnomajoc7v",bucket_name,bucket_path+"1_1/"+csv_file_path,bytes(df.to_csv( index=False), encoding='utf-8'))
|
||||
print("Finished uploading {}".format(csv_file_path))
|
||||
print(f"CSV saved to {csv_file_path}")
|
||||
|
||||
def annex_1_2(fileName, bucket_path,file,bucket_name):
|
||||
fileData=fileName.split("_")
|
||||
csv_file_path = fileName[:-4]+".csv"
|
||||
version_number = fileData[6]
|
||||
ref_exercise = fileData[2]
|
||||
ncb = fileData[4]
|
||||
df = pd.read_excel(file, sheet_name="Entities to which data relates", skiprows=3)
|
||||
df = df.dropna(axis=1, how='all').dropna(axis=0, how='all')
|
||||
df['file_name'] = os.path.basename(fileName)
|
||||
df['ingestion_timestamp'] = datetime.now().isoformat()
|
||||
df['version_number'] = version_number
|
||||
df['ref_exercise'] = ref_exercise
|
||||
df['ncb'] = ncb
|
||||
signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
|
||||
client = oci.object_storage.ObjectStorageClient({}, signer = signer) # the first empyty bracket is an empty config
|
||||
client.put_object("frcnomajoc7v",bucket_name,bucket_path+"1_2/"+csv_file_path,bytes(df.to_csv( index=False), encoding='utf-8'))
|
||||
print("Finished uploading {}".format(csv_file_path))
|
||||
print(f"CSV saved to {csv_file_path}")
|
||||
|
||||
def annex_2(fileName,bucket_path,file,bucket_name):
|
||||
fileData=fileName.split("_")
|
||||
# Parameters
|
||||
version_number = fileData[6]
|
||||
ref_exercise = fileData[2]
|
||||
ncb = fileData[4]
|
||||
|
||||
# Read the first sheet, skip the metadata rows
|
||||
df = pd.read_excel(file.getvalue(), sheet_name="Data collection template", skiprows=6)
|
||||
|
||||
# Clean empty rows/columns
|
||||
df = df.dropna(axis=1, how='all').dropna(axis=0, how='all')
|
||||
|
||||
# Add metadata columns
|
||||
df['file_name'] = os.path.basename(fileName)
|
||||
df['ingestion_timestamp'] = datetime.now().isoformat()
|
||||
df['version_number'] = version_number
|
||||
df['ref_exercise'] = ref_exercise
|
||||
df['ncb'] = ncb
|
||||
|
||||
csvName=fileName[:-4]+"csv"
|
||||
# Save to CSV
|
||||
|
||||
signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
|
||||
client = oci.object_storage.ObjectStorageClient({}, signer = signer) # the first empyty bracket is an empty config
|
||||
client.put_object("frcnomajoc7v",bucket_name,bucket_path+"2/"+csvName,bytes(df.to_csv( index=False), encoding='utf-8'))
|
||||
print("Finished uploading {}".format(csvName))
|
||||
print(f"CSV saved to {csvName}")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def rqsd_preflow(secret_crt_id,secret_key_id,casper_api_url,collection_id): #downloads the list of files
|
||||
|
||||
crt_path=os.getcwd()+"/rqsd_tst.crt"
|
||||
key_path=os.getcwd()+"/rqsd_tst.key.pem"
|
||||
try:
|
||||
with open(key_path,"w") as keyfile:
|
||||
keyfile.write(get_secret(secret_key_id))
|
||||
with open (crt_path,"w") as crtfile:
|
||||
crtfile.write(get_secret(secret_crt_id))
|
||||
except:
|
||||
print("Failed to retrieve certificates from secrets")
|
||||
os.remove(crt_path)
|
||||
os.remove(key_path)
|
||||
raise(Exception)
|
||||
|
||||
protected_resource_url=casper_api_url+"/casper-api/filevault/"
|
||||
try:
|
||||
|
||||
max_date=fileManager.execute_query("SELECT to_char(max(processing_end_time),'YYYY-MM-DD HH24:mi:ss') as MAX_PROCESSING_END_TIME FROM ct_ods.a_casper_filevault")
|
||||
if max_date is not []:
|
||||
filterString='isTest eq False and processingStatus eq "PS_COMPLETED" and processingEndTime gt '+max_date[0].split(' ')[0]
|
||||
else:
|
||||
filterString='isTest eq False and processingStatus eq "PS_COMPLETED"'
|
||||
response=requests.get(protected_resource_url+"files/"+collection_id ,headers={"accept": "application/json"},cert=(crt_path,key_path), verify=False, params={"filter": filterString})
|
||||
|
||||
print(response.text)
|
||||
files=response.json()
|
||||
except:
|
||||
print("Failed to retrieve ACC metadata, error during connection or request")
|
||||
raise(Exception)
|
||||
return files
|
||||
|
||||
|
||||
def rqsd_process(files,casper_api_url,bucket_path,bucket_name):
|
||||
crt_path=os.getcwd()+"/rqsd_tst.crt"
|
||||
key_path=os.getcwd()+"/rqsd_tst.key.pem"
|
||||
|
||||
# GET request to a protected
|
||||
for downloadable in files:
|
||||
try:
|
||||
print("\n\n")
|
||||
response=requests.get(casper_api_url+"/casper-api/filevault/download/"+str(downloadable["dcId"])+'/'+str(downloadable["fileID"]) ,headers={"accept": "application/json"},cert=(crt_path, key_path),verify=False)
|
||||
rqsd_parser(downloadable["fileName"],bucket_path,io.BytesIO(response.content),bucket_name)
|
||||
except:
|
||||
print(f"Failed to upload file into target bucket, files saved locally in {os.getcwd()}")
|
||||
os.remove(crt_path)
|
||||
os.remove(key_path)
|
||||
raise(Exception)
|
||||
|
||||
|
||||
def add_a_key_column(headers, data_rows, task_history_key):
|
||||
headers.insert(0, 'A_KEY')
|
||||
for i, row in enumerate(data_rows, start=1):
|
||||
a_key_value = int(task_history_key) * TASK_HISTORY_MULTIPLIER + i
|
||||
row.insert(0, str(a_key_value))
|
||||
|
||||
def add_workflow_key_column(headers, data_rows, workflow_key):
|
||||
headers.insert(1, 'A_WORKFLOW_HISTORY_KEY')
|
||||
for row in data_rows:
|
||||
row.insert(0, workflow_key)
|
||||
|
||||
def initialize_config(config_file_path):
|
||||
# Ensure the file exists
|
||||
if not os.path.exists(config_file_path):
|
||||
raise FileNotFoundError(f"Configuration file {config_file_path} not found.")
|
||||
|
||||
# Load the configuration
|
||||
with open(config_file_path, "r") as f:
|
||||
config_data = yaml.safe_load(f)
|
||||
|
||||
return config_data
|
||||
|
||||
|
||||
def main(workflow_context, flow_config_path, env_config_path, env):
|
||||
|
||||
#init setup
|
||||
flow_info = initialize_config(flow_config_path)
|
||||
envs_info = initialize_config(env_config_path)
|
||||
environment_info = envs_info[env]
|
||||
|
||||
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
|
||||
a_task_history_key = initialize_task(workflow_context, flow_info['TASK_NAME'])
|
||||
|
||||
# get list of files
|
||||
try:
|
||||
|
||||
files = rqsd_preflow(environment_info["CERTIFICATE_FILE"],environment_info["CERTIFICATE_KEY"],environment_info["CASPER_URL"],flow_info["COLLECTION_ID"])
|
||||
rqsd_process(files,environment_info["CASPER_URL"],flow_info["ODS_PREFIX"],environment_info["BUCKET"])
|
||||
except:
|
||||
print("Failed to retrieve DEVO data, error during connection or request")
|
||||
raise(Exception)
|
||||
|
||||
# Finalize task
|
||||
runManager.finalise_task(a_task_history_key, 'Y')
|
||||
27
python/connectors/casper/config/env_config.yaml
Normal file
27
python/connectors/casper/config/env_config.yaml
Normal file
@@ -0,0 +1,27 @@
|
||||
# Environment Configuration
|
||||
dev:
|
||||
BUCKET: "mrds_inbox_dev"
|
||||
CERTIFICATE_KEY: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya7r33ocatalf6jn6kg2xjhnya6kazlqd3e5gw6yghpd5q"
|
||||
CERTIFICATE_FILE: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyaeva4zvj6xdihljookamhse7jlyassfjb4p45xp46bwba"
|
||||
CASPER_URL: "https://internet.api.casper.tst.aws.tadnet.net"
|
||||
RQSD_COLLECTION_ID: "1537"
|
||||
tst:
|
||||
BUCKET: "mrds_inbox_tst"
|
||||
CERTIFICATE_KEY: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya7r33ocatalf6jn6kg2xjhnya6kazlqd3e5gw6yghpd5q"
|
||||
CERTIFICATE_FILE: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyaeva4zvj6xdihljookamhse7jlyassfjb4p45xp46bwba"
|
||||
CASPER_URL: "https://internet.api.casper.tst.aws.tadnet.net"
|
||||
RQSD_COLLECTION_ID: "1537"
|
||||
acc:
|
||||
BUCKET: "mrds_inbox_acc"
|
||||
CERTIFICATE_KEY: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya5snmftggydoszwchjra3ifa4pyiilgc26uqlhejnhcca"
|
||||
CERTIFICATE_FILE: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyaho5t4qgmlqctew6g6mcnwpz2p7z4nhxooyl6hc5sonfa"
|
||||
CASPER_URL: "https://internet.api.casper.stg.aws.ecb.de"
|
||||
RQSD_COLLECTION_ID: "1116"
|
||||
prd:
|
||||
BUCKET: "mrds_inbox_prd"
|
||||
CERTIFICATE_KEY: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyahmv5sopfsv7nytxdyycehoyl5pd7sz5t2drn27qaneta"
|
||||
CERTIFICATE_FILE: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyame3chyqs6cdl2igeyrvzpj3s4vrndhbgeayt533uhgqa"
|
||||
CASPER_URL: "https://internet.api.casper.prd.aws.ecb.de"
|
||||
RQSD_COLLECTION_ID: "1030"
|
||||
|
||||
|
||||
25
python/connectors/devo/config/env_config.yaml
Normal file
25
python/connectors/devo/config/env_config.yaml
Normal file
@@ -0,0 +1,25 @@
|
||||
# Environment Configuration
|
||||
dev:
|
||||
BUCKET_NAMESPACE: "frcnomajoc7v"
|
||||
BUCKET: "mrds_inbox_dev"
|
||||
DEVO_USERNAME: "ap-informatica-ipcwt"
|
||||
DEVO_HOSTNAME: "impala-proxy-devo-lab21-impala01.dw-devo-lab21.om2y56.b0.cloudera.site"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyavrevwxke46wjgj5nz3cc5kwwsybmngbji4zepones55q"
|
||||
tst:
|
||||
BUCKET_NAMESPACE: "frcnomajoc7v"
|
||||
BUCKET: "mrds_inbox_tst"
|
||||
DEVO_USERNAME: "ap-informatica-ipcwt"
|
||||
DEVO_HOSTNAME: "t-impala.devo.escb.eu"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyaxxx7yfifpgpdnxuj6dcowpoktwa6745kwwpezysd44oa"
|
||||
acc:
|
||||
BUCKET_NAMESPACE: "frcnomajoc7v"
|
||||
BUCKET: "mrds_inbox_acc"
|
||||
DEVO_USERNAME: "ap-informatica-ipcwa"
|
||||
DEVO_HOSTNAME: "impala-proxy-devo-acc21-impala01.dw-devo-acc21.inym23.b0.cloudera.site"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya4uttfadlzreloouw2e5bifgl2dvihffym5xoq3b3jmva"
|
||||
prd:
|
||||
BUCKET_NAMESPACE: "frcnomajoc7v"
|
||||
BUCKET: "mrds_inbox_prd"
|
||||
DEVO_USERNAME: "ap-informatica-ipcwp"
|
||||
DEVO_HOSTNAME: "impala-proxy-devo-prd21-impala01.dw-devo-prd21.inym23.b0.cloudera.site"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyanbahqlucid7qtzvoohsf4xrlul7cvhlsqttmbro4n66a"
|
||||
25
python/connectors/devo/config/env_config_rqsd.yaml
Normal file
25
python/connectors/devo/config/env_config_rqsd.yaml
Normal file
@@ -0,0 +1,25 @@
|
||||
# Environment Configuration
|
||||
dev:
|
||||
BUCKET_NAMESPACE: "frcnomajoc7v"
|
||||
BUCKET: "mrds_inbox_dev"
|
||||
DEVO_USERNAME: "ap-devo-rqsd-tst"
|
||||
DEVO_HOSTNAME: "t-impala.devo.escb.eu"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyap6wtzobzob7qizvk4nocszlcaxhwijgzejbvryt3uzbq"
|
||||
tst:
|
||||
BUCKET_NAMESPACE: "frcnomajoc7v"
|
||||
BUCKET: "mrds_inbox_tst"
|
||||
DEVO_USERNAME: "ap-devo-rqsd-tst"
|
||||
DEVO_HOSTNAME: "t-impala.devo.escb.eu"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyap6wtzobzob7qizvk4nocszlcaxhwijgzejbvryt3uzbq"
|
||||
acc:
|
||||
BUCKET_NAMESPACE: "frcnomajoc7v"
|
||||
BUCKET: "mrds_inbox_acc"
|
||||
DEVO_USERNAME: "ap-devo-rqsd-acc"
|
||||
DEVO_HOSTNAME: "impala-proxy-devo-acc21-impala01.dw-devo-acc21.inym23.b0.cloudera.site"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyamzhgatnso57mubvg3c6k4ens3orcx4dieo6efukuvm4a"
|
||||
prd:
|
||||
BUCKET_NAMESPACE: "frcnomajoc7v"
|
||||
BUCKET: "mrds_inbox_prd"
|
||||
DEVO_USERNAME: "ap-devo-rqsd-prd"
|
||||
DEVO_HOSTNAME: "impala-proxy-devo-prd21-impala01.dw-devo-prd21.inym23.b0.cloudera.site"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyawpahgevgxv6csqnwil3p37vi6pthl466onnkg6k7undq"
|
||||
259
python/connectors/devo/devo_connector.py
Normal file
259
python/connectors/devo/devo_connector.py
Normal file
@@ -0,0 +1,259 @@
|
||||
# devo_impala_exporter.py
|
||||
|
||||
import os
|
||||
import io
|
||||
import yaml
|
||||
import datetime
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import pandas as pd
|
||||
from mrds.utils.secrets import get_secret
|
||||
import mrds.utils.manage_runs as runManager
|
||||
import mrds.utils.objectstore as objectstore
|
||||
|
||||
import oci
|
||||
|
||||
from impala.dbapi import (
|
||||
connect,
|
||||
ProgrammingError,
|
||||
DatabaseError,
|
||||
IntegrityError,
|
||||
OperationalError,
|
||||
)
|
||||
from impala.error import HiveServer2Error
|
||||
|
||||
TASK_HISTORY_MULTIPLIER = 1_000_000_000
|
||||
|
||||
|
||||
class DevoConnector:
|
||||
"""
|
||||
Export the result of an Impala (DEVO) query to OCI Object Storage as CSV,
|
||||
while recording task run metadata via mrds.runManager.
|
||||
|
||||
Usage:
|
||||
exporter = DevoImpalaExporter(
|
||||
flow_config_path="/path/to/flow.yaml",
|
||||
env_config_path="/path/to/env.yaml",
|
||||
env="dev",
|
||||
logger=my_logger, # optional
|
||||
oci_client=my_object_storage, # optional ObjectStorageClient
|
||||
oci_signer=my_signer, # optional signer (used if client not provided)
|
||||
)
|
||||
exporter.run({"run_id": 34, "a_workflow_history_key": 6})
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
flow_config_path: str,
|
||||
env_config_path: str,
|
||||
env: str,
|
||||
logger: Optional[logging.Logger] = None,
|
||||
oci_client: Optional[oci.object_storage.ObjectStorageClient] = None,
|
||||
oci_signer: Optional[Any] = None,
|
||||
) -> None:
|
||||
self.flow_info = self._initialize_config(flow_config_path)
|
||||
envs_info = self._initialize_config(env_config_path)
|
||||
BUCKET_NAMESPACE = os.getenv("BUCKET_NAMESPACE", "frcnomajoc7v")
|
||||
|
||||
if env not in envs_info:
|
||||
raise KeyError(f"Environment '{env}' not found in {env_config_path}")
|
||||
self.environment_info = envs_info[env]
|
||||
self.environment_info["BUCKET_NAMESPACE"]=BUCKET_NAMESPACE
|
||||
self.env = env
|
||||
|
||||
# logging
|
||||
self.logger = logger or self._default_logger(self.flow_info.get("TASK_NAME", "devo_task"))
|
||||
|
||||
# OCI client/signer
|
||||
self.oci_client = oci_client
|
||||
self.oci_signer = oci_signer
|
||||
|
||||
# -------------------------
|
||||
# Public API
|
||||
# -------------------------
|
||||
|
||||
def run(self, workflow_context: Dict[str, Any]) -> None:
|
||||
"""Main entry point; executes query, uploads CSV, and finalizes task."""
|
||||
task_name = self.flow_info["TASK_NAME"]
|
||||
a_task_history_key = self._initialize_task(workflow_context, task_name)
|
||||
|
||||
try:
|
||||
# credentials
|
||||
devo_secret_name = self.environment_info["DEVO_SECRET"]
|
||||
password = get_secret(devo_secret_name)
|
||||
self.logger.info("Retrieved secret for DEVO connection.")
|
||||
|
||||
# query
|
||||
query = self.flow_info["DEVO_QUERY"]
|
||||
user = self.environment_info["DEVO_USERNAME"]
|
||||
host = self.environment_info["DEVO_HOSTNAME"]
|
||||
|
||||
columns, data, rowcount = self._execute_query(query=query, user=user, hostname=host, password=password)
|
||||
df = self._tuple_to_dataframe((columns, data))
|
||||
self.logger.info("Query executed and DataFrame created with %d rows.", len(df))
|
||||
|
||||
# upload
|
||||
if rowcount > 0:
|
||||
csv_name = f"{self.flow_info['OUTPUT_TABLE']}.csv"
|
||||
file_path = self._compose_object_path(self.flow_info["ODS_PREFIX"], csv_name)
|
||||
self._upload_dataframe_to_oci(df, csv_name, file_path)
|
||||
self.logger.info("Finished uploading %s to %s.", csv_name, file_path)
|
||||
else:
|
||||
return 0
|
||||
|
||||
# success
|
||||
runManager.finalise_task(a_task_history_key, "Y")
|
||||
self.logger.info("Task %s finalized successfully.", task_name)
|
||||
return rowcount
|
||||
|
||||
except Exception as e:
|
||||
# failure
|
||||
self.logger.exception("Run failed: %s", e)
|
||||
try:
|
||||
runManager.finalise_task(a_task_history_key, "N")
|
||||
finally:
|
||||
# re-raise for upstream handling if used as a library
|
||||
raise
|
||||
|
||||
# -------------------------
|
||||
# Impala / DEVO
|
||||
# -------------------------
|
||||
|
||||
@staticmethod
|
||||
def _get_impala_connection(hostname: str, user: str, secret: str):
|
||||
return connect(
|
||||
host=hostname,
|
||||
port=443,
|
||||
auth_mechanism="PLAIN",
|
||||
user=user,
|
||||
password=secret,
|
||||
use_http_transport=True,
|
||||
http_path="cliservice",
|
||||
use_ssl=True,
|
||||
)
|
||||
|
||||
def _execute_query(self, query: str, user: str, hostname: str, password: str) -> Tuple[List[str], List[List[Any]]]:
|
||||
conn = self._get_impala_connection(hostname, user, password)
|
||||
cursor = None
|
||||
self.logger.info("Executing Impala query against host '%s' as user '%s'.", hostname, user)
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
|
||||
if query.strip().lower().startswith("select") or "select" in query.strip().lower() :
|
||||
rows = cursor.fetchall()
|
||||
columns = [col[0] for col in cursor.description]
|
||||
return columns, rows, cursor.rowcount
|
||||
else:
|
||||
# Non-SELECT: return rowcount (still return a columns list for consistency)
|
||||
return [], [[cursor.rowcount]]
|
||||
|
||||
except OperationalError as oe:
|
||||
raise RuntimeError("Failed to connect to Impala: " + str(oe)) from oe
|
||||
except ProgrammingError as pe:
|
||||
raise ValueError("Query syntax error: " + str(pe)) from pe
|
||||
except IntegrityError as ie:
|
||||
raise PermissionError("Insufficient permissions: " + str(ie)) from ie
|
||||
except DatabaseError as db_err:
|
||||
raise RuntimeError("Database error: " + str(db_err)) from db_err
|
||||
except HiveServer2Error as au_err:
|
||||
raise PermissionError("HiveServer2Error error: " + str(au_err)) from au_err
|
||||
except Exception as e:
|
||||
raise RuntimeError("An unexpected error occurred: " + str(e)) from e
|
||||
finally:
|
||||
try:
|
||||
if cursor:
|
||||
cursor.close()
|
||||
finally:
|
||||
try:
|
||||
conn.close()
|
||||
except Exception:
|
||||
# log but don't mask the original exception
|
||||
self.logger.warning("Failed to close Impala connection cleanly.", exc_info=True)
|
||||
|
||||
# -------------------------
|
||||
# OCI Upload
|
||||
# -------------------------
|
||||
|
||||
|
||||
def _upload_dataframe_to_oci(self, df: pd.DataFrame, csv_name: str, object_path: str) -> None:
|
||||
namespace = self.environment_info["BUCKET_NAMESPACE"]
|
||||
bucket = self.environment_info["BUCKET"]
|
||||
# convert DataFrame to CSV bytes without index
|
||||
csv_bytes = df.to_csv(index=False).encode("utf-8")
|
||||
client=objectstore.get_client()
|
||||
client.put_object(namespace, bucket, object_path, csv_bytes)
|
||||
self.logger.info("CSV '%s' uploaded to bucket '%s' (ns: '%s', key: '%s').", csv_name, bucket, namespace, object_path)
|
||||
|
||||
# -------------------------
|
||||
# Utilities
|
||||
# -------------------------
|
||||
|
||||
@staticmethod
|
||||
def _tuple_to_dataframe(data_tuple: Tuple[List[str], List[List[Any]]]) -> pd.DataFrame:
|
||||
columns, data = data_tuple
|
||||
if not columns:
|
||||
# for non-SELECT queries we returned rowcount; represent it in a DataFrame
|
||||
return pd.DataFrame(data, columns=["rowcount"])
|
||||
return pd.DataFrame(data, columns=columns)
|
||||
|
||||
@staticmethod
|
||||
def _initialize_config(config_file_path: str) -> Dict[str, Any]:
|
||||
if not os.path.exists(config_file_path):
|
||||
raise FileNotFoundError(f"Configuration file {config_file_path} not found.")
|
||||
with open(config_file_path, "r") as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
@staticmethod
|
||||
def _initialize_task(workflow_context: Dict[str, Any], task_name: str) -> int:
|
||||
return runManager.init_task(
|
||||
task_name,
|
||||
workflow_context["run_id"],
|
||||
workflow_context["a_workflow_history_key"],
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def add_a_key_column(headers: List[str], data_rows: List[List[Any]], task_history_key: int) -> None:
|
||||
"""Optionally add an A_KEY column (kept for parity with original script)."""
|
||||
headers.insert(0, "A_KEY")
|
||||
for i, row in enumerate(data_rows, start=1):
|
||||
a_key_value = int(task_history_key) * TASK_HISTORY_MULTIPLIER + i
|
||||
row.insert(0, str(a_key_value))
|
||||
|
||||
@staticmethod
|
||||
def add_workflow_key_column(headers: List[str], data_rows: List[List[Any]], workflow_key: int) -> None:
|
||||
"""Optionally add the workflow key column right after A_KEY if present, otherwise at position 0."""
|
||||
insert_idx = 1 if headers and headers[0] == "A_KEY" else 0
|
||||
headers.insert(insert_idx, "A_WORKFLOW_HISTORY_KEY")
|
||||
for row in data_rows:
|
||||
row.insert(insert_idx, workflow_key)
|
||||
|
||||
@staticmethod
|
||||
def _compose_object_path(prefix: str, filename: str) -> str:
|
||||
if prefix.endswith("/"):
|
||||
return f"{prefix}{filename}"
|
||||
return f"{prefix}/{filename}"
|
||||
|
||||
@staticmethod
|
||||
def _default_logger(task_name: str) -> logging.Logger:
|
||||
logger = logging.getLogger(f"{task_name}_logger")
|
||||
if not logger.handlers:
|
||||
logger.setLevel(logging.INFO)
|
||||
handler = logging.StreamHandler()
|
||||
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
fmt = logging.Formatter(f"%(asctime)s [{task_name}] %(levelname)s: %(message)s")
|
||||
handler.setFormatter(fmt)
|
||||
logger.addHandler(handler)
|
||||
return logger
|
||||
|
||||
|
||||
# Optional: quick-run convenience if you ever want to execute this module directly.
|
||||
if __name__ == "__main__":
|
||||
# Example only—adjust paths/env/context as needed or remove this block.
|
||||
exporter = DevoConnector(
|
||||
flow_config_path="/home/dbt/Marco/mrds_elt/airflow/ods/rqsd/rqsd_process/config/yaml/m_ODS_RQSD_OBSERVATIONS.yaml",
|
||||
env_config_path="/home/dbt/Marco/mrds_elt/python/connectors/devo/config/env_config_rqsd.yaml",
|
||||
env="dev",
|
||||
)
|
||||
exporter.run({"run_id": 34, "a_workflow_history_key": 6})
|
||||
294
python/connectors/tms/TMSDBT.py
Normal file
294
python/connectors/tms/TMSDBT.py
Normal file
@@ -0,0 +1,294 @@
|
||||
|
||||
|
||||
import argparse
|
||||
from TMSQuery import XMLQuery
|
||||
|
||||
import mrds.utils.objectstore
|
||||
import tempfile
|
||||
import re
|
||||
import csv
|
||||
from io import StringIO
|
||||
|
||||
import os.path
|
||||
import os, psutil
|
||||
import sys
|
||||
|
||||
|
||||
namespace = os.getenv("BUCKET_NAMESPACE", "frcnomajoc7v")
|
||||
|
||||
def memory_usage():
|
||||
# return the memory usage in percentage like top
|
||||
process = psutil.Process(os.getpid())
|
||||
mem = process.memory_info().rss/(1024*1024*1024)
|
||||
return mem
|
||||
|
||||
|
||||
def protect_keyword(s):
|
||||
s = s.lower()
|
||||
s = s.replace(' ', '_')
|
||||
|
||||
match s.lower():
|
||||
case 'comment':
|
||||
#return '"comment"'
|
||||
return 'comment_'
|
||||
case 'date':
|
||||
#return '"date"'
|
||||
return 'date_'
|
||||
case 'number':
|
||||
#return '"number"'
|
||||
return 'number_'
|
||||
case _:
|
||||
return s
|
||||
|
||||
|
||||
cModelsDir = sys.path[0] + '/../dbt/mrds/models/ods/'
|
||||
cDatasetMultiplier = 10000000
|
||||
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("command", choices=['create-model', 'create-oracle-table', 'retrieve'], help="create-model retrieve")
|
||||
parser.add_argument("-n", "--name", help="Name")
|
||||
parser.add_argument("-u", "--url", required=True, help="URL of TMS service")
|
||||
parser.add_argument("-U", "--user", required=True, help="TMS user")
|
||||
parser.add_argument("-P", "--password", required=True, help="TMS password")
|
||||
parser.add_argument("-x", "--xmlfile", help="XML file")
|
||||
parser.add_argument("-l", "--layoutfile", help="layout file")
|
||||
parser.add_argument("-f", "--format", help="output format")
|
||||
parser.add_argument("-p", "--parameter", action="append", help="Parameter")
|
||||
parser.add_argument("-c", "--column", action="append", help="Additional column")
|
||||
parser.add_argument("-d", "--destination", help="destination")
|
||||
parser.add_argument("-s", "--dataset", help="data set ID", type=int)
|
||||
parser.add_argument("-v", "--version", help="data model version", type=int, default=1)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
query = XMLQuery()
|
||||
|
||||
if args.xmlfile:
|
||||
with open(args.xmlfile) as f:
|
||||
xml = f.read()
|
||||
query.xml = xml
|
||||
|
||||
|
||||
if args.layoutfile:
|
||||
with open(args.layoutfile) as f:
|
||||
layout = f.read()
|
||||
query.layout = layout
|
||||
|
||||
if args.format:
|
||||
query.format = args.format
|
||||
|
||||
if args.parameter:
|
||||
for p in args.parameter:
|
||||
[name, value] = p.split('=', 1)
|
||||
query.parameter[name] = value
|
||||
|
||||
additional_columns = []
|
||||
if args.column:
|
||||
for p in args.column:
|
||||
[name, value] = p.split('=', 1)
|
||||
t = re.split(r'(?:\|)|(?:/)|(?::)', name, maxsplit = 2)
|
||||
name = t[0]
|
||||
type = None
|
||||
if len(t) == 2:
|
||||
type = t[1]
|
||||
if not type:
|
||||
type = 'varchar2(255)'
|
||||
additional_columns.append((name, type, value))
|
||||
|
||||
|
||||
query.normalize_output()
|
||||
|
||||
from pathlib import Path
|
||||
import pprint
|
||||
p = Path('/tmp/kurt.xml')
|
||||
p.write_text(str(query))
|
||||
|
||||
|
||||
if args.command == 'create-oracle-table':
|
||||
|
||||
d = query.describe(args.url, args.user, args. password)
|
||||
|
||||
|
||||
columns = [" a_key number(38, 0)", "a_workflow_history_key number(38, 0)"]
|
||||
for c in additional_columns:
|
||||
columns.append("%s %s"%(c[0], c[1]))
|
||||
|
||||
for col in d:
|
||||
name = protect_keyword(col[0])
|
||||
match col[1]:
|
||||
case 'text':
|
||||
columns.append(name + " varchar2(512 char)")
|
||||
case 'int':
|
||||
columns.append(name + " number(38,0)")
|
||||
case 'money':
|
||||
columns.append(name + " number(19,4)")
|
||||
case 'floating':
|
||||
columns.append(name + " binary_double")
|
||||
case 'datetime':
|
||||
columns.append(name + " date")
|
||||
case 'integer':
|
||||
columns.append(name + " number(12, 0)")
|
||||
|
||||
|
||||
sql = "create table ct_et_templates." + args.name + " (\n"
|
||||
sql = sql + ",\n ".join(columns)
|
||||
sql = sql + "\n)\n"
|
||||
|
||||
if not args.destination or args.destination == '-':
|
||||
print(sql)
|
||||
else:
|
||||
with open(args.destination, 'w') as f:
|
||||
f.write(sql)
|
||||
|
||||
|
||||
|
||||
elif args.command == 'create-ods-model':
|
||||
|
||||
d = query.describe(args.url, args.user, args. password)
|
||||
|
||||
file_name = cModelsDir + args.name + '.yml'
|
||||
f = open(file_name, 'w') # open file in append mode
|
||||
|
||||
f.write('version: %d\n' % args.version)
|
||||
|
||||
f.write('models:' + '\n')
|
||||
f.write(' - name: ' + args.name + '_dbt\n')
|
||||
f.write(' description: "A starter dbt model"' + '\n')
|
||||
f.write(' columns:' + '\n')
|
||||
for col in d:
|
||||
f.write(' - name: ' + col[0] + '\n')
|
||||
f.write(' data_type: ' + col[1] + '\n')
|
||||
f.close()
|
||||
|
||||
|
||||
file_name = cModelsDir + args.name + '.sql'
|
||||
f = open(file_name, 'w') # open file in append mode
|
||||
|
||||
|
||||
if args.destination and args.destination != '-':
|
||||
if ':' in args.destination:
|
||||
dest = args.destination.split(':', 2)
|
||||
path = dest[1]
|
||||
else:
|
||||
path = args.destination
|
||||
prefix = os.path.dirname(path)
|
||||
else:
|
||||
prefix = 'INBOX/TMS/' + args.name.upper() + '/'
|
||||
|
||||
|
||||
|
||||
|
||||
pars = "ptablename => '%s', ptemplatetablename => 'ou_tms.%s', pprefix => '%s'" % (args.name, args.name, prefix)
|
||||
print(f"creating table {args.name}")
|
||||
f.write('{{\n config(\n post_hook = "call ct_mrds.file_manager.create_external_table(%s)"\n )\n}}\n\n' % pars)
|
||||
f.write("{{ config(materialized='table') }}" + "\n")
|
||||
f.write('with source_data as (' + "\n")
|
||||
columns = []
|
||||
columns.append("cast (1 as number(38,0)) as a_key")
|
||||
columns.append("cast (1 as number(38,0)) as a_workflow_history_key")
|
||||
for col in d:
|
||||
name = protect_keyword(col[0])
|
||||
match col[1]:
|
||||
case 'text':
|
||||
columns.append("cast ('x' as varchar2(255 char)) as " + name)
|
||||
case 'int':
|
||||
columns.append("cast (1 as number(38, 0)) as " + name)
|
||||
case 'money':
|
||||
columns.append("cast (1.0 as number(19,4)) as " + name)
|
||||
case 'floating':
|
||||
columns.append("cast (1.0 as binary_double) as " + name)
|
||||
case 'datetime':
|
||||
columns.append("cast (sysdate as date) as " + name)
|
||||
case 'integer':
|
||||
columns.append("cast (1 as number(12, 0)) as " + name)
|
||||
f.write(' select\n ' + ',\n '.join(columns) + '\n')
|
||||
f.write(')\nselect * from source_data\n ')
|
||||
f.close()
|
||||
|
||||
|
||||
|
||||
elif args.command == 'retrieve':
|
||||
ret = query.execute(args.url, args.user, args. password)
|
||||
|
||||
if query.format in ('scsv', 'standard_csv') and args.dataset:
|
||||
|
||||
# Save result to temporary spooled file for further processing
|
||||
# We avoid doing this in memory to prevent issues with flow EffectivePermissions
|
||||
|
||||
f = tempfile.SpooledTemporaryFile(mode = 'w+', max_size = 200*1024*1024)
|
||||
f.write(ret)
|
||||
del ret
|
||||
f.seek(0)
|
||||
|
||||
|
||||
# Replace embedded newlines for '<br/>'
|
||||
reader = csv.reader(f)
|
||||
|
||||
sio = StringIO()
|
||||
|
||||
writer = csv.writer(sio)
|
||||
for l in reader:
|
||||
l_tmp = [s.replace('\n', '<br/>') for s in l]
|
||||
writer.writerow(l_tmp)
|
||||
f.close()
|
||||
|
||||
|
||||
# Necessary to read the data into an array of lines for further processing
|
||||
sio.seek(0)
|
||||
lines_tmp = sio.readlines()
|
||||
del sio
|
||||
|
||||
|
||||
if not lines_tmp:
|
||||
ret = ""
|
||||
else:
|
||||
# Adding artificial columns A_KEY and A_WORKFLOW_HISTORY_KEY and added columns
|
||||
additional_headers = [t[0] for t in additional_columns]
|
||||
additional_values = [t[2] for t in additional_columns]
|
||||
headers = ['A_KEY','A_WORKFLOW_HISTORY_KEY'] + additional_headers + [protect_keyword(h) for h in lines_tmp[0].split(',')]
|
||||
lines = [','.join(headers) ]
|
||||
|
||||
i = 0
|
||||
for l in lines_tmp[1:]:
|
||||
lines.append(str(args.dataset*cDatasetMultiplier + i) + ',' + str(args.dataset) + ',' + ','.join(additional_values + [l]) )
|
||||
i += 1
|
||||
|
||||
del lines_tmp
|
||||
|
||||
# Spooling again to temporary file to avoid duplication memory needs
|
||||
f = tempfile.SpooledTemporaryFile(mode = 'w+', max_size = 200*1024*1024)
|
||||
f.writelines(lines)
|
||||
del lines
|
||||
f.seek(0)
|
||||
ret = f.read()
|
||||
f.close()
|
||||
|
||||
if not args.destination or args.destination == '-':
|
||||
print(ret, end='')
|
||||
elif ':' not in args.destination:
|
||||
with open(args.destination, 'w') as f:
|
||||
f.write(ret)
|
||||
else:
|
||||
f = tempfile.NamedTemporaryFile(delete = False, mode = 'w', prefix = 'TMSDBT-', suffix = '.csv')
|
||||
f.write(ret)
|
||||
f.close()
|
||||
|
||||
dest = args.destination.split(':', 2)
|
||||
bucket = dest[0]
|
||||
dirname = os.path.dirname(dest[1])
|
||||
filename = os.path.basename(dest[1])
|
||||
client = mrds.utils.objectstore.get_client()
|
||||
with open(f.name, "r") as file:
|
||||
print(file.read())
|
||||
mrds.utils.objectstore.upload_file(client, f.name,namespace, bucket, dirname, filename)
|
||||
|
||||
os.remove(f.name)
|
||||
|
||||
if ret:
|
||||
sys.exit(0)
|
||||
else:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
197
python/connectors/tms/TMSQuery.py
Normal file
197
python/connectors/tms/TMSQuery.py
Normal file
@@ -0,0 +1,197 @@
|
||||
|
||||
import xml.etree.ElementTree as ET
|
||||
import re
|
||||
import base64
|
||||
import sys
|
||||
|
||||
|
||||
class XMLQuery:
|
||||
|
||||
|
||||
def __init__(self, xml = None):
|
||||
|
||||
self._format = 'xml'
|
||||
self._layout = ''
|
||||
self._parameter = {}
|
||||
|
||||
if xml:
|
||||
|
||||
self._parse_xml(xml)
|
||||
|
||||
|
||||
def _parse_xml(self, xml):
|
||||
|
||||
self._tree = ET.fromstring(xml)
|
||||
|
||||
|
||||
layout_b64 = self._tree.find('layout').text
|
||||
self._layout = base64.b64decode(layout_b64).decode('utf-8')
|
||||
|
||||
self._format = self._tree.find('format').get('type')
|
||||
|
||||
|
||||
self._parameter = {}
|
||||
for p in self._tree.findall('parameters/parameter'):
|
||||
self._parameter[p.get('name')] = p.text
|
||||
|
||||
|
||||
|
||||
def execute(self, url, user, password):
|
||||
|
||||
# curl -X POST --basic -u schilli:chili03 --data @tms_activity_interval.xml https://tmsxd104.ecbt1.tadnet.net:9443/report/
|
||||
|
||||
import requests
|
||||
from requests.auth import HTTPBasicAuth
|
||||
|
||||
|
||||
data = str(self)
|
||||
basic = HTTPBasicAuth(user, password)
|
||||
|
||||
response = requests.post(url, data=data, auth=basic, verify=False)
|
||||
|
||||
if response.status_code == 200:
|
||||
response.encoding = "utf-8"
|
||||
return response.text
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def describe(self, url, user, password):
|
||||
|
||||
orig_format = self.format
|
||||
self.format = 'xml'
|
||||
|
||||
ret = self.execute(url, user, password)
|
||||
|
||||
m = re.match('^.*?\<PlainRow\>.*?\<\/PlainRow\>', ret, re.DOTALL)
|
||||
s = m[0] + '\n</report-generator>'
|
||||
|
||||
tree = ET.fromstring(s)
|
||||
|
||||
ret = []
|
||||
row = tree.find('PlainRow')
|
||||
for c in row.findall('Column'):
|
||||
#name = c.get('name')
|
||||
name = c.text
|
||||
type = c.get('type')
|
||||
if type == 'unknown': type = 'integer'
|
||||
|
||||
ret.append((name, type))
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
|
||||
def describe_simple(url, user, password, xml):
|
||||
|
||||
query = XMLQuery(xml)
|
||||
|
||||
query.format='xml'
|
||||
|
||||
ret = query.execute(url = url, user = user, password = password)
|
||||
|
||||
tree = ET.fromstring(ret)
|
||||
|
||||
ret = []
|
||||
row = tree.find('PlainRow')
|
||||
for c in row.findall('Column'):
|
||||
#name = c.get('name')
|
||||
name = c.text
|
||||
type = c.get('type')
|
||||
if type == 'unknown': type = 'integer'
|
||||
|
||||
ret.append((name, type))
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def normalize_output(self, date_format = 'dd/MM/yyyy', time_format = 'HH:mm:ss'):
|
||||
|
||||
lines = self.layout.splitlines()
|
||||
|
||||
lines = [re.sub(r'^date_format\s*=.*', 'date_format=' + date_format, l) for l in lines]
|
||||
lines = [re.sub(r'^time_format\s*=.*', 'time_format=' + time_format, l) for l in lines]
|
||||
lines = [re.sub(r'^NoNumberFormatting\s*=.*', 'NoNumberFormatting=1', l) for l in lines]
|
||||
|
||||
self.layout = '\n'.join(lines)
|
||||
|
||||
|
||||
|
||||
def __setattr__(self, name, value):
|
||||
if name == 'format' and value not in ('bin','xml','xml3','html','txt','csv','standard_csv', 'scsv', 'pdf'):
|
||||
raise Exception("Invalid report format '" + value + "'")
|
||||
|
||||
if not name.startswith('_'):
|
||||
name = '_' + name
|
||||
|
||||
if name == '_layout' and not value.endswith('\n'):
|
||||
value = value + '\n'
|
||||
|
||||
if name == '_xml':
|
||||
self._parse_xml(value)
|
||||
return
|
||||
|
||||
try:
|
||||
self.__dict__[name] = value
|
||||
except KeyError:
|
||||
raise AttributeError
|
||||
|
||||
|
||||
|
||||
def __getattr__(self, name):
|
||||
|
||||
if not name.startswith('_'):
|
||||
name = '_' + name
|
||||
|
||||
try:
|
||||
return self.__dict__[name]
|
||||
except KeyError:
|
||||
raise AttributeError(name)
|
||||
|
||||
|
||||
def __str__(self):
|
||||
parameters = ''
|
||||
for k in self._parameter:
|
||||
parameters = parameters + "\n<parameter name='%s'>%s</parameter>" % (k, self._parameter[k])
|
||||
|
||||
layout_b64 = base64.b64encode(self.layout.encode('utf-8')).decode('utf-8')
|
||||
return ('<?xml version="1.0" encoding="utf-8"?>\n' + \
|
||||
'<report-generator>\n' + \
|
||||
' <format type="%s"/>\n' + \
|
||||
' <layout>\n%s</layout>\n' + \
|
||||
' <parameters>%s\n</parameters>' + \
|
||||
'</report-generator>') % (self._format, layout_b64, parameters)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
||||
file = sys.argv[1]
|
||||
print(file)
|
||||
|
||||
with open(file) as f:
|
||||
xml = f.read()
|
||||
|
||||
|
||||
query = XMLQuery(xml)
|
||||
|
||||
print(query.layout)
|
||||
query.normalize_output()
|
||||
print(query.layout)
|
||||
|
||||
|
||||
|
||||
#query.format='xml'
|
||||
|
||||
#ret = query.execute(url = 'https://tmsxd104.ecbt1.tadnet.net:9443/report/', user = 'schilli', password = 'chili03')
|
||||
|
||||
#print(ret)
|
||||
|
||||
|
||||
desc = XMLQuery.describe_simple(url = 'https://tmsxd104.ecbt1.tadnet.net:9443/report/', user = 'schilli', password = 'chili03', xml = xml)
|
||||
|
||||
print(str(desc))
|
||||
355
python/connectors/tms/sample_DAG.py
Normal file
355
python/connectors/tms/sample_DAG.py
Normal file
@@ -0,0 +1,355 @@
|
||||
"""
|
||||
DAG: w_ODS_TMS_TRANSACTION (expanded example)
|
||||
Purpose:
|
||||
- Load layout+parameter metadata from TMS-layouts/w_ODS_TMS_TRANSACTION.yml
|
||||
- Call connectors/tms/TMSDBT.py to retrieve data into CSV in object storage
|
||||
- On first run, generate Oracle DDL and create an external table
|
||||
- Process file and record status in MRDS workflow tables
|
||||
Notes:
|
||||
- This is an expanded, readable version of the factory-generated DAG.
|
||||
- Replace paths/usernames/password references as appropriate.
|
||||
"""
|
||||
|
||||
import copy
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import traceback
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from airflow import DAG
|
||||
from airflow.operators.python import PythonOperator
|
||||
from pytz import timezone
|
||||
|
||||
# --- Project-specific deps (must exist in your Airflow image) ---
|
||||
from mrds.core import main # noqa: F401 # imported to mirror the factory env
|
||||
import mrds.utils.manage_files as mf
|
||||
import mrds.utils.manage_runs as mr
|
||||
|
||||
# ---------- Paths & constants ----------
|
||||
gScriptDir = Path(globals().get("__file__", "./_")).absolute().parent
|
||||
gDataDir = str(gScriptDir / "TMS-layouts") + "/"
|
||||
gConfigDir = str(gScriptDir / "config")
|
||||
gConnDir = "/opt/airflow/python/connectors/tms/"
|
||||
gTableDir = str(gScriptDir / "TMS-tables") + "/"
|
||||
|
||||
DAG_NAME = "w_ODS_TMS_TRANSACTION"
|
||||
ODS_TABLE = DAG_NAME
|
||||
DATABASE_NAME = "ODS"
|
||||
WF_NAME = DAG_NAME
|
||||
|
||||
default_args = {
|
||||
"owner": "ecb",
|
||||
"depends_on_past": False,
|
||||
"email_on_failure": False,
|
||||
"email_on_retry": False,
|
||||
"retries": 0,
|
||||
"execution_timeout": timedelta(minutes=60),
|
||||
"retry_delay": timedelta(minutes=5),
|
||||
}
|
||||
|
||||
# ---------- Load YAML configs once on parse ----------
|
||||
with open(gDataDir + DAG_NAME + ".yml", "r") as f:
|
||||
report_desc = yaml.safe_load(f) or {}
|
||||
|
||||
with open(gConfigDir + "/TMS.yml", "r") as f:
|
||||
tms_config = yaml.safe_load(f)
|
||||
|
||||
# TMS + storage config
|
||||
tms_url = tms_config["TMS-URL"]
|
||||
tms_user = tms_config["TMS-user"]
|
||||
tms_pwd = tms_config["TMS-password"]
|
||||
prefix = tms_config["dest-prefix"] + DAG_NAME + "/" + DAG_NAME + "/"
|
||||
data_prefix = tms_config["data-prefix"] + DAG_NAME + "/"
|
||||
dest = tms_config["dest-bucket"] + ":" + prefix
|
||||
|
||||
# Visible vs hidden params (from layout YAML)
|
||||
params_visible = {}
|
||||
params_hidden = {}
|
||||
params_dict = report_desc.get("parameters") or {}
|
||||
for p, meta in params_dict.items():
|
||||
val = meta.get("value", None)
|
||||
if not meta.get("hidden", False):
|
||||
params_visible[p] = val
|
||||
else:
|
||||
params_hidden[p] = val
|
||||
|
||||
# ---------- Helpers (parameter handling) ----------
|
||||
def _enum_param_combinations_recursive(params, keys):
|
||||
"""
|
||||
Build all combinations of params (cartesian product), supporting
|
||||
'column(<name>)' derived lists aligned by index.
|
||||
"""
|
||||
k = None
|
||||
result = []
|
||||
keys = list(keys) # safe copy
|
||||
|
||||
while keys:
|
||||
k = keys.pop(0)
|
||||
v = params[k]
|
||||
if v or v == "":
|
||||
break
|
||||
|
||||
if not k:
|
||||
return []
|
||||
|
||||
v = v if isinstance(v, list) else [v]
|
||||
|
||||
# derived columns aligned with v (same length)
|
||||
derived_columns = []
|
||||
# params_dict[k] holds the definition, not just the value
|
||||
pdef = params_dict.get(k, {})
|
||||
for c in list(pdef):
|
||||
if re.match(r"column\(.*\)$", c):
|
||||
vtmp = pdef[c]
|
||||
vtmp = vtmp if isinstance(vtmp, list) else [vtmp]
|
||||
derived_columns.append((c, vtmp))
|
||||
|
||||
if not keys:
|
||||
for i, value in enumerate(v):
|
||||
row = [(k, value)]
|
||||
for col_key, aligned_values in derived_columns:
|
||||
row.append((col_key, aligned_values[i]))
|
||||
result.append(row)
|
||||
return result
|
||||
|
||||
combinations = _enum_param_combinations_recursive(params, keys)
|
||||
for row in combinations:
|
||||
for i, vtmp in enumerate(v):
|
||||
new_row = copy.deepcopy(row)
|
||||
new_row.append((k, vtmp))
|
||||
for col_key, aligned_values in derived_columns:
|
||||
new_row.append((col_key, aligned_values[i]))
|
||||
result.append(new_row)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _enum_param_combinations(params, sequential=False):
|
||||
# Sequential path omitted (buggy in factory; not used there either)
|
||||
return _enum_param_combinations_recursive(params, list(params))
|
||||
|
||||
|
||||
def _allowed_select(table, expression, condition="1 = 1"):
|
||||
"""
|
||||
Guarded select used by eval_params(select(...)).
|
||||
Whitelist tables to avoid arbitrary reads.
|
||||
"""
|
||||
if table.upper() not in (
|
||||
ODS_TABLE.upper(),
|
||||
"DUAL",
|
||||
"CT_MRDS.A_WORKFLOW_HISTORY",
|
||||
):
|
||||
raise Exception(f"Not allowed to select from {table}")
|
||||
res = mr.select_ods_tab(table, expression, condition)
|
||||
return res[0]
|
||||
|
||||
|
||||
def _eval_param(v):
|
||||
"""
|
||||
Evaluate special functional values:
|
||||
- select(...) => guarded DB helper above
|
||||
- eval(...) => strongly discouraged; keep disabled or restricted
|
||||
"""
|
||||
s = str(v) if v is not None else ""
|
||||
if re.match(r"\s*select\(.*\)", s):
|
||||
# Expose only 'select' symbol to eval
|
||||
return eval(s, {"select": _allowed_select}, {})
|
||||
if re.match(r"\s*eval\(.*\)\s*$", s):
|
||||
# If you really must support eval, strictly sandbox or remove this path.
|
||||
raise ValueError("eval(...) not allowed in this hardened DAG.")
|
||||
return v
|
||||
|
||||
|
||||
def _finalize_param_list(param_list):
|
||||
"""
|
||||
Apply replacements and drop virtual params according to YAML definitions.
|
||||
"""
|
||||
d = dict(param_list)
|
||||
|
||||
# Replace parameter tokens inside another parameter (string replace)
|
||||
for p, meta in params_dict.items():
|
||||
if meta.get("replace_parameter"):
|
||||
target = meta["replace_parameter"]
|
||||
if target in d and p in d and isinstance(d[target], str):
|
||||
d[target] = d[target].replace(p, str(d[p]))
|
||||
|
||||
# Drop 'virtual' params
|
||||
cleaned = []
|
||||
for k, v in d.items():
|
||||
meta = params_dict.get(k, {})
|
||||
if not meta.get("virtual", False):
|
||||
cleaned.append((k, v))
|
||||
return cleaned
|
||||
|
||||
|
||||
# ---------- Core work ----------
|
||||
def execute_report(**context):
|
||||
"""
|
||||
For each parameter combination:
|
||||
- create workflow key
|
||||
- call TMSDBT.py retrieve to land CSV
|
||||
- if first time, create Oracle table from generated DDL
|
||||
- process file, finalize workflow Y/N
|
||||
"""
|
||||
logger = logging.getLogger("airflow.task")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
run_id = context["dag_run"].run_id
|
||||
all_params = {**params_visible, **params_hidden}
|
||||
|
||||
# 1) Compute combinations
|
||||
combos = _enum_param_combinations(all_params)
|
||||
|
||||
# 2) Evaluate select(...) etc and finalize
|
||||
evaluated = []
|
||||
for combo in combos or [[]]:
|
||||
# first pass: special evaluations
|
||||
pair_list = []
|
||||
for k, v in combo:
|
||||
pair_list.append((k, _eval_param(v)))
|
||||
# second pass: replacements + pruning
|
||||
evaluated.append(_finalize_param_list(pair_list))
|
||||
|
||||
# if no combos at all, ensure we run once
|
||||
if not evaluated:
|
||||
evaluated = [[]]
|
||||
|
||||
# Timing + workflow
|
||||
ts = "{:%Y%m%d_%H%M%S}".format(datetime.now(timezone("Europe/Berlin")))
|
||||
|
||||
for idx, param_list in enumerate(evaluated, start=1):
|
||||
wf_key = mr.init_workflow(DATABASE_NAME, WF_NAME, run_id)
|
||||
file_name = f"{WF_NAME}.{wf_key}.{ts}.csv"
|
||||
|
||||
try:
|
||||
# Build connector command safely (no shell quoting games)
|
||||
cmd = [
|
||||
sys.executable, # 'python'
|
||||
os.path.join(gConnDir, "TMSDBT.py"),
|
||||
"retrieve",
|
||||
"--name", WF_NAME,
|
||||
"--url", tms_url,
|
||||
"-U", tms_user,
|
||||
"--password", tms_pwd,
|
||||
"--layoutfile", gDataDir + DAG_NAME + ".fkr",
|
||||
"-f", "scsv",
|
||||
"--dataset", str(wf_key),
|
||||
"-d", dest + file_name,
|
||||
]
|
||||
|
||||
# Map params to -p or -c switches
|
||||
for k, v in param_list:
|
||||
sval = "" if v is None else str(v).rstrip()
|
||||
m = re.match(r"column\((.*)\)$", k)
|
||||
if m:
|
||||
cmd.extend(["-c", f'{m.group(1)}={sval}'])
|
||||
else:
|
||||
cmd.extend(["-p", f"{k}={sval}"])
|
||||
mr.set_workflow_property(wf_key, DATABASE_NAME, k, sval)
|
||||
|
||||
logger.debug("Running connector: %s", json.dumps(cmd))
|
||||
res = subprocess.run(cmd, capture_output=True, check=False)
|
||||
logger.debug("stdout: %s", res.stdout.decode(errors="ignore"))
|
||||
logger.debug("stderr: %s", res.stderr.decode(errors="ignore"))
|
||||
|
||||
if res.returncode is None:
|
||||
raise RuntimeError("Connector returned no status")
|
||||
if res.returncode == 1:
|
||||
logger.info("No data returned for wf_key=%s (continuing)", wf_key)
|
||||
mr.finalise_workflow(wf_key, "Y")
|
||||
continue
|
||||
if res.returncode != 0:
|
||||
raise RuntimeError(f"Connector failed (rc={res.returncode})")
|
||||
|
||||
# Data landed -> ensure source config exists, bootstrap table if needed
|
||||
cfg = mf.execute_query(
|
||||
"select * from CT_MRDS.A_SOURCE_FILE_CONFIG "
|
||||
f"where a_source_key = 'TMS' and table_id = '{ODS_TABLE}'"
|
||||
)
|
||||
|
||||
if not cfg:
|
||||
# Generate DDL file
|
||||
ddl_cmd = [
|
||||
sys.executable,
|
||||
os.path.join(gConnDir, "TMSDBT.py"),
|
||||
"create-oracle-table",
|
||||
"--name", WF_NAME,
|
||||
"--url", tms_url,
|
||||
"-U", tms_user,
|
||||
"--password", tms_pwd,
|
||||
"--layoutfile", gDataDir + DAG_NAME + ".fkr",
|
||||
"-d", gTableDir + WF_NAME + ".sql",
|
||||
]
|
||||
for k, v in param_list:
|
||||
sval = "" if v is None else str(v).rstrip()
|
||||
m = re.match(r"column\((.*)\)$", k)
|
||||
if m:
|
||||
ddl_cmd.extend(["-c", f'{m.group(1)}={sval}'])
|
||||
else:
|
||||
ddl_cmd.extend(["-p", f"{k}={sval}"])
|
||||
|
||||
logger.debug("Generating DDL: %s", json.dumps(ddl_cmd))
|
||||
ddl_res = subprocess.run(ddl_cmd, capture_output=True, check=True)
|
||||
logger.debug("DDL stdout: %s", ddl_res.stdout.decode(errors="ignore"))
|
||||
logger.debug("DDL stderr: %s", ddl_res.stderr.decode(errors="ignore"))
|
||||
|
||||
# Execute DDL and create external table
|
||||
sql = Path(gTableDir + WF_NAME + ".sql").read_text()
|
||||
mf.execute(sql)
|
||||
mf.add_column_date_format(
|
||||
f"CT_ET_TEMPLATES.{ODS_TABLE}", "DEFAULT", "DD/MM/YYYY HH24:MI:SS"
|
||||
)
|
||||
mf.create_external_table(ODS_TABLE, f"CT_ET_TEMPLATES.{ODS_TABLE}", data_prefix)
|
||||
mf.add_source_file_config(
|
||||
"TMS",
|
||||
"INPUT",
|
||||
DAG_NAME,
|
||||
DAG_NAME,
|
||||
r".*\.csv",
|
||||
ODS_TABLE,
|
||||
f"CT_ET_TEMPLATES.{ODS_TABLE}",
|
||||
)
|
||||
|
||||
# Process landed file (register, move, etc. as per your mf impl)
|
||||
mf.process_source_file(prefix, file_name)
|
||||
mr.finalise_workflow(wf_key, "Y")
|
||||
|
||||
except BaseException as ex:
|
||||
# rich error logging, then mark workflow failed and re-raise
|
||||
ex_type, ex_value, ex_tb = sys.exc_info()
|
||||
tb = traceback.extract_tb(ex_tb)
|
||||
stack = [
|
||||
f"File: {t[0]}, Line: {t[1]}, Func: {t[2]}, Code: {t[3]}"
|
||||
for t in tb
|
||||
]
|
||||
logging.error("Exception type: %s", ex_type.__name__)
|
||||
logging.error("Exception message: %s", ex_value)
|
||||
logging.error("Stack trace: %s", stack)
|
||||
mr.finalise_workflow(wf_key, "N")
|
||||
raise
|
||||
|
||||
|
||||
# ---------- DAG definition ----------
|
||||
with DAG(
|
||||
dag_id=DAG_NAME,
|
||||
default_args=default_args,
|
||||
description=DAG_NAME,
|
||||
schedule_interval=None, # manual trigger
|
||||
params=params_visible, # visible-only; hidden merged inside task
|
||||
start_date=datetime(2025, 1, 1),
|
||||
catchup=False,
|
||||
tags=[DAG_NAME],
|
||||
) as dag:
|
||||
|
||||
retrieve_report = PythonOperator(
|
||||
task_id="retrieve_report",
|
||||
python_callable=execute_report,
|
||||
execution_timeout=timedelta(minutes=30),
|
||||
)
|
||||
Reference in New Issue
Block a user