This commit is contained in:
Grzegorz Michalski
2026-03-02 09:47:35 +01:00
commit 2c225d68ac
715 changed files with 130067 additions and 0 deletions

View File

@@ -0,0 +1,201 @@
import requests
import io
import zipfile
import pandas as pd
import os
from datetime import datetime
import oci
from mrds.utils.secrets import get_secret
import mrds.utils.manage_runs as runManager
import mrds.utils.manage_files as fileManager
import mrds.utils.sql_statements as sqls
import sys
import yaml
TASK_HISTORY_MULTIPLIER = 1_000_000_000
def initialize_task(workflow_context, task_name):
# Initialize task
a_task_history_key = runManager.init_task(
task_name,
workflow_context["run_id"],
workflow_context["a_workflow_history_key"],
)
return a_task_history_key
def rqsd_parser(fileName,bucket_path,file,bucket_name):
if "SCOPA" in fileName or "SCOPF" in fileName:
print("SCOP")
annex_1_1(fileName,bucket_path,file,bucket_name)
annex_1_2(fileName,bucket_path,file,bucket_name)
elif "RQSDC" in fileName:
print("RQSDC")
return annex_2(fileName, bucket_path,file,bucket_name)
def annex_1_1(fileName, bucket_path,file,bucket_name):
fileData=fileName.split("_")
csv_file_path = fileName[:-4]+".csv"
version_number = fileData[6]
ref_exercise = fileData[2]
ncb = fileData[4]
df = pd.read_excel(file, sheet_name="Counterparties in scope", skiprows=3)
df = df.dropna(axis=1, how='all').dropna(axis=0, how='all')
df['file_name'] = os.path.basename(fileName)
df['ingestion_timestamp'] = datetime.now().isoformat()
df['version_number'] = version_number
df['ref_exercise'] = ref_exercise
df['ncb'] = ncb
signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
client = oci.object_storage.ObjectStorageClient({}, signer = signer) # the first empyty bracket is an empty config
client.put_object("frcnomajoc7v",bucket_name,bucket_path+"1_1/"+csv_file_path,bytes(df.to_csv( index=False), encoding='utf-8'))
print("Finished uploading {}".format(csv_file_path))
print(f"CSV saved to {csv_file_path}")
def annex_1_2(fileName, bucket_path,file,bucket_name):
fileData=fileName.split("_")
csv_file_path = fileName[:-4]+".csv"
version_number = fileData[6]
ref_exercise = fileData[2]
ncb = fileData[4]
df = pd.read_excel(file, sheet_name="Entities to which data relates", skiprows=3)
df = df.dropna(axis=1, how='all').dropna(axis=0, how='all')
df['file_name'] = os.path.basename(fileName)
df['ingestion_timestamp'] = datetime.now().isoformat()
df['version_number'] = version_number
df['ref_exercise'] = ref_exercise
df['ncb'] = ncb
signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
client = oci.object_storage.ObjectStorageClient({}, signer = signer) # the first empyty bracket is an empty config
client.put_object("frcnomajoc7v",bucket_name,bucket_path+"1_2/"+csv_file_path,bytes(df.to_csv( index=False), encoding='utf-8'))
print("Finished uploading {}".format(csv_file_path))
print(f"CSV saved to {csv_file_path}")
def annex_2(fileName,bucket_path,file,bucket_name):
fileData=fileName.split("_")
# Parameters
version_number = fileData[6]
ref_exercise = fileData[2]
ncb = fileData[4]
# Read the first sheet, skip the metadata rows
df = pd.read_excel(file.getvalue(), sheet_name="Data collection template", skiprows=6)
# Clean empty rows/columns
df = df.dropna(axis=1, how='all').dropna(axis=0, how='all')
# Add metadata columns
df['file_name'] = os.path.basename(fileName)
df['ingestion_timestamp'] = datetime.now().isoformat()
df['version_number'] = version_number
df['ref_exercise'] = ref_exercise
df['ncb'] = ncb
csvName=fileName[:-4]+"csv"
# Save to CSV
signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
client = oci.object_storage.ObjectStorageClient({}, signer = signer) # the first empyty bracket is an empty config
client.put_object("frcnomajoc7v",bucket_name,bucket_path+"2/"+csvName,bytes(df.to_csv( index=False), encoding='utf-8'))
print("Finished uploading {}".format(csvName))
print(f"CSV saved to {csvName}")
def rqsd_preflow(secret_crt_id,secret_key_id,casper_api_url,collection_id): #downloads the list of files
crt_path=os.getcwd()+"/rqsd_tst.crt"
key_path=os.getcwd()+"/rqsd_tst.key.pem"
try:
with open(key_path,"w") as keyfile:
keyfile.write(get_secret(secret_key_id))
with open (crt_path,"w") as crtfile:
crtfile.write(get_secret(secret_crt_id))
except:
print("Failed to retrieve certificates from secrets")
os.remove(crt_path)
os.remove(key_path)
raise(Exception)
protected_resource_url=casper_api_url+"/casper-api/filevault/"
try:
max_date=fileManager.execute_query("SELECT to_char(max(processing_end_time),'YYYY-MM-DD HH24:mi:ss') as MAX_PROCESSING_END_TIME FROM ct_ods.a_casper_filevault")
if max_date is not []:
filterString='isTest eq False and processingStatus eq "PS_COMPLETED" and processingEndTime gt '+max_date[0].split(' ')[0]
else:
filterString='isTest eq False and processingStatus eq "PS_COMPLETED"'
response=requests.get(protected_resource_url+"files/"+collection_id ,headers={"accept": "application/json"},cert=(crt_path,key_path), verify=False, params={"filter": filterString})
print(response.text)
files=response.json()
except:
print("Failed to retrieve ACC metadata, error during connection or request")
raise(Exception)
return files
def rqsd_process(files,casper_api_url,bucket_path,bucket_name):
crt_path=os.getcwd()+"/rqsd_tst.crt"
key_path=os.getcwd()+"/rqsd_tst.key.pem"
# GET request to a protected
for downloadable in files:
try:
print("\n\n")
response=requests.get(casper_api_url+"/casper-api/filevault/download/"+str(downloadable["dcId"])+'/'+str(downloadable["fileID"]) ,headers={"accept": "application/json"},cert=(crt_path, key_path),verify=False)
rqsd_parser(downloadable["fileName"],bucket_path,io.BytesIO(response.content),bucket_name)
except:
print(f"Failed to upload file into target bucket, files saved locally in {os.getcwd()}")
os.remove(crt_path)
os.remove(key_path)
raise(Exception)
def add_a_key_column(headers, data_rows, task_history_key):
headers.insert(0, 'A_KEY')
for i, row in enumerate(data_rows, start=1):
a_key_value = int(task_history_key) * TASK_HISTORY_MULTIPLIER + i
row.insert(0, str(a_key_value))
def add_workflow_key_column(headers, data_rows, workflow_key):
headers.insert(1, 'A_WORKFLOW_HISTORY_KEY')
for row in data_rows:
row.insert(0, workflow_key)
def initialize_config(config_file_path):
# Ensure the file exists
if not os.path.exists(config_file_path):
raise FileNotFoundError(f"Configuration file {config_file_path} not found.")
# Load the configuration
with open(config_file_path, "r") as f:
config_data = yaml.safe_load(f)
return config_data
def main(workflow_context, flow_config_path, env_config_path, env):
#init setup
flow_info = initialize_config(flow_config_path)
envs_info = initialize_config(env_config_path)
environment_info = envs_info[env]
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
a_task_history_key = initialize_task(workflow_context, flow_info['TASK_NAME'])
# get list of files
try:
files = rqsd_preflow(environment_info["CERTIFICATE_FILE"],environment_info["CERTIFICATE_KEY"],environment_info["CASPER_URL"],flow_info["COLLECTION_ID"])
rqsd_process(files,environment_info["CASPER_URL"],flow_info["ODS_PREFIX"],environment_info["BUCKET"])
except:
print("Failed to retrieve DEVO data, error during connection or request")
raise(Exception)
# Finalize task
runManager.finalise_task(a_task_history_key, 'Y')

View File

@@ -0,0 +1,27 @@
# Environment Configuration
dev:
BUCKET: "mrds_inbox_dev"
CERTIFICATE_KEY: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya7r33ocatalf6jn6kg2xjhnya6kazlqd3e5gw6yghpd5q"
CERTIFICATE_FILE: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyaeva4zvj6xdihljookamhse7jlyassfjb4p45xp46bwba"
CASPER_URL: "https://internet.api.casper.tst.aws.tadnet.net"
RQSD_COLLECTION_ID: "1537"
tst:
BUCKET: "mrds_inbox_tst"
CERTIFICATE_KEY: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya7r33ocatalf6jn6kg2xjhnya6kazlqd3e5gw6yghpd5q"
CERTIFICATE_FILE: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyaeva4zvj6xdihljookamhse7jlyassfjb4p45xp46bwba"
CASPER_URL: "https://internet.api.casper.tst.aws.tadnet.net"
RQSD_COLLECTION_ID: "1537"
acc:
BUCKET: "mrds_inbox_acc"
CERTIFICATE_KEY: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya5snmftggydoszwchjra3ifa4pyiilgc26uqlhejnhcca"
CERTIFICATE_FILE: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyaho5t4qgmlqctew6g6mcnwpz2p7z4nhxooyl6hc5sonfa"
CASPER_URL: "https://internet.api.casper.stg.aws.ecb.de"
RQSD_COLLECTION_ID: "1116"
prd:
BUCKET: "mrds_inbox_prd"
CERTIFICATE_KEY: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyahmv5sopfsv7nytxdyycehoyl5pd7sz5t2drn27qaneta"
CERTIFICATE_FILE: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyame3chyqs6cdl2igeyrvzpj3s4vrndhbgeayt533uhgqa"
CASPER_URL: "https://internet.api.casper.prd.aws.ecb.de"
RQSD_COLLECTION_ID: "1030"

View File

@@ -0,0 +1,25 @@
# Environment Configuration
dev:
BUCKET_NAMESPACE: "frcnomajoc7v"
BUCKET: "mrds_inbox_dev"
DEVO_USERNAME: "ap-informatica-ipcwt"
DEVO_HOSTNAME: "impala-proxy-devo-lab21-impala01.dw-devo-lab21.om2y56.b0.cloudera.site"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyavrevwxke46wjgj5nz3cc5kwwsybmngbji4zepones55q"
tst:
BUCKET_NAMESPACE: "frcnomajoc7v"
BUCKET: "mrds_inbox_tst"
DEVO_USERNAME: "ap-informatica-ipcwt"
DEVO_HOSTNAME: "t-impala.devo.escb.eu"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyaxxx7yfifpgpdnxuj6dcowpoktwa6745kwwpezysd44oa"
acc:
BUCKET_NAMESPACE: "frcnomajoc7v"
BUCKET: "mrds_inbox_acc"
DEVO_USERNAME: "ap-informatica-ipcwa"
DEVO_HOSTNAME: "impala-proxy-devo-acc21-impala01.dw-devo-acc21.inym23.b0.cloudera.site"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya4uttfadlzreloouw2e5bifgl2dvihffym5xoq3b3jmva"
prd:
BUCKET_NAMESPACE: "frcnomajoc7v"
BUCKET: "mrds_inbox_prd"
DEVO_USERNAME: "ap-informatica-ipcwp"
DEVO_HOSTNAME: "impala-proxy-devo-prd21-impala01.dw-devo-prd21.inym23.b0.cloudera.site"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyanbahqlucid7qtzvoohsf4xrlul7cvhlsqttmbro4n66a"

View File

@@ -0,0 +1,25 @@
# Environment Configuration
dev:
BUCKET_NAMESPACE: "frcnomajoc7v"
BUCKET: "mrds_inbox_dev"
DEVO_USERNAME: "ap-devo-rqsd-tst"
DEVO_HOSTNAME: "t-impala.devo.escb.eu"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyap6wtzobzob7qizvk4nocszlcaxhwijgzejbvryt3uzbq"
tst:
BUCKET_NAMESPACE: "frcnomajoc7v"
BUCKET: "mrds_inbox_tst"
DEVO_USERNAME: "ap-devo-rqsd-tst"
DEVO_HOSTNAME: "t-impala.devo.escb.eu"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyap6wtzobzob7qizvk4nocszlcaxhwijgzejbvryt3uzbq"
acc:
BUCKET_NAMESPACE: "frcnomajoc7v"
BUCKET: "mrds_inbox_acc"
DEVO_USERNAME: "ap-devo-rqsd-acc"
DEVO_HOSTNAME: "impala-proxy-devo-acc21-impala01.dw-devo-acc21.inym23.b0.cloudera.site"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyamzhgatnso57mubvg3c6k4ens3orcx4dieo6efukuvm4a"
prd:
BUCKET_NAMESPACE: "frcnomajoc7v"
BUCKET: "mrds_inbox_prd"
DEVO_USERNAME: "ap-devo-rqsd-prd"
DEVO_HOSTNAME: "impala-proxy-devo-prd21-impala01.dw-devo-prd21.inym23.b0.cloudera.site"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyawpahgevgxv6csqnwil3p37vi6pthl466onnkg6k7undq"

View File

@@ -0,0 +1,259 @@
# devo_impala_exporter.py
import os
import io
import yaml
import datetime
import logging
from typing import Any, Dict, List, Optional, Tuple
import pandas as pd
from mrds.utils.secrets import get_secret
import mrds.utils.manage_runs as runManager
import mrds.utils.objectstore as objectstore
import oci
from impala.dbapi import (
connect,
ProgrammingError,
DatabaseError,
IntegrityError,
OperationalError,
)
from impala.error import HiveServer2Error
TASK_HISTORY_MULTIPLIER = 1_000_000_000
class DevoConnector:
"""
Export the result of an Impala (DEVO) query to OCI Object Storage as CSV,
while recording task run metadata via mrds.runManager.
Usage:
exporter = DevoImpalaExporter(
flow_config_path="/path/to/flow.yaml",
env_config_path="/path/to/env.yaml",
env="dev",
logger=my_logger, # optional
oci_client=my_object_storage, # optional ObjectStorageClient
oci_signer=my_signer, # optional signer (used if client not provided)
)
exporter.run({"run_id": 34, "a_workflow_history_key": 6})
"""
def __init__(
self,
flow_config_path: str,
env_config_path: str,
env: str,
logger: Optional[logging.Logger] = None,
oci_client: Optional[oci.object_storage.ObjectStorageClient] = None,
oci_signer: Optional[Any] = None,
) -> None:
self.flow_info = self._initialize_config(flow_config_path)
envs_info = self._initialize_config(env_config_path)
BUCKET_NAMESPACE = os.getenv("BUCKET_NAMESPACE", "frcnomajoc7v")
if env not in envs_info:
raise KeyError(f"Environment '{env}' not found in {env_config_path}")
self.environment_info = envs_info[env]
self.environment_info["BUCKET_NAMESPACE"]=BUCKET_NAMESPACE
self.env = env
# logging
self.logger = logger or self._default_logger(self.flow_info.get("TASK_NAME", "devo_task"))
# OCI client/signer
self.oci_client = oci_client
self.oci_signer = oci_signer
# -------------------------
# Public API
# -------------------------
def run(self, workflow_context: Dict[str, Any]) -> None:
"""Main entry point; executes query, uploads CSV, and finalizes task."""
task_name = self.flow_info["TASK_NAME"]
a_task_history_key = self._initialize_task(workflow_context, task_name)
try:
# credentials
devo_secret_name = self.environment_info["DEVO_SECRET"]
password = get_secret(devo_secret_name)
self.logger.info("Retrieved secret for DEVO connection.")
# query
query = self.flow_info["DEVO_QUERY"]
user = self.environment_info["DEVO_USERNAME"]
host = self.environment_info["DEVO_HOSTNAME"]
columns, data, rowcount = self._execute_query(query=query, user=user, hostname=host, password=password)
df = self._tuple_to_dataframe((columns, data))
self.logger.info("Query executed and DataFrame created with %d rows.", len(df))
# upload
if rowcount > 0:
csv_name = f"{self.flow_info['OUTPUT_TABLE']}.csv"
file_path = self._compose_object_path(self.flow_info["ODS_PREFIX"], csv_name)
self._upload_dataframe_to_oci(df, csv_name, file_path)
self.logger.info("Finished uploading %s to %s.", csv_name, file_path)
else:
return 0
# success
runManager.finalise_task(a_task_history_key, "Y")
self.logger.info("Task %s finalized successfully.", task_name)
return rowcount
except Exception as e:
# failure
self.logger.exception("Run failed: %s", e)
try:
runManager.finalise_task(a_task_history_key, "N")
finally:
# re-raise for upstream handling if used as a library
raise
# -------------------------
# Impala / DEVO
# -------------------------
@staticmethod
def _get_impala_connection(hostname: str, user: str, secret: str):
return connect(
host=hostname,
port=443,
auth_mechanism="PLAIN",
user=user,
password=secret,
use_http_transport=True,
http_path="cliservice",
use_ssl=True,
)
def _execute_query(self, query: str, user: str, hostname: str, password: str) -> Tuple[List[str], List[List[Any]]]:
conn = self._get_impala_connection(hostname, user, password)
cursor = None
self.logger.info("Executing Impala query against host '%s' as user '%s'.", hostname, user)
try:
cursor = conn.cursor()
cursor.execute(query)
if query.strip().lower().startswith("select") or "select" in query.strip().lower() :
rows = cursor.fetchall()
columns = [col[0] for col in cursor.description]
return columns, rows, cursor.rowcount
else:
# Non-SELECT: return rowcount (still return a columns list for consistency)
return [], [[cursor.rowcount]]
except OperationalError as oe:
raise RuntimeError("Failed to connect to Impala: " + str(oe)) from oe
except ProgrammingError as pe:
raise ValueError("Query syntax error: " + str(pe)) from pe
except IntegrityError as ie:
raise PermissionError("Insufficient permissions: " + str(ie)) from ie
except DatabaseError as db_err:
raise RuntimeError("Database error: " + str(db_err)) from db_err
except HiveServer2Error as au_err:
raise PermissionError("HiveServer2Error error: " + str(au_err)) from au_err
except Exception as e:
raise RuntimeError("An unexpected error occurred: " + str(e)) from e
finally:
try:
if cursor:
cursor.close()
finally:
try:
conn.close()
except Exception:
# log but don't mask the original exception
self.logger.warning("Failed to close Impala connection cleanly.", exc_info=True)
# -------------------------
# OCI Upload
# -------------------------
def _upload_dataframe_to_oci(self, df: pd.DataFrame, csv_name: str, object_path: str) -> None:
namespace = self.environment_info["BUCKET_NAMESPACE"]
bucket = self.environment_info["BUCKET"]
# convert DataFrame to CSV bytes without index
csv_bytes = df.to_csv(index=False).encode("utf-8")
client=objectstore.get_client()
client.put_object(namespace, bucket, object_path, csv_bytes)
self.logger.info("CSV '%s' uploaded to bucket '%s' (ns: '%s', key: '%s').", csv_name, bucket, namespace, object_path)
# -------------------------
# Utilities
# -------------------------
@staticmethod
def _tuple_to_dataframe(data_tuple: Tuple[List[str], List[List[Any]]]) -> pd.DataFrame:
columns, data = data_tuple
if not columns:
# for non-SELECT queries we returned rowcount; represent it in a DataFrame
return pd.DataFrame(data, columns=["rowcount"])
return pd.DataFrame(data, columns=columns)
@staticmethod
def _initialize_config(config_file_path: str) -> Dict[str, Any]:
if not os.path.exists(config_file_path):
raise FileNotFoundError(f"Configuration file {config_file_path} not found.")
with open(config_file_path, "r") as f:
return yaml.safe_load(f)
@staticmethod
def _initialize_task(workflow_context: Dict[str, Any], task_name: str) -> int:
return runManager.init_task(
task_name,
workflow_context["run_id"],
workflow_context["a_workflow_history_key"],
)
@staticmethod
def add_a_key_column(headers: List[str], data_rows: List[List[Any]], task_history_key: int) -> None:
"""Optionally add an A_KEY column (kept for parity with original script)."""
headers.insert(0, "A_KEY")
for i, row in enumerate(data_rows, start=1):
a_key_value = int(task_history_key) * TASK_HISTORY_MULTIPLIER + i
row.insert(0, str(a_key_value))
@staticmethod
def add_workflow_key_column(headers: List[str], data_rows: List[List[Any]], workflow_key: int) -> None:
"""Optionally add the workflow key column right after A_KEY if present, otherwise at position 0."""
insert_idx = 1 if headers and headers[0] == "A_KEY" else 0
headers.insert(insert_idx, "A_WORKFLOW_HISTORY_KEY")
for row in data_rows:
row.insert(insert_idx, workflow_key)
@staticmethod
def _compose_object_path(prefix: str, filename: str) -> str:
if prefix.endswith("/"):
return f"{prefix}{filename}"
return f"{prefix}/{filename}"
@staticmethod
def _default_logger(task_name: str) -> logging.Logger:
logger = logging.getLogger(f"{task_name}_logger")
if not logger.handlers:
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
fmt = logging.Formatter(f"%(asctime)s [{task_name}] %(levelname)s: %(message)s")
handler.setFormatter(fmt)
logger.addHandler(handler)
return logger
# Optional: quick-run convenience if you ever want to execute this module directly.
if __name__ == "__main__":
# Example only—adjust paths/env/context as needed or remove this block.
exporter = DevoConnector(
flow_config_path="/home/dbt/Marco/mrds_elt/airflow/ods/rqsd/rqsd_process/config/yaml/m_ODS_RQSD_OBSERVATIONS.yaml",
env_config_path="/home/dbt/Marco/mrds_elt/python/connectors/devo/config/env_config_rqsd.yaml",
env="dev",
)
exporter.run({"run_id": 34, "a_workflow_history_key": 6})

View File

@@ -0,0 +1,294 @@
import argparse
from TMSQuery import XMLQuery
import mrds.utils.objectstore
import tempfile
import re
import csv
from io import StringIO
import os.path
import os, psutil
import sys
namespace = os.getenv("BUCKET_NAMESPACE", "frcnomajoc7v")
def memory_usage():
# return the memory usage in percentage like top
process = psutil.Process(os.getpid())
mem = process.memory_info().rss/(1024*1024*1024)
return mem
def protect_keyword(s):
s = s.lower()
s = s.replace(' ', '_')
match s.lower():
case 'comment':
#return '"comment"'
return 'comment_'
case 'date':
#return '"date"'
return 'date_'
case 'number':
#return '"number"'
return 'number_'
case _:
return s
cModelsDir = sys.path[0] + '/../dbt/mrds/models/ods/'
cDatasetMultiplier = 10000000
parser = argparse.ArgumentParser()
parser.add_argument("command", choices=['create-model', 'create-oracle-table', 'retrieve'], help="create-model retrieve")
parser.add_argument("-n", "--name", help="Name")
parser.add_argument("-u", "--url", required=True, help="URL of TMS service")
parser.add_argument("-U", "--user", required=True, help="TMS user")
parser.add_argument("-P", "--password", required=True, help="TMS password")
parser.add_argument("-x", "--xmlfile", help="XML file")
parser.add_argument("-l", "--layoutfile", help="layout file")
parser.add_argument("-f", "--format", help="output format")
parser.add_argument("-p", "--parameter", action="append", help="Parameter")
parser.add_argument("-c", "--column", action="append", help="Additional column")
parser.add_argument("-d", "--destination", help="destination")
parser.add_argument("-s", "--dataset", help="data set ID", type=int)
parser.add_argument("-v", "--version", help="data model version", type=int, default=1)
args = parser.parse_args()
query = XMLQuery()
if args.xmlfile:
with open(args.xmlfile) as f:
xml = f.read()
query.xml = xml
if args.layoutfile:
with open(args.layoutfile) as f:
layout = f.read()
query.layout = layout
if args.format:
query.format = args.format
if args.parameter:
for p in args.parameter:
[name, value] = p.split('=', 1)
query.parameter[name] = value
additional_columns = []
if args.column:
for p in args.column:
[name, value] = p.split('=', 1)
t = re.split(r'(?:\|)|(?:/)|(?::)', name, maxsplit = 2)
name = t[0]
type = None
if len(t) == 2:
type = t[1]
if not type:
type = 'varchar2(255)'
additional_columns.append((name, type, value))
query.normalize_output()
from pathlib import Path
import pprint
p = Path('/tmp/kurt.xml')
p.write_text(str(query))
if args.command == 'create-oracle-table':
d = query.describe(args.url, args.user, args. password)
columns = [" a_key number(38, 0)", "a_workflow_history_key number(38, 0)"]
for c in additional_columns:
columns.append("%s %s"%(c[0], c[1]))
for col in d:
name = protect_keyword(col[0])
match col[1]:
case 'text':
columns.append(name + " varchar2(512 char)")
case 'int':
columns.append(name + " number(38,0)")
case 'money':
columns.append(name + " number(19,4)")
case 'floating':
columns.append(name + " binary_double")
case 'datetime':
columns.append(name + " date")
case 'integer':
columns.append(name + " number(12, 0)")
sql = "create table ct_et_templates." + args.name + " (\n"
sql = sql + ",\n ".join(columns)
sql = sql + "\n)\n"
if not args.destination or args.destination == '-':
print(sql)
else:
with open(args.destination, 'w') as f:
f.write(sql)
elif args.command == 'create-ods-model':
d = query.describe(args.url, args.user, args. password)
file_name = cModelsDir + args.name + '.yml'
f = open(file_name, 'w') # open file in append mode
f.write('version: %d\n' % args.version)
f.write('models:' + '\n')
f.write(' - name: ' + args.name + '_dbt\n')
f.write(' description: "A starter dbt model"' + '\n')
f.write(' columns:' + '\n')
for col in d:
f.write(' - name: ' + col[0] + '\n')
f.write(' data_type: ' + col[1] + '\n')
f.close()
file_name = cModelsDir + args.name + '.sql'
f = open(file_name, 'w') # open file in append mode
if args.destination and args.destination != '-':
if ':' in args.destination:
dest = args.destination.split(':', 2)
path = dest[1]
else:
path = args.destination
prefix = os.path.dirname(path)
else:
prefix = 'INBOX/TMS/' + args.name.upper() + '/'
pars = "ptablename => '%s', ptemplatetablename => 'ou_tms.%s', pprefix => '%s'" % (args.name, args.name, prefix)
print(f"creating table {args.name}")
f.write('{{\n config(\n post_hook = "call ct_mrds.file_manager.create_external_table(%s)"\n )\n}}\n\n' % pars)
f.write("{{ config(materialized='table') }}" + "\n")
f.write('with source_data as (' + "\n")
columns = []
columns.append("cast (1 as number(38,0)) as a_key")
columns.append("cast (1 as number(38,0)) as a_workflow_history_key")
for col in d:
name = protect_keyword(col[0])
match col[1]:
case 'text':
columns.append("cast ('x' as varchar2(255 char)) as " + name)
case 'int':
columns.append("cast (1 as number(38, 0)) as " + name)
case 'money':
columns.append("cast (1.0 as number(19,4)) as " + name)
case 'floating':
columns.append("cast (1.0 as binary_double) as " + name)
case 'datetime':
columns.append("cast (sysdate as date) as " + name)
case 'integer':
columns.append("cast (1 as number(12, 0)) as " + name)
f.write(' select\n ' + ',\n '.join(columns) + '\n')
f.write(')\nselect * from source_data\n ')
f.close()
elif args.command == 'retrieve':
ret = query.execute(args.url, args.user, args. password)
if query.format in ('scsv', 'standard_csv') and args.dataset:
# Save result to temporary spooled file for further processing
# We avoid doing this in memory to prevent issues with flow EffectivePermissions
f = tempfile.SpooledTemporaryFile(mode = 'w+', max_size = 200*1024*1024)
f.write(ret)
del ret
f.seek(0)
# Replace embedded newlines for '<br/>'
reader = csv.reader(f)
sio = StringIO()
writer = csv.writer(sio)
for l in reader:
l_tmp = [s.replace('\n', '<br/>') for s in l]
writer.writerow(l_tmp)
f.close()
# Necessary to read the data into an array of lines for further processing
sio.seek(0)
lines_tmp = sio.readlines()
del sio
if not lines_tmp:
ret = ""
else:
# Adding artificial columns A_KEY and A_WORKFLOW_HISTORY_KEY and added columns
additional_headers = [t[0] for t in additional_columns]
additional_values = [t[2] for t in additional_columns]
headers = ['A_KEY','A_WORKFLOW_HISTORY_KEY'] + additional_headers + [protect_keyword(h) for h in lines_tmp[0].split(',')]
lines = [','.join(headers) ]
i = 0
for l in lines_tmp[1:]:
lines.append(str(args.dataset*cDatasetMultiplier + i) + ',' + str(args.dataset) + ',' + ','.join(additional_values + [l]) )
i += 1
del lines_tmp
# Spooling again to temporary file to avoid duplication memory needs
f = tempfile.SpooledTemporaryFile(mode = 'w+', max_size = 200*1024*1024)
f.writelines(lines)
del lines
f.seek(0)
ret = f.read()
f.close()
if not args.destination or args.destination == '-':
print(ret, end='')
elif ':' not in args.destination:
with open(args.destination, 'w') as f:
f.write(ret)
else:
f = tempfile.NamedTemporaryFile(delete = False, mode = 'w', prefix = 'TMSDBT-', suffix = '.csv')
f.write(ret)
f.close()
dest = args.destination.split(':', 2)
bucket = dest[0]
dirname = os.path.dirname(dest[1])
filename = os.path.basename(dest[1])
client = mrds.utils.objectstore.get_client()
with open(f.name, "r") as file:
print(file.read())
mrds.utils.objectstore.upload_file(client, f.name,namespace, bucket, dirname, filename)
os.remove(f.name)
if ret:
sys.exit(0)
else:
sys.exit(1)

View File

@@ -0,0 +1,197 @@
import xml.etree.ElementTree as ET
import re
import base64
import sys
class XMLQuery:
def __init__(self, xml = None):
self._format = 'xml'
self._layout = ''
self._parameter = {}
if xml:
self._parse_xml(xml)
def _parse_xml(self, xml):
self._tree = ET.fromstring(xml)
layout_b64 = self._tree.find('layout').text
self._layout = base64.b64decode(layout_b64).decode('utf-8')
self._format = self._tree.find('format').get('type')
self._parameter = {}
for p in self._tree.findall('parameters/parameter'):
self._parameter[p.get('name')] = p.text
def execute(self, url, user, password):
# curl -X POST --basic -u schilli:chili03 --data @tms_activity_interval.xml https://tmsxd104.ecbt1.tadnet.net:9443/report/
import requests
from requests.auth import HTTPBasicAuth
data = str(self)
basic = HTTPBasicAuth(user, password)
response = requests.post(url, data=data, auth=basic, verify=False)
if response.status_code == 200:
response.encoding = "utf-8"
return response.text
else:
return None
def describe(self, url, user, password):
orig_format = self.format
self.format = 'xml'
ret = self.execute(url, user, password)
m = re.match('^.*?\<PlainRow\>.*?\<\/PlainRow\>', ret, re.DOTALL)
s = m[0] + '\n</report-generator>'
tree = ET.fromstring(s)
ret = []
row = tree.find('PlainRow')
for c in row.findall('Column'):
#name = c.get('name')
name = c.text
type = c.get('type')
if type == 'unknown': type = 'integer'
ret.append((name, type))
return ret
def describe_simple(url, user, password, xml):
query = XMLQuery(xml)
query.format='xml'
ret = query.execute(url = url, user = user, password = password)
tree = ET.fromstring(ret)
ret = []
row = tree.find('PlainRow')
for c in row.findall('Column'):
#name = c.get('name')
name = c.text
type = c.get('type')
if type == 'unknown': type = 'integer'
ret.append((name, type))
return ret
def normalize_output(self, date_format = 'dd/MM/yyyy', time_format = 'HH:mm:ss'):
lines = self.layout.splitlines()
lines = [re.sub(r'^date_format\s*=.*', 'date_format=' + date_format, l) for l in lines]
lines = [re.sub(r'^time_format\s*=.*', 'time_format=' + time_format, l) for l in lines]
lines = [re.sub(r'^NoNumberFormatting\s*=.*', 'NoNumberFormatting=1', l) for l in lines]
self.layout = '\n'.join(lines)
def __setattr__(self, name, value):
if name == 'format' and value not in ('bin','xml','xml3','html','txt','csv','standard_csv', 'scsv', 'pdf'):
raise Exception("Invalid report format '" + value + "'")
if not name.startswith('_'):
name = '_' + name
if name == '_layout' and not value.endswith('\n'):
value = value + '\n'
if name == '_xml':
self._parse_xml(value)
return
try:
self.__dict__[name] = value
except KeyError:
raise AttributeError
def __getattr__(self, name):
if not name.startswith('_'):
name = '_' + name
try:
return self.__dict__[name]
except KeyError:
raise AttributeError(name)
def __str__(self):
parameters = ''
for k in self._parameter:
parameters = parameters + "\n<parameter name='%s'>%s</parameter>" % (k, self._parameter[k])
layout_b64 = base64.b64encode(self.layout.encode('utf-8')).decode('utf-8')
return ('<?xml version="1.0" encoding="utf-8"?>\n' + \
'<report-generator>\n' + \
' <format type="%s"/>\n' + \
' <layout>\n%s</layout>\n' + \
' <parameters>%s\n</parameters>' + \
'</report-generator>') % (self._format, layout_b64, parameters)
if __name__ == "__main__":
file = sys.argv[1]
print(file)
with open(file) as f:
xml = f.read()
query = XMLQuery(xml)
print(query.layout)
query.normalize_output()
print(query.layout)
#query.format='xml'
#ret = query.execute(url = 'https://tmsxd104.ecbt1.tadnet.net:9443/report/', user = 'schilli', password = 'chili03')
#print(ret)
desc = XMLQuery.describe_simple(url = 'https://tmsxd104.ecbt1.tadnet.net:9443/report/', user = 'schilli', password = 'chili03', xml = xml)
print(str(desc))

View File

@@ -0,0 +1,355 @@
"""
DAG: w_ODS_TMS_TRANSACTION (expanded example)
Purpose:
- Load layout+parameter metadata from TMS-layouts/w_ODS_TMS_TRANSACTION.yml
- Call connectors/tms/TMSDBT.py to retrieve data into CSV in object storage
- On first run, generate Oracle DDL and create an external table
- Process file and record status in MRDS workflow tables
Notes:
- This is an expanded, readable version of the factory-generated DAG.
- Replace paths/usernames/password references as appropriate.
"""
import copy
import itertools
import json
import logging
import os
import re
import subprocess
import sys
import traceback
from datetime import datetime, timedelta
from pathlib import Path
import yaml
from airflow import DAG
from airflow.operators.python import PythonOperator
from pytz import timezone
# --- Project-specific deps (must exist in your Airflow image) ---
from mrds.core import main # noqa: F401 # imported to mirror the factory env
import mrds.utils.manage_files as mf
import mrds.utils.manage_runs as mr
# ---------- Paths & constants ----------
gScriptDir = Path(globals().get("__file__", "./_")).absolute().parent
gDataDir = str(gScriptDir / "TMS-layouts") + "/"
gConfigDir = str(gScriptDir / "config")
gConnDir = "/opt/airflow/python/connectors/tms/"
gTableDir = str(gScriptDir / "TMS-tables") + "/"
DAG_NAME = "w_ODS_TMS_TRANSACTION"
ODS_TABLE = DAG_NAME
DATABASE_NAME = "ODS"
WF_NAME = DAG_NAME
default_args = {
"owner": "ecb",
"depends_on_past": False,
"email_on_failure": False,
"email_on_retry": False,
"retries": 0,
"execution_timeout": timedelta(minutes=60),
"retry_delay": timedelta(minutes=5),
}
# ---------- Load YAML configs once on parse ----------
with open(gDataDir + DAG_NAME + ".yml", "r") as f:
report_desc = yaml.safe_load(f) or {}
with open(gConfigDir + "/TMS.yml", "r") as f:
tms_config = yaml.safe_load(f)
# TMS + storage config
tms_url = tms_config["TMS-URL"]
tms_user = tms_config["TMS-user"]
tms_pwd = tms_config["TMS-password"]
prefix = tms_config["dest-prefix"] + DAG_NAME + "/" + DAG_NAME + "/"
data_prefix = tms_config["data-prefix"] + DAG_NAME + "/"
dest = tms_config["dest-bucket"] + ":" + prefix
# Visible vs hidden params (from layout YAML)
params_visible = {}
params_hidden = {}
params_dict = report_desc.get("parameters") or {}
for p, meta in params_dict.items():
val = meta.get("value", None)
if not meta.get("hidden", False):
params_visible[p] = val
else:
params_hidden[p] = val
# ---------- Helpers (parameter handling) ----------
def _enum_param_combinations_recursive(params, keys):
"""
Build all combinations of params (cartesian product), supporting
'column(<name>)' derived lists aligned by index.
"""
k = None
result = []
keys = list(keys) # safe copy
while keys:
k = keys.pop(0)
v = params[k]
if v or v == "":
break
if not k:
return []
v = v if isinstance(v, list) else [v]
# derived columns aligned with v (same length)
derived_columns = []
# params_dict[k] holds the definition, not just the value
pdef = params_dict.get(k, {})
for c in list(pdef):
if re.match(r"column\(.*\)$", c):
vtmp = pdef[c]
vtmp = vtmp if isinstance(vtmp, list) else [vtmp]
derived_columns.append((c, vtmp))
if not keys:
for i, value in enumerate(v):
row = [(k, value)]
for col_key, aligned_values in derived_columns:
row.append((col_key, aligned_values[i]))
result.append(row)
return result
combinations = _enum_param_combinations_recursive(params, keys)
for row in combinations:
for i, vtmp in enumerate(v):
new_row = copy.deepcopy(row)
new_row.append((k, vtmp))
for col_key, aligned_values in derived_columns:
new_row.append((col_key, aligned_values[i]))
result.append(new_row)
return result
def _enum_param_combinations(params, sequential=False):
# Sequential path omitted (buggy in factory; not used there either)
return _enum_param_combinations_recursive(params, list(params))
def _allowed_select(table, expression, condition="1 = 1"):
"""
Guarded select used by eval_params(select(...)).
Whitelist tables to avoid arbitrary reads.
"""
if table.upper() not in (
ODS_TABLE.upper(),
"DUAL",
"CT_MRDS.A_WORKFLOW_HISTORY",
):
raise Exception(f"Not allowed to select from {table}")
res = mr.select_ods_tab(table, expression, condition)
return res[0]
def _eval_param(v):
"""
Evaluate special functional values:
- select(...) => guarded DB helper above
- eval(...) => strongly discouraged; keep disabled or restricted
"""
s = str(v) if v is not None else ""
if re.match(r"\s*select\(.*\)", s):
# Expose only 'select' symbol to eval
return eval(s, {"select": _allowed_select}, {})
if re.match(r"\s*eval\(.*\)\s*$", s):
# If you really must support eval, strictly sandbox or remove this path.
raise ValueError("eval(...) not allowed in this hardened DAG.")
return v
def _finalize_param_list(param_list):
"""
Apply replacements and drop virtual params according to YAML definitions.
"""
d = dict(param_list)
# Replace parameter tokens inside another parameter (string replace)
for p, meta in params_dict.items():
if meta.get("replace_parameter"):
target = meta["replace_parameter"]
if target in d and p in d and isinstance(d[target], str):
d[target] = d[target].replace(p, str(d[p]))
# Drop 'virtual' params
cleaned = []
for k, v in d.items():
meta = params_dict.get(k, {})
if not meta.get("virtual", False):
cleaned.append((k, v))
return cleaned
# ---------- Core work ----------
def execute_report(**context):
"""
For each parameter combination:
- create workflow key
- call TMSDBT.py retrieve to land CSV
- if first time, create Oracle table from generated DDL
- process file, finalize workflow Y/N
"""
logger = logging.getLogger("airflow.task")
logger.setLevel(logging.DEBUG)
run_id = context["dag_run"].run_id
all_params = {**params_visible, **params_hidden}
# 1) Compute combinations
combos = _enum_param_combinations(all_params)
# 2) Evaluate select(...) etc and finalize
evaluated = []
for combo in combos or [[]]:
# first pass: special evaluations
pair_list = []
for k, v in combo:
pair_list.append((k, _eval_param(v)))
# second pass: replacements + pruning
evaluated.append(_finalize_param_list(pair_list))
# if no combos at all, ensure we run once
if not evaluated:
evaluated = [[]]
# Timing + workflow
ts = "{:%Y%m%d_%H%M%S}".format(datetime.now(timezone("Europe/Berlin")))
for idx, param_list in enumerate(evaluated, start=1):
wf_key = mr.init_workflow(DATABASE_NAME, WF_NAME, run_id)
file_name = f"{WF_NAME}.{wf_key}.{ts}.csv"
try:
# Build connector command safely (no shell quoting games)
cmd = [
sys.executable, # 'python'
os.path.join(gConnDir, "TMSDBT.py"),
"retrieve",
"--name", WF_NAME,
"--url", tms_url,
"-U", tms_user,
"--password", tms_pwd,
"--layoutfile", gDataDir + DAG_NAME + ".fkr",
"-f", "scsv",
"--dataset", str(wf_key),
"-d", dest + file_name,
]
# Map params to -p or -c switches
for k, v in param_list:
sval = "" if v is None else str(v).rstrip()
m = re.match(r"column\((.*)\)$", k)
if m:
cmd.extend(["-c", f'{m.group(1)}={sval}'])
else:
cmd.extend(["-p", f"{k}={sval}"])
mr.set_workflow_property(wf_key, DATABASE_NAME, k, sval)
logger.debug("Running connector: %s", json.dumps(cmd))
res = subprocess.run(cmd, capture_output=True, check=False)
logger.debug("stdout: %s", res.stdout.decode(errors="ignore"))
logger.debug("stderr: %s", res.stderr.decode(errors="ignore"))
if res.returncode is None:
raise RuntimeError("Connector returned no status")
if res.returncode == 1:
logger.info("No data returned for wf_key=%s (continuing)", wf_key)
mr.finalise_workflow(wf_key, "Y")
continue
if res.returncode != 0:
raise RuntimeError(f"Connector failed (rc={res.returncode})")
# Data landed -> ensure source config exists, bootstrap table if needed
cfg = mf.execute_query(
"select * from CT_MRDS.A_SOURCE_FILE_CONFIG "
f"where a_source_key = 'TMS' and table_id = '{ODS_TABLE}'"
)
if not cfg:
# Generate DDL file
ddl_cmd = [
sys.executable,
os.path.join(gConnDir, "TMSDBT.py"),
"create-oracle-table",
"--name", WF_NAME,
"--url", tms_url,
"-U", tms_user,
"--password", tms_pwd,
"--layoutfile", gDataDir + DAG_NAME + ".fkr",
"-d", gTableDir + WF_NAME + ".sql",
]
for k, v in param_list:
sval = "" if v is None else str(v).rstrip()
m = re.match(r"column\((.*)\)$", k)
if m:
ddl_cmd.extend(["-c", f'{m.group(1)}={sval}'])
else:
ddl_cmd.extend(["-p", f"{k}={sval}"])
logger.debug("Generating DDL: %s", json.dumps(ddl_cmd))
ddl_res = subprocess.run(ddl_cmd, capture_output=True, check=True)
logger.debug("DDL stdout: %s", ddl_res.stdout.decode(errors="ignore"))
logger.debug("DDL stderr: %s", ddl_res.stderr.decode(errors="ignore"))
# Execute DDL and create external table
sql = Path(gTableDir + WF_NAME + ".sql").read_text()
mf.execute(sql)
mf.add_column_date_format(
f"CT_ET_TEMPLATES.{ODS_TABLE}", "DEFAULT", "DD/MM/YYYY HH24:MI:SS"
)
mf.create_external_table(ODS_TABLE, f"CT_ET_TEMPLATES.{ODS_TABLE}", data_prefix)
mf.add_source_file_config(
"TMS",
"INPUT",
DAG_NAME,
DAG_NAME,
r".*\.csv",
ODS_TABLE,
f"CT_ET_TEMPLATES.{ODS_TABLE}",
)
# Process landed file (register, move, etc. as per your mf impl)
mf.process_source_file(prefix, file_name)
mr.finalise_workflow(wf_key, "Y")
except BaseException as ex:
# rich error logging, then mark workflow failed and re-raise
ex_type, ex_value, ex_tb = sys.exc_info()
tb = traceback.extract_tb(ex_tb)
stack = [
f"File: {t[0]}, Line: {t[1]}, Func: {t[2]}, Code: {t[3]}"
for t in tb
]
logging.error("Exception type: %s", ex_type.__name__)
logging.error("Exception message: %s", ex_value)
logging.error("Stack trace: %s", stack)
mr.finalise_workflow(wf_key, "N")
raise
# ---------- DAG definition ----------
with DAG(
dag_id=DAG_NAME,
default_args=default_args,
description=DAG_NAME,
schedule_interval=None, # manual trigger
params=params_visible, # visible-only; hidden merged inside task
start_date=datetime(2025, 1, 1),
catchup=False,
tags=[DAG_NAME],
) as dag:
retrieve_report = PythonOperator(
task_id="retrieve_report",
python_callable=execute_report,
execution_timeout=timedelta(minutes=30),
)