This commit is contained in:
Grzegorz Michalski
2026-03-02 09:47:35 +01:00
commit 2c225d68ac
715 changed files with 130067 additions and 0 deletions

View File

@@ -0,0 +1,294 @@
import argparse
from TMSQuery import XMLQuery
import mrds.utils.objectstore
import tempfile
import re
import csv
from io import StringIO
import os.path
import os, psutil
import sys
namespace = os.getenv("BUCKET_NAMESPACE", "frcnomajoc7v")
def memory_usage():
# return the memory usage in percentage like top
process = psutil.Process(os.getpid())
mem = process.memory_info().rss/(1024*1024*1024)
return mem
def protect_keyword(s):
s = s.lower()
s = s.replace(' ', '_')
match s.lower():
case 'comment':
#return '"comment"'
return 'comment_'
case 'date':
#return '"date"'
return 'date_'
case 'number':
#return '"number"'
return 'number_'
case _:
return s
cModelsDir = sys.path[0] + '/../dbt/mrds/models/ods/'
cDatasetMultiplier = 10000000
parser = argparse.ArgumentParser()
parser.add_argument("command", choices=['create-model', 'create-oracle-table', 'retrieve'], help="create-model retrieve")
parser.add_argument("-n", "--name", help="Name")
parser.add_argument("-u", "--url", required=True, help="URL of TMS service")
parser.add_argument("-U", "--user", required=True, help="TMS user")
parser.add_argument("-P", "--password", required=True, help="TMS password")
parser.add_argument("-x", "--xmlfile", help="XML file")
parser.add_argument("-l", "--layoutfile", help="layout file")
parser.add_argument("-f", "--format", help="output format")
parser.add_argument("-p", "--parameter", action="append", help="Parameter")
parser.add_argument("-c", "--column", action="append", help="Additional column")
parser.add_argument("-d", "--destination", help="destination")
parser.add_argument("-s", "--dataset", help="data set ID", type=int)
parser.add_argument("-v", "--version", help="data model version", type=int, default=1)
args = parser.parse_args()
query = XMLQuery()
if args.xmlfile:
with open(args.xmlfile) as f:
xml = f.read()
query.xml = xml
if args.layoutfile:
with open(args.layoutfile) as f:
layout = f.read()
query.layout = layout
if args.format:
query.format = args.format
if args.parameter:
for p in args.parameter:
[name, value] = p.split('=', 1)
query.parameter[name] = value
additional_columns = []
if args.column:
for p in args.column:
[name, value] = p.split('=', 1)
t = re.split(r'(?:\|)|(?:/)|(?::)', name, maxsplit = 2)
name = t[0]
type = None
if len(t) == 2:
type = t[1]
if not type:
type = 'varchar2(255)'
additional_columns.append((name, type, value))
query.normalize_output()
from pathlib import Path
import pprint
p = Path('/tmp/kurt.xml')
p.write_text(str(query))
if args.command == 'create-oracle-table':
d = query.describe(args.url, args.user, args. password)
columns = [" a_key number(38, 0)", "a_workflow_history_key number(38, 0)"]
for c in additional_columns:
columns.append("%s %s"%(c[0], c[1]))
for col in d:
name = protect_keyword(col[0])
match col[1]:
case 'text':
columns.append(name + " varchar2(512 char)")
case 'int':
columns.append(name + " number(38,0)")
case 'money':
columns.append(name + " number(19,4)")
case 'floating':
columns.append(name + " binary_double")
case 'datetime':
columns.append(name + " date")
case 'integer':
columns.append(name + " number(12, 0)")
sql = "create table ct_et_templates." + args.name + " (\n"
sql = sql + ",\n ".join(columns)
sql = sql + "\n)\n"
if not args.destination or args.destination == '-':
print(sql)
else:
with open(args.destination, 'w') as f:
f.write(sql)
elif args.command == 'create-ods-model':
d = query.describe(args.url, args.user, args. password)
file_name = cModelsDir + args.name + '.yml'
f = open(file_name, 'w') # open file in append mode
f.write('version: %d\n' % args.version)
f.write('models:' + '\n')
f.write(' - name: ' + args.name + '_dbt\n')
f.write(' description: "A starter dbt model"' + '\n')
f.write(' columns:' + '\n')
for col in d:
f.write(' - name: ' + col[0] + '\n')
f.write(' data_type: ' + col[1] + '\n')
f.close()
file_name = cModelsDir + args.name + '.sql'
f = open(file_name, 'w') # open file in append mode
if args.destination and args.destination != '-':
if ':' in args.destination:
dest = args.destination.split(':', 2)
path = dest[1]
else:
path = args.destination
prefix = os.path.dirname(path)
else:
prefix = 'INBOX/TMS/' + args.name.upper() + '/'
pars = "ptablename => '%s', ptemplatetablename => 'ou_tms.%s', pprefix => '%s'" % (args.name, args.name, prefix)
print(f"creating table {args.name}")
f.write('{{\n config(\n post_hook = "call ct_mrds.file_manager.create_external_table(%s)"\n )\n}}\n\n' % pars)
f.write("{{ config(materialized='table') }}" + "\n")
f.write('with source_data as (' + "\n")
columns = []
columns.append("cast (1 as number(38,0)) as a_key")
columns.append("cast (1 as number(38,0)) as a_workflow_history_key")
for col in d:
name = protect_keyword(col[0])
match col[1]:
case 'text':
columns.append("cast ('x' as varchar2(255 char)) as " + name)
case 'int':
columns.append("cast (1 as number(38, 0)) as " + name)
case 'money':
columns.append("cast (1.0 as number(19,4)) as " + name)
case 'floating':
columns.append("cast (1.0 as binary_double) as " + name)
case 'datetime':
columns.append("cast (sysdate as date) as " + name)
case 'integer':
columns.append("cast (1 as number(12, 0)) as " + name)
f.write(' select\n ' + ',\n '.join(columns) + '\n')
f.write(')\nselect * from source_data\n ')
f.close()
elif args.command == 'retrieve':
ret = query.execute(args.url, args.user, args. password)
if query.format in ('scsv', 'standard_csv') and args.dataset:
# Save result to temporary spooled file for further processing
# We avoid doing this in memory to prevent issues with flow EffectivePermissions
f = tempfile.SpooledTemporaryFile(mode = 'w+', max_size = 200*1024*1024)
f.write(ret)
del ret
f.seek(0)
# Replace embedded newlines for '<br/>'
reader = csv.reader(f)
sio = StringIO()
writer = csv.writer(sio)
for l in reader:
l_tmp = [s.replace('\n', '<br/>') for s in l]
writer.writerow(l_tmp)
f.close()
# Necessary to read the data into an array of lines for further processing
sio.seek(0)
lines_tmp = sio.readlines()
del sio
if not lines_tmp:
ret = ""
else:
# Adding artificial columns A_KEY and A_WORKFLOW_HISTORY_KEY and added columns
additional_headers = [t[0] for t in additional_columns]
additional_values = [t[2] for t in additional_columns]
headers = ['A_KEY','A_WORKFLOW_HISTORY_KEY'] + additional_headers + [protect_keyword(h) for h in lines_tmp[0].split(',')]
lines = [','.join(headers) ]
i = 0
for l in lines_tmp[1:]:
lines.append(str(args.dataset*cDatasetMultiplier + i) + ',' + str(args.dataset) + ',' + ','.join(additional_values + [l]) )
i += 1
del lines_tmp
# Spooling again to temporary file to avoid duplication memory needs
f = tempfile.SpooledTemporaryFile(mode = 'w+', max_size = 200*1024*1024)
f.writelines(lines)
del lines
f.seek(0)
ret = f.read()
f.close()
if not args.destination or args.destination == '-':
print(ret, end='')
elif ':' not in args.destination:
with open(args.destination, 'w') as f:
f.write(ret)
else:
f = tempfile.NamedTemporaryFile(delete = False, mode = 'w', prefix = 'TMSDBT-', suffix = '.csv')
f.write(ret)
f.close()
dest = args.destination.split(':', 2)
bucket = dest[0]
dirname = os.path.dirname(dest[1])
filename = os.path.basename(dest[1])
client = mrds.utils.objectstore.get_client()
with open(f.name, "r") as file:
print(file.read())
mrds.utils.objectstore.upload_file(client, f.name,namespace, bucket, dirname, filename)
os.remove(f.name)
if ret:
sys.exit(0)
else:
sys.exit(1)

View File

@@ -0,0 +1,197 @@
import xml.etree.ElementTree as ET
import re
import base64
import sys
class XMLQuery:
def __init__(self, xml = None):
self._format = 'xml'
self._layout = ''
self._parameter = {}
if xml:
self._parse_xml(xml)
def _parse_xml(self, xml):
self._tree = ET.fromstring(xml)
layout_b64 = self._tree.find('layout').text
self._layout = base64.b64decode(layout_b64).decode('utf-8')
self._format = self._tree.find('format').get('type')
self._parameter = {}
for p in self._tree.findall('parameters/parameter'):
self._parameter[p.get('name')] = p.text
def execute(self, url, user, password):
# curl -X POST --basic -u schilli:chili03 --data @tms_activity_interval.xml https://tmsxd104.ecbt1.tadnet.net:9443/report/
import requests
from requests.auth import HTTPBasicAuth
data = str(self)
basic = HTTPBasicAuth(user, password)
response = requests.post(url, data=data, auth=basic, verify=False)
if response.status_code == 200:
response.encoding = "utf-8"
return response.text
else:
return None
def describe(self, url, user, password):
orig_format = self.format
self.format = 'xml'
ret = self.execute(url, user, password)
m = re.match('^.*?\<PlainRow\>.*?\<\/PlainRow\>', ret, re.DOTALL)
s = m[0] + '\n</report-generator>'
tree = ET.fromstring(s)
ret = []
row = tree.find('PlainRow')
for c in row.findall('Column'):
#name = c.get('name')
name = c.text
type = c.get('type')
if type == 'unknown': type = 'integer'
ret.append((name, type))
return ret
def describe_simple(url, user, password, xml):
query = XMLQuery(xml)
query.format='xml'
ret = query.execute(url = url, user = user, password = password)
tree = ET.fromstring(ret)
ret = []
row = tree.find('PlainRow')
for c in row.findall('Column'):
#name = c.get('name')
name = c.text
type = c.get('type')
if type == 'unknown': type = 'integer'
ret.append((name, type))
return ret
def normalize_output(self, date_format = 'dd/MM/yyyy', time_format = 'HH:mm:ss'):
lines = self.layout.splitlines()
lines = [re.sub(r'^date_format\s*=.*', 'date_format=' + date_format, l) for l in lines]
lines = [re.sub(r'^time_format\s*=.*', 'time_format=' + time_format, l) for l in lines]
lines = [re.sub(r'^NoNumberFormatting\s*=.*', 'NoNumberFormatting=1', l) for l in lines]
self.layout = '\n'.join(lines)
def __setattr__(self, name, value):
if name == 'format' and value not in ('bin','xml','xml3','html','txt','csv','standard_csv', 'scsv', 'pdf'):
raise Exception("Invalid report format '" + value + "'")
if not name.startswith('_'):
name = '_' + name
if name == '_layout' and not value.endswith('\n'):
value = value + '\n'
if name == '_xml':
self._parse_xml(value)
return
try:
self.__dict__[name] = value
except KeyError:
raise AttributeError
def __getattr__(self, name):
if not name.startswith('_'):
name = '_' + name
try:
return self.__dict__[name]
except KeyError:
raise AttributeError(name)
def __str__(self):
parameters = ''
for k in self._parameter:
parameters = parameters + "\n<parameter name='%s'>%s</parameter>" % (k, self._parameter[k])
layout_b64 = base64.b64encode(self.layout.encode('utf-8')).decode('utf-8')
return ('<?xml version="1.0" encoding="utf-8"?>\n' + \
'<report-generator>\n' + \
' <format type="%s"/>\n' + \
' <layout>\n%s</layout>\n' + \
' <parameters>%s\n</parameters>' + \
'</report-generator>') % (self._format, layout_b64, parameters)
if __name__ == "__main__":
file = sys.argv[1]
print(file)
with open(file) as f:
xml = f.read()
query = XMLQuery(xml)
print(query.layout)
query.normalize_output()
print(query.layout)
#query.format='xml'
#ret = query.execute(url = 'https://tmsxd104.ecbt1.tadnet.net:9443/report/', user = 'schilli', password = 'chili03')
#print(ret)
desc = XMLQuery.describe_simple(url = 'https://tmsxd104.ecbt1.tadnet.net:9443/report/', user = 'schilli', password = 'chili03', xml = xml)
print(str(desc))

View File

@@ -0,0 +1,355 @@
"""
DAG: w_ODS_TMS_TRANSACTION (expanded example)
Purpose:
- Load layout+parameter metadata from TMS-layouts/w_ODS_TMS_TRANSACTION.yml
- Call connectors/tms/TMSDBT.py to retrieve data into CSV in object storage
- On first run, generate Oracle DDL and create an external table
- Process file and record status in MRDS workflow tables
Notes:
- This is an expanded, readable version of the factory-generated DAG.
- Replace paths/usernames/password references as appropriate.
"""
import copy
import itertools
import json
import logging
import os
import re
import subprocess
import sys
import traceback
from datetime import datetime, timedelta
from pathlib import Path
import yaml
from airflow import DAG
from airflow.operators.python import PythonOperator
from pytz import timezone
# --- Project-specific deps (must exist in your Airflow image) ---
from mrds.core import main # noqa: F401 # imported to mirror the factory env
import mrds.utils.manage_files as mf
import mrds.utils.manage_runs as mr
# ---------- Paths & constants ----------
gScriptDir = Path(globals().get("__file__", "./_")).absolute().parent
gDataDir = str(gScriptDir / "TMS-layouts") + "/"
gConfigDir = str(gScriptDir / "config")
gConnDir = "/opt/airflow/python/connectors/tms/"
gTableDir = str(gScriptDir / "TMS-tables") + "/"
DAG_NAME = "w_ODS_TMS_TRANSACTION"
ODS_TABLE = DAG_NAME
DATABASE_NAME = "ODS"
WF_NAME = DAG_NAME
default_args = {
"owner": "ecb",
"depends_on_past": False,
"email_on_failure": False,
"email_on_retry": False,
"retries": 0,
"execution_timeout": timedelta(minutes=60),
"retry_delay": timedelta(minutes=5),
}
# ---------- Load YAML configs once on parse ----------
with open(gDataDir + DAG_NAME + ".yml", "r") as f:
report_desc = yaml.safe_load(f) or {}
with open(gConfigDir + "/TMS.yml", "r") as f:
tms_config = yaml.safe_load(f)
# TMS + storage config
tms_url = tms_config["TMS-URL"]
tms_user = tms_config["TMS-user"]
tms_pwd = tms_config["TMS-password"]
prefix = tms_config["dest-prefix"] + DAG_NAME + "/" + DAG_NAME + "/"
data_prefix = tms_config["data-prefix"] + DAG_NAME + "/"
dest = tms_config["dest-bucket"] + ":" + prefix
# Visible vs hidden params (from layout YAML)
params_visible = {}
params_hidden = {}
params_dict = report_desc.get("parameters") or {}
for p, meta in params_dict.items():
val = meta.get("value", None)
if not meta.get("hidden", False):
params_visible[p] = val
else:
params_hidden[p] = val
# ---------- Helpers (parameter handling) ----------
def _enum_param_combinations_recursive(params, keys):
"""
Build all combinations of params (cartesian product), supporting
'column(<name>)' derived lists aligned by index.
"""
k = None
result = []
keys = list(keys) # safe copy
while keys:
k = keys.pop(0)
v = params[k]
if v or v == "":
break
if not k:
return []
v = v if isinstance(v, list) else [v]
# derived columns aligned with v (same length)
derived_columns = []
# params_dict[k] holds the definition, not just the value
pdef = params_dict.get(k, {})
for c in list(pdef):
if re.match(r"column\(.*\)$", c):
vtmp = pdef[c]
vtmp = vtmp if isinstance(vtmp, list) else [vtmp]
derived_columns.append((c, vtmp))
if not keys:
for i, value in enumerate(v):
row = [(k, value)]
for col_key, aligned_values in derived_columns:
row.append((col_key, aligned_values[i]))
result.append(row)
return result
combinations = _enum_param_combinations_recursive(params, keys)
for row in combinations:
for i, vtmp in enumerate(v):
new_row = copy.deepcopy(row)
new_row.append((k, vtmp))
for col_key, aligned_values in derived_columns:
new_row.append((col_key, aligned_values[i]))
result.append(new_row)
return result
def _enum_param_combinations(params, sequential=False):
# Sequential path omitted (buggy in factory; not used there either)
return _enum_param_combinations_recursive(params, list(params))
def _allowed_select(table, expression, condition="1 = 1"):
"""
Guarded select used by eval_params(select(...)).
Whitelist tables to avoid arbitrary reads.
"""
if table.upper() not in (
ODS_TABLE.upper(),
"DUAL",
"CT_MRDS.A_WORKFLOW_HISTORY",
):
raise Exception(f"Not allowed to select from {table}")
res = mr.select_ods_tab(table, expression, condition)
return res[0]
def _eval_param(v):
"""
Evaluate special functional values:
- select(...) => guarded DB helper above
- eval(...) => strongly discouraged; keep disabled or restricted
"""
s = str(v) if v is not None else ""
if re.match(r"\s*select\(.*\)", s):
# Expose only 'select' symbol to eval
return eval(s, {"select": _allowed_select}, {})
if re.match(r"\s*eval\(.*\)\s*$", s):
# If you really must support eval, strictly sandbox or remove this path.
raise ValueError("eval(...) not allowed in this hardened DAG.")
return v
def _finalize_param_list(param_list):
"""
Apply replacements and drop virtual params according to YAML definitions.
"""
d = dict(param_list)
# Replace parameter tokens inside another parameter (string replace)
for p, meta in params_dict.items():
if meta.get("replace_parameter"):
target = meta["replace_parameter"]
if target in d and p in d and isinstance(d[target], str):
d[target] = d[target].replace(p, str(d[p]))
# Drop 'virtual' params
cleaned = []
for k, v in d.items():
meta = params_dict.get(k, {})
if not meta.get("virtual", False):
cleaned.append((k, v))
return cleaned
# ---------- Core work ----------
def execute_report(**context):
"""
For each parameter combination:
- create workflow key
- call TMSDBT.py retrieve to land CSV
- if first time, create Oracle table from generated DDL
- process file, finalize workflow Y/N
"""
logger = logging.getLogger("airflow.task")
logger.setLevel(logging.DEBUG)
run_id = context["dag_run"].run_id
all_params = {**params_visible, **params_hidden}
# 1) Compute combinations
combos = _enum_param_combinations(all_params)
# 2) Evaluate select(...) etc and finalize
evaluated = []
for combo in combos or [[]]:
# first pass: special evaluations
pair_list = []
for k, v in combo:
pair_list.append((k, _eval_param(v)))
# second pass: replacements + pruning
evaluated.append(_finalize_param_list(pair_list))
# if no combos at all, ensure we run once
if not evaluated:
evaluated = [[]]
# Timing + workflow
ts = "{:%Y%m%d_%H%M%S}".format(datetime.now(timezone("Europe/Berlin")))
for idx, param_list in enumerate(evaluated, start=1):
wf_key = mr.init_workflow(DATABASE_NAME, WF_NAME, run_id)
file_name = f"{WF_NAME}.{wf_key}.{ts}.csv"
try:
# Build connector command safely (no shell quoting games)
cmd = [
sys.executable, # 'python'
os.path.join(gConnDir, "TMSDBT.py"),
"retrieve",
"--name", WF_NAME,
"--url", tms_url,
"-U", tms_user,
"--password", tms_pwd,
"--layoutfile", gDataDir + DAG_NAME + ".fkr",
"-f", "scsv",
"--dataset", str(wf_key),
"-d", dest + file_name,
]
# Map params to -p or -c switches
for k, v in param_list:
sval = "" if v is None else str(v).rstrip()
m = re.match(r"column\((.*)\)$", k)
if m:
cmd.extend(["-c", f'{m.group(1)}={sval}'])
else:
cmd.extend(["-p", f"{k}={sval}"])
mr.set_workflow_property(wf_key, DATABASE_NAME, k, sval)
logger.debug("Running connector: %s", json.dumps(cmd))
res = subprocess.run(cmd, capture_output=True, check=False)
logger.debug("stdout: %s", res.stdout.decode(errors="ignore"))
logger.debug("stderr: %s", res.stderr.decode(errors="ignore"))
if res.returncode is None:
raise RuntimeError("Connector returned no status")
if res.returncode == 1:
logger.info("No data returned for wf_key=%s (continuing)", wf_key)
mr.finalise_workflow(wf_key, "Y")
continue
if res.returncode != 0:
raise RuntimeError(f"Connector failed (rc={res.returncode})")
# Data landed -> ensure source config exists, bootstrap table if needed
cfg = mf.execute_query(
"select * from CT_MRDS.A_SOURCE_FILE_CONFIG "
f"where a_source_key = 'TMS' and table_id = '{ODS_TABLE}'"
)
if not cfg:
# Generate DDL file
ddl_cmd = [
sys.executable,
os.path.join(gConnDir, "TMSDBT.py"),
"create-oracle-table",
"--name", WF_NAME,
"--url", tms_url,
"-U", tms_user,
"--password", tms_pwd,
"--layoutfile", gDataDir + DAG_NAME + ".fkr",
"-d", gTableDir + WF_NAME + ".sql",
]
for k, v in param_list:
sval = "" if v is None else str(v).rstrip()
m = re.match(r"column\((.*)\)$", k)
if m:
ddl_cmd.extend(["-c", f'{m.group(1)}={sval}'])
else:
ddl_cmd.extend(["-p", f"{k}={sval}"])
logger.debug("Generating DDL: %s", json.dumps(ddl_cmd))
ddl_res = subprocess.run(ddl_cmd, capture_output=True, check=True)
logger.debug("DDL stdout: %s", ddl_res.stdout.decode(errors="ignore"))
logger.debug("DDL stderr: %s", ddl_res.stderr.decode(errors="ignore"))
# Execute DDL and create external table
sql = Path(gTableDir + WF_NAME + ".sql").read_text()
mf.execute(sql)
mf.add_column_date_format(
f"CT_ET_TEMPLATES.{ODS_TABLE}", "DEFAULT", "DD/MM/YYYY HH24:MI:SS"
)
mf.create_external_table(ODS_TABLE, f"CT_ET_TEMPLATES.{ODS_TABLE}", data_prefix)
mf.add_source_file_config(
"TMS",
"INPUT",
DAG_NAME,
DAG_NAME,
r".*\.csv",
ODS_TABLE,
f"CT_ET_TEMPLATES.{ODS_TABLE}",
)
# Process landed file (register, move, etc. as per your mf impl)
mf.process_source_file(prefix, file_name)
mr.finalise_workflow(wf_key, "Y")
except BaseException as ex:
# rich error logging, then mark workflow failed and re-raise
ex_type, ex_value, ex_tb = sys.exc_info()
tb = traceback.extract_tb(ex_tb)
stack = [
f"File: {t[0]}, Line: {t[1]}, Func: {t[2]}, Code: {t[3]}"
for t in tb
]
logging.error("Exception type: %s", ex_type.__name__)
logging.error("Exception message: %s", ex_value)
logging.error("Stack trace: %s", stack)
mr.finalise_workflow(wf_key, "N")
raise
# ---------- DAG definition ----------
with DAG(
dag_id=DAG_NAME,
default_args=default_args,
description=DAG_NAME,
schedule_interval=None, # manual trigger
params=params_visible, # visible-only; hidden merged inside task
start_date=datetime(2025, 1, 1),
catchup=False,
tags=[DAG_NAME],
) as dag:
retrieve_report = PythonOperator(
task_id="retrieve_report",
python_callable=execute_report,
execution_timeout=timedelta(minutes=30),
)