This commit is contained in:
Grzegorz Michalski
2026-03-02 09:47:35 +01:00
commit 2c225d68ac
715 changed files with 130067 additions and 0 deletions

View File

@@ -0,0 +1,73 @@
from jinja2 import Environment, FileSystemLoader
import csv
environment = Environment(loader=FileSystemLoader("templates/"))
template_sq = environment.get_template("m_Template_sq.sql")
template_tgt = environment.get_template("m_Template_target.sql")
template_db = environment.get_template("db_Template.sql")
template_dag = environment.get_template("dag_Template.py")
csvFile = csv.reader(open("ods_mopdb_plain.txt", "r"))
header = next(csvFile) # skip header
for row in csvFile:
(schema, table) = row
# ou_tms,ACTIVITYLOGDUE,TMS,T_ACTIVITYLOGDUE,m_MOPDB_TMS_T_ACTIVITYLOGDUE_OU_TMS_ACTIVITYLOGDUE,w_ODS_TMS_ACTIVITYLOGDUE
target_table = f"T_{table}"
source_schema = f"OU_{schema}"
source_schema_lower = f"{source_schema}".lower()
source_table = table
mapping_name = f"m_MOPDB_{schema}_{target_table}_{source_schema}_{source_table}"
workflow_name = f"w_MOPDB_{schema}_{target_table}"
content = template_sq.render(
source_schema = source_schema_lower,
source_table = table,
target_schema = schema,
target_table = target_table,
mapping_name = mapping_name,
workflow_name = workflow_name
)
filename=f"source_qualifiers/{mapping_name}_SQ.sql"
with open(filename, mode="w", encoding="utf-8") as message:
message.write(content)
print(f"... wrote {filename}")
content = template_tgt.render(
source_schema = source_schema_lower,
source_table = table,
target_schema = schema,
target_table = target_table,
mapping_name = mapping_name,
workflow_name = workflow_name
)
filename=f"targets/{mapping_name}.sql"
with open(filename, mode="w", encoding="utf-8") as message:
message.write(content)
print(f"... wrote {filename}")
content = template_dag.render(
table = table
)
filename=f"dags/{workflow_name}.py"
with open(filename, mode="w", encoding="utf-8") as message:
message.write(content)
print(f"... wrote {filename}")
content = template_db.render(
table = table,
schema = schema
)
filename=f"db/{source_schema}_{source_table}.sql"
with open(filename, mode="w", encoding="utf-8") as message:
message.write(content)
print(f"... wrote {filename}")
filename=f"db/table_changes.sql"
with open(filename, mode="a", encoding="utf-8") as message:
message.write(f"{content}\n")
print(f"... wrote {filename}")

View File

@@ -0,0 +1,30 @@
SCHEMA,TABLE
TMS,ACMENTRYSTATELEDGERGROUP
TMS,ACTIVITYLOGDUE
TMS,ACTIVITY_LOG
TMS,BALANCE
TMS,BLACKOUT_LOG
TMS,BRANCH
TMS,CALENDAR
TMS,CASHFLOW
TMS,CLIENT
TMS,CUSTODYBALANCE
TMS,ECBINSTRUMENTBONDCASHFLOW
TMS,EFFECTIVEROLEPROFILE
TMS,FINMESSAGELOG
TMS,HISTORY_LOG
TMS,INSTRUMENTBONDCASHFLOW
TMS,INSTRUMENT_REPORT
TMS,MARKETINFO
TMS,PARAMETER
TMS,PORTFOLIOTREE
TMS,PRICES
TMS,PROPERTY
TMS,RECONCILIATION
TMS,ROLEPORTFOLIOPROFILE
TMS,RULES
TMS,SDM_ENTITY_STATE
TMS,SECURITYPOSITION
TMS,SETTLEMENTCASHFLOW
TMS,SETTLEMENTLOG
TMS,USERINFORMATION

View File

@@ -0,0 +1,95 @@
from airflow.decorators import dag
from airflow.operators.bash import BashOperator
from airflow.operators.python import PythonOperator
from airflow.utils.dates import days_ago
from airflow.utils.trigger_rule import TriggerRule
from cosmos import DbtTaskGroup, ProfileConfig, ProjectConfig, RenderConfig
# Define paths to your dbt files
dbt_root_path = "/home/dbt/DBT/mrds"
dbt_profiles_dir = "/home/dbt/.dbt/profiles.yml"
ODS_TABLE = "{{table}}"
DATABASE_NAME = "MOPDB"
DAG_NAME = f"w_{DATABASE_NAME}_TMS_T_{ODS_TABLE}_OU_TMS_{ODS_TABLE}"
MAPPING_NAME = f"m_{DATABASE_NAME}_TMS_T_{ODS_TABLE}_OU_TMS_{ODS_TABLE}"
# Define function for the retrieval of the current run_id
def retrieve_run_id(**kwargs):
# Retrieve the run_id from the Airflow context
run_id = kwargs['run_id']
# Store the run_id in XCom for future reference
ti = kwargs['ti']
ti.xcom_push(key='run_id', value=run_id)
return run_id
def check_dag_status(**kwargs):
for task_instance in kwargs['dag_run'].get_task_instances():
if task_instance.state == 'failed' and task_instance.task_id != kwargs['task_instance'].task_id:
raise Exception("Task {} failed. Failing this DAG run".format(task_instance.task_id))
# Define function for the check of the status of the previous tasks
def determine_workflow_status(**kwargs):
# Check the status of previous tasks
task_statuses = kwargs['ti'].xcom_pull(task_ids=['retrieve_run_id', 'control_external_run_start', 'mapping_mopdb'])
# If any task failed, set workflow_status to 'N', otherwise 'Y'
workflow_status = 'N' if any(status != 'success' for status in task_statuses) else 'Y'
return workflow_status
@dag(
dag_id=DAG_NAME,
schedule_interval=None,
start_date=days_ago(2),
catchup=False
)
def run_dag():
# Retrieve run_id
retrieve_run_id_task = PythonOperator(
task_id='retrieve_run_id',
python_callable=retrieve_run_id,
provide_context=True,
# pool='my_custom_pool', # Create pool in Airflow Web UI with one slot to ensure that only one dag can run it at a time.
)
# Run dbt macro control_external_run_start
control_external_run_start = BashOperator(
task_id='control_external_run_start',
bash_command=(
'cd /home/dbt/DBT/mrds && '
'dbt run-operation control_external_run_start --vars \'{"orchestration_run_id": "{% raw %}{{{% endraw %} task_instance.xcom_pull(task_ids="retrieve_run_id", key="run_id") {% raw %}}}{% endraw %}", "input_service_name": "' + DATABASE_NAME + '", "workflow_name": "' + DAG_NAME + '"}\' '
'--profiles-dir /home/dbt/.dbt/ --target dev'
)
)
# run dbt taskGroup with tag of the mapping name
dbtTaskGroup = DbtTaskGroup(
group_id=MAPPING_NAME,
project_config=ProjectConfig(
dbt_project_path = dbt_root_path),
profile_config=ProfileConfig(
profiles_yml_filepath = dbt_profiles_dir,
profile_name="mrds",
target_name="dev"),
render_config=RenderConfig(select=[f"tag:{MAPPING_NAME}"],),
operator_args={'vars': {'orchestration_run_id': '{% raw %}{{{% endraw %} task_instance.xcom_pull(task_ids="retrieve_run_id", key="run_id") {% raw %}}}{% endraw %}', "input_service_name": DATABASE_NAME, "workflow_name": DAG_NAME }}
)
control_external_run_end = BashOperator(
task_id='control_external_run_end',
bash_command=(
'cd /home/dbt/DBT/mrds && '
'dbt run-operation control_external_run_end --vars \'{"orchestration_run_id": "{% raw %}{{{% endraw %} task_instance.xcom_pull(task_ids="retrieve_run_id", key="run_id") {% raw %}}}{% endraw %}", "input_service_name": "' + DATABASE_NAME + '", "workflow_name": "' + DAG_NAME + '"}\' '
'--profiles-dir /home/dbt/.dbt/ --target dev'
),
trigger_rule=TriggerRule.ALL_DONE # Run regardless of previous task outcomes
)
dag_status = PythonOperator(
task_id='dag_status',
provide_context=True,
python_callable=check_dag_status,
trigger_rule=TriggerRule.ALL_DONE, # Ensures this task runs even if upstream fails
)
# Set task dependencies
retrieve_run_id_task >> control_external_run_start >> [dbtTaskGroup] >> control_external_run_end >> dag_status
globals()[DAG_NAME] = run_dag()

View File

@@ -0,0 +1,2 @@
ALTER TABLE OU_{{schema}}.{{table}} RENAME COLUMN A_ETL_LOAD_SET_FK TO A_WORKFLOW_HISTORY_KEY;
DROP TABLE {{schema}}.T_{{table}};

View File

@@ -0,0 +1,11 @@
{% raw %}{{{% endraw %}
config(
materialized="table",
tags=["{{mapping_name}}", "{{source_schema}}", "{{target_schema}}"],
alias="{{target_table}}_SQ",
schema="{{target_schema}}"
)
{% raw %}}}{% endraw %}
{% raw %}{{{% endraw %}
create_table_from_source("{{source_schema}}","{{source_table}}","{{workflow_name}}",get_main_task_name(model.name),['A_KEY','A_WORKFLOW_HISTORY_KEY'])
{% raw %}}}{% endraw %}

View File

@@ -0,0 +1,13 @@
{% raw %}-- depends_on: {{{% endraw %} ref('{{mapping_name}}_SQ') {% raw %}}} {% endraw %}
{% raw %}{{{% endraw %}
config(
tags=["{{mapping_name}}","MOPDB","{{target_schema}}"],
alias="{{target_table}}",
schema="{{target_schema}}",
materialized="incremental" ,
incremental_strategy="merge"
)
{% raw %}}}{% endraw %}
{% raw %}{{{% endraw %}
create_table_target('{{mapping_name}}_SQ')
{% raw %}}}{% endraw %}

View File

@@ -0,0 +1,201 @@
import requests
import io
import zipfile
import pandas as pd
import os
from datetime import datetime
import oci
from mrds.utils.secrets import get_secret
import mrds.utils.manage_runs as runManager
import mrds.utils.manage_files as fileManager
import mrds.utils.sql_statements as sqls
import sys
import yaml
TASK_HISTORY_MULTIPLIER = 1_000_000_000
def initialize_task(workflow_context, task_name):
# Initialize task
a_task_history_key = runManager.init_task(
task_name,
workflow_context["run_id"],
workflow_context["a_workflow_history_key"],
)
return a_task_history_key
def rqsd_parser(fileName,bucket_path,file,bucket_name):
if "SCOPA" in fileName or "SCOPF" in fileName:
print("SCOP")
annex_1_1(fileName,bucket_path,file,bucket_name)
annex_1_2(fileName,bucket_path,file,bucket_name)
elif "RQSDC" in fileName:
print("RQSDC")
return annex_2(fileName, bucket_path,file,bucket_name)
def annex_1_1(fileName, bucket_path,file,bucket_name):
fileData=fileName.split("_")
csv_file_path = fileName[:-4]+".csv"
version_number = fileData[6]
ref_exercise = fileData[2]
ncb = fileData[4]
df = pd.read_excel(file, sheet_name="Counterparties in scope", skiprows=3)
df = df.dropna(axis=1, how='all').dropna(axis=0, how='all')
df['file_name'] = os.path.basename(fileName)
df['ingestion_timestamp'] = datetime.now().isoformat()
df['version_number'] = version_number
df['ref_exercise'] = ref_exercise
df['ncb'] = ncb
signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
client = oci.object_storage.ObjectStorageClient({}, signer = signer) # the first empyty bracket is an empty config
client.put_object("frcnomajoc7v",bucket_name,bucket_path+"1_1/"+csv_file_path,bytes(df.to_csv( index=False), encoding='utf-8'))
print("Finished uploading {}".format(csv_file_path))
print(f"CSV saved to {csv_file_path}")
def annex_1_2(fileName, bucket_path,file,bucket_name):
fileData=fileName.split("_")
csv_file_path = fileName[:-4]+".csv"
version_number = fileData[6]
ref_exercise = fileData[2]
ncb = fileData[4]
df = pd.read_excel(file, sheet_name="Entities to which data relates", skiprows=3)
df = df.dropna(axis=1, how='all').dropna(axis=0, how='all')
df['file_name'] = os.path.basename(fileName)
df['ingestion_timestamp'] = datetime.now().isoformat()
df['version_number'] = version_number
df['ref_exercise'] = ref_exercise
df['ncb'] = ncb
signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
client = oci.object_storage.ObjectStorageClient({}, signer = signer) # the first empyty bracket is an empty config
client.put_object("frcnomajoc7v",bucket_name,bucket_path+"1_2/"+csv_file_path,bytes(df.to_csv( index=False), encoding='utf-8'))
print("Finished uploading {}".format(csv_file_path))
print(f"CSV saved to {csv_file_path}")
def annex_2(fileName,bucket_path,file,bucket_name):
fileData=fileName.split("_")
# Parameters
version_number = fileData[6]
ref_exercise = fileData[2]
ncb = fileData[4]
# Read the first sheet, skip the metadata rows
df = pd.read_excel(file.getvalue(), sheet_name="Data collection template", skiprows=6)
# Clean empty rows/columns
df = df.dropna(axis=1, how='all').dropna(axis=0, how='all')
# Add metadata columns
df['file_name'] = os.path.basename(fileName)
df['ingestion_timestamp'] = datetime.now().isoformat()
df['version_number'] = version_number
df['ref_exercise'] = ref_exercise
df['ncb'] = ncb
csvName=fileName[:-4]+"csv"
# Save to CSV
signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
client = oci.object_storage.ObjectStorageClient({}, signer = signer) # the first empyty bracket is an empty config
client.put_object("frcnomajoc7v",bucket_name,bucket_path+"2/"+csvName,bytes(df.to_csv( index=False), encoding='utf-8'))
print("Finished uploading {}".format(csvName))
print(f"CSV saved to {csvName}")
def rqsd_preflow(secret_crt_id,secret_key_id,casper_api_url,collection_id): #downloads the list of files
crt_path=os.getcwd()+"/rqsd_tst.crt"
key_path=os.getcwd()+"/rqsd_tst.key.pem"
try:
with open(key_path,"w") as keyfile:
keyfile.write(get_secret(secret_key_id))
with open (crt_path,"w") as crtfile:
crtfile.write(get_secret(secret_crt_id))
except:
print("Failed to retrieve certificates from secrets")
os.remove(crt_path)
os.remove(key_path)
raise(Exception)
protected_resource_url=casper_api_url+"/casper-api/filevault/"
try:
max_date=fileManager.execute_query("SELECT to_char(max(processing_end_time),'YYYY-MM-DD HH24:mi:ss') as MAX_PROCESSING_END_TIME FROM ct_ods.a_casper_filevault")
if max_date is not []:
filterString='isTest eq False and processingStatus eq "PS_COMPLETED" and processingEndTime gt '+max_date[0].split(' ')[0]
else:
filterString='isTest eq False and processingStatus eq "PS_COMPLETED"'
response=requests.get(protected_resource_url+"files/"+collection_id ,headers={"accept": "application/json"},cert=(crt_path,key_path), verify=False, params={"filter": filterString})
print(response.text)
files=response.json()
except:
print("Failed to retrieve ACC metadata, error during connection or request")
raise(Exception)
return files
def rqsd_process(files,casper_api_url,bucket_path,bucket_name):
crt_path=os.getcwd()+"/rqsd_tst.crt"
key_path=os.getcwd()+"/rqsd_tst.key.pem"
# GET request to a protected
for downloadable in files:
try:
print("\n\n")
response=requests.get(casper_api_url+"/casper-api/filevault/download/"+str(downloadable["dcId"])+'/'+str(downloadable["fileID"]) ,headers={"accept": "application/json"},cert=(crt_path, key_path),verify=False)
rqsd_parser(downloadable["fileName"],bucket_path,io.BytesIO(response.content),bucket_name)
except:
print(f"Failed to upload file into target bucket, files saved locally in {os.getcwd()}")
os.remove(crt_path)
os.remove(key_path)
raise(Exception)
def add_a_key_column(headers, data_rows, task_history_key):
headers.insert(0, 'A_KEY')
for i, row in enumerate(data_rows, start=1):
a_key_value = int(task_history_key) * TASK_HISTORY_MULTIPLIER + i
row.insert(0, str(a_key_value))
def add_workflow_key_column(headers, data_rows, workflow_key):
headers.insert(1, 'A_WORKFLOW_HISTORY_KEY')
for row in data_rows:
row.insert(0, workflow_key)
def initialize_config(config_file_path):
# Ensure the file exists
if not os.path.exists(config_file_path):
raise FileNotFoundError(f"Configuration file {config_file_path} not found.")
# Load the configuration
with open(config_file_path, "r") as f:
config_data = yaml.safe_load(f)
return config_data
def main(workflow_context, flow_config_path, env_config_path, env):
#init setup
flow_info = initialize_config(flow_config_path)
envs_info = initialize_config(env_config_path)
environment_info = envs_info[env]
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
a_task_history_key = initialize_task(workflow_context, flow_info['TASK_NAME'])
# get list of files
try:
files = rqsd_preflow(environment_info["CERTIFICATE_FILE"],environment_info["CERTIFICATE_KEY"],environment_info["CASPER_URL"],flow_info["COLLECTION_ID"])
rqsd_process(files,environment_info["CASPER_URL"],flow_info["ODS_PREFIX"],environment_info["BUCKET"])
except:
print("Failed to retrieve DEVO data, error during connection or request")
raise(Exception)
# Finalize task
runManager.finalise_task(a_task_history_key, 'Y')

View File

@@ -0,0 +1,27 @@
# Environment Configuration
dev:
BUCKET: "mrds_inbox_dev"
CERTIFICATE_KEY: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya7r33ocatalf6jn6kg2xjhnya6kazlqd3e5gw6yghpd5q"
CERTIFICATE_FILE: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyaeva4zvj6xdihljookamhse7jlyassfjb4p45xp46bwba"
CASPER_URL: "https://internet.api.casper.tst.aws.tadnet.net"
RQSD_COLLECTION_ID: "1537"
tst:
BUCKET: "mrds_inbox_tst"
CERTIFICATE_KEY: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya7r33ocatalf6jn6kg2xjhnya6kazlqd3e5gw6yghpd5q"
CERTIFICATE_FILE: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyaeva4zvj6xdihljookamhse7jlyassfjb4p45xp46bwba"
CASPER_URL: "https://internet.api.casper.tst.aws.tadnet.net"
RQSD_COLLECTION_ID: "1537"
acc:
BUCKET: "mrds_inbox_acc"
CERTIFICATE_KEY: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya5snmftggydoszwchjra3ifa4pyiilgc26uqlhejnhcca"
CERTIFICATE_FILE: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyaho5t4qgmlqctew6g6mcnwpz2p7z4nhxooyl6hc5sonfa"
CASPER_URL: "https://internet.api.casper.stg.aws.ecb.de"
RQSD_COLLECTION_ID: "1116"
prd:
BUCKET: "mrds_inbox_prd"
CERTIFICATE_KEY: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyahmv5sopfsv7nytxdyycehoyl5pd7sz5t2drn27qaneta"
CERTIFICATE_FILE: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyame3chyqs6cdl2igeyrvzpj3s4vrndhbgeayt533uhgqa"
CASPER_URL: "https://internet.api.casper.prd.aws.ecb.de"
RQSD_COLLECTION_ID: "1030"

View File

@@ -0,0 +1,25 @@
# Environment Configuration
dev:
BUCKET_NAMESPACE: "frcnomajoc7v"
BUCKET: "mrds_inbox_dev"
DEVO_USERNAME: "ap-informatica-ipcwt"
DEVO_HOSTNAME: "impala-proxy-devo-lab21-impala01.dw-devo-lab21.om2y56.b0.cloudera.site"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyavrevwxke46wjgj5nz3cc5kwwsybmngbji4zepones55q"
tst:
BUCKET_NAMESPACE: "frcnomajoc7v"
BUCKET: "mrds_inbox_tst"
DEVO_USERNAME: "ap-informatica-ipcwt"
DEVO_HOSTNAME: "t-impala.devo.escb.eu"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyaxxx7yfifpgpdnxuj6dcowpoktwa6745kwwpezysd44oa"
acc:
BUCKET_NAMESPACE: "frcnomajoc7v"
BUCKET: "mrds_inbox_acc"
DEVO_USERNAME: "ap-informatica-ipcwa"
DEVO_HOSTNAME: "impala-proxy-devo-acc21-impala01.dw-devo-acc21.inym23.b0.cloudera.site"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya4uttfadlzreloouw2e5bifgl2dvihffym5xoq3b3jmva"
prd:
BUCKET_NAMESPACE: "frcnomajoc7v"
BUCKET: "mrds_inbox_prd"
DEVO_USERNAME: "ap-informatica-ipcwp"
DEVO_HOSTNAME: "impala-proxy-devo-prd21-impala01.dw-devo-prd21.inym23.b0.cloudera.site"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyanbahqlucid7qtzvoohsf4xrlul7cvhlsqttmbro4n66a"

View File

@@ -0,0 +1,25 @@
# Environment Configuration
dev:
BUCKET_NAMESPACE: "frcnomajoc7v"
BUCKET: "mrds_inbox_dev"
DEVO_USERNAME: "ap-devo-rqsd-tst"
DEVO_HOSTNAME: "t-impala.devo.escb.eu"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyap6wtzobzob7qizvk4nocszlcaxhwijgzejbvryt3uzbq"
tst:
BUCKET_NAMESPACE: "frcnomajoc7v"
BUCKET: "mrds_inbox_tst"
DEVO_USERNAME: "ap-devo-rqsd-tst"
DEVO_HOSTNAME: "t-impala.devo.escb.eu"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyap6wtzobzob7qizvk4nocszlcaxhwijgzejbvryt3uzbq"
acc:
BUCKET_NAMESPACE: "frcnomajoc7v"
BUCKET: "mrds_inbox_acc"
DEVO_USERNAME: "ap-devo-rqsd-acc"
DEVO_HOSTNAME: "impala-proxy-devo-acc21-impala01.dw-devo-acc21.inym23.b0.cloudera.site"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyamzhgatnso57mubvg3c6k4ens3orcx4dieo6efukuvm4a"
prd:
BUCKET_NAMESPACE: "frcnomajoc7v"
BUCKET: "mrds_inbox_prd"
DEVO_USERNAME: "ap-devo-rqsd-prd"
DEVO_HOSTNAME: "impala-proxy-devo-prd21-impala01.dw-devo-prd21.inym23.b0.cloudera.site"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyawpahgevgxv6csqnwil3p37vi6pthl466onnkg6k7undq"

View File

@@ -0,0 +1,259 @@
# devo_impala_exporter.py
import os
import io
import yaml
import datetime
import logging
from typing import Any, Dict, List, Optional, Tuple
import pandas as pd
from mrds.utils.secrets import get_secret
import mrds.utils.manage_runs as runManager
import mrds.utils.objectstore as objectstore
import oci
from impala.dbapi import (
connect,
ProgrammingError,
DatabaseError,
IntegrityError,
OperationalError,
)
from impala.error import HiveServer2Error
TASK_HISTORY_MULTIPLIER = 1_000_000_000
class DevoConnector:
"""
Export the result of an Impala (DEVO) query to OCI Object Storage as CSV,
while recording task run metadata via mrds.runManager.
Usage:
exporter = DevoImpalaExporter(
flow_config_path="/path/to/flow.yaml",
env_config_path="/path/to/env.yaml",
env="dev",
logger=my_logger, # optional
oci_client=my_object_storage, # optional ObjectStorageClient
oci_signer=my_signer, # optional signer (used if client not provided)
)
exporter.run({"run_id": 34, "a_workflow_history_key": 6})
"""
def __init__(
self,
flow_config_path: str,
env_config_path: str,
env: str,
logger: Optional[logging.Logger] = None,
oci_client: Optional[oci.object_storage.ObjectStorageClient] = None,
oci_signer: Optional[Any] = None,
) -> None:
self.flow_info = self._initialize_config(flow_config_path)
envs_info = self._initialize_config(env_config_path)
BUCKET_NAMESPACE = os.getenv("BUCKET_NAMESPACE", "frcnomajoc7v")
if env not in envs_info:
raise KeyError(f"Environment '{env}' not found in {env_config_path}")
self.environment_info = envs_info[env]
self.environment_info["BUCKET_NAMESPACE"]=BUCKET_NAMESPACE
self.env = env
# logging
self.logger = logger or self._default_logger(self.flow_info.get("TASK_NAME", "devo_task"))
# OCI client/signer
self.oci_client = oci_client
self.oci_signer = oci_signer
# -------------------------
# Public API
# -------------------------
def run(self, workflow_context: Dict[str, Any]) -> None:
"""Main entry point; executes query, uploads CSV, and finalizes task."""
task_name = self.flow_info["TASK_NAME"]
a_task_history_key = self._initialize_task(workflow_context, task_name)
try:
# credentials
devo_secret_name = self.environment_info["DEVO_SECRET"]
password = get_secret(devo_secret_name)
self.logger.info("Retrieved secret for DEVO connection.")
# query
query = self.flow_info["DEVO_QUERY"]
user = self.environment_info["DEVO_USERNAME"]
host = self.environment_info["DEVO_HOSTNAME"]
columns, data, rowcount = self._execute_query(query=query, user=user, hostname=host, password=password)
df = self._tuple_to_dataframe((columns, data))
self.logger.info("Query executed and DataFrame created with %d rows.", len(df))
# upload
if rowcount > 0:
csv_name = f"{self.flow_info['OUTPUT_TABLE']}.csv"
file_path = self._compose_object_path(self.flow_info["ODS_PREFIX"], csv_name)
self._upload_dataframe_to_oci(df, csv_name, file_path)
self.logger.info("Finished uploading %s to %s.", csv_name, file_path)
else:
return 0
# success
runManager.finalise_task(a_task_history_key, "Y")
self.logger.info("Task %s finalized successfully.", task_name)
return rowcount
except Exception as e:
# failure
self.logger.exception("Run failed: %s", e)
try:
runManager.finalise_task(a_task_history_key, "N")
finally:
# re-raise for upstream handling if used as a library
raise
# -------------------------
# Impala / DEVO
# -------------------------
@staticmethod
def _get_impala_connection(hostname: str, user: str, secret: str):
return connect(
host=hostname,
port=443,
auth_mechanism="PLAIN",
user=user,
password=secret,
use_http_transport=True,
http_path="cliservice",
use_ssl=True,
)
def _execute_query(self, query: str, user: str, hostname: str, password: str) -> Tuple[List[str], List[List[Any]]]:
conn = self._get_impala_connection(hostname, user, password)
cursor = None
self.logger.info("Executing Impala query against host '%s' as user '%s'.", hostname, user)
try:
cursor = conn.cursor()
cursor.execute(query)
if query.strip().lower().startswith("select") or "select" in query.strip().lower() :
rows = cursor.fetchall()
columns = [col[0] for col in cursor.description]
return columns, rows, cursor.rowcount
else:
# Non-SELECT: return rowcount (still return a columns list for consistency)
return [], [[cursor.rowcount]]
except OperationalError as oe:
raise RuntimeError("Failed to connect to Impala: " + str(oe)) from oe
except ProgrammingError as pe:
raise ValueError("Query syntax error: " + str(pe)) from pe
except IntegrityError as ie:
raise PermissionError("Insufficient permissions: " + str(ie)) from ie
except DatabaseError as db_err:
raise RuntimeError("Database error: " + str(db_err)) from db_err
except HiveServer2Error as au_err:
raise PermissionError("HiveServer2Error error: " + str(au_err)) from au_err
except Exception as e:
raise RuntimeError("An unexpected error occurred: " + str(e)) from e
finally:
try:
if cursor:
cursor.close()
finally:
try:
conn.close()
except Exception:
# log but don't mask the original exception
self.logger.warning("Failed to close Impala connection cleanly.", exc_info=True)
# -------------------------
# OCI Upload
# -------------------------
def _upload_dataframe_to_oci(self, df: pd.DataFrame, csv_name: str, object_path: str) -> None:
namespace = self.environment_info["BUCKET_NAMESPACE"]
bucket = self.environment_info["BUCKET"]
# convert DataFrame to CSV bytes without index
csv_bytes = df.to_csv(index=False).encode("utf-8")
client=objectstore.get_client()
client.put_object(namespace, bucket, object_path, csv_bytes)
self.logger.info("CSV '%s' uploaded to bucket '%s' (ns: '%s', key: '%s').", csv_name, bucket, namespace, object_path)
# -------------------------
# Utilities
# -------------------------
@staticmethod
def _tuple_to_dataframe(data_tuple: Tuple[List[str], List[List[Any]]]) -> pd.DataFrame:
columns, data = data_tuple
if not columns:
# for non-SELECT queries we returned rowcount; represent it in a DataFrame
return pd.DataFrame(data, columns=["rowcount"])
return pd.DataFrame(data, columns=columns)
@staticmethod
def _initialize_config(config_file_path: str) -> Dict[str, Any]:
if not os.path.exists(config_file_path):
raise FileNotFoundError(f"Configuration file {config_file_path} not found.")
with open(config_file_path, "r") as f:
return yaml.safe_load(f)
@staticmethod
def _initialize_task(workflow_context: Dict[str, Any], task_name: str) -> int:
return runManager.init_task(
task_name,
workflow_context["run_id"],
workflow_context["a_workflow_history_key"],
)
@staticmethod
def add_a_key_column(headers: List[str], data_rows: List[List[Any]], task_history_key: int) -> None:
"""Optionally add an A_KEY column (kept for parity with original script)."""
headers.insert(0, "A_KEY")
for i, row in enumerate(data_rows, start=1):
a_key_value = int(task_history_key) * TASK_HISTORY_MULTIPLIER + i
row.insert(0, str(a_key_value))
@staticmethod
def add_workflow_key_column(headers: List[str], data_rows: List[List[Any]], workflow_key: int) -> None:
"""Optionally add the workflow key column right after A_KEY if present, otherwise at position 0."""
insert_idx = 1 if headers and headers[0] == "A_KEY" else 0
headers.insert(insert_idx, "A_WORKFLOW_HISTORY_KEY")
for row in data_rows:
row.insert(insert_idx, workflow_key)
@staticmethod
def _compose_object_path(prefix: str, filename: str) -> str:
if prefix.endswith("/"):
return f"{prefix}{filename}"
return f"{prefix}/{filename}"
@staticmethod
def _default_logger(task_name: str) -> logging.Logger:
logger = logging.getLogger(f"{task_name}_logger")
if not logger.handlers:
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
fmt = logging.Formatter(f"%(asctime)s [{task_name}] %(levelname)s: %(message)s")
handler.setFormatter(fmt)
logger.addHandler(handler)
return logger
# Optional: quick-run convenience if you ever want to execute this module directly.
if __name__ == "__main__":
# Example only—adjust paths/env/context as needed or remove this block.
exporter = DevoConnector(
flow_config_path="/home/dbt/Marco/mrds_elt/airflow/ods/rqsd/rqsd_process/config/yaml/m_ODS_RQSD_OBSERVATIONS.yaml",
env_config_path="/home/dbt/Marco/mrds_elt/python/connectors/devo/config/env_config_rqsd.yaml",
env="dev",
)
exporter.run({"run_id": 34, "a_workflow_history_key": 6})

View File

@@ -0,0 +1,294 @@
import argparse
from TMSQuery import XMLQuery
import mrds.utils.objectstore
import tempfile
import re
import csv
from io import StringIO
import os.path
import os, psutil
import sys
namespace = os.getenv("BUCKET_NAMESPACE", "frcnomajoc7v")
def memory_usage():
# return the memory usage in percentage like top
process = psutil.Process(os.getpid())
mem = process.memory_info().rss/(1024*1024*1024)
return mem
def protect_keyword(s):
s = s.lower()
s = s.replace(' ', '_')
match s.lower():
case 'comment':
#return '"comment"'
return 'comment_'
case 'date':
#return '"date"'
return 'date_'
case 'number':
#return '"number"'
return 'number_'
case _:
return s
cModelsDir = sys.path[0] + '/../dbt/mrds/models/ods/'
cDatasetMultiplier = 10000000
parser = argparse.ArgumentParser()
parser.add_argument("command", choices=['create-model', 'create-oracle-table', 'retrieve'], help="create-model retrieve")
parser.add_argument("-n", "--name", help="Name")
parser.add_argument("-u", "--url", required=True, help="URL of TMS service")
parser.add_argument("-U", "--user", required=True, help="TMS user")
parser.add_argument("-P", "--password", required=True, help="TMS password")
parser.add_argument("-x", "--xmlfile", help="XML file")
parser.add_argument("-l", "--layoutfile", help="layout file")
parser.add_argument("-f", "--format", help="output format")
parser.add_argument("-p", "--parameter", action="append", help="Parameter")
parser.add_argument("-c", "--column", action="append", help="Additional column")
parser.add_argument("-d", "--destination", help="destination")
parser.add_argument("-s", "--dataset", help="data set ID", type=int)
parser.add_argument("-v", "--version", help="data model version", type=int, default=1)
args = parser.parse_args()
query = XMLQuery()
if args.xmlfile:
with open(args.xmlfile) as f:
xml = f.read()
query.xml = xml
if args.layoutfile:
with open(args.layoutfile) as f:
layout = f.read()
query.layout = layout
if args.format:
query.format = args.format
if args.parameter:
for p in args.parameter:
[name, value] = p.split('=', 1)
query.parameter[name] = value
additional_columns = []
if args.column:
for p in args.column:
[name, value] = p.split('=', 1)
t = re.split(r'(?:\|)|(?:/)|(?::)', name, maxsplit = 2)
name = t[0]
type = None
if len(t) == 2:
type = t[1]
if not type:
type = 'varchar2(255)'
additional_columns.append((name, type, value))
query.normalize_output()
from pathlib import Path
import pprint
p = Path('/tmp/kurt.xml')
p.write_text(str(query))
if args.command == 'create-oracle-table':
d = query.describe(args.url, args.user, args. password)
columns = [" a_key number(38, 0)", "a_workflow_history_key number(38, 0)"]
for c in additional_columns:
columns.append("%s %s"%(c[0], c[1]))
for col in d:
name = protect_keyword(col[0])
match col[1]:
case 'text':
columns.append(name + " varchar2(512 char)")
case 'int':
columns.append(name + " number(38,0)")
case 'money':
columns.append(name + " number(19,4)")
case 'floating':
columns.append(name + " binary_double")
case 'datetime':
columns.append(name + " date")
case 'integer':
columns.append(name + " number(12, 0)")
sql = "create table ct_et_templates." + args.name + " (\n"
sql = sql + ",\n ".join(columns)
sql = sql + "\n)\n"
if not args.destination or args.destination == '-':
print(sql)
else:
with open(args.destination, 'w') as f:
f.write(sql)
elif args.command == 'create-ods-model':
d = query.describe(args.url, args.user, args. password)
file_name = cModelsDir + args.name + '.yml'
f = open(file_name, 'w') # open file in append mode
f.write('version: %d\n' % args.version)
f.write('models:' + '\n')
f.write(' - name: ' + args.name + '_dbt\n')
f.write(' description: "A starter dbt model"' + '\n')
f.write(' columns:' + '\n')
for col in d:
f.write(' - name: ' + col[0] + '\n')
f.write(' data_type: ' + col[1] + '\n')
f.close()
file_name = cModelsDir + args.name + '.sql'
f = open(file_name, 'w') # open file in append mode
if args.destination and args.destination != '-':
if ':' in args.destination:
dest = args.destination.split(':', 2)
path = dest[1]
else:
path = args.destination
prefix = os.path.dirname(path)
else:
prefix = 'INBOX/TMS/' + args.name.upper() + '/'
pars = "ptablename => '%s', ptemplatetablename => 'ou_tms.%s', pprefix => '%s'" % (args.name, args.name, prefix)
print(f"creating table {args.name}")
f.write('{{\n config(\n post_hook = "call ct_mrds.file_manager.create_external_table(%s)"\n )\n}}\n\n' % pars)
f.write("{{ config(materialized='table') }}" + "\n")
f.write('with source_data as (' + "\n")
columns = []
columns.append("cast (1 as number(38,0)) as a_key")
columns.append("cast (1 as number(38,0)) as a_workflow_history_key")
for col in d:
name = protect_keyword(col[0])
match col[1]:
case 'text':
columns.append("cast ('x' as varchar2(255 char)) as " + name)
case 'int':
columns.append("cast (1 as number(38, 0)) as " + name)
case 'money':
columns.append("cast (1.0 as number(19,4)) as " + name)
case 'floating':
columns.append("cast (1.0 as binary_double) as " + name)
case 'datetime':
columns.append("cast (sysdate as date) as " + name)
case 'integer':
columns.append("cast (1 as number(12, 0)) as " + name)
f.write(' select\n ' + ',\n '.join(columns) + '\n')
f.write(')\nselect * from source_data\n ')
f.close()
elif args.command == 'retrieve':
ret = query.execute(args.url, args.user, args. password)
if query.format in ('scsv', 'standard_csv') and args.dataset:
# Save result to temporary spooled file for further processing
# We avoid doing this in memory to prevent issues with flow EffectivePermissions
f = tempfile.SpooledTemporaryFile(mode = 'w+', max_size = 200*1024*1024)
f.write(ret)
del ret
f.seek(0)
# Replace embedded newlines for '<br/>'
reader = csv.reader(f)
sio = StringIO()
writer = csv.writer(sio)
for l in reader:
l_tmp = [s.replace('\n', '<br/>') for s in l]
writer.writerow(l_tmp)
f.close()
# Necessary to read the data into an array of lines for further processing
sio.seek(0)
lines_tmp = sio.readlines()
del sio
if not lines_tmp:
ret = ""
else:
# Adding artificial columns A_KEY and A_WORKFLOW_HISTORY_KEY and added columns
additional_headers = [t[0] for t in additional_columns]
additional_values = [t[2] for t in additional_columns]
headers = ['A_KEY','A_WORKFLOW_HISTORY_KEY'] + additional_headers + [protect_keyword(h) for h in lines_tmp[0].split(',')]
lines = [','.join(headers) ]
i = 0
for l in lines_tmp[1:]:
lines.append(str(args.dataset*cDatasetMultiplier + i) + ',' + str(args.dataset) + ',' + ','.join(additional_values + [l]) )
i += 1
del lines_tmp
# Spooling again to temporary file to avoid duplication memory needs
f = tempfile.SpooledTemporaryFile(mode = 'w+', max_size = 200*1024*1024)
f.writelines(lines)
del lines
f.seek(0)
ret = f.read()
f.close()
if not args.destination or args.destination == '-':
print(ret, end='')
elif ':' not in args.destination:
with open(args.destination, 'w') as f:
f.write(ret)
else:
f = tempfile.NamedTemporaryFile(delete = False, mode = 'w', prefix = 'TMSDBT-', suffix = '.csv')
f.write(ret)
f.close()
dest = args.destination.split(':', 2)
bucket = dest[0]
dirname = os.path.dirname(dest[1])
filename = os.path.basename(dest[1])
client = mrds.utils.objectstore.get_client()
with open(f.name, "r") as file:
print(file.read())
mrds.utils.objectstore.upload_file(client, f.name,namespace, bucket, dirname, filename)
os.remove(f.name)
if ret:
sys.exit(0)
else:
sys.exit(1)

View File

@@ -0,0 +1,197 @@
import xml.etree.ElementTree as ET
import re
import base64
import sys
class XMLQuery:
def __init__(self, xml = None):
self._format = 'xml'
self._layout = ''
self._parameter = {}
if xml:
self._parse_xml(xml)
def _parse_xml(self, xml):
self._tree = ET.fromstring(xml)
layout_b64 = self._tree.find('layout').text
self._layout = base64.b64decode(layout_b64).decode('utf-8')
self._format = self._tree.find('format').get('type')
self._parameter = {}
for p in self._tree.findall('parameters/parameter'):
self._parameter[p.get('name')] = p.text
def execute(self, url, user, password):
# curl -X POST --basic -u schilli:chili03 --data @tms_activity_interval.xml https://tmsxd104.ecbt1.tadnet.net:9443/report/
import requests
from requests.auth import HTTPBasicAuth
data = str(self)
basic = HTTPBasicAuth(user, password)
response = requests.post(url, data=data, auth=basic, verify=False)
if response.status_code == 200:
response.encoding = "utf-8"
return response.text
else:
return None
def describe(self, url, user, password):
orig_format = self.format
self.format = 'xml'
ret = self.execute(url, user, password)
m = re.match('^.*?\<PlainRow\>.*?\<\/PlainRow\>', ret, re.DOTALL)
s = m[0] + '\n</report-generator>'
tree = ET.fromstring(s)
ret = []
row = tree.find('PlainRow')
for c in row.findall('Column'):
#name = c.get('name')
name = c.text
type = c.get('type')
if type == 'unknown': type = 'integer'
ret.append((name, type))
return ret
def describe_simple(url, user, password, xml):
query = XMLQuery(xml)
query.format='xml'
ret = query.execute(url = url, user = user, password = password)
tree = ET.fromstring(ret)
ret = []
row = tree.find('PlainRow')
for c in row.findall('Column'):
#name = c.get('name')
name = c.text
type = c.get('type')
if type == 'unknown': type = 'integer'
ret.append((name, type))
return ret
def normalize_output(self, date_format = 'dd/MM/yyyy', time_format = 'HH:mm:ss'):
lines = self.layout.splitlines()
lines = [re.sub(r'^date_format\s*=.*', 'date_format=' + date_format, l) for l in lines]
lines = [re.sub(r'^time_format\s*=.*', 'time_format=' + time_format, l) for l in lines]
lines = [re.sub(r'^NoNumberFormatting\s*=.*', 'NoNumberFormatting=1', l) for l in lines]
self.layout = '\n'.join(lines)
def __setattr__(self, name, value):
if name == 'format' and value not in ('bin','xml','xml3','html','txt','csv','standard_csv', 'scsv', 'pdf'):
raise Exception("Invalid report format '" + value + "'")
if not name.startswith('_'):
name = '_' + name
if name == '_layout' and not value.endswith('\n'):
value = value + '\n'
if name == '_xml':
self._parse_xml(value)
return
try:
self.__dict__[name] = value
except KeyError:
raise AttributeError
def __getattr__(self, name):
if not name.startswith('_'):
name = '_' + name
try:
return self.__dict__[name]
except KeyError:
raise AttributeError(name)
def __str__(self):
parameters = ''
for k in self._parameter:
parameters = parameters + "\n<parameter name='%s'>%s</parameter>" % (k, self._parameter[k])
layout_b64 = base64.b64encode(self.layout.encode('utf-8')).decode('utf-8')
return ('<?xml version="1.0" encoding="utf-8"?>\n' + \
'<report-generator>\n' + \
' <format type="%s"/>\n' + \
' <layout>\n%s</layout>\n' + \
' <parameters>%s\n</parameters>' + \
'</report-generator>') % (self._format, layout_b64, parameters)
if __name__ == "__main__":
file = sys.argv[1]
print(file)
with open(file) as f:
xml = f.read()
query = XMLQuery(xml)
print(query.layout)
query.normalize_output()
print(query.layout)
#query.format='xml'
#ret = query.execute(url = 'https://tmsxd104.ecbt1.tadnet.net:9443/report/', user = 'schilli', password = 'chili03')
#print(ret)
desc = XMLQuery.describe_simple(url = 'https://tmsxd104.ecbt1.tadnet.net:9443/report/', user = 'schilli', password = 'chili03', xml = xml)
print(str(desc))

View File

@@ -0,0 +1,355 @@
"""
DAG: w_ODS_TMS_TRANSACTION (expanded example)
Purpose:
- Load layout+parameter metadata from TMS-layouts/w_ODS_TMS_TRANSACTION.yml
- Call connectors/tms/TMSDBT.py to retrieve data into CSV in object storage
- On first run, generate Oracle DDL and create an external table
- Process file and record status in MRDS workflow tables
Notes:
- This is an expanded, readable version of the factory-generated DAG.
- Replace paths/usernames/password references as appropriate.
"""
import copy
import itertools
import json
import logging
import os
import re
import subprocess
import sys
import traceback
from datetime import datetime, timedelta
from pathlib import Path
import yaml
from airflow import DAG
from airflow.operators.python import PythonOperator
from pytz import timezone
# --- Project-specific deps (must exist in your Airflow image) ---
from mrds.core import main # noqa: F401 # imported to mirror the factory env
import mrds.utils.manage_files as mf
import mrds.utils.manage_runs as mr
# ---------- Paths & constants ----------
gScriptDir = Path(globals().get("__file__", "./_")).absolute().parent
gDataDir = str(gScriptDir / "TMS-layouts") + "/"
gConfigDir = str(gScriptDir / "config")
gConnDir = "/opt/airflow/python/connectors/tms/"
gTableDir = str(gScriptDir / "TMS-tables") + "/"
DAG_NAME = "w_ODS_TMS_TRANSACTION"
ODS_TABLE = DAG_NAME
DATABASE_NAME = "ODS"
WF_NAME = DAG_NAME
default_args = {
"owner": "ecb",
"depends_on_past": False,
"email_on_failure": False,
"email_on_retry": False,
"retries": 0,
"execution_timeout": timedelta(minutes=60),
"retry_delay": timedelta(minutes=5),
}
# ---------- Load YAML configs once on parse ----------
with open(gDataDir + DAG_NAME + ".yml", "r") as f:
report_desc = yaml.safe_load(f) or {}
with open(gConfigDir + "/TMS.yml", "r") as f:
tms_config = yaml.safe_load(f)
# TMS + storage config
tms_url = tms_config["TMS-URL"]
tms_user = tms_config["TMS-user"]
tms_pwd = tms_config["TMS-password"]
prefix = tms_config["dest-prefix"] + DAG_NAME + "/" + DAG_NAME + "/"
data_prefix = tms_config["data-prefix"] + DAG_NAME + "/"
dest = tms_config["dest-bucket"] + ":" + prefix
# Visible vs hidden params (from layout YAML)
params_visible = {}
params_hidden = {}
params_dict = report_desc.get("parameters") or {}
for p, meta in params_dict.items():
val = meta.get("value", None)
if not meta.get("hidden", False):
params_visible[p] = val
else:
params_hidden[p] = val
# ---------- Helpers (parameter handling) ----------
def _enum_param_combinations_recursive(params, keys):
"""
Build all combinations of params (cartesian product), supporting
'column(<name>)' derived lists aligned by index.
"""
k = None
result = []
keys = list(keys) # safe copy
while keys:
k = keys.pop(0)
v = params[k]
if v or v == "":
break
if not k:
return []
v = v if isinstance(v, list) else [v]
# derived columns aligned with v (same length)
derived_columns = []
# params_dict[k] holds the definition, not just the value
pdef = params_dict.get(k, {})
for c in list(pdef):
if re.match(r"column\(.*\)$", c):
vtmp = pdef[c]
vtmp = vtmp if isinstance(vtmp, list) else [vtmp]
derived_columns.append((c, vtmp))
if not keys:
for i, value in enumerate(v):
row = [(k, value)]
for col_key, aligned_values in derived_columns:
row.append((col_key, aligned_values[i]))
result.append(row)
return result
combinations = _enum_param_combinations_recursive(params, keys)
for row in combinations:
for i, vtmp in enumerate(v):
new_row = copy.deepcopy(row)
new_row.append((k, vtmp))
for col_key, aligned_values in derived_columns:
new_row.append((col_key, aligned_values[i]))
result.append(new_row)
return result
def _enum_param_combinations(params, sequential=False):
# Sequential path omitted (buggy in factory; not used there either)
return _enum_param_combinations_recursive(params, list(params))
def _allowed_select(table, expression, condition="1 = 1"):
"""
Guarded select used by eval_params(select(...)).
Whitelist tables to avoid arbitrary reads.
"""
if table.upper() not in (
ODS_TABLE.upper(),
"DUAL",
"CT_MRDS.A_WORKFLOW_HISTORY",
):
raise Exception(f"Not allowed to select from {table}")
res = mr.select_ods_tab(table, expression, condition)
return res[0]
def _eval_param(v):
"""
Evaluate special functional values:
- select(...) => guarded DB helper above
- eval(...) => strongly discouraged; keep disabled or restricted
"""
s = str(v) if v is not None else ""
if re.match(r"\s*select\(.*\)", s):
# Expose only 'select' symbol to eval
return eval(s, {"select": _allowed_select}, {})
if re.match(r"\s*eval\(.*\)\s*$", s):
# If you really must support eval, strictly sandbox or remove this path.
raise ValueError("eval(...) not allowed in this hardened DAG.")
return v
def _finalize_param_list(param_list):
"""
Apply replacements and drop virtual params according to YAML definitions.
"""
d = dict(param_list)
# Replace parameter tokens inside another parameter (string replace)
for p, meta in params_dict.items():
if meta.get("replace_parameter"):
target = meta["replace_parameter"]
if target in d and p in d and isinstance(d[target], str):
d[target] = d[target].replace(p, str(d[p]))
# Drop 'virtual' params
cleaned = []
for k, v in d.items():
meta = params_dict.get(k, {})
if not meta.get("virtual", False):
cleaned.append((k, v))
return cleaned
# ---------- Core work ----------
def execute_report(**context):
"""
For each parameter combination:
- create workflow key
- call TMSDBT.py retrieve to land CSV
- if first time, create Oracle table from generated DDL
- process file, finalize workflow Y/N
"""
logger = logging.getLogger("airflow.task")
logger.setLevel(logging.DEBUG)
run_id = context["dag_run"].run_id
all_params = {**params_visible, **params_hidden}
# 1) Compute combinations
combos = _enum_param_combinations(all_params)
# 2) Evaluate select(...) etc and finalize
evaluated = []
for combo in combos or [[]]:
# first pass: special evaluations
pair_list = []
for k, v in combo:
pair_list.append((k, _eval_param(v)))
# second pass: replacements + pruning
evaluated.append(_finalize_param_list(pair_list))
# if no combos at all, ensure we run once
if not evaluated:
evaluated = [[]]
# Timing + workflow
ts = "{:%Y%m%d_%H%M%S}".format(datetime.now(timezone("Europe/Berlin")))
for idx, param_list in enumerate(evaluated, start=1):
wf_key = mr.init_workflow(DATABASE_NAME, WF_NAME, run_id)
file_name = f"{WF_NAME}.{wf_key}.{ts}.csv"
try:
# Build connector command safely (no shell quoting games)
cmd = [
sys.executable, # 'python'
os.path.join(gConnDir, "TMSDBT.py"),
"retrieve",
"--name", WF_NAME,
"--url", tms_url,
"-U", tms_user,
"--password", tms_pwd,
"--layoutfile", gDataDir + DAG_NAME + ".fkr",
"-f", "scsv",
"--dataset", str(wf_key),
"-d", dest + file_name,
]
# Map params to -p or -c switches
for k, v in param_list:
sval = "" if v is None else str(v).rstrip()
m = re.match(r"column\((.*)\)$", k)
if m:
cmd.extend(["-c", f'{m.group(1)}={sval}'])
else:
cmd.extend(["-p", f"{k}={sval}"])
mr.set_workflow_property(wf_key, DATABASE_NAME, k, sval)
logger.debug("Running connector: %s", json.dumps(cmd))
res = subprocess.run(cmd, capture_output=True, check=False)
logger.debug("stdout: %s", res.stdout.decode(errors="ignore"))
logger.debug("stderr: %s", res.stderr.decode(errors="ignore"))
if res.returncode is None:
raise RuntimeError("Connector returned no status")
if res.returncode == 1:
logger.info("No data returned for wf_key=%s (continuing)", wf_key)
mr.finalise_workflow(wf_key, "Y")
continue
if res.returncode != 0:
raise RuntimeError(f"Connector failed (rc={res.returncode})")
# Data landed -> ensure source config exists, bootstrap table if needed
cfg = mf.execute_query(
"select * from CT_MRDS.A_SOURCE_FILE_CONFIG "
f"where a_source_key = 'TMS' and table_id = '{ODS_TABLE}'"
)
if not cfg:
# Generate DDL file
ddl_cmd = [
sys.executable,
os.path.join(gConnDir, "TMSDBT.py"),
"create-oracle-table",
"--name", WF_NAME,
"--url", tms_url,
"-U", tms_user,
"--password", tms_pwd,
"--layoutfile", gDataDir + DAG_NAME + ".fkr",
"-d", gTableDir + WF_NAME + ".sql",
]
for k, v in param_list:
sval = "" if v is None else str(v).rstrip()
m = re.match(r"column\((.*)\)$", k)
if m:
ddl_cmd.extend(["-c", f'{m.group(1)}={sval}'])
else:
ddl_cmd.extend(["-p", f"{k}={sval}"])
logger.debug("Generating DDL: %s", json.dumps(ddl_cmd))
ddl_res = subprocess.run(ddl_cmd, capture_output=True, check=True)
logger.debug("DDL stdout: %s", ddl_res.stdout.decode(errors="ignore"))
logger.debug("DDL stderr: %s", ddl_res.stderr.decode(errors="ignore"))
# Execute DDL and create external table
sql = Path(gTableDir + WF_NAME + ".sql").read_text()
mf.execute(sql)
mf.add_column_date_format(
f"CT_ET_TEMPLATES.{ODS_TABLE}", "DEFAULT", "DD/MM/YYYY HH24:MI:SS"
)
mf.create_external_table(ODS_TABLE, f"CT_ET_TEMPLATES.{ODS_TABLE}", data_prefix)
mf.add_source_file_config(
"TMS",
"INPUT",
DAG_NAME,
DAG_NAME,
r".*\.csv",
ODS_TABLE,
f"CT_ET_TEMPLATES.{ODS_TABLE}",
)
# Process landed file (register, move, etc. as per your mf impl)
mf.process_source_file(prefix, file_name)
mr.finalise_workflow(wf_key, "Y")
except BaseException as ex:
# rich error logging, then mark workflow failed and re-raise
ex_type, ex_value, ex_tb = sys.exc_info()
tb = traceback.extract_tb(ex_tb)
stack = [
f"File: {t[0]}, Line: {t[1]}, Func: {t[2]}, Code: {t[3]}"
for t in tb
]
logging.error("Exception type: %s", ex_type.__name__)
logging.error("Exception message: %s", ex_value)
logging.error("Stack trace: %s", stack)
mr.finalise_workflow(wf_key, "N")
raise
# ---------- DAG definition ----------
with DAG(
dag_id=DAG_NAME,
default_args=default_args,
description=DAG_NAME,
schedule_interval=None, # manual trigger
params=params_visible, # visible-only; hidden merged inside task
start_date=datetime(2025, 1, 1),
catchup=False,
tags=[DAG_NAME],
) as dag:
retrieve_report = PythonOperator(
task_id="retrieve_report",
python_callable=execute_report,
execution_timeout=timedelta(minutes=30),
)

View File

View File

@@ -0,0 +1,86 @@
dev:
DEVO_USERNAME: "ap-devo_lab-mrds"
IMPALA_HOSTNAME: "impala-proxy-devo-lab21-impala01.dw-devo-lab21.om2y56.b0.cloudera.site"
HIVE_HOSTNAME: 'hs2-devo-lab21-hive01.dw-devo-lab21.om2y56.b0.cloudera.site'
RANGER_HOSTNAME: "https://devo-lab21-dl-gateway.devo-lab.om2y56.b0.cloudera.site:443/devo-lab21-dl/cdp-proxy-api/ranger"
BUCKET_PREFIX: "s3a://devo-crp-ffppyd8q/"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya3tsglrzfgiyfisxchref774l5y4nrler2vn54lr3li7q"
S3_LOCATION_URI: "https://devo-crp-ffppyd8q.bucket.vpce-040b28f5818b670c1-owicl3ow.s3.eu-central-1.vpce.amazonaws.com/{0}/db/"
DEVO_USERNAME_RQSD: "ap-devo-rqsd-lab"
DEVO_SECRET_RQSD: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyap6wtzobzob7qizvk4nocszlcaxhwijgzejbvryt3uzbq"
FULL_ACCESS_LIST_RAR: "DISC-DC-RAR-R"
FULL_ACCESS_LIST_MOPDB: ""
FULL_ACCESS_LIST_RQSD: ""
tst:
DEVO_USERNAME: "ap-devo_tst-mrds"
IMPALA_HOSTNAME: "t-impala.devo.escb.eu"
HIVE_HOSTNAME: "hs2-devo-tst21-hive01.dw-devo-tst21.om2y56.b0.cloudera.site"
RANGER_HOSTNAME: "https://devo-tst21-dl-gateway.devo-tst.om2y56.b0.cloudera.site:443/devo-tst21-dl/cdp-proxy-api/ranger"
BUCKET_PREFIX: "s3a://devo-crp-sbul3ju3/"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyayqqotyowhpoml3v5szkwhmtu4rq6bplpkvdruzupz3ma"
S3_LOCATION_URI: "https://devo-crp-sbul3ju3.bucket.vpce-040b28f5818b670c1-owicl3ow.s3.eu-central-1.vpce.amazonaws.com/{0}/db/"
DEVO_USERNAME_RQSD: "ap-devo-rqsd-tst"
DEVO_SECRET_RQSD: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyap6wtzobzob7qizvk4nocszlcaxhwijgzejbvryt3uzbq"
FULL_ACCESS_LIST_RAR: "DISC-TC-RAR-R"
FULL_ACCESS_LIST_MOPDB: ""
FULL_ACCESS_LIST_RQSD: ""
acc:
DEVO_USERNAME: "ap-devo_acc-mrds"
IMPALA_HOSTNAME: "impala-proxy-devo-acc21-impala01.dw-devo-acc21.inym23.b0.cloudera.site"
HIVE_HOSTNAME: "hs2-devo-acc21-hive01.dw-devo-acc21.inym23.b0.cloudera.site"
RANGER_HOSTNAME: "https://devo-acc21-dl-gateway.devo-acc.inym23.b0.cloudera.site/devo-acc21-dl/cdp-proxy-api/ranger/"
BUCKET_PREFIX: "s3a://devo-crp-sbc9vbsu/"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya3x3nic3vxsnpzlfshz2ubj6kekny5tvaqsnwkuh2hw2a"
S3_LOCATION_URI: "https://devo-crp-sbc9vbsu.bucket.vpce-0bf4fa440fb60935d-6m9iqoo9.s3.eu-central-1.vpce.amazonaws.com/{0}/db/"
DEVO_USERNAME_RQSD: "ap-devo-rqsd-acc"
DEVO_SECRET_RQSD: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyamzhgatnso57mubvg3c6k4ens3orcx4dieo6efukuvm4a"
FULL_ACCESS_LIST_RAR: "DISC-AC-RAR-R"
FULL_ACCESS_LIST_MOPDB: ""
FULL_ACCESS_LIST_RQSD: ""
prd:
DEVO_USERNAME: "ap-devo_prd-mrds"
IMPALA_HOSTNAME: "impala-proxy-devo-prd21-impala01.dw-devo-prd21.inym23.b0.cloudera.site"
HIVE_HOSTNAME: "hs2-devo-prd21-hive01.dw-devo-prd21.inym23.b0.cloudera.site"
RANGER_HOSTNAME: "https://devo-prd21-dl-gateway.devo-prd.inym23.b0.cloudera.site/devo-prd21-dl/cdp-proxy-api/ranger/"
BUCKET_PREFIX: "s3a://devo-crp-2gn5maj9/"
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyace73o3xowa3f3jkw4diqzoiyc6skt34sqnnx4yrbykmq"
S3_LOCATION_URI: "https://devo-crp-2gn5maj9.bucket.vpce-0aa6cf4490536dfd5-qgy4w5sz.s3.eu-central-1.vpce.amazonaws.com/{0}/db/"
DEVO_USERNAME_RQSD: "ap-devo-rqsd-prd"
DEVO_SECRET_RQSD: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyacodc43tfgumkw4qyzw4s3j4jp42vp2elakkpwwrmivqa"
FULL_ACCESS_LIST_RAR: "DISC-PC-RAR-R"
FULL_ACCESS_LIST_MOPDB: ""
FULL_ACCESS_LIST_RQSD: ""
rar:
corporate_store: "crp_rar"
oracle_metadata_table: "CORR_RAR.NH_METADATA_INVENTORY"
oracle_igam_table: "CT_MRDS.A_DEVO_SOURCES_IGAM"
oracle_mgmt_table: "CT_MRDS.A_DEVO_REPLICA_MGMT_RAR"
target_s3_bucket: "rar/db"
tech_meta_data_fields: "tec_ingestion_date String, tec_execution_date String, tec_run_id String"
mopdb:
corporate_store: "crp_mopdb"
oracle_metadata_table: "CT_MOPDB.MOPDB_METADATA_INVENTORY"
oracle_igam_table: "CT_MRDS.A_DEVO_SOURCES_IGAM"
oracle_mgmt_table: "CT_MRDS.A_DEVO_REPLICA_MGMT_MOPDB"
target_s3_bucket: "mopdb/db"
tech_meta_data_fields: "tec_ingestion_date String, tec_execution_date String, tec_run_id String"
rqsd:
corporate_store: "crp_rqsd"
oracle_metadata_table: "CT_MRDS.A_DEVO_METADATA_INVENTORY"
oracle_igam_table: "CT_MRDS.A_DEVO_SOURCES_IGAM"
oracle_mgmt_table: "CT_MRDS.A_DEVO_REPLICA_MGMT_RQSD"
target_s3_bucket: "rqsd/db"
tech_meta_data_fields: "tec_ingestion_date String, tec_execution_date String, tec_run_id String"
# -- target table name as
# SELECT DISTINCT TABLE_ALIAS FROM {oracle_mgmt_table}
# WHERE OWNER = ''
# AND TABLE_NAME = '';
# -- type of access
# SELECT DISTINCT RAR3_TYPE_OF_ACCESS FROM {oracle_metadata_table}
# WHERE A_VALID_TO > SYSDATE AND
# OWNER = ''
# AND TABLE_NAME = '';

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env python3
import sys, json
import boto3
from botocore.exceptions import ClientError
from botocore.config import Config
BUCKET = "devo-crp-sbc9vbsu"
PREFIX = "mopdb/db/" # adjust if needed
def show(e):
# Print the structured error if present
resp = getattr(e, "response", {})
code = resp.get("Error", {}).get("Code")
msg = resp.get("Error", {}).get("Message")
rid = resp.get("ResponseMetadata", {}).get("RequestId")
print(f"{type(e).__name__}: {code} {msg} (RequestId={rid})", file=sys.stderr)
def main(endpoint_url=None, region=None, force_path=False):
session = boto3.Session()
cfg = Config(s3={"addressing_style": "path" if force_path else "auto"})
s3 = session.client("s3", region_name=region, endpoint_url=endpoint_url, config=cfg)
sts = session.client("sts", region_name=region)
# Who am I?
try:
ident = sts.get_caller_identity()
print(f"Caller: {ident['Arn']} (acct {ident['Account']})")
except Exception as e:
print("Could not call STS get-caller-identity — credentials not valid for STS.", file=sys.stderr)
show(e); return 1
# Is the bucket reachable at all?
try:
s3.head_bucket(Bucket=BUCKET)
print(f"head_bucket OK on s3://{BUCKET}")
except ClientError as e:
print("head_bucket failed:", file=sys.stderr)
show(e); return 2
# List with zero keys to test just the ListBucket permission
try:
s3.list_objects_v2(Bucket=BUCKET, Prefix=PREFIX, MaxKeys=0)
print(f"list_objects_v2 OK on prefix '{PREFIX}' (permission exists)")
except ClientError as e:
print("list_objects_v2 failed:", file=sys.stderr)
show(e); return 3
# Ask for 1 key to confirm data path works
try:
resp = s3.list_objects_v2(Bucket=BUCKET, Prefix=PREFIX, MaxKeys=1)
print("First key:", resp.get("Contents", [{}])[0].get("Key"))
except ClientError as e:
print("list_objects_v2 (MaxKeys=1) failed:", file=sys.stderr)
show(e); return 4
return 0
if __name__ == "__main__":
# Allow optional args: --endpoint-url URL --region eu-central-1 --force-path
url = None; reg = None; force = False
for i,a in enumerate(sys.argv):
if a == "--endpoint-url": url = sys.argv[i+1]
if a == "--region": reg = sys.argv[i+1]
if a == "--force-path": force = True
sys.exit(main(endpoint_url=url, region=reg, force_path=force))

View File

@@ -0,0 +1,129 @@
import os
import yaml
import datetime
import pandas as pd
from mrds.utils.secrets import get_secret
import mrds.utils.manage_runs as runManager
import mrds.utils.manage_files as fileManager
import mrds.utils.sql_statements as sqls
import oci
from impala.dbapi import (
connect,
ProgrammingError,
DatabaseError,
IntegrityError,
OperationalError,
)
from impala.error import HiveServer2Error
def get_impala_connection(hostname: str, user: str, secret: str):
conn = connect(
host=hostname,
port=443,
auth_mechanism="PLAIN",
user=user,
password=secret,
use_http_transport=True,
http_path="cliservice",
use_ssl=True
)
return conn
def execute_query(query: str,user,hostname,password):
conn = get_impala_connection(hostname, user, password)
print(conn)
columns, result = execute_devo_query(query, conn)
return columns, result
def execute_devo_query(query: str, conn):
#impersonation_configuration = {"impala.doas.user": userid} # to be changed
#impersonation_configuration = {} # to be changed
cursor = conn.cursor()
print("executing query")
try:
cursor.execute(query)
# Check if the query is a SELECT query (i.e., reads data)
return None, cursor.rowcount # rowcount returns the number of rows affected
except OperationalError as oe:
raise Exception(
status_code=500, detail="Failed to connect to Impala: " + str(oe)
)
except ProgrammingError as pe:
raise Exception(status_code=400, detail="Query syntax error: " + str(pe))
except IntegrityError as ie:
raise Exception(
status_code=403, detail="Insufficient permissions: " + str(ie)
)
except DatabaseError as db_err:
raise Exception(status_code=500, detail="Database error: " + str(db_err))
except HiveServer2Error as au_err:
raise Exception(
status_code=403, detail="HiveServer2Error error: " + str(au_err)
)
except Exception as e:
raise Exception(
status_code=500, detail="An unexpected error occurred: " + str(e)
) from e
finally:
try:
if cursor:
cursor.close()
if conn:
conn.close()
except Exception as e:
raise Exception(
status_code=500, detail="Failed to close the connection: " + str(e)
)
def initialize_task(workflow_context, task_name):
# Initialize task
a_task_history_key = runManager.init_task(
task_name,
workflow_context["run_id"],
workflow_context["a_workflow_history_key"],
)
return a_task_history_key
def initialize_config(config_file_path):
# Ensure the file exists
if not os.path.exists(config_file_path):
raise FileNotFoundError(f"Configuration file {config_file_path} not found.")
# Load the configuration
with open(config_file_path, "r") as f:
config_data = yaml.safe_load(f)
return config_data
def main(env_config_path, env, table, corporate_store):
#init setup
envs_info = initialize_config(env_config_path)
environment_info = envs_info[env]
try:
devo_secret_name = environment_info["DEVO_SECRET"]
password = get_secret(devo_secret_name)
except:
print("Failed to retrieve credentials from secrets")
raise(Exception)
# get devo data
try:
execute_query(f"INVALIDATE METADATA {corporate_store}.{table}", environment_info['DEVO_USERNAME'], environment_info['IMPALA_HOSTNAME'], password)
execute_query(f"COMPUTE STATS {corporate_store}.{table}", environment_info['DEVO_USERNAME'], environment_info['IMPALA_HOSTNAME'], password)
except:
print("Failed to retrieve DEVO data, error during connection or request")
raise(Exception)
return True

View File

@@ -0,0 +1,128 @@
#!/usr/bin/env python3
import argparse, sys
from urllib.parse import urlparse
import boto3
from botocore.config import Config
from botocore.exceptions import ClientError, EndpointConnectionError, NoCredentialsError, ReadTimeoutError, ConnectTimeoutError
def parse_s3_uri(s3_uri: str):
if not s3_uri.startswith("s3://"):
raise ValueError("S3 URI must start with 's3://'")
p = urlparse(s3_uri)
if not p.netloc:
raise ValueError("Missing bucket in S3 URI")
return p.netloc, p.path.lstrip("/")
def parse_location(location: str):
"""Accept s3://... OR https://...amazonaws.com/... and return (bucket, prefix)."""
if location.startswith("s3://"):
return parse_s3_uri(location)
if location.startswith(("http://", "https://")):
p = urlparse(location)
host = p.netloc
path = p.path.lstrip("/")
# Bucket-scoped VPCe host: <bucket>.bucket.vpce-xxxx.s3.<region>.vpce.amazonaws.com
if ".bucket." in host:
bucket = host.split(".bucket.", 1)[0]
return bucket, path
# Virtual-hosted: <bucket>.s3.<region>...
if ".s3." in host and not host.startswith("s3."):
bucket = host.split(".s3.", 1)[0]
return bucket, path
# Path-style: s3.<region>.../<bucket>/...
if host.startswith("s3."):
parts = path.split("/", 1)
bucket = parts[0]
prefix = parts[1] if len(parts) > 1 else ""
return bucket, prefix
raise ValueError(f"Unsupported location: {location}")
def iter_keys(s3, bucket: str, prefix: str, page_size: int, max_items: int, verbose: bool):
print('here')
paginator = s3.get_paginator("list_objects_v2")
kwargs = {"Bucket": bucket, "Prefix": prefix}
pagination = {"PageSize": page_size}
if max_items > 0:
pagination["MaxItems"] = max_items
total = 0
page_num = 0
for page in paginator.paginate(**kwargs, PaginationConfig=pagination):
page_num += 1
contents = page.get("Contents", []) or []
if verbose:
print(f"[page {page_num}] fetched {len(contents)} keys (running total={total + len(contents)})",
file=sys.stderr, flush=True)
for obj in contents:
key = obj["Key"]
yield key
total += 1
def main():
ap = argparse.ArgumentParser(description="List files under an S3 location quickly and safely.")
ap.add_argument("location", help="s3://bucket/prefix/ OR https://<vpc-endpoint-host>/<prefix>")
ap.add_argument("--region", default=None, help="AWS region (e.g., eu-central-1)")
ap.add_argument("--profile", default=None, help="AWS profile to use")
ap.add_argument("--endpoint-url", default=None,
help="Custom S3 endpoint (e.g., https://s3.eu-central-1.vpce.amazonaws.com)")
ap.add_argument("--force-path-addressing", action="store_true",
help="Force path-style addressing (useful with bucket-scoped VPCe hostnames)")
ap.add_argument("--page-size", type=int, default=1000, help="S3 page size (default 1000)")
ap.add_argument("--max-items", type=int, default=0, help="Stop after N keys (0 = no limit)")
ap.add_argument("--connect-timeout", type=float, default=10.0, help="Seconds (default 10)")
ap.add_argument("--read-timeout", type=float, default=30.0, help="Seconds (default 30)")
ap.add_argument("--retries", type=int, default=3, help="Max retry attempts (default 3)")
ap.add_argument("--relative", action="store_true", help="Print keys relative to the prefix")
ap.add_argument("--verbose", "-v", action="store_true", help="Print progress to stderr")
args = ap.parse_args()
bucket, prefix = parse_location(args.location)
# Session & client with explicit timeouts and optional path addressing
sess_kwargs = {}
if args.profile:
sess_kwargs["profile_name"] = args.profile
session = boto3.Session(**sess_kwargs)
cfg = Config(
connect_timeout=args.connect_timeout,
read_timeout=args.read_timeout,
retries={"max_attempts": args.retries, "mode": "standard"},
s3={"addressing_style": "path" if args.force_path_addressing else "auto"},
)
s3 = session.client("s3", region_name=args.region, endpoint_url=args.endpoint_url, config=cfg)
# Quick preflight: try a 0-key list to surface auth/endpoint issues fast
try:
_ = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, MaxKeys=0)
except ClientError as e:
print(f"Preflight failed (auth/permissions/endpoint): {e}", file=sys.stderr)
sys.exit(1)
except (EndpointConnectionError, ReadTimeoutError, ConnectTimeoutError) as e:
print(f"Network/endpoint error: {e}", file=sys.stderr)
sys.exit(1)
try:
for key in iter_keys(s3, bucket, prefix, args.page_size, args.max_items, args.verbose):
if args.relative and prefix and key.startswith(prefix):
print(key[len(prefix):].lstrip("/"))
else:
print(f"s3://{bucket}/{key}")
except KeyboardInterrupt:
print("\nInterrupted.", file=sys.stderr)
sys.exit(130)
except NoCredentialsError:
print("No AWS credentials found. Set env vars or use --profile.", file=sys.stderr)
sys.exit(1)
except (EndpointConnectionError, ReadTimeoutError, ConnectTimeoutError) as e:
print(f"Network/timeout listing objects: {e}", file=sys.stderr)
sys.exit(1)
except ClientError as e:
print(f"AWS error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,145 @@
class Options:
def __init__(self, args):
self.options = {
"corporate_store": None,
"service_name": None,
"source_schema": None,
"source_table": None,
"access_type": None,
"oracle_metadata_table": None,
"oracle_igam_table": None,
"query_metadata_access_type1": None,
"query_metadata_access_type2a": None,
"query_igam_roles": None,
"ora_jdbc_url_dwh": None,
"ora_jdbc_url_ods": None,
"sql_file_path": None,
"sql_filename_grants": None,
"sentry_role_environment": None,
"ranger_script": None,
"type3_access_table": None,
"type3_access_table_key_column": None,
"type3_source_table_key_column": None,
"target_s3_bucket": None,
"ranger_s3_bucket": None,
"ranger_s3_path": None,
"rar_full_access_entitlement_list": None,
"target_table": None,
"tech_meta_data_fields": None,
"full_access_entitlement_list": None
}
# Initialize options from arguments
self.initialize_options(args)
def initialize_options(self, args):
# Assuming args is a list of key-value pairs
for key in args.keys():
if key in self.options:
self.options[key] = args[key]
def get_option_value(self, key):
return self.options.get(key, "")
@property
def corporate_store(self):
return self.get_option_value("corporate_store")
@property
def source_schema(self):
return self.get_option_value("source_schema")
@property
def source_table(self):
return self.get_option_value("source_table")
@property
def access_type(self):
return self.get_option_value("access_type")
@property
def oracle_metadata_table(self):
return self.get_option_value("oracle_metadata_table")
@property
def oracle_igam_table(self):
return self.get_option_value("oracle_igam_table")
@property
def query_metadata_access_type1(self):
return self.get_option_value("query_metadata_access_type1")
@property
def query_metadata_access_type2a(self):
return self.get_option_value("query_metadata_access_type2a")
@property
def query_igam_roles(self):
return self.get_option_value("query_igam_roles")
@property
def ora_jdbc_url_dwh(self):
return self.get_option_value("ora_jdbc_url_dwh")
@property
def ora_jdbc_url_ods(self):
return self.get_option_value("ora_jdbc_url_ods")
@property
def sql_file_path(self):
return self.get_option_value("sql_file_path")
@property
def sql_filename_grants(self):
return self.get_option_value("sql_filename_grants")
@property
def sentry_role_environment(self):
return self.get_option_value("sentry_role_environment")
@property
def ranger_script(self):
return self.get_option_value("ranger_script")
@property
def type3_access_table(self):
return self.get_option_value("type3_access_table")
@property
def type3_access_table_key_column(self):
return self.get_option_value("type3_access_table_key_column")
@property
def type3_source_table_key_column(self):
return self.get_option_value("type3_source_table_key_column")
@property
def target_s3_bucket(self):
return self.get_option_value("target_s3_bucket")
@property
def ranger_s3_bucket(self):
return self.get_option_value("ranger_s3_bucket")
@property
def ranger_s3_path(self):
return self.get_option_value("ranger_s3_path")
@property
def rar_full_access_entitlement_list(self):
return self.get_option_value("rar_full_access_entitlement_list")
@property
def target_table(self):
return self.get_option_value("target_table")
@property
def tech_meta_data_fields(self):
return self.get_option_value("tech_meta_data_fields")
@property
def full_access_entitlement_list(self):
return self.get_option_value("full_access_entitlement_list")
@property
def service_name(self):
return self.get_option_value("service_name")

View File

@@ -0,0 +1,73 @@
from impala.dbapi import (
connect,
ProgrammingError,
DatabaseError,
IntegrityError,
OperationalError,
)
from impala.error import HiveServer2Error
def get_DEVO_connection(hostname: str, user: str, secret: str):
conn = connect(
host=hostname,
port=443,
auth_mechanism="PLAIN",
user=user,
password=secret,
use_http_transport=True,
http_path="cliservice",
use_ssl=True,
)
return conn
def execute_devo_query(query: str, conn):
cursor = None
try:
cursor = conn.cursor()
cursor.execute(query) # Check if the query is a SELECT query (i.e., reads data)
if query.strip().lower().startswith("select"):
rows = cursor.fetchall()
columns = [col[0] for col in cursor.description]
return columns, rows
else:
# For non-SELECT queries (e.g., INSERT, UPDATE, DELETE), just return affected rows
return None, cursor.rowcount # rowcount returns the number of rows affected
except OperationalError as oe:
raise Exception("Failed to connect to DEVO: " + str(oe))
except ProgrammingError as pe:
raise Exception("Query syntax error: " + str(pe))
except IntegrityError as ie:
raise Exception("Insufficient permissions: " + str(ie))
except DatabaseError as db_err:
raise Exception("Database error: " + str(db_err))
except HiveServer2Error as au_err:
raise Exception("HiveServer2Error error: " + str(au_err))
finally:
try:
if cursor:
cursor.close()
if not conn:
conn.close()
except Exception as e:
raise Exception(status_code=500, detail=f"Failed to close the cursor or impala connection: {str(e)}") from e
def execute_query(query: str, user: str, hostname: str,password):
conn = get_DEVO_connection(hostname, user, password)
columns, result = execute_devo_query(query, conn)
return columns, result
#sql="CREATE EXTERNAL TABLE IF NOT EXISTS crp_rar.testInternalTable ( iid STRING,RANDOM_DATE DATE, number int) ;"
#sql_drop="DROP TABLE IF EXISTS crp_rar.NH_PRICE"
#print( execute_query("SELECT 1","ap-informatica-ipcwt","t-impala.devo.escb.eu","Start_123456789"))
#print( execute_query("SELECT 1","ap-devo_tst-mrds","t-impala.devo.escb.eu","V1XqZ*#fvwQl=nRG*idI"))
#print( execute_query("SELECT 1","ap-devo_lab-mrds","impala-proxy-devo-lab21-impala01.dw-devo-lab21.om2y56.b0.cloudera.site","PHkvyVonyePAmZD8wUuw!"))

View File

@@ -0,0 +1,69 @@
## Step 3: Let's create a policy
from apache_ranger.model.ranger_service import *
from apache_ranger.client.ranger_client import *
from apache_ranger.model.ranger_policy import *
from mrds.utils.secrets import get_secret
## Step 1: create a client to connect to Apache Ranger admin
ranger_url ="https://devo-lab21-dl-gateway.devo-lab.om2y56.b0.cloudera.site:443/devo-lab21-dl/cdp-proxy-api/ranger"
password= get_secret("ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya3tsglrzfgiyfisxchref774l5y4nrler2vn54lr3li7q")
ranger_auth = ('ap-devo_lab-mrds', password)
# For Kerberos authentication
#
# from requests_kerberos import HTTPKerberosAuth
#
# ranger_auth = HTTPKerberosAuth()
ranger = RangerClient(ranger_url, ranger_auth)
ranger.session.verify = False
# to disable SSL certificate validation (not recommended for production use!)
#
# ranger.session.verify = False
## Step 2: Let's create a service
policy = RangerPolicy()
policy.service = "cm_hive" #da hardcodare
policy.name = 'cpo_crp_mopdb_sgroi_1' #corporatestore_table_accessType
policy.resources = { 'database': RangerPolicyResource({ 'values': ['crp_RQSD'] }),
'table': RangerPolicyResource({ 'values': ['ANNEX_1_1_ALL'] }),
'column': RangerPolicyResource({ 'values': ['*'] }) } #change with correct values
allowItem1 = RangerPolicyItem() #to try allowItem1.groups
allowItem1.groups = ["d_mopdb_mpec"]
#allowItem1.users = [] #to try for single users
allowItem1.accesses = [ RangerPolicyItemAccess({ 'type': 'create' }),
RangerPolicyItemAccess({ 'type': 'alter' }),
RangerPolicyItemAccess({ 'type': 'select' }),
RangerPolicyItemAccess({ 'type': 'drop' }) ]
"""denyItem1 = RangerPolicyItem()
denyItem1.users = [ 'admin' ] #does it make sense to deny and not allow?
denyItem1.accesses = [ RangerPolicyItemAccess({ 'type': 'drop' }) ]"""
policy.policyItems = [ allowItem1 ]
#policy.denyPolicyItems = [ denyItem1 ]
#policy2=ranger.get_policy_by_id(policyId=5086)
#print(ranger.get_policy(serviceName="cm_hive",policyName='crp_rar_testinternalTable_alcesso1'))
#print(ranger.find_policies({"service": "cm_hive", "resources": {"database": {"values": ["crp_rar"], "isExcludes": False , "isRecursive": False}, "column": {"values": ["*"], "isExcludes": False, "isRecursive": False}, "table": {"values": ["testInternalTable"], "isExcludes": False, "isRecursive": False}}}))
#print(ranger.delete_policy(serviceName="cm_hive",policyName="crp_rar_testinternalTable_alcesso1"))
#print(policy2)
#print('Creating policy: name=' + policy.name)
#created_policy = ranger.create_policy(policy)
#print(' created policy: name=' + created_policy.name + ', id=' + str(created_policy.id))
## Step 4: Delete policy and service created above
#print('Deleting policy: id=' + str(created_policy.id))
#ranger.delete_policy_by_id(created_policy.id)
data=ranger.get_policies_in_service(serviceName="cm_hive")
with open("output.txt", "w") as file:
for string in data:
file.write(str(string))
file.close()

View File

@@ -0,0 +1,250 @@
from typing import List, Optional
from apache_ranger.model.ranger_service import *
from apache_ranger.client.ranger_client import *
from apache_ranger.model.ranger_policy import *
import re
def add_table_permission_groups(corporate_store: str, target_table: str, access_type: str, source_table: str, igam_entitlement_list: List[str], columns_list: Optional[List[str]] = None, row_list: Optional[List[str]] = None):
igam_entitlements = igam_entitlement_list + ["public"] if source_table.lower() == "rar_sources_igam_sentry" else igam_entitlement_list
column_details = columns_list if columns_list is not None else ["*"]
columns = column_details
row_filter = row_list if row_list is not None else ["*"]
filter_condition = ','.join([f"'{row}'" for row in row_filter])
igam_roles = [x.lower() for x in igam_entitlements if x !=""]
return {
'corporate_store': corporate_store,
'target_table': target_table,
'access_type': access_type,
'columns': columns,
'rows': filter_condition,
'igam_roles': igam_roles
}
from typing import List, Optional
# --- helpers ---------------------------------------------------------------
def _policy_name_from_params(config, policy_id: Optional[str] = None) -> Optional[str]:
"""
Build the exact policy name used by your create functions.
Returns None for types where we need to match multiple (e.g., 2a without id).
"""
cs = config['corporate_store'].lower()
tbl = config['target_table'].lower()
at = config['access_type'].lower()
base = f"cpo_{cs}_{tbl}_{at}"
if at == "1":
# yaml_format_1
return base
elif at == "2a":
# yaml_format_2a -> requires policy_id to be exact
if policy_id:
return f"{base}_policy_{policy_id}"
# without policy_id, well delete all that start with this prefix
return None
elif at == "2b":
# yaml_format_2b
return f"{base}_row_level_policy"
elif at == "3":
# yaml_format_3 uses same name pattern as 2b in your script
return f"{base}_row_level_policy"
else:
raise ValueError(f"Invalid access type '{config['access_type']}'. Expected one of: 1, 2a, 2b, 3.")
def _ranger_client(env_config) -> RangerClient:
ranger_url = env_config['RANGER_HOSTNAME']
ranger_auth = ( env_config['DEVO_USERNAME'], env_config['DEVO_SECRET'])
client = RangerClient(ranger_url, ranger_auth)
client.session.verify = False
return client
# --- main deletion API -----------------------------------------------------
def delete_policy(config,env_config, policy_id: Optional[str] = None) -> List[str]:
"""
Delete Ranger policy/policies by name based on:
- params['corporate_store']
- params['target_table']
- typeOfAccess: "1", "2a", "2b", "3"
- policy_id: optional (only meaningful for '2a')
Returns a list of deleted policy names.
"""
ranger = _ranger_client(env_config)
service_name = "cm_hive"
# Try build exact name
deleted: List[str] = []
# If we dont have an exact name (e.g. type 2a without policy_id),
# delete *all* that match the expected prefix.
cs = config['corporate_store'].lower()
tbl = config['target_table'].lower()
at = config['access_type'].lower()
prefix = f"cpo_{cs}_{tbl}_"
print(prefix)
# Fetch all policies for the table and filter client-side to reduce calls.
start = 0
candidates = []
page_size=1000
service_name="cm_hive"
while True:
params = {"pageSize": page_size, "startIndex": start}
page = ranger.get_policies_in_service(service_name, params=params) or []
candidates.extend(page)
if len(page) < page_size:
break
start += len(page)
for p in candidates:
name = p["name"]
print(f"analizing policy:{name}")
if re.fullmatch(f"{prefix}([0-9]?[a-z]?)(_policy_)?([0-9]*)?(_row_level_policy)?(full_access)?$",name) != None:
try:
ranger.delete_policy_by_id(p["id"])
deleted.append(name)
except Exception:
# continue attempting others
pass
if not deleted:
raise RuntimeError(
f"No matching policies found for deletion with prefix '{prefix}'. "
)
return deleted
def generate_policy(params,env_config, policy_id: Optional[str] = None):
access_type = params['access_type'].lower()
if access_type == "1":
return yaml_format_1(params,env_config)
elif access_type == "2a":
return yaml_format_2a(params, env_config, policy_id)
elif access_type == "2b":
return yaml_format_1(params,env_config)
elif access_type == "3":
return yaml_format_3(params)
else:
raise Exception(f"Invalid access type {params['access_type']}. Please check the input param")
def yaml_format_1(params,env_config) -> str:
ranger=_ranger_client(env_config)
# For Kerberos authentication
#
# from requests_kerberos import HTTPKerberosAuth
#
# ranger_auth = HTTPKerberosAuth()
policy = RangerPolicy()
policy.service = "cm_hive" #harcoded
policy.name = f"cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}" #corporatestore_table_accessType
policy.resources = { 'database': RangerPolicyResource({ 'values': [params['corporate_store'].lower()] }),
'table': RangerPolicyResource({ 'values': [params['target_table']] }),
'column': RangerPolicyResource({ 'values': params['columns'] }) }
allowItem1 = RangerPolicyItem()
allowItem1.groups = params['igam_roles']
allowItem1.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
policy.policyItems = [ allowItem1 ]
created_policy = ranger.create_policy(policy)
print('Created policy: name=' + created_policy.name + ', id=' + str(created_policy.id))
return policy
def yaml_format_2a(params, env_config,policy_id: Optional[str]) -> str:
policy_ID = policy_id if policy_id is not None else "0"
# For Kerberos authentication
#
# from requests_kerberos import HTTPKerberosAuth
#
# ranger_auth = HTTPKerberosAuth()
ranger = _ranger_client(env_config)
policy = RangerPolicy()
policy.service = "cm_hive" #harcoded
policy.name = f"cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}_policy_{policy_ID}" #corporatestore_table_accessType
policy.resources = { 'database': RangerPolicyResource({ 'values': [params['corporate_store'].lower()] }),
'table': RangerPolicyResource({ 'values': [params['target_table']] }),
'column': RangerPolicyResource({ 'values': params['columns'] }) }
allowItem1 = RangerPolicyItem()
allowItem1.groups = params['igam_roles']
allowItem1.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
policy.policyItems = [ allowItem1 ]
created_policy = ranger.create_policy(policy)
print(' created policy: name=' + created_policy.name + ', id=' + str(created_policy.id))
return policy
def yaml_format_2b(params,env_config, full_access_list: Optional[List]) -> str:
# For Kerberos authentication
#
# from requests_kerberos import HTTPKerberosAuth
#
# ranger_auth = HTTPKerberosAuth()
ranger = _ranger_client(env_config)
policy = RangerPolicy()
policy.service = "cm_hive" #harcoded
policy.name = f"cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}_row_level_policy" #corporatestore_table_accessType
policy.isEnabled = True
policy.resources ={ 'database': RangerPolicyResource({ 'values': [params['corporate_store'].lower()] }),
'table': RangerPolicyResource({ 'values': [params['target_table']] })}
rowFilterAllowItem1= RangerRowFilterPolicyItem()
rowFilterAllowItem1.groups = params['igam_roles']
rowFilterAllowItem1.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
rowFilterAllowItem1.rowFilterInfo = RangerPolicyItemRowFilterInfo({ 'filterExpr': f"lower(source) IN (select lower(rar_subsource_id) from {params['corporate_store'].lower()}.t_ref_rar_sources_igam_sentry where lower(rar_igam_entitlement) IN (select ad_group from {params['corporate_store'].lower()}.active_directory_user_groups where username = lower(regexp_extract(current_user(),'[^@]*',0))))" })
rowFilterAllowItem2= RangerRowFilterPolicyItem()
rowFilterAllowItem2.groups = [x.lower() for x in full_access_list]
rowFilterAllowItem2.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
rowFilterAllowItem2.rowFilterInfo = RangerPolicyItemRowFilterInfo({ 'filterExpr': f"1=1" })
policy.rowFilterPolicyItems= [rowFilterAllowItem1, rowFilterAllowItem2]
created_policy = ranger.create_policy(policy)
print(' created policy: name=' + created_policy.name + ', id=' + str(created_policy.id))
return policy
def yaml_format_3(params, env_config,filterString, full_access_list: Optional[List]) -> str:
ranger = _ranger_client(env_config)
policy = RangerPolicy()
policy.service = "cm_hive" # hardcoded
policy.name = (
f"cpo_{params['corporate_store'].lower()}_"
f"{params['target_table'].lower()}_"
f"{params['access_type'].lower()}_row_level_policy"
)
policy.isEnabled = True
policy.resources = {
"database": RangerPolicyResource({"values": [params["corporate_store"].lower()]}),
"table": RangerPolicyResource({"values": [params["target_table"]]}),
}
# Row filter item
rowFilterAllowItem = RangerRowFilterPolicyItem()
rowFilterAllowItem.groups = params["igam_roles"]
rowFilterAllowItem.accesses = [RangerPolicyItemAccess({"type": "select"})]
rowFilterAllowItem.rowFilterInfo = RangerPolicyItemRowFilterInfo(
{
"filterExpr": filterString
}
)
rowFilterAllowItem2= RangerRowFilterPolicyItem()
rowFilterAllowItem2.groups = [x.lower() for x in full_access_list]
rowFilterAllowItem2.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
rowFilterAllowItem2.rowFilterInfo = RangerPolicyItemRowFilterInfo({ 'filterExpr': f"1=1" })
policy.rowFilterPolicyItems = [rowFilterAllowItem,rowFilterAllowItem2]
# Create policy in Ranger
created_policy = ranger.create_policy(policy)
print(f" created policy: name={created_policy.name}, id={created_policy.id}")
return policy

View File

@@ -0,0 +1,345 @@
from typing import List, Optional
from apache_ranger.model.ranger_service import *
from apache_ranger.client.ranger_client import *
from apache_ranger.model.ranger_policy import *
import re
def add_table_permission_groups(corporate_store: str, target_table: str, access_type: str, source_table: str, igam_entitlement_list: List[str], columns_list: Optional[List[str]] = None, row_list: Optional[List[str]] = None):
igam_entitlements = igam_entitlement_list + ["public"] if source_table.lower() == "rar_sources_igam_sentry" else igam_entitlement_list
column_details = columns_list if columns_list is not None else ["*"]
columns = column_details
row_filter = row_list if row_list is not None else ["*"]
filter_condition = ','.join([f"'{row}'" for row in row_filter])
igam_roles = [x.lower() for x in igam_entitlements if x !=""]
return {
'corporate_store': corporate_store,
'target_table': target_table,
'access_type': access_type,
'columns': columns,
'rows': filter_condition,
'igam_roles': igam_roles
}
from typing import List, Optional
# --- helpers ---------------------------------------------------------------
def _policy_name_from_params(config, policy_id: Optional[str] = None) -> Optional[str]:
"""
Build the exact policy name used by your create functions.
Returns None for types where we need to match multiple (e.g., 2a without id).
"""
cs = config.corporate_store.lower()
tbl = config.target_table.lower()
at = config.access_type.lower()
base = f"cpo_{cs}_{tbl}_{at}"
if at == "1":
# yaml_format_1
return base
elif at == "2a":
# yaml_format_2a -> requires policy_id to be exact
if policy_id:
return f"{base}_policy_{policy_id}"
# without policy_id, well delete all that start with this prefix
return None
elif at == "2b":
# yaml_format_2b
return f"{base}_row_level_policy"
elif at == "3":
# yaml_format_3 uses same name pattern as 2b in your script
return f"{base}_row_level_policy"
else:
raise ValueError(f"Invalid access type '{config.access_type}'. Expected one of: 1, 2a, 2b, 3.")
def _ranger_client(env_config) -> RangerClient:
ranger_url = env_config['RANGER_HOSTNAME']
ranger_auth = ( env_config['DEVO_USERNAME'], env_config['DEVO_SECRET'])
client = RangerClient(ranger_url, ranger_auth)
client.session.verify = False
return client
# --- main deletion API -----------------------------------------------------
def delete_policy(config,env_config, policy_id: Optional[str] = None) -> List[str]:
"""
Delete Ranger policy/policies by name based on:
- params['corporate_store']
- params['target_table']
- typeOfAccess: "1", "2a", "2b", "3"
- policy_id: optional (only meaningful for '2a')
Returns a list of deleted policy names.
"""
ranger = _ranger_client(env_config)
service_name = "cm_hive"
# Try build exact name
deleted: List[str] = []
# If we dont have an exact name (e.g. type 2a without policy_id),
# delete *all* that match the expected prefix.
cs = config.corporate_store.lower()
tbl = config.target_table.lower()
at = config.access_type.lower()
prefix = f"cpo_{cs}_{tbl}_"
# Fetch all policies for the table and filter client-side to reduce calls.
start = 0
candidates = []
page_size=1000
service_name="cm_hive"
while True:
params = {"pageSize": page_size, "startIndex": start}
page = ranger.get_policies_in_service(service_name, params=params) or []
candidates.extend(page)
if len(page) < page_size:
break
start += len(page)
for p in candidates:
name = p["name"]
print(f"analizing policy:{name}")
if re.fullmatch(f"{prefix}([0-9]?[a-z]?)(_policy_)?([0-9]*)?(_row_level_policy)?(full_access)?$",name) != None:
try:
ranger.delete_policy_by_id(p["id"])
deleted.append(name)
except Exception:
# continue attempting others
pass
if not deleted:
raise RuntimeError(
f"No matching policies found for deletion with prefix '{prefix}'. "
f"Provide 'policy_id' to delete a specific 2a policy."
)
return deleted
def generate_policy(params,env_config, policy_id: Optional[str] = None):
access_type = params['access_type'].lower()
if access_type == "1":
return yaml_format_1(params,env_config)
elif access_type == "2a":
return yaml_format_2a(params, env_config, policy_id)
elif access_type == "2b":
return yaml_format_1(params,env_config)
elif access_type == "3":
return yaml_format_3(params)
else:
raise Exception(f"Invalid access type {params['access_type']}. Please check the input param")
def yaml_format_1(params,env_config) -> str:
ranger=_ranger_client(env_config)
# For Kerberos authentication
#
# from requests_kerberos import HTTPKerberosAuth
#
# ranger_auth = HTTPKerberosAuth()
policy = RangerPolicy()
policy.service = "cm_hive" #harcoded
policy.name = f"cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}" #corporatestore_table_accessType
policy.resources = { 'database': RangerPolicyResource({ 'values': [params['corporate_store'].lower()] }),
'table': RangerPolicyResource({ 'values': [params['target_table']] }),
'column': RangerPolicyResource({ 'values': params['columns'] }) }
allowItem1 = RangerPolicyItem()
allowItem1.groups = params['igam_roles']
allowItem1.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
policy.policyItems = [ allowItem1 ]
print(policy)
try:
created_policy = ranger.create_policy(policy)
print('Created policy: name=' + created_policy.name + ', id=' + str(created_policy.id))
except:
pass
'''
yaml_format = f"""- name: "{{{{ ecb_env }}}} : {{{{ cdp_env_name }}}}: allow CRP RAR users to select core tables"
devo_ranger_client:
name: "cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}"
policy:
service: cm_hive
resources:
database:
values:
- {params['corporate_store'].lower()}
table:
values:
- {params['target_table']}
column:
values:
{params['columns']}
policyItems:
- groups:
{params['igam_roles'].lower()}
accesses:
- select
"""
return yaml_format'
'''
def yaml_format_2a(params, env_config,policy_id: Optional[str]) -> str:
policy_ID = policy_id if policy_id is not None else "0"
# For Kerberos authentication
#
# from requests_kerberos import HTTPKerberosAuth
#
# ranger_auth = HTTPKerberosAuth()
ranger = _ranger_client(env_config)
policy = RangerPolicy()
policy.service = "cm_hive" #harcoded
policy.name = f"cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}_policy_{policy_ID}" #corporatestore_table_accessType
policy.resources = { 'database': RangerPolicyResource({ 'values': [params['corporate_store'].lower()] }),
'table': RangerPolicyResource({ 'values': [params['target_table']] }),
'column': RangerPolicyResource({ 'values': params['columns'] }) }
allowItem1 = RangerPolicyItem()
allowItem1.groups = params['igam_roles']
allowItem1.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
policy.policyItems = [ allowItem1 ]
print(policy)
print("\n\n")
#created_policy = ranger.create_policy(policy)
#print(' created policy: name=' + created_policy.name + ', id=' + str(created_policy.id))
'''
yaml_format = f"""- name: "{{{{ ecb_env }}}} : {{{{ cdp_env_name }}}}: allow CRP RAR users to select core tables"
devo_ranger_client:
name: "cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}_policy_{policy_ID}"
policy:
service: cm_hive
resources:
database:
values:
- {params['corporate_store'].lower()}
table:
values:
- {params['target_table']}
column:
values:
{params['columns']}
policyItems:
- groups:
{params['igam_roles'].lower()}
accesses:
- select
"""
return yaml_format'
'''
def yaml_format_2b(params,env_config, full_access_list: Optional[List]) -> str:
# For Kerberos authentication
#
# from requests_kerberos import HTTPKerberosAuth
#
# ranger_auth = HTTPKerberosAuth()
ranger = _ranger_client(env_config)
policy = RangerPolicy()
policy.service = "cm_hive" #harcoded
policy.name = f"cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}_row_level_policy" #corporatestore_table_accessType
policy.isEnabled = True
policy.resources ={ 'database': RangerPolicyResource({ 'values': [params['corporate_store'].lower()] }),
'table': RangerPolicyResource({ 'values': [params['target_table']] })}
rowFilterAllowItem1= RangerRowFilterPolicyItem()
rowFilterAllowItem1.groups = params['igam_roles']
rowFilterAllowItem1.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
rowFilterAllowItem1.rowFilterInfo = RangerPolicyItemRowFilterInfo({ 'filterExpr': f"lower(source) IN (select lower(rar_subsource_id) from {params['corporate_store'].lower()}.t_ref_rar_sources_igam_sentry where lower(rar_igam_entitlement) IN (select ad_group from {params['corporate_store'].lower()}.active_directory_user_groups where username = lower(regexp_extract(current_user(),'[^@]*',0))))" })
rowFilterAllowItem2= RangerRowFilterPolicyItem()
rowFilterAllowItem2.groups = [x.lower() for x in full_access_list]
rowFilterAllowItem2.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
rowFilterAllowItem2.rowFilterInfo = RangerPolicyItemRowFilterInfo({ 'filterExpr': f"1=1" })
policy.rowFilterPolicyItems= [rowFilterAllowItem1, rowFilterAllowItem2]
print(policy)
created_policy = ranger.create_policy(policy)
print(' created policy: name=' + created_policy.name + ', id=' + str(created_policy.id))
'''
yaml_format = f"""- name: "{{{{ ecb_env }}}} : {{{{ cdp_env_name }}}}: filter by confidentiality level"
devo_ranger_client:
name: "cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}_row_level_policy"
policy:
isEnabled: "true"
service: cm_hive
resources:
database:
values:
- {params['corporate_store'].lower()}
table:
values:
- {params['target_table']}
rowFilterPolicyItems:
"""
return yaml_format
'''
def yaml_format_3(params, env_config,filterString, full_access_list: Optional[List]) -> str:
ranger = _ranger_client(env_config)
policy = RangerPolicy()
policy.service = "cm_hive" # hardcoded
policy.name = (
f"cpo_{params['corporate_store'].lower()}_"
f"{params['target_table'].lower()}_"
f"{params['access_type'].lower()}_row_level_policy"
)
policy.isEnabled = True
policy.resources = {
"database": RangerPolicyResource({"values": [params["corporate_store"].lower()]}),
"table": RangerPolicyResource({"values": [params["target_table"]]}),
}
# Row filter item
rowFilterAllowItem = RangerRowFilterPolicyItem()
rowFilterAllowItem.groups = params["igam_roles"]
rowFilterAllowItem.accesses = [RangerPolicyItemAccess({"type": "select"})]
rowFilterAllowItem.rowFilterInfo = RangerPolicyItemRowFilterInfo(
{
"filterExpr": filterString
}
)
rowFilterAllowItem2= RangerRowFilterPolicyItem()
rowFilterAllowItem2.groups = [x.lower() for x in full_access_list]
rowFilterAllowItem2.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
rowFilterAllowItem2.rowFilterInfo = RangerPolicyItemRowFilterInfo({ 'filterExpr': f"1=1" })
policy.rowFilterPolicyItems = [rowFilterAllowItem,rowFilterAllowItem2]
print(policy)
# Create policy in Ranger
created_policy = ranger.create_policy(policy)
print(f" created policy: name={created_policy.name}, id={created_policy.id}")
return created_policy
"""
yaml_format = f"- name: "{{{{ ecb_env }}}} : {{{{ cdp_env_name }}}}: filter by confidentiality level"
devo_ranger_client:
name: "cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}_row_level_policy"
policy:
isEnabled: "true"
service: cm_hive
resources:
database:
values:
- {params['corporate_store'].lower()}
table:
values:
- {params['target_table']}
rowFilterPolicyItems:
return yaml_format
"""

View File

@@ -0,0 +1,793 @@
import pandasql as ps
import pandas as pd
import mrds.utils.manage_files as fileManager
import logging
import tableBuilderQueries as tbq
from devo_query import execute_query
import ranger_updater_old as ranger
import os
import yaml
import FlowOptions as fo
import numpy as np
from mrds.utils.secrets import get_secret
import traceback
from mrds.utils import oraconn
# Set up basic configuration for logging
logging.basicConfig(level=logging.INFO)
# Create a logger object
logger = logging.getLogger(__name__)
import re
#0 utilities
def initialize_config(config_file_path):
# Ensure the file exists
if not os.path.exists(config_file_path):
raise FileNotFoundError(f"Configuration file {config_file_path} not found.")
# Load the configuration
with open(config_file_path, "r") as f:
config_data = yaml.safe_load(f)
return config_data
def fix_impala_sql(sql: str) -> str:
# List of reserved keywords in Impala that need backticks if used as column names
impala_reserved_keywords = {
'date', 'value', 'source', 'comment', 'partition', 'row', 'select', 'insert',
'table', 'external', 'format', 'location', 'stored', 'inputformat', 'outputformat',
'scenario', 'string', 'int', 'decimal', 'timestamp', 'float', 'double','procedure', 'floor'
}
# Regex pattern to find column definitions
pattern = re.compile(
r'(?P<col>`?\w+`?)\s+(?P<type>[A-Za-z]+\s*(?:\([^)]+\))?)\s*(?P<comment>comment\s*\'[^\']*\'|)?',
re.IGNORECASE
)
def replace(match):
col = match.group('col').strip('`')
dtype = match.group('type')
comment = match.group('comment') or ''
# Add backticks only if column name is a reserved keyword or contains special chars
if col.lower() in impala_reserved_keywords or not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', col):
col = f'`{col}`'
return f"{col} {dtype} {comment}".strip()
# Only replace column list part between parentheses
table_def_start = sql.find('(')
table_def_end = sql.find('ROW FORMAT SERDE', table_def_start)
if table_def_start == -1 or table_def_end == -1:
raise ValueError("Invalid SQL format: Missing column definition parentheses.")
before = sql[:table_def_start + 1]
columns = sql[table_def_start + 1:table_def_end]
after = sql[table_def_end:]
# Replace all columns inside definition
fixed_columns = pattern.sub(replace, columns)
# Combine and return
final= before + fixed_columns + after
final=final.replace("\\'", "").replace('\\\\', '\\')
return final
def applyQueryParameters(query: str, parameters: str) -> str:
"""
Replaces placeholders in the query with values from parameters.
Parameters:
- query: Original query string with placeholders like $$$1, $$$2, etc.
- parameters: Semicolon-separated string of parameter values.
Returns:
- String with the query filled with parameter values.
"""
filled_query = query
if parameters:
# Split the parameters string and reverse the list
params_array = parameters.split(';')[::-1]
index = len(params_array)
for param in params_array:
# Replace the placeholder $$$<index> with the parameter
placeholder = f"$$${index}"
filled_query = filled_query.replace(placeholder, param)
index -= 1 # Decrement the index
return filled_query
def format_column_definition(row):
if pd.isnull(row['data_description']):
# If data_description is null, only include column_name and data_type_string
return f"{row['column_name']} {row['data_type_string']}"
else:
# If data_description is present, include it with a comment
# Ensure data_description does not contain single quotes
data_description = str(row['data_description']).replace("'", "\\'")
return f"{row['column_name']} {row['data_type_string']} comment '{data_description}'"
#1 receive table name and check for target table and access type
def execute_oracle_query(sql):
oracle_conn = oraconn.connect('MRDS_LOADER_MOPDB')
cursor = oracle_conn.cursor()
options=cursor.execute(sql).fetchall()
oracle_conn.commit()
df = pd.DataFrame(options,columns= [row[0].lower() for row in cursor.description])
## fetch db dtypes
cursor.close()
oracle_conn.close()
return df
def get_target_table(oracle_mgmt_table,source_schema,source_table, env):
sql=f"SELECT DISTINCT TABLE_ALIAS FROM {oracle_mgmt_table} WHERE OWNER = '{source_schema}' AND TABLE_NAME = '{source_table}'"
df=execute_oracle_query(sql)
return df
def get_type_ofAccess(oracle_metadata_table,source_schema,source_table,env):
sql=f"SELECT DISTINCT RAR3_TYPE_OF_ACCESS FROM {oracle_metadata_table} WHERE A_VALID_TO > SYSDATE AND OWNER = '{source_schema}'AND TABLE_NAME = '{source_table}'"
df=execute_oracle_query(sql)
return df
#2 load metadata
def readIGAMRoles( config ,env):
queryParams = "'" + config.sentry_role_environment + "'"
igamRolesQuery = tbq.get_query_igam_roles(config.oracle_igam_table,config.service_name)
logger.info(f"Querying the IGAM Table")
queryWithParamsIgamSentry = applyQueryParameters(igamRolesQuery, queryParams)
logger.info(f"Replaced params to IGAM Table:")
igamRoleDF = execute_oracle_query(queryWithParamsIgamSentry)
return igamRoleDF
def loadMetadataTable( config,env ):
metadataQuery = tbq.get_query_metadata(config.oracle_metadata_table, config.source_schema, config.source_table)
logger.info("Map Oracle metadata (data types) to Hive query: ")
jdbcMetaDataDF = df=execute_oracle_query(metadataQuery)
logger.info("Fetch all fields for table and concatenate them separated by ','")
tableDataList = jdbcMetaDataDF.apply(format_column_definition, axis=1).tolist()
tableFields = ",".join(tableDataList)
return tableFields
#3 drop table and policies
def deleteExternalTable(config,env_config):
try:
deleted=ranger.delete_policy(config,env_config)
except Exception as e:
pass
sql_drop = f"DROP TABLE IF EXISTS {config.corporate_store}.{config.target_table}"
execute_query(
sql_drop,
env_config['DEVO_USERNAME'], env_config['IMPALA_HOSTNAME'], env_config['DEVO_SECRET'],
)
#4 create external table and policies
def createExternalTables( config, tableFields,env_config ):
sql_create = (
f"CREATE EXTERNAL TABLE {config.corporate_store}.{config.target_table} "
f"({tableFields}, {config.tech_meta_data_fields}) "
"ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' "
"STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' "
"OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' "
f"LOCATION '{config.target_s3_bucket}/{config.target_table}' "
"TBLPROPERTIES ("
"'external.table.purge'='true', "
"'parquet.compression'='snappy')"
)
sql_create=fix_impala_sql(sql_create)
execute_query(sql_create,env_config['DEVO_USERNAME'],env_config['HIVE_HOSTNAME'],env_config['DEVO_SECRET'])
def createTableFromExternal( config, tableFields,env_config ):
sql_create = (
f"CREATE EXTERNAL TABLE {config.corporate_store}.{config.target_table} AS "
f"SELECT * FROM {config.corporate_store}.{config.target_table}_EXT"
)
execute_query(sql_create,env_config['DEVO_USERNAME'],env_config['HIVE_HOSTNAME'],env_config['DEVO_SECRET'])
def accessTypeMapper(config, env_config, igamRoleDF):
if config.access_type.lower() == '1':
accessType_1(config, env_config, igamRoleDF)
elif (config.access_type.lower() == '2a'):
accessType_2A(config, env_config, igamRoleDF)
elif (config.access_type.lower() == '2b'):
accessType_2B(config, env_config, igamRoleDF)
elif (config.access_type.lower() == '3'):
accessType_3(config, env_config, igamRoleDF)
else:
logger.info(f"Invalid access type {config.access_type}. Please check the input param")
def accessType_1(config, env_config, igamRoleDF):
logger.info("Grant privileges for access type 1")
logger.info("Fetch metadata from Oracle for access type 1")
# ---- Construct query and fetch from Oracle ----
queryParams = f"'{config.source_schema}.{config.source_table}'"
queryMetadataAccessType1 = tbq.get_query_metadata_access_type1(config.oracle_metadata_table)
queryWithParamsAccessType1 = applyQueryParameters(queryMetadataAccessType1, queryParams)
logger.info("Metadata table query: " )
jdbcMetaDataAccessType1DF = df=execute_oracle_query(queryWithParamsAccessType1)
# ---- Normalize columns ----
df = jdbcMetaDataAccessType1DF.copy()
df["rar3_type_of_access"] = df["rar3_type_of_access"].astype(str).str.strip()
df["source"] = df["source"].astype(str).str.strip().str.upper()
igamRoleDF["datasource"] = igamRoleDF["datasource"].astype(str).str.strip().str.upper()
# ---- Branch A: source != 'RAR' ----
left_a = (
df.loc[
(df["rar3_type_of_access"] == "1") & (df["source"] != config.service_name),
["table_name", "source"]
]
.drop_duplicates()
)
branch_a = (
left_a.merge(
igamRoleDF,
left_on="source",
right_on="datasource",
how="inner"
)
[["table_name", "source", "subsource_id", "igam_entitlement", "environment"]]
.drop_duplicates()
)
# ---- Branch B: source == 'RAR' (CROSS JOIN with igamRoleDF) ----
left_b = (
df.loc[
(df["rar3_type_of_access"] == "1") & (df["source"] == config.service_name),
["table_name", "source"]
]
.drop_duplicates()
)
if not left_b.empty:
branch_b = (
left_b.merge(igamRoleDF, how="cross")
[["table_name", "source", "subsource_id", "igam_entitlement", "environment"]]
.drop_duplicates()
)
else:
branch_b = pd.DataFrame(columns=["table_name", "source", "subsource_id", "igam_entitlement", "environment"])
# ---- UNION (distinct) ----
typeOneDF = (
pd.concat([branch_a, branch_b], ignore_index=True)
.drop_duplicates()
.reset_index(drop=True)
)
# ---- Collect IGAM entitlements ----
igam_entitlements = (
typeOneDF["igam_entitlement"]
.dropna()
.astype(str)
.str.strip()
.tolist()
)
# Extract IGAM entitlements
# Merge with optional full access list
if config.full_access_entitlement_list is None:
combined_entitlements = igam_entitlements
else:
full_access_list_clean = config.full_access_entitlement_list
combined_entitlements = igam_entitlements + full_access_list_clean
# Add table permission groups using YAMLFormatter
params = ranger.add_table_permission_groups(
config.corporate_store,
config.target_table,
config.access_type,
config.source_table,
combined_entitlements
)
# Generate the final YAML policy
formattedYaml = ranger.generate_policy(params,env_config, None)
logger.info(f"Final YAML format")
return formattedYaml
def accessType_2A(config, env_config, igamRoleDF):
logger.info("Grant privileges for access type 2a")
logger.info("Fetch the metadata in Oracle for access type 2a")
# ---- Construct query and fetch from Oracle ----
queryParams = f"'{config.source_schema}.{config.source_table}'"
queryMetadataAccessType2a = tbq.get_query_metadata_access_type2a(config.oracle_metadata_table)
queryWithParamsAccessType2a = applyQueryParameters(queryMetadataAccessType2a, queryParams)
logger.info(f"Meta data table query: {queryWithParamsAccessType2a} ")
jdbcMetaDataAccessType2aDF = execute_oracle_query(queryWithParamsAccessType2a)
# ---- Normalize columns ----
df = jdbcMetaDataAccessType2aDF.copy()
df["rar3_type_of_access"] = df["rar3_type_of_access"].astype(str).str.strip().str.lower()
df["source"] = df["source"].astype(str).str.strip().str.upper()
print(df)
roles = igamRoleDF.copy()
# expected columns in igamRoleDF: rar_subsource_id, igam_entitlement, environment (plus anything else you keep)
roles["subsource_id"] = roles["subsource_id"].astype(str).str.strip().str.upper()
roles["igam_entitlement"] = roles["igam_entitlement"].astype(str).str.strip()
# ---- Branch A: source != service_name -> INNER JOIN on source == rar_subsource_id ----
left_a = (
df.loc[
(df["rar3_type_of_access"] == "2a")
& (df["source"] != config.service_name.upper()),
["table_name", "column_name", "source"]
]
)
branch_a = (
left_a.merge(
roles,
left_on="source",
right_on="subsource_id",
how="inner"
)
.drop(columns=["subsource_id", "source"], errors="ignore")
[["table_name", "column_name", "igam_entitlement", "environment"]]
)
# ---- Branch B: source == service_name -> CROSS JOIN with igamRoleDF ----
left_b = (
df.loc[
(df["rar3_type_of_access"] == "2a")
& (df["source"] == config.service_name.upper()),
["table_name", "column_name", "source"]
]
)
if not left_b.empty:
try:
branch_b = (
left_b.merge(roles, how="cross")
.drop(columns=["subsource_id", "source"], errors="ignore")
[["table_name", "column_name", "igam_entitlement", "environment"]]
)
except TypeError:
# pandas < 1.2 fallback
left_b["_cj"] = 1
roles["_cj"] = 1
branch_b = (
left_b.merge(roles, on="_cj")
.drop(columns=["_cj", "subsource_id", "source"], errors="ignore")
[["table_name", "column_name", "igam_entitlement", "environment"]]
)
# (optional) cleanup if you keep using roles later
roles.drop(columns=["_cj"], inplace=True, errors="ignore")
else:
branch_b = pd.DataFrame(columns=["table_name", "column_name", "igam_entitlement", "environment"])
# ---- UNION (distinct) ----
one_df = (
pd.concat([branch_a, branch_b], ignore_index=True)
.reset_index(drop=True)
)
# ---- Group 1: (table_name, igam_entitlement) -> sorted, comma-joined column_list ----
tmp = one_df.sort_values(["table_name", "igam_entitlement", "column_name"], kind="mergesort")
new_df = (
tmp.groupby(["table_name", "igam_entitlement"], as_index=False)["column_name"]
.apply(lambda s: ",".join(s.dropna().astype(str).tolist()))
.rename(columns={"column_name": "column_list"})
)
# Columns: table_name, igam_entitlement, column_list
# ---- Group 2: (table_name, column_list) -> comma-joined igam_entitlement ----
grouped = (
new_df.groupby(["table_name", "column_list"], as_index=False)["igam_entitlement"]
.apply(lambda s: ",".join(s.dropna().astype(str).tolist()))
)
# Columns: table_name, column_list, igam_entitlement
# ---- ROW_NUMBER() OVER (ORDER BY column_list) -> policy_id ----
grouped = grouped.sort_values(["column_list"], kind="mergesort")
grouped["policy_id"] = np.arange(1, len(grouped) + 1).astype(int)
# ---- Emit policies: one per (table_name, column_list) row ----
for _, row in grouped.iterrows():
entitlements_list = [e.strip() for e in str(row["igam_entitlement"]).split(",") if e.strip()]
columns_list = [c.strip() for c in str(row["column_list"]).split(",") if c.strip()]
policy_id = str(int(row["policy_id"]))
params = ranger.add_table_permission_groups(
config.corporate_store,
config.target_table,
config.access_type, # "2a"
config.source_table,
entitlements_list,
columns_list=columns_list
)
ranger.generate_policy(params, env_config, policy_id)
# ---- Optional: append full-access YAML if list provided on config ----
if getattr(config, "full_access_entitlement_list", None):
# If your code already provides a list, use it directly; otherwise split string.
if isinstance(config.full_access_entitlement_list, list):
full_access_list = config.full_access_entitlement_list
else:
full_access_list = [s.strip() for s in str(config.full_access_entitlement_list).split(",") if s.strip()]
params_full = ranger.add_table_permission_groups(
config.corporate_store,
config.target_table,
config.access_type, # keep same access type per your pattern
config.source_table,
full_access_list
)
ranger.generate_policy(params_full, env_config, "full_access")
def accessType_2B(config, env_config,igamRoleDF):
logger.info(f"Grant privileges for access type {config.access_type}")
logger.info("Fetch the metadata in Oracle for access type 2b")
# --- Validate required columns ---
required = {"environment", "igam_entitlement", "subsource_id"}
missing = required - set(igamRoleDF.columns)
if missing:
raise KeyError(f"igamRoleDF missing required column(s): {sorted(missing)}")
# --- Normalize to strings (robust against None/NaN) ---
igamRoleDF = igamRoleDF.copy()
igamRoleDF["environment"] = igamRoleDF["environment"].astype(str).str.strip()
igamRoleDF["igam_entitlement"] = igamRoleDF["igam_entitlement"].astype(str).str.strip()
igamRoleDF["subsource_id"] = igamRoleDF["subsource_id"].astype(str).str.strip()
# --- Aggregation: per (environment, igam_entitlement) collect unique subsource_id list ---
# Keep a stable order by sorting; remove empties.
agg_df = (
igamRoleDF.loc[igamRoleDF["subsource_id"].ne(""), ["environment", "igam_entitlement", "subsource_id"]]
.drop_duplicates()
.sort_values(["environment", "igam_entitlement", "subsource_id"], kind="mergesort")
.groupby(["environment", "igam_entitlement"], as_index=False)["subsource_id"]
.agg(lambda s: ",".join(s.unique()))
.rename(columns={"subsource_id": "subsource_id_list"})
)
# List of tuples (IGAM_ENTITLEMENT, subsource_id_list) — mirrors your log payload
accessType2bValidList = list(zip(
agg_df["igam_entitlement"].astype(str),
agg_df["subsource_id_list"].astype(str)
))
# --- Entitlements for policy generation (unique, non-empty) ---
igam_entitlements = (
igamRoleDF["igam_entitlement"]
.dropna()
.map(str)
.str.strip()
.loc[lambda s: s.ne("")]
.drop_duplicates()
.tolist()
)
# --- Row-level permissions (per your existing API) ---
params_row_level = ranger.add_table_permission_groups(
config.corporate_store,
config.target_table,
config.access_type,
config.source_table,
igam_entitlements
)
# --- Table-level permissions, merging in full-access entitlements if provided ---
if getattr(config, "full_access_entitlement_list", None):
combined_entitlements = igam_entitlements + config.full_access_entitlement_list
else:
combined_entitlements = igam_entitlements
# --- Emit YAML using your helpers ---
if getattr(config, "full_access_entitlement_list", None):
params = ranger.add_table_permission_groups(
config.corporate_store,
config.target_table,
config.access_type,
config.source_table,
combined_entitlements
)
ranger.generate_policy(params, env_config)
ranger.yaml_format_2b(params_row_level,env_config, config.full_access_entitlement_list) # row-level policy
logger.info("Final YAML format emitted for 2B.")
def accessType_3(config,env_config, igamRoleDF):
"""
Python/pandas translation of the Scala accessType_3.
Expects igamRoleDF to have at least: ['igam_entitlement', 'subsource_id'].
The `config` object should expose the attributes used below (names match your Scala/Python usage).
Uses a YAML formatter module `ranger` with:
- add_table_permission_groups(corporate_store, target_table, access_type, source_table, entitlements)
- yaml_format_3(params)
- yaml_format_1(params)
"""
# --- 1) Filter entitlements where subsource_id = 'TMS' ---
if not {"igam_entitlement", "subsource_id"}.issubset(igamRoleDF.columns):
missing = {"igam_entitlement", "subsource_id"} - set(igamRoleDF.columns)
raise KeyError(f"igamRoleDF missing required column(s): {sorted(missing)}")
new_df = (
igamRoleDF.loc[
igamRoleDF["subsource_id"].astype(str).str.upper() == "TMS",
["igam_entitlement"]
].drop_duplicates()
)
accessType3ValidList = new_df["igam_entitlement"].astype(str).str.strip().tolist()
# --- 2) Build params for row-level groups (type 3) ---
params_row_level = ranger.add_table_permission_groups(
config.corporate_store,
config.target_table,
config.access_type,
config.source_table,
accessType3ValidList
)
corp = str(config.corporate_store).lower()
src_tbl = str(config.source_table).lower()
# --- 3) Compose the filter expressions (match Scala strings) ---
sqlCreateView3NonRestrString_Ptree = (
"(parent_fk in ( "
f"select portfolio_fk from {corp}.nh_portfolio_access "
"where lower(user_id) LIKE concat('%', lower(regexp_extract(current_user(),'[^@]*',0)), '%') "
"AND to_date(a_valid_to) > current_timestamp() "
")) AND (child_fk in ( "
f"select portfolio_fk from {corp}.nh_portfolio_access "
"where lower(user_id) LIKE concat('%', lower(regexp_extract(current_user(),'[^@]*',0)), '%') "
"AND to_date(a_valid_to) > current_timestamp() "
"))"
)
sqlCreateView3NonRestrString_Pos = (
"position_key in ( "
f"select position_key from {corp}.nh_portfolio_access a "
f"inner join {corp}.nh_position b on ( "
"(b.portfolio_fk = a.portfolio_fk and b.portfolio_fk is not NULL) or "
"(b.portfolio_compare_fk = a.portfolio_fk and b.portfolio_compare_fk is not NULL) "
") "
"where lower(user_id) LIKE concat('%', lower(regexp_extract(current_user(),'[^@]*',0)), '%') "
"AND to_date(a_valid_to) > current_timestamp() "
")"
)
sqlCreateView3PortAccess = "lower(user_id) LIKE concat('%', lower(regexp_extract(current_user(),'[^@]*',0)), '%')"
sqlCreateView3LimAccess = "lower(user_id) LIKE concat('%', lower(regexp_extract(current_user(),'[^@]*',0)), '%')"
# Standard case uses the configured key columns/table names
key_col = getattr(config, "type3SourceTableKeyColumn", None)
acc_col = getattr(config, "type3AccessTableKeyColumn", None)
acc_table= getattr(config, "type3AccessTable", None)
if not all([key_col, acc_col, acc_table]):
# Only needed for the default branch; keep None if your config doesn't use the default
key_col = key_col or "source_key_col"
acc_col = acc_col or "access_key_col"
acc_table = acc_table or "type3_access_table"
sqlCreateView3NonRestrString_Stdrd = (
f"{key_col} in (select {acc_col} from {corp}.{acc_table} "
"where lower(user_id) LIKE concat('%', lower(regexp_extract(current_user(),'[^@]*',0)), '%') "
"AND to_date(a_valid_to) > current_timestamp())"
)
# --- 4) Choose the filter by source table (matches Scala match/case) ---
if src_tbl == "nh_portfoliotree":
sqlCreateViewType3Filter = sqlCreateView3NonRestrString_Ptree
elif src_tbl == "nh_position":
sqlCreateViewType3Filter = sqlCreateView3NonRestrString_Pos
elif src_tbl == "nh_portfolio_access":
sqlCreateViewType3Filter = sqlCreateView3PortAccess
elif src_tbl == "nh_limit_access":
sqlCreateViewType3Filter = sqlCreateView3LimAccess
else:
sqlCreateViewType3Filter = sqlCreateView3NonRestrString_Stdrd
# --- 5) Row filter YAML block (uses groups from params_row_level) ---
# Expecting params_row_level like {'igam_roles': '...'}; adjust key if your API differs.
igam_roles_lower = str(params_row_level.get("igam_roles", "")).lower()
rowFilter = (
"- groups:\n"
f" {igam_roles_lower}\n"
" accesses:\n"
" - select\n"
f" filterExpr: \"{sqlCreateViewType3Filter}\"\n"
" "
)
# --- 6) Handle optional full access entitlements ---
if config.full_access_entitlement_list:
paramsFullAccess = ranger.add_table_permission_groups(
config.corporate_store,
config.target_table,
config.access_type,
config.source_table,
config.full_access_entitlement_list
)
full_groups_lower = str(paramsFullAccess.get("igam_roles", "")).lower()
"""
fullAccessFilter = (
"- groups:\n"
f" {full_groups_lower}\n"
" accesses:\n"
" - select\n"
" filterExpr: \"1=1\"\n"
" "
)
"""
params_table_level = ranger.add_table_permission_groups(
config.corporate_store,
config.target_table,
config.access_type,
config.source_table,
accessType3ValidList + config.full_access_entitlement_list
)
else:
fullAccessFilter = ""
params_table_level = ranger.add_table_permission_groups(
config.corporate_store,
config.target_table,
config.access_type,
config.source_table,
accessType3ValidList
)
# --- 7) Render YAML and merge like Scala ---
ranger.yaml_format_3(params_row_level,env_config,sqlCreateViewType3Filter,config.full_access_entitlement_list ) # base type 3 yaml
ranger.yaml_format_1(params_table_level,env_config) # table-level yaml
#5 create extra policies for super-users
#6 refresh metadata
def run_process(env_file, env, service_name,source_schema,source_table,sentry_role_environment):
#1 receive table name and check for target table and access type
env_dict=initialize_config(env_file)
env_config=env_dict[env]
if service_name.lower()=='rqsd':
env_config["DEVO_SECRET"]=env_config["DEVO_SECRET_RQSD"]
env_config["DEVO_USERNAME"]=env_config["DEVO_USERNAME_RQSD"]
try:
devo_secret_name = env_config["DEVO_SECRET"]
env_config["DEVO_SECRET"]= get_secret(devo_secret_name)
except:
logger.error("Failed to retrieve credentials from secrets")
raise(Exception)
db_config=env_dict[service_name]
try:
target_table=get_target_table(db_config['oracle_mgmt_table'],source_schema,source_table,env)['table_alias'][0]
except Exception as e:
logger.error("Table not found in oracle management table")
logger.error("Exception: %s", e)
logger.error("Traceback:\n%s", traceback.format_exc())
raise
try:
access_type=get_type_ofAccess(db_config['oracle_metadata_table'],source_schema,source_table,env)['rar3_type_of_access'][0].strip()
except Exception as e:
logger.error("Table not found in oracle metadata inventory")
logger.error("Exception: %s", e)
logger.error("Traceback:\n%s", traceback.format_exc())
raise
args={
'corporate_store':db_config['corporate_store'],
'service_name': service_name,
'source_schema':source_schema,
'source_table':source_table,
'oracle_metadata_table':db_config['oracle_metadata_table'],
'oracle_igam_table':db_config['oracle_igam_table'],
'oracle_mgmt_table': db_config['oracle_mgmt_table'],
'target_table':target_table,
'sentry_role_environment':sentry_role_environment,
'target_s3_bucket': env_config["BUCKET_PREFIX"]+db_config['target_s3_bucket'] ,
'tech_meta_data_fields': db_config['tech_meta_data_fields'],
'full_access_entitlement_list':env_config[f"FULL_ACCESS_LIST_{service_name.upper()}"].split(','),
'access_type': access_type
}
config=fo.Options(args)
#2 load metadata
tableFields=loadMetadataTable(config,env)
igamRoles=readIGAMRoles(config,env)
#3 drop table and policies
deleteExternalTable(config,env_config)
#4 create external table and policies
if (config.target_table[-4:].upper() == '_EXT'):
createExternalTables( config, tableFields,env_config )
else:
createTableFromExternal( config, tableFields,env_config)
accessTypeMapper(config,env_config,igamRoles)
#5 refresh metadata
#execute_query(f"INVALIDATE METADATA {config.corporate_store}.{config.target_table}",env_config["DEVO_USERNAME"],env_config['IMPALA_HOSTNAME'],env_config['DEVO_SECRET'])
#execute_query(f"COMPUTE STATS {config.corporate_store}.{config.target_table}",env_config["DEVO_USERNAME"],env_config['IMPALA_HOSTNAME'],env_config['DEVO_SECRET'])
#run_process("/home/dbt/Marco/mrds_elt/python/devo_replicator/env_config.yaml",'tst','mopdb','MPEC','T_MPEC','TEST/INTEGRATION')
#run_process("/home/dbt/Marco/mrds_elt/python/devo_replicator/config/env_config.yaml",'tst','rar','CORR_RAR','NH_ASSET','TEST/INTEGRATION')
#run_process("/home/dbt/Marco/mrds_elt/python/devo_replicator/config/env_config.yaml",'dev','rar','CORR_RAR','NH_LIMIT','TEST/INTEGRATION')
run_process("/home/dbt/Marco/mrds_elt/python/devo_replicator/config/env_config.yaml",'dev','rar','CORR_RAR','NH_Asset_transactial_data'.upper(),'TEST/INTEGRATION')
"""
df=execute_oracle_query('select owner, table_name from CT_MRDS.A_DEVO_REPLICA_MGMT_RQSD')
listfail=[]
for index, row in df.iterrows():
try:
print("running table: ",row["table_name"])
run_process("/home/dbt/Marco/mrds_elt/python/devo_replicator/config/env_config.yaml",'tst','rqsd',row['owner'],row['table_name'].upper(),'TEST/INTEGRATION')
except:
print("failed")
listfail.append(row["table_name"])
print("succeded")
print(listfail)
"""
'''{"id": 48754, "guid": "d75f1491-538d-402a-a8ac-e7e21ac0be53", "isEnabled": true, "version": 1, "service": "cm_hive", "name": "cpo_crp_rar_mu_asset_code_map_2a_policy_1", "policyType": 0, "policyPriority": 0, "description": "created-ranger_client-v0.0.6-2025-10-17T14:28:24.135108", "isAuditEnabled": true, "resources": {"database": {"values": ["crp_rar"], "isExcludes": false, "isRecursive": false},
"column": {"values": ["ASSET_FK", "ASSET_FK", "A_DWH_LOAD_SET_FK", "A_DWH_LOAD_SET_FK", "A_VALID_FROM", "A_VALID_FROM", "A_VALID_TO", "A_VALID_TO", "CODE_TYPE_NO_ID", "CODE_TYPE_NO_ID", "CODE_VALUE", "CODE_VALUE", "INDEP_SUBPROCESS_FK", "INDEP_SUBPROCESS_FK", "TEC_EXECUTION_DATE", "TEC_EXECUTION_DATE", "TEC_INGESTION_DATE", "TEC_INGESTION_DATE", "TEC_RUN_ID", "TEC_RUN_ID"], "isExcludes": false, "isRecursive": false},
"table": {"values": ["MU_ASSET_CODE_MAP"], "isExcludes": false, "isRecursive": false}}, "policyItems": [{"accesses": [{"type": "select", "isAllowed": true}],
"groups": ["a_mopdb_ea", "disc-au-bda"], "delegateAdmin": false}], "serviceType": "hive", "isDenyAllElse": false}
{"id": 48755, "guid": "5ff857c2-3683-4178-98ce-5932c0677cd4", "isEnabled": true, "version": 1, "service": "cm_hive", "name": "cpo_crp_rar_mu_asset_code_map_2a_policy_2", "policyType": 0, "policyPriority": 0, "description": "created-ranger_client-v0.0.6-2025-10-17T14:28:24.135108", "isAuditEnabled": true, "resources": {"database": {"values": ["crp_rar"], "isExcludes": false, "isRecursive": false},
"column": {"values": ["ASSET_FK", "A_DWH_LOAD_SET_FK", "A_VALID_FROM", "A_VALID_TO", "CODE_TYPE_NO_ID", "CODE_VALUE", "INDEP_SUBPROCESS_FK", "TEC_EXECUTION_DATE", "TEC_INGESTION_DATE", "TEC_RUN_ID"], "isExcludes": false, "isRecursive": false},
"table": {"values": ["MU_ASSET_CODE_MAP"], "isExcludes": false, "isRecursive": false}}, "policyItems": [{"accesses": [{"type": "select", "isAllowed": true}], "
groups": ["su-omd-reuters-users", "a_mopdb_excess_liquidity", "a-mora-lba-exp-a", "a_rar_csdb_reference_data", "a_mopdb_uc", "a_rar_csdb_ratings_data", "a_mopdb_credit_operations", "a_rar_fxcd_data", "a_rar_mdp_bbg_data", "disc-ac-riad_cnf_n-r", "a-mora-lba-ana-a", "a_mopdb_tms_data", "disc-ac-riad_core-r", "a_mopdb_mpec", "a-led-ana-a", "a-led-exp-a", "a_mopdb_ela_all"], "delegateAdmin": false}], "serviceType": "hive", "isDenyAllElse": false}
{"id": 48756, "guid": "1071767f-8ef6-47be-bb9b-7077ed9e9a90", "isEnabled": true, "version": 1, "service": "cm_hive", "name": "cpo_crp_rar_mu_asset_code_map_2a_policy_full_access", "policyType": 0, "policyPriority": 0, "description": "created-ranger_client-v0.0.6-2025-10-17T14:28:24.135108", "isAuditEnabled": true, "resources": {"database": {"values": ["crp_rar"], "isExcludes": false, "isRecursive": false}, "column": {"values": ["*"], "isExcludes": false, "isRecursive": false}, "table": {"values": ["MU_ASSET_CODE_MAP"], "isExcludes": false, "isRecursive": false}},
"policyItems": [{"accesses": [{"type": "select", "isAllowed": true}], "groups": ["disc-ac-rar-r"], "delegateAdmin": false}], "serviceType": "hive", "isDenyAllElse": false}'''
'''
{"isEnabled": true, "isDenyAllElse": false, "service": "cm_hive", "name": "cpo_crp_rar_mu_asset_code_map_ext_2a_policy_1", "resources": {"database": {"values": ["crp_rar"], "isExcludes": false, "isRecursive": false}, "table": {"values": ["MU_ASSET_CODE_MAP_EXT"], "isExcludes": false, "isRecursive": false},
"column": {"values": ["ASSET_FK", "ASSET_FK", "A_DWH_LOAD_SET_FK", "A_DWH_LOAD_SET_FK", "A_VALID_FROM", "A_VALID_FROM", "A_VALID_TO", "A_VALID_TO", "CODE_TYPE_NO_ID", "CODE_TYPE_NO_ID", "CODE_VALUE", "CODE_VALUE", "INDEP_SUBPROCESS_FK", "INDEP_SUBPROCESS_FK", "TEC_EXECUTION_DATE", "TEC_EXECUTION_DATE", "TEC_INGESTION_DATE", "TEC_INGESTION_DATE", "TEC_RUN_ID", "TEC_RUN_ID"], "isExcludes": false, "isRecursive": false}}, "policyItems": [{"delegateAdmin": false,
"groups": ["disc-tu-bda", "t_mopdb_ea"], "accesses": [{"type": "select", "isAllowed": true}]}]}
{"isEnabled": true, "isDenyAllElse": false, "service": "cm_hive", "name": "cpo_crp_rar_mu_asset_code_map_ext_2a_policy_2", "resources": {"database": {"values": ["crp_rar"], "isExcludes": false, "isRecursive": false},
"table": {"values": ["MU_ASSET_CODE_MAP_EXT"], "isExcludes": false, "isRecursive": false},
"column": {"values": ["ASSET_FK", "A_DWH_LOAD_SET_FK", "A_VALID_FROM", "A_VALID_TO", "CODE_TYPE_NO_ID", "CODE_VALUE", "INDEP_SUBPROCESS_FK", "TEC_EXECUTION_DATE", "TEC_INGESTION_DATE", "TEC_RUN_ID"], "isExcludes": false, "isRecursive": false}}, "policyItems": [{"delegateAdmin": false,
"groups": ["a-led-ana-t", "a-led-exp-t", "a-mora-lba-ana-t", "a-mora-lba-exp-t", "disc-tc-riad_cnf_n-r", "disc-tc-riad_core-r", "su-omd-reuters-users", "t_mopdb_credit_operations", "t_mopdb_ela_all", "t_mopdb_excess_liquidity", "t_mopdb_mpec", "t_mopdb_tms_data", "t_mopdb_uc", "t_rar_csdb_ratings_data", "t_rar_csdb_reference_data", "t_rar_fxcd_data", "t_rar_mdp_bbg_data"],
"accesses": [{"type": "select", "isAllowed": true}]}]}
{"isEnabled": true, "isDenyAllElse": false, "service": "cm_hive", "name": "cpo_crp_rar_mu_asset_code_map_ext_2a_policy_full_access", "resources": {"database": {"values": ["crp_rar"], "isExcludes": false, "isRecursive": false},
"table": {"values": ["MU_ASSET_CODE_MAP_EXT"], "isExcludes": false, "isRecursive": false}, "column": {"values": ["*"], "isExcludes": false, "isRecursive": false}}, "policyItems": [{"delegateAdmin": false,
"groups": ["disc-dc-rar-r"], "accesses": [{"type": "select", "isAllowed": true}]}]}
'''

View File

@@ -0,0 +1,130 @@
metadata_table = "DW_RAR.NH_METADATA_INVENTORY"
def get_query_metadata(metadata_table, owner, table_name):
query_metadata = (
"WITH metaDF AS ( "
"SELECT owner, table_name, column_id, column_name, data_type, data_precision, data_scale, "
"CASE WHEN data_precision IS NULL AND data_scale IS NULL THEN NULL "
"WHEN data_precision IS NOT NULL AND data_scale IS NULL THEN data_precision "
"WHEN CAST(data_precision AS INT) >= CAST(data_scale AS INT) AND CAST(data_scale AS INT) >= 0 THEN data_precision "
"WHEN CAST(data_precision AS INT) < CAST(data_scale AS INT) AND CAST(data_scale AS INT) <= 38 AND CAST(data_scale AS INT) > 0 THEN data_scale "
"WHEN CAST(data_precision AS INT) < CAST(data_scale AS INT) AND (CAST(data_scale AS INT) > 38 OR CAST(data_scale AS INT) < 0) THEN NULL "
"ELSE NULL END AS data_precision_hive, "
"CASE WHEN data_precision IS NULL AND data_scale IS NULL THEN NULL "
"WHEN data_precision IS NOT NULL AND data_scale IS NULL THEN CAST(0 AS INT) "
"WHEN CAST(data_precision AS INT) >= CAST(data_scale AS INT) AND CAST(data_scale AS INT) >= 0 THEN data_scale "
"WHEN CAST(data_precision AS INT) < CAST(data_scale AS INT) AND CAST(data_scale AS INT) <= 38 AND CAST(data_scale AS INT) > 0 THEN data_scale "
"WHEN CAST(data_precision AS INT) < CAST(data_scale AS INT) AND (CAST(data_scale AS INT) > 38 OR CAST(data_scale AS INT) < 0) THEN NULL "
"ELSE NULL END AS data_scale_hive, "
"CASE WHEN data_type LIKE '%NUMBER%' AND data_precision IS NULL AND data_scale IS NULL THEN 'String' "
"WHEN data_type LIKE '%NUMBER%' AND data_precision IS NOT NULL AND data_scale IS NULL THEN 'Decimal' "
"WHEN data_type LIKE '%NUMBER%' AND CAST(data_precision AS INT) >= CAST(data_scale AS INT) AND CAST(data_scale AS INT) >= 0 THEN 'Decimal' "
"WHEN data_type LIKE '%NUMBER%' AND CAST(data_precision AS INT) < CAST(data_scale AS INT) AND CAST(data_scale AS INT) <= 38 AND CAST(data_scale AS INT) > 0 THEN 'Decimal' "
"WHEN data_type LIKE '%NUMBER%' AND CAST(data_precision AS INT) < CAST(data_scale AS INT) AND (CAST(data_scale AS INT) > 38 OR CAST(data_scale AS INT) < 0) THEN 'String' "
"WHEN data_type LIKE '%CHAR%' THEN 'String' "
"WHEN data_type LIKE '%VARCHAR2%' THEN 'String' "
"WHEN data_type LIKE '%TIMESTAMP%' THEN 'String' "
"WHEN data_type LIKE '%DATE%' THEN 'String' "
"ELSE 'String' END AS data_type_hive, "
"REGEXP_REPLACE(data_description, '''', '\\''') AS data_description "
"FROM {0} "
"WHERE lower(owner||'.'||table_name) = lower('{1}'||'.'||'{2}') "
"AND a_valid_to > sysdate) "
"SELECT owner, table_name, column_id, column_name, data_type, data_precision, data_scale, "
"data_precision_hive, data_scale_hive, data_type_hive, "
"CASE WHEN data_type_hive = 'Decimal' THEN 'Decimal(' || COALESCE(CAST(data_precision_hive AS VARCHAR2(30)), '') || ',' || COALESCE(CAST(data_scale_hive AS VARCHAR2(30)), '') || ')' "
"ELSE data_type_hive END AS data_type_string, data_description "
"FROM metaDF "
"ORDER BY CAST(column_id AS INT) "
).format(metadata_table, owner, table_name)
return query_metadata
def get_query_metadata_access_type1(metadata_table):
query_metadata_access_type1 = (
"SELECT owner, table_name, list_of_sources as SOURCE, rar3_type_of_access "
"FROM {0} "
"WHERE a_valid_to > sysdate "
"AND rar3_type_of_access = '1' "
"AND list_of_sources NOT IN 'RAR' "
"AND lower(owner||'.'||table_name) = lower($$$1) "
"UNION "
"SELECT owner, table_name, list_of_sources as SOURCE, rar3_type_of_access "
"FROM {0} "
"WHERE a_valid_to > sysdate "
"AND rar3_type_of_access = '1' "
"AND owner = 'CORR_REF_MAIN' "
"AND lower(owner||'.'||table_name) = lower($$$1) "
).format(metadata_table)
return query_metadata_access_type1
def get_query_metadata_access_type2a(metadata_table):
query_metadata_access_type2a = (
"WITH rar_columns AS ( "
"SELECT owner, table_name, column_name, source, rar3_type_of_access "
"FROM ( "
"SELECT owner, table_name, column_name, rar3_type_of_access, list_of_sources, "
"tms, c2d_ea, c2d_ela, c2d_mpec, c2d_uc, ceph, lm, csdb_reference, "
"csdb_ratings, fxcd, mdp_bbg, mdp_reu, riad_cl, riad_ou, sdw_estr, sdw_fx, "
"top, rar, rtm, led, mdp_cma "
"FROM {0} "
"WHERE a_valid_to > sysdate "
"AND rar3_type_of_access = '2a' "
"AND lower(owner || '.' || table_name) = lower($$$1) "
"AND list_of_sources NOT LIKE '%,%' "
"AND upper(column_name) NOT IN ('DATABASE') "
") a "
"UNPIVOT ( "
"val FOR (source) IN ( "
"tms AS 'TMS', c2d_ea AS 'C2D_EA', c2d_ela AS 'C2D_ELA', c2d_mpec AS 'C2D_MPEC', "
"c2d_uc AS 'C2D_UC', ceph AS 'CEPH', lm AS 'LM', csdb_reference AS 'CSDB_REFERENCE', "
"csdb_ratings AS 'CSDB_RATINGS', fxcd AS 'FXCD', mdp_bbg AS 'MDP_BBG', mdp_reu AS 'MDP_REU', "
"riad_cl AS 'RIAD_CL', riad_ou AS 'RIAD_OU', sdw_estr AS 'SDW_ESTR', sdw_fx AS 'SDW_FX', "
"top AS 'TOP', rar AS 'RAR', rtm AS 'RTM', led AS 'LED', mdp_cma AS 'MDP_CMA') "
") "
"ORDER BY owner, table_name, column_name "
"), "
"dummy_entry AS ( "
"SELECT owner, table_name, 'RAR' as SOURCE, rar3_type_of_access "
"FROM rar_columns "
"FETCH FIRST ROW ONLY "
"), "
"disc_tec_fields AS ( "
"SELECT owner, table_name, 'TEC_INGESTION_DATE' AS column_name, source, rar3_type_of_access "
"FROM dummy_entry "
"UNION "
"( "
"SELECT owner, table_name, 'TEC_EXECUTION_DATE' AS column_name, source, rar3_type_of_access "
"FROM dummy_entry "
") "
"UNION "
"( "
"SELECT owner, table_name, 'TEC_RUN_ID' AS column_name, source, rar3_type_of_access "
"FROM dummy_entry "
") "
") "
"SELECT owner, table_name, column_name, source, rar3_type_of_access "
"FROM disc_tec_fields "
"UNION "
"SELECT owner, table_name, column_name, source, rar3_type_of_access "
"FROM rar_columns "
).format(metadata_table)
return query_metadata_access_type2a
def get_query_igam_roles(igam_table,service):
if service.lower() == 'rar':
service_entitlement='mrds'
elif service.lower()=='mopdb':
service_entitlement='mrds'
else:
service_entitlement='mrds'
query_igam_roles = (
"SELECT MRDS_subsource_id as Datasource, "
"MRDS_subsource_id as subsource_id, "
"MRDS_entitlement as IGAM_Entitlement, "
"environment "
"FROM {0} where lower(environment) = lower($$$1) and SERVICE_NAME='{1}'").format(igam_table,service.upper())
return query_igam_roles

View File

@@ -0,0 +1,54 @@
import os
import mrds_elt.python.devo_replicator.FlowOptions as ro
import tableBuilderProcessor_2 as tbp
# setting variables
# args = [
# 'corporate_store=crp_rar',
# 'source_schema=CORR_RAR',
# 'source_table=NH_F_RATING',
# 'target_table=NH_F_RATING',
# 'access_type=1',
# 'oracle_metadata_table=CORR_RAR.NH_METADATA_INVENTORY',
# 'oracle_igam_table=CT_REF.RAR_SOURCES_IGAM_SENTRY',
# 'sentry_role_environment=production',
# 'target_s3_bucket=s3a://devo-crp-ffppyd8q',
# 'tech_meta_data_fields=tec_ingestion_date String, tec_execution_date String, tec_run_id String',
# 'full_access_entitlement_list=DISC-PC-RAR-R'
# ]
# args = [
# 'corporate_store=crp_rar',
# 'source_schema=CORR_RAR',
# 'source_table=NH_ASSET',
# 'target_table=NH_ASSET',
# 'access_type=2a',
# 'oracle_metadata_table=CORR_RAR.NH_METADATA_INVENTORY',
# 'oracle_igam_table=CT_REF.RAR_SOURCES_IGAM_SENTRY',
# 'sentry_role_environment=production',
# 'target_s3_bucket=s3a://devo-crp-ffppyd8q',
# 'tech_meta_data_fields=tec_ingestion_date String, tec_execution_date String, tec_run_id String',
# 'full_access_entitlement_list=DISC-PC-RAR-R'
# ]
args = [
'corporate_store=crp_mopdb',
'source_schema=MPEC',
'source_table=T_MPEC',
'oracle_metadata_table=CT_MOPDB.MOPDB_METADATA_INVENTORY',
'oracle_igam_table=CT_MOPDB.MOPDB_SOURCES_IGAM_SENTRY',
'sentry_role_environment=production',
'target_s3_bucket=s3a://devo-crp-sbul3ju3/mopdb/db',
'tech_meta_data_fields=tec_ingestion_date String, tec_execution_date String, tec_run_id String',
'full_access_entitlement_list='
]
rar_options = ro.Options(args)
tableFields = tbp.loadMetadataTable(rar_options)
tbp.createExternalTables_CRP_RAR(rar_options,tableFields )
igamRoleDF = tbp.readIGAMRoles(rar_options)
tbp.accessTypeMapper(rar_options, igamRoleDF)

6
python/mrds_common/.gitignore vendored Normal file
View File

@@ -0,0 +1,6 @@
__pycache__
*.log
.venv
.tox
*.egg-info/
build

View File

@@ -0,0 +1,72 @@
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [0.6.0] - 13-10-2025
### Added
- new type of column xpath_element_id
## [0.5.0] - 08-10-2025
### Added
- added new mandatory configuration parameter `archive_prefix`. App now archives source file to this location, before deleting it from inbox_prefix location.
- log app version at runtime.
### Changed
- improved logging when calling database function CT_MRDS.FILE_MANAGER.PROCESS_SOURCE_FILE
- removed local zip file deletion from version 0.4.0 to accomodate archiving at the end of the processing
## [0.4.1] - 03-10-2025
### Added
- `--version` flag to CLI, now shows package version from `mrds.__version__`. ([#179](https://gitlab.sofa.dev/mrds/mrds_elt/-/merge_requests/179))
## [0.4.0] - 03-10-2025
### Added
- App versioning!
- Streaming algorithm when reading, filtering and enriching csv files. This drastically improves application performance in regards to RAM usage.
- Unzipping now deletes local source zip file, after data has been extracted.
## [0.3.1] - 30-09-2025
### Fixed
- fixed small bug related to the new encoding setting
## [0.3.0] - 29-09-2025
### Added
- new type of config - Application config.
These will be very specific application settings to be overridden in specific cases. Consequently, such configuration will only be optional, because rare usage is expected. First such config is encoding_type
### Changed
- removed output of .log files when running the application
### Fixed
- small bug when unzipping a file
### [0.2.0] - 17-09-2025
### Added
- automatic deletion of the source file, and all temporary files created by the app.
- two new cli paramters - --keep-source-file and --keep-tmp-dir flags, to be used to avoid deleting the source file and/or temporary working directory when testing.
- row count output in log files after enrichment.
### Fixed
- source and output columns in csv extraction were mistakenly swapped. This is now fixed.

View File

@@ -0,0 +1,328 @@
# MRDS APP
The main purpose of this application is to download XML or CSV files from source, perform some basic ETL and upload them to target.
Below is a simplified workflow of the application.
## Application workflow
```mermaid
flowchart LR
subgraph CoreApplication
direction TB
B[Read and validate config file] --> |If valid| C[Download source file]
C[Download source file] --> D[Unzip if file is ZIP]
D[Unzip if file is ZIP] --> E[Validate source file]
E --> |If valid| G[Start task defined in config file]
G --> H[Build output file with selected data from source]
H --> I[Enrich output file with metadata]
I --> J[Upload the output file]
J --> K[Trigger remote function]
K --> L[Check if more tasks are available in config file]
L --> |Yes| G
L --> |No| M[Archive & Delete source file]
M --> N[Finish workflow]
end
A[Trigger app via CLI or Airflow DAG] --> CoreApplication
```
## Installation
Checkout repository and cd to root project directory
```shell
cd python/mrds_common
```
Create new virtual environment using Python >=3.11
```shell
python3.11 -m venv .venv
```
Activate virtual environment
```shell
source .venv/bin/activate
```
Upgrade pip
```shell
pip install --upgrade pip
```
Install app
```shell
pip install .
```
## Environment variables
There are two operating system environment variables, which are requred by the application:
BUCKET_NAMESPACE - OCI namespace where main operating bucket is located (if not found - default value is frcnomajoc7v)
BUCKET - main operating OCI bucket for downloading and uploading files (if not found - default value is mrds_inbox_poc)
## Usage
The application accepts two required and four optional parameters.
### Parameters
| Parameter | Short Flag | Required | Default | Description |
|-------------------------------|------------|----------|---------|----------------------------------------------------------------------------------------------------------------------|
| `--workflow-context` | `-w` | No* | None | JSON string representing the workflow context. Must contain `run_id` and `a_workflow_history_key`. |
| `--generate-workflow-context` | | No* | | Flag type. If provided, app automatically generates and finalizes workflow context. Use this if `--workflow-context` is not provided. |
| `--source-filename` | `-s` | Yes | None | Name of the source file to be looked up in source inbox set in configuration file (`inbox_prefix`). |
| `--config-file` | `-c` | Yes | None | Path to the YAML configuration file. Can be absolute, or relative to current working directory. |
| `--keep-source-file` | | No | | Flag type. If provided, app keeps source file, instead of archiving and deleting it. |
| `--keep-tmp-dir` | | No | | Flag type. If provided, app keeps tmp directory, instead of deleting it. |
*`--workflow-context` and `--generate-workflow-context` are both optional, however - either one of them MUST be provided for the application to run.
### CLI
```shell
mrds-cli --workflow-context '{"run_id": "0ce35637-302c-4293-8069-3186d5d9a57d", "a_workflow_history_key": 352344}' \
--source-filename 'CSDB_Debt_Daily.ZIP' \
--config-file /home/dbt/GEORGI/projects/mrds_elt/airflow/ods/csdb/debt_daily/config/yaml/csdb_debt_daily.yaml
```
### Python module
Import main function from core module and provide needed parameters:
```python
from mrds.core import main
from mrds.utils.manage_runs import init_workflow, finalise_workflow
from mrds.utils.static_vars import status_success, status_failed
import datetime
import logging
import sys
# Configure logging for your needs. This is just a sample
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
log_filename = f"mrds_{current_time}.log"
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s - %(message)s",
handlers=[
logging.FileHandler(log_filename),
logging.StreamHandler(sys.stdout),
],
)
STATUS_SUCCESS = status_success
STATUS_FAILURE = status_failed
# Run time parameters
run_id = "0ce35637-302c-4293-8069-3186d5d9a57d"
a_workflow_history_key = init_workflow(database_name='ODS', workflow_name='w_OU_C2D_UC_DISSEM', workflow_run_id=run_id)
workflow_context = {
"run_id": run_id,
"a_workflow_history_key": a_workflow_history_key,
}
source_filename = "CSDB_Debt_Daily.ZIP"
config_file = "/home/dbt/GEORGI/projects/mrds_elt/airflow/ods/csdb/debt_daily/config/yaml/csdb_debt_daily.yaml"
main(workflow_context, source_filename, config_file)
# implement your desired error handling logic and provide correct status to function finalize_workflow
finalise_workflow(workflow_context["a_workflow_history_key"], STATUS_SUCCESS)
```
## Configuration
### Generate workflow context
Use this if you are using the application in standalone mode. Workflow context will be generated, and then finalized.
### Source filename
This is the source file name to be looked up in in source inbox set in the configuration file (`inbox_prefix`).
### Workflow context
This is a JSON string (or from the application standpoint view - dictionary) containing run_id and a_workflow_history_key values.
```JSON
workflow_context = {
"run_id": "0ce35637-302c-4293-8069-3186d5d9a57d",
"a_workflow_history_key": 352344,
}
```
run_id - this represent orchestration ID. Can be any string ID of your choice, for example Airflow DAG ID.
a_workflow_history_key - can be generated via mrds.utils.manage_runs.init_workflow() function.
If you provide workflow context by yourself, you need to take care of finalizing it too.
### Config file
This is the main place which we can control the application.
At the top, are the Application configurations. These apply to all tasks. These are all optional and are used to override some specific runtime application settings.
```yaml
# System configurations
encoding_type: cp1252 # Overrides default encoding type (utf-8) of the app. This encoding is used when reading source csv/xml files and when writing the output csv files of the app. For codec naming, follow guidelines here - https://docs.python.org/3/library/codecs.html#standard-encodings
```
After that, are the global configurations. These apply to all tasks:
```yaml
# Global configurations
tmpdir: /tmp # root temporary directory to create runtime temporary directory, download source file and perform operations on it, before upload it to target
inbox_prefix: INBOX/C2D/UC_DISSEM # prefix for the inbox containing the source file
archive_prefix: ARCHIVE/C2D/UC_DISSEM # prefix for the archive bucket
workflow_name: w_OU_C2D_UC_DISSEM # name of the particular workflow
validation_schema_path: 'xsd/UseOfCollateralMessage.xsd' # relative path (to runtime location) to schema used to validate XML or CSV file
file_type: xml # file type of the expected source file - either CSV or XML
```
Following, there is a list of tasks to be performed on the source file.
We can have multiple tasks per file, meaning - we can generate more than one output file, from one source file.
Further, one of the key configuration parameters per task is "output_columns". There we define columns of the final output file.
There are several types of columns:
xpath - this type of column is used when source file is XML. It is a standart xpath expression, pointing to path in the xml.
xpath_element_id - this type of column is used when we need to id a particular xml element. Used to create foreign keys between two separate tasks. It is a standart xpath expression, pointing to path in the xml.
csv_header - this type of column is used when source file is CSV. It just points to the corresponding csv header in the source file.
a_key - generates key unique per row.
workflow_key - generates key unique per run of the application
static - allows the user to define column with static value
The application respects the order of the output columns in the configuration file, when generating the output file.
Data and columns from the source file, not included in the configuration file, will not be present in the final output file.
Example of xml task configuration:
```yaml
# List of tasks
tasks:
- task_name: ou_lm_standing_facilities_header_create_file # name of the particular task
ods_prefix: INBOX/LM/STANDING_FACILITIES/STANDING_FACILITIES_HEADER # prefix for the upload location
output_table: standing_facilities_headers # table in Oracle
namespaces:
ns2: 'http://escb.ecb.int/sf' # XML namespace
output_columns: # Columns in the output file, order will be respected.
- type: 'a_key' # A_KEY type of column
column_header: 'A_KEY' # naming of the column in the output file
- type: 'workflow_key' # WORKFLOW_KEY type of column
column_header: 'A_WORKFLOW_HISTORY_KEY'
- type: 'xpath' # xpath type of column
value: '//ns2:header/ns2:version'
column_header: 'REV_NUMBER'
is_key: 'N' # value is transposed across the rows - YES/NO. Used when there is only single value in source XML
- type: 'xpath'
value: '//ns2:header/ns2:referenceDate'
column_header: 'REF_DATE'
is_key: 'N'
- type: 'static'
value: ''
column_header: 'FREE_TEXT'
- task_name: ou_lm_standing_facilities_create_file
ods_prefix: INBOX/LM/STANDING_FACILITIES/STANDING_FACILITIES
output_table: standing_facilities
namespaces:
ns2: 'http://escb.ecb.int/sf'
output_columns:
- type: 'a_key'
column_header: 'A_KEY'
- type: 'workflow_key'
column_header: 'A_SFH_FK'
- type: 'workflow_key'
column_header: 'A_WORKFLOW_HISTORY_KEY'
- type: 'xpath'
value: '//ns2:disaggregatedStandingFacilities/ns2:standingFacilities/ns2:disaggregatedStandingFacility/ns2:country'
column_header: 'COUNTRY'
- type: 'static'
value: ''
column_header: 'COMMENT_'
```
Example of CSV task configuration:
```yaml
tasks:
- task_name: ODS_CSDB_DEBT_DAILY_process_csv
ods_prefix: ODS/CSDB/DEBT_DAILY
output_table: DEBT_DAILY
output_columns:
- type: 'a_key'
column_header: 'A_KEY'
- type: 'workflow_key'
column_header: 'A_WORKFLOW_HISTORY_KEY'
- type: 'csv_header' # csv_header type of column
value: 'Date last modified' # naming of the column in the SOURCE file
column_header: 'Date last modified' # naming of the column in the OUTPUT file
- type: 'csv_header'
value: 'Extraction date'
column_header: 'Extraction date'
- type: 'csv_header'
value: 'ISIN code'
column_header: 'ISIN code'
```
## Development
### Installing requirements
Install app + dev requirements. For easier workflow, you can install in editable mode
```
pip install -e .[dev]
```
In editable mode, instead of copying the package files to the site-packages directory, pip creates a special link that points to the source code directory. This means any changes you make to your source code will be immediately available without needing to reinstall the package.
### Code formattting
Run black to reformat the code before pushing changes.
Following will reformat all files recursively from current dir.
```
black .
```
Following will only check and report what needs to be formatted, recursively from current dir.
```
black --check --diff .
```
### Tests
Run tests with
```
pytest .
```
### Tox automation
Tox automates runs of black checks and tests
```
tox .
```

View File

@@ -0,0 +1 @@
__version__ = "0.6.0"

View File

@@ -0,0 +1,117 @@
import click
import json
import logging
import sys
from mrds import __version__
from mrds.core import main
@click.command()
@click.version_option(version=__version__, prog_name="mrds")
@click.option(
"--workflow-context",
"-w",
required=False,
help="Workflow context to be used by the application. This is required unless --generate-workflow-context is provided.",
)
@click.option(
"--source-filename",
"-s",
required=True,
help="Source filename to be processed.",
)
@click.option(
"--config-file",
"-c",
type=click.Path(exists=True),
required=True,
help="Path to the YAML configuration file.",
)
@click.option(
"--generate-workflow-context",
is_flag=True,
default=False,
help="Generate a workflow context automatically. If this is set, --workflow-context is not required.",
)
@click.option(
"--keep-source-file",
is_flag=True,
default=False,
help="Keep source file, instead of deleting it.",
)
@click.option(
"--keep-tmp-dir",
is_flag=True,
default=False,
help="Keep tmp directory, instead of deleting it.",
)
def cli_main(
workflow_context,
source_filename,
config_file,
generate_workflow_context,
keep_source_file,
keep_tmp_dir,
):
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s - %(message)s",
handlers=[
logging.StreamHandler(sys.stdout),
],
)
# Handle conflicting options
if workflow_context and generate_workflow_context:
raise click.UsageError(
"You cannot use both --workflow-context and --generate-workflow-context at the same time. "
"Please provide only one."
)
# Enforce that either --workflow-context or --generate-workflow-context must be provided
if not workflow_context and not generate_workflow_context:
raise click.UsageError(
"You must provide --workflow-context or use --generate-workflow-context flag."
)
# Parse and validate the workflow_context if provided
if workflow_context:
try:
workflow_context = json.loads(workflow_context)
except json.JSONDecodeError as e:
raise click.UsageError(f"Invalid JSON for --workflow-context: {e}")
# Validate that the workflow_context matches the expected structure
if (
not isinstance(workflow_context, dict)
or "run_id" not in workflow_context
or "a_workflow_history_key" not in workflow_context
):
raise click.UsageError(
"Invalid workflow context structure. It must be a JSON object with 'run_id' and 'a_workflow_history_key'."
)
# Call the core processing function
main(
workflow_context,
source_filename,
config_file,
generate_workflow_context,
keep_source_file,
keep_tmp_dir,
)
if __name__ == "__main__":
try:
cli_main()
sys.exit(0)
except click.UsageError as e:
logging.error(f"Usage error: {e}")
sys.exit(2)
except Exception as e:
logging.error(f"Unexpected error: {e}")
sys.exit(1)

View File

@@ -0,0 +1,366 @@
import os
import uuid
import logging
import yaml
import zipfile
import tempfile
from dataclasses import dataclass, field
from mrds import __version__
from mrds.processors import get_file_processor
from mrds.utils import (
manage_runs,
objectstore,
static_vars,
xml_utils,
)
# environment variables
MRDS_ENV = os.getenv("MRDS_ENV", "poc")
BUCKET = os.getenv("INBOX_BUCKET", "mrds_inbox_poc")
BUCKET_NAMESPACE = os.getenv("BUCKET_NAMESPACE", "frcnomajoc7v")
# Static configuration variables
WORKFLOW_TYPE = "ODS"
ENCODING_TYPE = "utf-8"
CONFIG_REQUIRED_KEYS = [
"tmpdir",
"inbox_prefix",
"archive_prefix",
"workflow_name",
"validation_schema_path",
"tasks",
"file_type",
]
TASK_REQUIRED_KEYS = [
"task_name",
"ods_prefix",
"output_table",
"output_columns",
]
STATUS_SUCCESS = static_vars.status_success
STATUS_FAILURE = static_vars.status_failed
@dataclass
class GlobalConfig:
tmpdir: str
inbox_prefix: str
archive_prefix: str
workflow_name: str
source_filename: str
validation_schema_path: str
bucket: str
bucket_namespace: str
file_type: str
encoding_type: str
def __post_init__(self):
self.original_source_filename = self.source_filename # keep this in case we have a zip file to archive
@property
def source_filepath(self) -> str:
return os.path.join(self.tmpdir, self.source_filename)
@property
def original_source_filepath(self) -> str:
return os.path.join(self.tmpdir, self.original_source_filename)
@dataclass
class TaskConfig:
task_name: str
ods_prefix: str
output_table: str
namespaces: dict
output_columns: list
def initialize_config(source_filename, config_file_path):
logging.info(f"Source filename is set to: {source_filename}")
logging.info(f"Loading configuration from {config_file_path}")
# Ensure the file exists
if not os.path.exists(config_file_path):
raise FileNotFoundError(f"Configuration file {config_file_path} not found.")
# Load the configuration
with open(config_file_path, "r") as f:
config_data = yaml.safe_load(f)
logging.debug(f"Configuration data: {config_data}")
missing_keys = [key for key in CONFIG_REQUIRED_KEYS if key not in config_data]
if missing_keys:
raise ValueError(f"Missing required keys in configuration: {missing_keys}")
# Create GlobalConfig instance
global_config = GlobalConfig(
tmpdir=config_data["tmpdir"],
inbox_prefix=config_data["inbox_prefix"],
archive_prefix=config_data["archive_prefix"],
workflow_name=config_data["workflow_name"],
source_filename=source_filename,
validation_schema_path=config_data["validation_schema_path"],
bucket=BUCKET,
bucket_namespace=BUCKET_NAMESPACE,
file_type=config_data["file_type"],
encoding_type=config_data.get("encoding_type", ENCODING_TYPE),
)
# Create list of TaskConfig instances
tasks_data = config_data["tasks"]
tasks = []
for task_data in tasks_data:
# Validate required keys in task_data
missing_task_keys = [key for key in TASK_REQUIRED_KEYS if key not in task_data]
if missing_task_keys:
raise ValueError(
f"Missing required keys in task configuration: {missing_task_keys}"
)
task = TaskConfig(
task_name=task_data["task_name"],
ods_prefix=task_data["ods_prefix"],
output_table=task_data["output_table"],
namespaces=task_data.get("namespaces", {}),
output_columns=task_data["output_columns"],
)
tasks.append(task)
return global_config, tasks
def initialize_workflow(global_config):
run_id = str(uuid.uuid4())
logging.info(f"Initializing workflow '{global_config.workflow_name}'")
a_workflow_history_key = manage_runs.init_workflow(
WORKFLOW_TYPE, global_config.workflow_name, run_id
)
return {
"run_id": run_id,
"a_workflow_history_key": a_workflow_history_key,
}
def download_source_file(client, global_config):
logging.info(
f"Downloading source file '{global_config.source_filename}' "
f"from '{global_config.bucket}/{global_config.inbox_prefix}'"
)
objectstore.download_file(
client,
global_config.bucket_namespace,
global_config.bucket,
global_config.inbox_prefix,
global_config.source_filename,
global_config.source_filepath,
)
logging.info(f"Source file downloaded to '{global_config.source_filepath}'")
def delete_source_file(client, global_config):
logging.info(
f"Deleting source file '{global_config.bucket}/{global_config.inbox_prefix}/{global_config.original_source_filename}'"
)
objectstore.delete_file(
client,
global_config.original_source_filename,
global_config.bucket_namespace,
global_config.bucket,
global_config.inbox_prefix,
)
logging.info(
f"Deleted source file '{global_config.bucket}/{global_config.inbox_prefix}/{global_config.original_source_filename}'"
)
def archive_source_file(client, global_config):
logging.info(
f"Archiving source file to '{global_config.bucket}/{global_config.archive_prefix}/{global_config.original_source_filename}'"
)
objectstore.upload_file(
client,
global_config.original_source_filepath,
global_config.bucket_namespace,
global_config.bucket,
global_config.archive_prefix,
global_config.original_source_filename,
)
logging.info(
f"Source file archived to '{global_config.bucket}/{global_config.archive_prefix}/{global_config.original_source_filename}'"
)
def unzip_source_file_if_needed(global_config):
source_filepath = global_config.source_filepath
# If it's not a zip, nothing to do
if not zipfile.is_zipfile(source_filepath):
logging.info(f"File '{source_filepath}' is not a ZIP file.")
return True
logging.info(f"File '{source_filepath}' is a ZIP file. Unzipping...")
extract_dir = os.path.dirname(source_filepath)
try:
with zipfile.ZipFile(source_filepath, "r") as zip_ref:
extracted_files = zip_ref.namelist()
if len(extracted_files) != 1:
logging.error(
f"Expected one file in the ZIP, but found {len(extracted_files)} files."
)
return False
# Extract everything
zip_ref.extractall(extract_dir)
except Exception as e:
logging.error(f"Error while extracting '{source_filepath}': {e}")
return False
# Update the global_config to point to the extracted file
extracted_filename = extracted_files[0]
global_config.source_filename = extracted_filename
logging.info(
f"Extracted '{extracted_filename}' to '{extract_dir}'. "
f"Updated source_filepath to '{global_config.source_filepath}'."
)
return True
def validate_source_file(global_config):
file_type = global_config.file_type.lower()
if file_type == "xml":
xml_is_valid, xml_validation_message = xml_utils.validate_xml(
global_config.source_filepath, global_config.validation_schema_path
)
if not xml_is_valid:
raise ValueError(f"XML validation failed: {xml_validation_message}")
logging.info(xml_validation_message)
elif file_type == "csv":
# TODO: add CSV validation here
pass
else:
raise ValueError(f"Unsupported file type: {file_type}")
return True
def process_tasks(tasks, global_config, workflow_context, client):
# get appropriate task processor
processor_class = get_file_processor(global_config)
for task_conf in tasks:
logging.info(f"Starting task '{task_conf.task_name}'")
file_processor = processor_class(
global_config, task_conf, client, workflow_context
)
file_processor.process()
def finalize_workflow(workflow_context, success=True):
status = STATUS_SUCCESS if success else STATUS_FAILURE
manage_runs.finalise_workflow(workflow_context["a_workflow_history_key"], status)
if success:
logging.info("Workflow completed successfully")
else:
logging.error("Workflow failed")
def main(
workflow_context: dict,
source_filename: str,
config_file_path: str,
generate_workflow_context=False,
keep_source_file=False,
keep_tmp_dir=False,
):
logging.info(f"Initializing mrds app, version {__version__}")
tmpdir_manager = None
try:
# get configs
global_config, tasks = initialize_config(source_filename, config_file_path)
# Handle temporary dirs
if keep_tmp_dir:
tmpdir = tempfile.mkdtemp(
prefix="mrds_", dir=global_config.tmpdir
) # dir is created and never deleted
logging.info(
f"Created temporary working directory (not auto-deleted): {tmpdir}"
)
else:
tmpdir_manager = tempfile.TemporaryDirectory(
prefix="mrds_", dir=global_config.tmpdir
)
tmpdir = tmpdir_manager.name
logging.info(
f"Created temporary working directory (auto-deleted): {tmpdir}"
)
# override tmpdir with newly created tmpdir
global_config.tmpdir = tmpdir
client = objectstore.get_client()
# Handle workflow_context generation if required
if generate_workflow_context:
logging.info("Generating workflow context automatically.")
workflow_context = initialize_workflow(global_config)
logging.info(f"Generated workflow context: {workflow_context}")
else:
logging.info(f"Using provided workflow context: {workflow_context}")
download_source_file(client, global_config)
unzip_source_file_if_needed(global_config)
validate_source_file(global_config)
process_tasks(tasks, global_config, workflow_context, client)
if generate_workflow_context:
finalize_workflow(workflow_context)
if not keep_source_file:
archive_source_file(client, global_config)
delete_source_file(client, global_config)
except Exception as e:
logging.error(f"Critical error: {str(e)}")
# Finalize workflow with failure if needed
if generate_workflow_context and "workflow_context" in locals():
finalize_workflow(workflow_context, success=False)
raise RuntimeError(f"Workflow failed due to: {e}")
finally:
# Always attempt to remove tmpdir if created a TemporaryDirectory manager
if tmpdir_manager and not keep_tmp_dir:
try:
tmpdir_manager.cleanup()
logging.info(f"Deleted temporary working directory {tmpdir}")
except Exception:
logging.exception(
f"Failed to delete up temporary working directory {tmpdir}"
)

View File

@@ -0,0 +1,186 @@
# static configs
tmpdir: /tmp
inbox_prefix: INBOX/RQSD/RQSD_PROCESS
workflow_name: w_ODS_RQSD_PROCESS_DEVO
validation_schema_path: None
file_type: csv
# task configs
tasks:
- task_name: m_ODS_RQSD_OBSERVATIONS_PARSE
ods_prefix: INBOX/RQSD/RQSD_PROCESS/RQSD_OBSERVATIONS
output_table: RQSD_OBSERVATIONS
output_columns:
- type: 'workflow_key'
column_header: 'A_WORKFLOW_HISTORY_KEY'
- type: 'csv_header'
value: 'datacollectioncode'
column_header: 'datacollectioncode'
- type: 'csv_header'
value: 'datacollectionname'
column_header: 'datacollectionname'
- type: 'csv_header'
value: 'datacollectionowner'
column_header: 'datacollectionowner'
- type: 'csv_header'
value: 'reportingcyclename'
column_header: 'reportingcyclename'
- type: 'csv_header'
value: 'reportingcyclestatus'
column_header: 'reportingcyclestatus'
- type: 'csv_header'
value: 'modulecode'
column_header: 'modulecode'
- type: 'csv_header'
value: 'modulename'
column_header: 'modulename'
- type: 'csv_header'
value: 'moduleversionnumber'
column_header: 'moduleversionnumber'
- type: 'csv_header'
value: 'reportingentitycollectionuniqueid'
column_header: 'reportingentitycollectionuniqueid'
- type: 'csv_header'
value: 'entityattributereportingcode'
column_header: 'entityattributereportingcode'
- type: 'csv_header'
value: 'reportingentityname'
column_header: 'reportingentityname'
- type: 'csv_header'
value: 'reportingentityentitytype'
column_header: 'reportingentityentitytype'
- type: 'csv_header'
value: 'entityattributecountry'
column_header: 'entityattributecountry'
- type: 'csv_header'
value: 'entitygroupentityname'
column_header: 'entitygroupentityname'
- type: 'csv_header'
value: 'obligationmodulereferencedate'
column_header: 'obligationmodulereferencedate'
- type: 'csv_header'
value: 'obligationmoduleremittancedate'
column_header: 'obligationmoduleremittancedate'
- type: 'csv_header'
value: 'receivedfilereceiveddate'
column_header: 'receivedfilereceiveddate'
- type: 'csv_header'
value: 'obligationmoduleexpected'
column_header: 'obligationmoduleexpected'
- type: 'csv_header'
value: 'receivedfileversionnumber'
column_header: 'receivedfileversionnumber'
- type: 'csv_header'
value: 'revalidationversionnumber'
column_header: 'revalidationversionnumber'
- type: 'csv_header'
value: 'revalidationdate'
column_header: 'revalidationdate'
- type: 'csv_header'
value: 'receivedfilesystemfilename'
column_header: 'receivedfilesystemfilename'
- type: 'csv_header'
value: 'obligationstatusstatus'
column_header: 'obligationstatusstatus'
- type: 'csv_header'
value: 'filestatussetsubmissionstatus'
column_header: 'filestatussetsubmissionstatus'
- type: 'csv_header'
value: 'filestatussetvalidationstatus'
column_header: 'filestatussetvalidationstatus'
- type: 'csv_header'
value: 'filestatussetexternalvalidationstatus'
column_header: 'filestatussetexternalvalidationstatus'
- type: 'csv_header'
value: 'numberoferrors'
column_header: 'numberoferrors'
- type: 'csv_header'
value: 'numberofwarnings'
column_header: 'numberofwarnings'
- type: 'csv_header'
value: 'delayindays'
column_header: 'delayindays'
- type: 'csv_header'
value: 'failedattempts'
column_header: 'failedattempts'
- type: 'csv_header'
value: 'observationvalue'
column_header: 'observationvalue'
- type: 'csv_header'
value: 'observationtextvalue'
column_header: 'observationtextvalue'
- type: 'csv_header'
value: 'observationdatevalue'
column_header: 'observationdatevalue'
- type: 'csv_header'
value: 'datapointsetdatapointidentifier'
column_header: 'datapointsetdatapointidentifier'
- type: 'csv_header'
value: 'datapointsetlabel'
column_header: 'datapointsetlabel'
- type: 'csv_header'
value: 'obsrvdescdatatype'
column_header: 'obsrvdescdatatype'
- type: 'csv_header'
value: 'ordinatecode'
column_header: 'ordinatecode'
- type: 'csv_header'
value: 'ordinateposition'
column_header: 'ordinateposition'
- type: 'csv_header'
value: 'tablename'
column_header: 'tablename'
- type: 'csv_header'
value: 'isstock'
column_header: 'isstock'
- type: 'csv_header'
value: 'scale'
column_header: 'scale'
- type: 'csv_header'
value: 'currency'
column_header: 'currency'
- type: 'csv_header'
value: 'numbertype'
column_header: 'numbertype'
- type: 'csv_header'
value: 'ismandatory'
column_header: 'ismandatory'
- type: 'csv_header'
value: 'decimalplaces'
column_header: 'decimalplaces'
- type: 'csv_header'
value: 'serieskey'
column_header: 'serieskey'
- type: 'csv_header'
value: 'tec_source_system'
column_header: 'tec_source_system'
- type: 'csv_header'
value: 'tec_dataset'
column_header: 'tec_dataset'
- type: 'csv_header'
value: 'tec_surrogate_key'
column_header: 'tec_surrogate_key'
- type: 'csv_header'
value: 'tec_crc'
column_header: 'tec_crc'
- type: 'csv_header'
value: 'tec_ingestion_date'
column_header: 'tec_ingestion_date'
- type: 'csv_header'
value: 'tec_version_id'
column_header: 'tec_version_id'
- type: 'csv_header'
value: 'tec_execution_date'
column_header: 'tec_execution_date'
- type: 'csv_header'
value: 'tec_run_id'
column_header: 'tec_run_id'
- type: 'static'
value: 'test test'
column_header: 'BLABLA'
- type: 'a_key'
column_header: 'A_KEY'
- type: 'csv_header'
value: 'tec_business_date'
column_header: 'tec_business_dateTest!'

View File

@@ -0,0 +1,50 @@
# file uploader
import os
import sys
import logging
from mrds.utils import objectstore
BUCKET = os.getenv("INBOX_BUCKET", "mrds_inbox_poc")
BUCKET_NAMESPACE = os.getenv("BUCKET_NAMESPACE", "frcnomajoc7v")
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s - %(message)s",
handlers=[
logging.StreamHandler(sys.stdout),
],
)
source_filepath = '/home/dbt/tmp/mrds_4twsw_ib/20250630_Pre-Production_DV_P2_DBT_I4.zip'
source_filename = '20250630_Pre-Production_DV_P2_DBT_I4.zip'
target_prefix = 'INBOX/CSDB/STC_CentralizedSecuritiesDissemination_ECB'
def upload_file():
client = objectstore.get_client()
logging.info(
f"uploading source file to '{BUCKET}/{target_prefix}/{source_filename}'"
)
objectstore.upload_file(
client,
source_filepath,
BUCKET_NAMESPACE,
BUCKET,
target_prefix,
source_filename,
)
logging.info(
f"Source file uploaded to '{BUCKET}/{target_prefix}/{source_filename}'"
)
if __name__ == "__main__":
try:
upload_file()
sys.exit(0)
except Exception as e:
logging.error(f"Unexpected error: {e}")
sys.exit(1)

View File

@@ -0,0 +1,15 @@
from .xml_processor import XMLTaskProcessor
from .csv_processor import CSVTaskProcessor
def get_file_processor(global_config):
"""
Factory function to get the appropriate file processor class based on the file type in the global configuration.
"""
file_type = global_config.file_type.lower()
if file_type == "xml":
return XMLTaskProcessor
elif file_type == "csv":
return CSVTaskProcessor
else:
raise ValueError(f"Unsupported file type: {file_type}")

View File

@@ -0,0 +1,211 @@
import logging
import os
import csv
from abc import ABC, abstractmethod
from mrds.utils.utils import parse_output_columns
from mrds.utils import (
manage_files,
manage_runs,
objectstore,
static_vars,
)
OUTPUT_FILENAME_TEMPLATE = "{output_table}-{task_history_key}.csv"
STATUS_SUCCESS = static_vars.status_success # duplicated needs to be moved #TODO
class TaskProcessor(ABC):
def __init__(self, global_config, task_conf, client, workflow_context):
self.global_config = global_config
self.task_conf = task_conf
self.client = client
self.workflow_context = workflow_context
self._init_common()
self._post_init()
def _init_common(self):
# Initialize task
self.a_task_history_key = manage_runs.init_task(
self.task_conf.task_name,
self.workflow_context["run_id"],
self.workflow_context["a_workflow_history_key"],
)
logging.info(f"Task initialized with history key: {self.a_task_history_key}")
# Define output file paths
self.output_filename = OUTPUT_FILENAME_TEMPLATE.format(
output_table=self.task_conf.output_table,
task_history_key=self.a_task_history_key,
)
self.output_filepath = os.path.join(
self.global_config.tmpdir, self.output_filename
)
# Parse the output_columns
(
self.xpath_entries,
self.csv_entries,
self.static_entries,
self.a_key_entries,
self.workflow_key_entries,
self.xml_position_entries,
self.column_order,
) = parse_output_columns(self.task_conf.output_columns)
def _post_init(self):
"""Optional hook for classes to override"""
pass
@abstractmethod
def _extract(self):
"""Non-optional hook for classes to override"""
pass
def _enrich(self):
"""
Stream-based enrich: read one row at a time, append static/A-key/workflow-key,
reorder columns, and write out immediately.
"""
TASK_HISTORY_MULTIPLIER = 1_000_000_000
logging.info(f"Enriching CSV file at '{self.output_filepath}'")
temp_output = self.output_filepath + ".tmp"
encoding = self.global_config.encoding_type
with open(self.output_filepath, newline="", encoding=encoding) as inf, open(
temp_output, newline="", encoding=encoding, mode="w"
) as outf:
reader = csv.reader(inf)
writer = csv.writer(outf, quoting=csv.QUOTE_ALL)
# Read the original header
original_headers = next(reader)
# Compute the full set of headers
headers = list(original_headers)
# Add static column headers if missing
for col_name, _ in self.static_entries:
if col_name not in headers:
headers.append(col_name)
# Add A-key column headers if missing
for col_name in self.a_key_entries:
if col_name not in headers:
headers.append(col_name)
# Add workflow key column headers if missing
for col_name in self.workflow_key_entries:
if col_name not in headers:
headers.append(col_name)
# Rearrange headers to desired ordr
header_to_index = {h: i for i, h in enumerate(headers)}
out_indices = [
header_to_index[h] for h in self.column_order if h in header_to_index
]
out_headers = [headers[i] for i in out_indices]
# Write the new header
writer.writerow(out_headers)
# Stream each row, enrich in-place, reorder, and write
row_count = 0
base_task_history = int(self.a_task_history_key) * TASK_HISTORY_MULTIPLIER
for i, in_row in enumerate(reader, start=1):
# Build a working list that matches `headers` order.
# Start by copying the existing columns (or '' if missing)
work_row = [None] * len(headers)
for j, h in enumerate(original_headers):
idx = header_to_index[h]
work_row[idx] = in_row[j]
# Fill static columns
for col_name, value in self.static_entries:
idx = header_to_index[col_name]
work_row[idx] = value
# Fill A-key columns
for col_name in self.a_key_entries:
idx = header_to_index[col_name]
a_key_value = base_task_history + i
work_row[idx] = str(a_key_value)
# Fill workflow key columns
wf_val = self.workflow_context["a_workflow_history_key"]
for col_name in self.workflow_key_entries:
idx = header_to_index[col_name]
work_row[idx] = wf_val
# Reorder to output order and write
out_row = [work_row[j] for j in out_indices]
writer.writerow(out_row)
row_count += 1
# Atomically replace
os.replace(temp_output, self.output_filepath)
logging.info(
f"CSV file enriched at '{self.output_filepath}', {row_count} rows generated"
)
def _upload(self):
# Upload CSV to object store
logging.info(
f"Uploading CSV file to '{self.global_config.bucket}/{self.task_conf.ods_prefix}/{self.output_filename}'"
)
objectstore.upload_file(
self.client,
self.output_filepath,
self.global_config.bucket_namespace,
self.global_config.bucket,
self.task_conf.ods_prefix,
self.output_filename,
)
logging.info(
f"CSV file uploaded to '{self.global_config.bucket}/{self.task_conf.ods_prefix}/{self.output_filename}'"
)
def _process_remote(self):
# Process the source file
logging.info(f"Processing source file '{self.output_filename}' with CT_MRDS.FILE_MANAGER.PROCESS_SOURCE_FILE database function.")
try:
manage_files.process_source_file(
self.task_conf.ods_prefix, self.output_filename
)
except Exception as e:
logging.error(
f"Processing source file '{self.output_filename}' failed. Cleaning up..."
)
objectstore.delete_file(
self.client,
self.output_filename,
self.global_config.bucket_namespace,
self.global_config.bucket,
self.task_conf.ods_prefix,
)
logging.error(
f"CSV file '{self.global_config.bucket}/{self.task_conf.ods_prefix}/{self.output_filename}' deleted."
)
raise
else:
logging.info(f"Source file '{self.output_filename}' processed")
def _finalize(self):
# Finalize task
manage_runs.finalise_task(self.a_task_history_key, STATUS_SUCCESS)
logging.info(f"Task '{self.task_conf.task_name}' completed successfully")
def process(self):
# main processor function
self._extract()
self._enrich()
self._upload()
self._process_remote()
self._finalize()

View File

@@ -0,0 +1,52 @@
import logging
import csv
import os
from .base import TaskProcessor
class CSVTaskProcessor(TaskProcessor):
def _extract(self):
input_path = self.global_config.source_filepath
output_path = self.output_filepath
encoding = self.global_config.encoding_type
logging.info(f"Reading source CSV file at '{input_path}'")
# Open both input & output at once for streaming row-by-row
temp_output = output_path + ".tmp"
with open(input_path, newline="", encoding=encoding) as inf, open(
temp_output, newline="", encoding=encoding, mode="w"
) as outf:
reader = csv.reader(inf)
writer = csv.writer(outf, quoting=csv.QUOTE_ALL)
# Read and parse the header
headers = next(reader)
# Build the list of headers to keep + their new names
headers_to_keep = [old for _, old in self.csv_entries]
headers_rename = [new for new, _ in self.csv_entries]
# Check if all specified headers exist in the input file
missing = [h for h in headers_to_keep if h not in headers]
if missing:
raise ValueError(
f"The following headers are not in the input CSV: {missing}"
)
# Determine the indices of the headers to keep
indices = [headers.index(old) for old in headers_to_keep]
# Write the renamed header
writer.writerow(headers_rename)
# Stream through every data row and write out the filtered columns
for row in reader:
filtered = [row[i] for i in indices]
writer.writerow(filtered)
# Atomically replace the old file
os.replace(temp_output, output_path)
logging.info(f"Core data written to CSV file at '{output_path}'")

View File

@@ -0,0 +1,30 @@
import logging
from .base import TaskProcessor
from mrds.utils import (
xml_utils,
csv_utils,
)
class XMLTaskProcessor(TaskProcessor):
def _extract(self):
# Extract data from XML
csv_data = xml_utils.extract_data(
self.global_config.source_filepath,
self.xpath_entries,
self.xml_position_entries,
self.task_conf.namespaces,
self.workflow_context,
self.global_config.encoding_type,
)
logging.info(f"CSV data extracted for task '{self.task_conf.task_name}'")
# Generate CSV
logging.info(f"Writing core data to CSV file at '{self.output_filepath}'")
csv_utils.write_data_to_csv_file(
self.output_filepath, csv_data, self.global_config.encoding_type
)
logging.info(f"Core data written to CSV file at '{self.output_filepath}'")

View File

@@ -0,0 +1,69 @@
import csv
import os
TASK_HISTORY_MULTIPLIER = 1_000_000_000
def read_csv_file(csv_filepath, encoding_type="utf-8"):
with open(csv_filepath, "r", newline="", encoding=encoding_type) as csvfile:
reader = list(csv.reader(csvfile))
headers = reader[0]
data_rows = reader[1:]
return headers, data_rows
def write_data_to_csv_file(csv_filepath, data, encoding_type="utf-8"):
temp_csv_filepath = csv_filepath + ".tmp"
with open(temp_csv_filepath, "w", newline="", encoding=encoding_type) as csvfile:
writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
writer.writerow(data["headers"])
writer.writerows(data["rows"])
os.replace(temp_csv_filepath, csv_filepath)
def add_static_columns(data_rows, headers, static_entries):
for column_header, value in static_entries:
if column_header not in headers:
headers.append(column_header)
for row in data_rows:
row.append(value)
else:
idx = headers.index(column_header)
for row in data_rows:
row[idx] = value
def add_a_key_columns(data_rows, headers, a_key_entries, task_history_key):
for column_header in a_key_entries:
if column_header not in headers:
headers.append(column_header)
for i, row in enumerate(data_rows, start=1):
a_key_value = int(task_history_key) * TASK_HISTORY_MULTIPLIER + i
row.append(str(a_key_value))
else:
idx = headers.index(column_header)
for i, row in enumerate(data_rows, start=1):
a_key_value = int(task_history_key) * TASK_HISTORY_MULTIPLIER + i
row[idx] = str(a_key_value)
def add_workflow_key_columns(data_rows, headers, workflow_key_entries, workflow_key):
for column_header in workflow_key_entries:
if column_header not in headers:
headers.append(column_header)
for row in data_rows:
row.append(workflow_key)
else:
idx = headers.index(column_header)
for row in data_rows:
row[idx] = workflow_key
def rearrange_columns(headers, data_rows, column_order):
header_to_index = {header: idx for idx, header in enumerate(headers)}
new_indices = [
header_to_index[header] for header in column_order if header in header_to_index
]
headers = [headers[idx] for idx in new_indices]
data_rows = [[row[idx] for idx in new_indices] for row in data_rows]
return headers, data_rows

View File

@@ -0,0 +1,177 @@
from . import oraconn
from . import sql_statements
from . import utils
# Get the next load id from the sequence
#
# Workflows
#
def process_source_file_from_event(resource_id: str):
#
# expects object uri in the form /n/<namespace>/b/<bucket>/o/<object>
# eg /n/frcnomajoc7v/b/dmarsdb1/o/sqlnet.log
# and calls process_source_file with prefix and file_name extracted from that uri
#
_, _, prefix, file_name = utils.parse_uri_with_regex(resource_id)
process_source_file(prefix, file_name)
def process_source_file(prefix: str, filename: str):
sourcefile = f"{prefix.rstrip('/')}/{filename}" # rstrip to cater for cases where the prefix is passed with a trailing slash
try:
conn = oraconn.connect("MRDS_LOADER")
oraconn.run_proc(conn, "CT_MRDS.FILE_MANAGER.PROCESS_SOURCE_FILE", [sourcefile])
conn.commit()
finally:
conn.close()
def execute_query(query, query_parameters=None, account_alias="MRDS_LOADER"):
query_result = None
try:
conn = oraconn.connect(account_alias)
curs = conn.cursor()
if query_parameters != None:
curs.execute(query, query_parameters)
else:
curs.execute(query)
query_result = curs.fetchall()
conn.commit()
finally:
conn.close()
return [t[0] for t in query_result]
def get_file_prefix(source_key, source_file_id, table_id):
query_result = None
try:
conn = oraconn.connect("MRDS_LOADER")
curs = conn.cursor()
curs.execute(
sql_statements.get_sql("get_file_prefix"),
[source_key, source_file_id, table_id],
)
query_result = curs.fetchone()
conn.commit()
finally:
conn.close()
return query_result[0]
def get_inbox_bucket():
try:
conn = oraconn.connect("MRDS_LOADER")
ret = oraconn.run_func(conn, "CT_MRDS.FILE_MANAGER.GET_INBOX_BUCKET", str, [])
conn.commit()
finally:
conn.close()
return ret
def get_data_bucket():
try:
conn = oraconn.connect("MRDS_LOADER")
ret = oraconn.run_func(conn, "CT_MRDS.FILE_MANAGER.GET_DATA_BUCKET", str, [])
conn.commit()
finally:
conn.close()
return ret
def add_source_file_config(
source_key,
source_file_type,
source_file_id,
source_file_desc,
source_file_name_pattern,
table_id,
template_table_name,
):
try:
conn = oraconn.connect("MRDS_LOADER")
ret = oraconn.run_proc(
conn,
"CT_MRDS.FILE_MANAGER.ADD_SOURCE_FILE_CONFIG",
[
source_key,
source_file_type,
source_file_id,
source_file_desc,
source_file_name_pattern,
table_id,
template_table_name,
],
)
conn.commit()
finally:
conn.close()
return ret
def add_column_date_format(template_table_name, column_name, date_format):
try:
conn = oraconn.connect("MRDS_LOADER")
ret = oraconn.run_proc(
conn,
"CT_MRDS.FILE_MANAGER.ADD_column_date_format",
[template_table_name, column_name, date_format],
)
conn.commit()
finally:
conn.close()
return ret
def execute(stmt):
try:
conn = oraconn.connect("MRDS_LOADER")
curs = conn.cursor()
curs.execute(stmt)
conn.commit()
finally:
conn.close()
def create_external_table(table_name, template_table_name, prefix):
try:
conn = oraconn.connect("ODS_LOADER")
ret = oraconn.run_proc(
conn,
"CT_MRDS.FILE_MANAGER.CREATE_EXTERNAL_TABLE",
[table_name, template_table_name, prefix, get_bucket("ODS")],
)
conn.commit()
finally:
conn.close()
return ret
def get_bucket(bucket):
try:
conn = oraconn.connect("MRDS_LOADER")
ret = oraconn.run_func(
conn, "CT_MRDS.FILE_MANAGER.GET_BUCKET_URI", str, [bucket]
)
conn.commit()
finally:
conn.close()
return ret

View File

@@ -0,0 +1,97 @@
from . import oraconn
from . import sql_statements
from . import static_vars
from . import manage_files
def init_workflow(database_name: str, workflow_name: str, workflow_run_id: str):
try:
conn = oraconn.connect("MRDS_LOADER")
a_workflow_history_key = oraconn.run_func(
conn,
"CT_MRDS.WORKFLOW_MANAGER.INIT_WORKFLOW",
int,
[database_name, workflow_run_id, workflow_name],
)
conn.commit()
finally:
conn.close()
return a_workflow_history_key
def finalise_workflow(a_workflow_history_key: int, workflow_status: str):
try:
conn = oraconn.connect("MRDS_LOADER")
oraconn.run_proc(
conn,
"CT_MRDS.WORKFLOW_MANAGER.FINALISE_WORKFLOW",
[a_workflow_history_key, workflow_status],
)
conn.commit()
finally:
conn.close()
def init_task(task_name: str, task_run_id: str, a_workflow_history_key: int):
a_task_history_key: int
try:
conn = oraconn.connect("MRDS_LOADER")
a_task_history_key = oraconn.run_func(
conn,
"CT_MRDS.WORKFLOW_MANAGER.INIT_TASK",
int,
[task_run_id, task_name, a_workflow_history_key],
)
conn.commit()
finally:
conn.close()
return a_task_history_key
def finalise_task(a_task_history_key: int, task_status: str):
try:
conn = oraconn.connect("MRDS_LOADER")
curs = conn.cursor()
curs.execute(
sql_statements.get_sql("finalise_task"), [task_status, a_task_history_key]
)
conn.commit()
finally:
conn.close()
def set_workflow_property(
wf_history_key: int, service_name: str, property: str, value: str
):
try:
conn = oraconn.connect("MRDS_LOADER")
ret = oraconn.run_proc(
conn,
"CT_MRDS.WORKFLOW_MANAGER.SET_WORKFLOW_PROPERTY",
[wf_history_key, service_name, property, value],
)
conn.commit()
finally:
conn.close()
return ret
def select_ods_tab(table_name: str, value: str, condition="1 = 1"):
query = "select %s from %s where %s" % (value, table_name, condition)
print("query = |%s|" % query)
return manage_files.execute_query(query=query, account_alias="ODS_LOADER")

View File

@@ -0,0 +1,53 @@
import oci
def get_client():
#
# Authentication is done using Instance Principals on VMs and Resouce Principal on OCI Container Instances
# The function first tries Resource Principal and fails back to Instance Principal in case of error
#
try:
signer = oci.auth.signers.get_resource_principals_signer()
except:
signer = signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
# Create secret client and retrieve content
client = oci.object_storage.ObjectStorageClient(
{}, signer=signer
) # the first empyty bracket is an empty config
return client
def list_bucket(client, namespace, bucket, prefix):
objects = client.list_objects(namespace, bucket, prefix=prefix)
# see https://docs.oracle.com/en-us/iaas/tools/python/2.135.0/api/request_and_response.html#oci.response.Response for all attrs
return objects.data
def upload_file(client, source_filename, namespace, bucket, prefix, target_filename):
with open(source_filename, "rb") as in_file:
client.put_object(
namespace, bucket, f"{prefix.rstrip('/')}/{target_filename}", in_file
)
def clean_folder(client, namespace, bucket, prefix):
objects = client.list_objects(namespace, bucket, prefix=prefix)
for o in objects.data.objects:
print(f"Deleting {prefix.rstrip('/')}/{o.name}")
client.delete_object(namespace, bucket, f"{o.name}")
def delete_file(client, file, namespace, bucket, prefix):
client.delete_object(namespace, bucket, f"{prefix.rstrip('/')}/{file}")
def download_file(client, namespace, bucket, prefix, source_filename, target_filename):
# Retrieve the file, streaming it into another file in 1 MiB chunks
get_obj = client.get_object(
namespace, bucket, f"{prefix.rstrip('/')}/{source_filename}"
)
with open(target_filename, "wb") as f:
for chunk in get_obj.data.raw.stream(1024 * 1024, decode_content=False):
f.write(chunk)

View File

@@ -0,0 +1,38 @@
import oracledb
import os
import traceback
import sys
def connect(alias):
username = os.getenv(alias + "_DB_USER")
password = os.getenv(alias + "_DB_PASS")
tnsalias = os.getenv(alias + "_DB_TNS")
connstr = username + "/" + password + "@" + tnsalias
oracledb.init_oracle_client()
try:
conn = oracledb.connect(connstr)
return conn
except oracledb.DatabaseError as db_err:
tb = traceback.format_exc()
print(f"DatabaseError connecting to '{alias}': {db_err}\n{tb}", file=sys.stderr)
sys.exit(1)
except Exception as exc:
tb = traceback.format_exc()
print(f"Unexpected error connecting to '{alias}': {exc}\n{tb}", file=sys.stderr)
sys.exit(1)
def run_proc(connection, proc: str, param: []):
curs = connection.cursor()
curs.callproc(proc, param)
def run_func(connection, proc: str, rettype, param: []):
curs = connection.cursor()
ret = curs.callfunc(proc, rettype, param)
return ret

View File

@@ -0,0 +1,46 @@
import oci
import ast
import base64
# Specify the OCID of the secret to retrieve
def get_secretcontents(ocid):
#
# Authentication is done using Instance Principals on VMs and Resouce Principal on OCI Container Instances
# The function first tries Resource Principal and fails back to Instance Principal in case of error
#
try:
signer = oci.auth.signers.get_resource_principals_signer()
except:
signer = signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
# Create secret client and retrieve content
secretclient = oci.secrets.SecretsClient({}, signer=signer)
secretcontents = secretclient.get_secret_bundle(secret_id=ocid)
return secretcontents
def get_password(ocid):
secretcontents = get_secretcontents(ocid)
# Decode the secret from base64 and return password
keybase64 = secretcontents.data.secret_bundle_content.content
keybase64bytes = keybase64.encode("ascii")
keybytes = base64.b64decode(keybase64bytes)
key = keybytes.decode("ascii")
keydict = ast.literal_eval(key)
return keydict["password"]
def get_secret(ocid):
# Create client
secretcontents = get_secretcontents(ocid)
# Decode the secret from base64 and return it
certbase64 = secretcontents.data.secret_bundle_content.content
certbytes = base64.b64decode(certbase64)
cert = certbytes.decode("UTF-8")
return cert

View File

@@ -0,0 +1,106 @@
import re
import logging
def verify_run_id(run_id, context=None):
"""
Verify run_id for security compliance.
Args:
run_id (str): The run_id to verify
context (dict, optional): Airflow context for logging
Returns:
str: Verified run_id
Raises:
ValueError: If run_id is invalid or suspicious
"""
try:
# Basic checks
if not run_id or not isinstance(run_id, str):
raise ValueError(
f"Invalid run_id: must be non-empty string, got: {type(run_id).__name__}"
)
run_id = run_id.strip()
if len(run_id) < 1 or len(run_id) > 250:
raise ValueError(
f"Invalid run_id: length must be 1-250 chars, got: {len(run_id)}"
)
# Allow only safe characters
if not re.match(r"^[a-zA-Z0-9_\-:+.T]+$", run_id):
suspicious_chars = "".join(
set(
char for char in run_id if not re.match(r"[a-zA-Z0-9_\-:+.T]", char)
)
)
logging.warning(f"SECURITY: Invalid chars in run_id: '{suspicious_chars}'")
raise ValueError("Invalid run_id: contains unsafe characters")
# Check for attack patterns
dangerous_patterns = [
r"\.\./",
r"\.\.\\",
r"<script",
r"javascript:",
r"union\s+select",
r"drop\s+table",
r"insert\s+into",
r"delete\s+from",
r"exec\s*\(",
r"system\s*\(",
r"eval\s*\(",
r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]",
]
for pattern in dangerous_patterns:
if re.search(pattern, run_id, re.IGNORECASE):
logging.error(f"SECURITY: Dangerous pattern in run_id: '{run_id}'")
raise ValueError("Invalid run_id: contains dangerous pattern")
# Log success
if context:
dag_id = (
getattr(context.get("dag"), "dag_id", "unknown")
if context.get("dag")
else "unknown"
)
logging.info(f"run_id verified: '{run_id}' for DAG: '{dag_id}'")
return run_id
except Exception as e:
logging.error(
f"SECURITY: run_id verification failed: '{run_id}', Error: {str(e)}"
)
raise ValueError(f"run_id verification failed: {str(e)}")
def get_verified_run_id(context):
"""
Extract and verify run_id from Airflow context.
Args:
context (dict): Airflow context
Returns:
str: Verified run_id
"""
try:
run_id = None
if context and "ti" in context:
run_id = context["ti"].run_id
elif context and "run_id" in context:
run_id = context["run_id"]
if not run_id:
raise ValueError("Could not extract run_id from context")
return verify_run_id(run_id, context)
except Exception as e:
logging.error(f"Failed to get verified run_id: {str(e)}")
raise

View File

@@ -0,0 +1,68 @@
sql_statements = {}
#
# Workflows
#
# register_workflow: Register new DW load
sql_statements[
"register_workflow"
] = """INSERT INTO CT_MRDS.A_WORKFLOW_HISTORY
(A_WORKFLOW_HISTORY_KEY, WORKFLOW_RUN_ID,
WORKFLOW_NAME, WORKFLOW_START, WORKFLOW_SSUCCESSFUL)
VALUES (:a_workflow_history_key, :workflow_run_id, :workflow_name, SYSTIMESTAMP, :running_status)
"""
# get_a_workflow_history_key: get new key from sequence
sql_statements["get_a_workflow_history_key"] = (
"SELECT CT_MRDS.A_WORKFLOW_HISTORY_KEY_SEQ.NEXTVAL FROM DUAL"
)
# finalise: Update load record in A_LOAD_HISTORY after workflow completion
sql_statements[
"finalise_workflow"
] = """UPDATE CT_MRDS.A_WORKFLOW_HISTORY
SET WORKFLOW_END = SYSTIMESTAMP, WORKFLOW_SUCCESSFUL = :workflow_status
WHERE A_WORKFLOW_HISTORY_KEY = :a_workflow_history_key
"""
#
# Tasks
#
# register_task
sql_statements[
"register_task"
] = """INSERT INTO CT_MRDS.A_TASK_HISTORY (A_TASK_HISTORY_KEY,
A_WORKFLOW_HISTORY_KEY, TASK_RUN_ID,
TASK_NAME, TASK_START, TASK_SUCCESSFUL)
VALUES (:a_workflow_history_key, :workflow_run_id, :workflow_name, SYSTIMESTAMP, :running_status)
"""
# get_a_task_history_key: get new key from sequence
sql_statements["get_a_task_history_key"] = (
"SELECT CT_MRDS.A_TASK_HISTORY_KEY_SEQ.NEXTVAL FROM DUAL"
)
# finalise: Update load record in A_LOAD_HISTORY after workflow completion
sql_statements[
"finalise_task"
] = """UPDATE CT_MRDS.A_TASK_HISTORY
SET TASK_END = SYSTIMESTAMP, TASK_SUCCESSFUL = :workflow_status
WHERE A_TASK_HISTORY_KEY = :a_workflow_history_key
"""
#
# Files
#
sql_statements["get_file_prefix"] = (
"SELECT CT_MRDS.FILE_MANAGER.GET_BUCKET_PATH(:source_key, :source_file_id, :table_id) FROM DUAL"
)
def get_sql(stmt_id: str):
if stmt_id in sql_statements:
return sql_statements[stmt_id]
else:
return

View File

@@ -0,0 +1,6 @@
#
# Task management variables
#
status_running: str = "RUNNING"
status_failed: str = "N"
status_success: str = "Y"

View File

@@ -0,0 +1,83 @@
import re
def parse_uri_with_regex(uri):
"""
Parses an Oracle Object Storage URI using regular expressions to extract the namespace,
bucket name, prefix, and object name.
Parameters:
uri (str): The URI string to parse, in the format '/n/{namespace}/b/{bucketname}/o/{object_path}'
Returns:
tuple: A tuple containing (namespace, bucket_name, prefix, object_name)
"""
# Define the regular expression pattern
pattern = r"^/n/([^/]+)/b/([^/]+)/o/(.*)$"
# Match the pattern against the URI
match = re.match(pattern, uri)
if not match:
raise ValueError("Invalid URI format")
# Extract namespace, bucket name, and object path from the matched groups
namespace = match.group(1)
bucket_name = match.group(2)
object_path = match.group(3)
# Split the object path into prefix and object name
if "/" in object_path:
# Split at the last '/' to separate prefix and object name
prefix, object_name = object_path.rsplit("/", 1)
# Ensure the prefix ends with a '/'
prefix += "/"
else:
# If there is no '/', there is no prefix
prefix = ""
object_name = object_path
return namespace, bucket_name, prefix, object_name
def parse_output_columns(output_columns):
xpath_entries = []
csv_entries = []
static_entries = []
a_key_entries = []
workflow_key_entries = []
xml_position_entries = []
column_order = []
for entry in output_columns:
entry_type = entry["type"]
column_header = entry["column_header"]
column_order.append(column_header)
if entry_type == "xpath":
xpath_expr = entry["value"]
is_key = entry["is_key"]
xpath_entries.append((xpath_expr, column_header, is_key))
elif entry_type == "csv_header":
value = entry["value"]
csv_entries.append((column_header, value))
elif entry_type == "static":
value = entry["value"]
static_entries.append((column_header, value))
elif entry_type == "a_key":
a_key_entries.append(column_header)
elif entry_type == "workflow_key":
workflow_key_entries.append(column_header)
elif entry_type == "xpath_element_id": # TODO - update all xml_position namings to xpath_element_id
xpath_expr = entry["value"]
xml_position_entries.append((xpath_expr, column_header))
return (
xpath_entries,
csv_entries,
static_entries,
a_key_entries,
workflow_key_entries,
xml_position_entries,
column_order,
)

View File

@@ -0,0 +1,23 @@
import oci
import ast
import base64
# Specify the OCID of the secret to retrieve
def get_password(ocid):
# Create vaultsclient using the default config file (\.oci\config) for auth to the API
signer = signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
# Get the secret
secretclient = oci.secrets.SecretsClient({}, signer=signer)
secretcontents = secretclient.get_secret_bundle(secret_id=ocid)
# Decode the secret from base64 and print
keybase64 = secretcontents.data.secret_bundle_content.content
keybase64bytes = keybase64.encode("ascii")
keybytes = base64.b64decode(keybase64bytes)
key = keybytes.decode("ascii")
keydict = ast.literal_eval(key)
return keydict["password"]

View File

@@ -0,0 +1,177 @@
import xmlschema
import hashlib
from lxml import etree
from typing import Dict, List
def validate_xml(xml_file, xsd_file):
try:
# Create an XMLSchema instance with strict validation
schema = xmlschema.XMLSchema(xsd_file, validation="strict")
# Validate the XML file
schema.validate(xml_file)
return True, "XML file is valid against the provided XSD schema."
except xmlschema.validators.exceptions.XMLSchemaValidationError as e:
return False, f"XML validation error: {str(e)}"
except xmlschema.validators.exceptions.XMLSchemaException as e:
return False, f"XML schema error: {str(e)}"
except Exception as e:
return False, f"An error occurred during XML validation: {str(e)}"
def extract_data(
filename,
xpath_columns, # List[(expr, header, is_key)]
xml_position_columns, # List[(expr, header)]
namespaces,
workflow_context,
encoding_type="utf-8",
):
"""
Parses an XML file using XPath expressions and extracts data.
Parameters:
- filename (str): The path to the XML file to parse.
- xpath_columns (list): A list of tuples, each containing:
- XPath expression (str)
- CSV column header (str)
- Indicator if the field is a key ('Y' or 'N')
- xml_position_columns (list)
- namespaces (dict): Namespace mapping needed for lxml's xpath()
Returns:
- dict: A dictionary containing headers and rows with extracted data.
"""
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(filename, parser)
root = tree.getroot()
# Separate out key vs nonkey columns
key_cols = [ (expr, h) for expr, h, k in xpath_columns if k == "Y" ]
nonkey_cols = [ (expr, h) for expr, h, k in xpath_columns if k == "N" ]
# Evaluate every nonkey XPath and keep the ELEMENT nodes
nonkey_elements = {}
for expr, header in nonkey_cols:
elems = root.xpath(expr, namespaces=namespaces)
nonkey_elements[header] = elems
# figure out how many rows total we need
# that's the maximum length of any of the nonkey lists
if nonkey_elements:
row_count = max(len(lst) for lst in nonkey_elements.values())
else:
row_count = 0
# pad every nonkey list up to row_count with `None`
for header, lst in nonkey_elements.items():
if len(lst) < row_count:
lst.extend([None] * (row_count - len(lst)))
# key columns
key_values = []
for expr, header in key_cols:
nodes = root.xpath(expr, namespaces=namespaces)
if not nodes:
key_values.append("")
else:
first = nodes[0]
txt = (first.text if isinstance(first, etree._Element) else str(first)) or ""
key_values.append(txt.strip())
# xml_position columns
xml_positions = {}
for expr, header in xml_position_columns:
xml_positions[header] = root.xpath(expr, namespaces=namespaces)
# prepare headers
headers = [h for _, h in nonkey_cols] + [h for _, h in key_cols] + [h for _, h in xml_position_columns]
# build rows
rows = []
for i in range(row_count):
row = []
# nonkey data
for expr, header in nonkey_cols:
elem = nonkey_elements[header][i]
text = ""
if isinstance(elem, etree._Element):
text = elem.text or ""
elif elem is not None:
text = str(elem)
row.append(text.strip())
# key columns
row.extend(key_values)
# xml_position columns
for expr, header in xml_position_columns:
if not nonkey_cols:
row.append("")
continue
first_header = nonkey_cols[0][1]
data_elem = nonkey_elements[first_header][i]
if data_elem is None:
row.append("")
continue
target_list = xml_positions[header]
current = data_elem
found = None
while current is not None:
if current in target_list:
found = current
break
current = current.getparent()
if not found:
row.append("")
else:
# compute fullpath with indices
path_elems = []
walk = found
while walk is not None:
idx = 1 + sum(1 for s in walk.itersiblings(preceding=True) if s.tag == walk.tag)
path_elems.append(f"{walk.tag}[{idx}]")
walk = walk.getparent()
full_path = "/" + "/".join(reversed(path_elems))
row.append(_xml_pos_hasher(full_path, workflow_context["a_workflow_history_key"]))
rows.append(row)
return {"headers": headers, "rows": rows}
def _xml_pos_hasher(input_string, salt, hash_length=15):
"""
Helps hashing xml positions.
Parameters:
input_string (str): The string to hash.
salt (int): The integer salt to ensure deterministic, run-specific behavior.
hash_length (int): The desired length of the resulting hash (default is 15 digits).
Returns:
int: A deterministic integer hash of the specified length.
"""
# Ensure the hash length is valid
if hash_length <= 0:
raise ValueError("Hash length must be a positive integer.")
# Combine the input string with the salt to create a deterministic input
salted_input = f"{salt}:{input_string}"
# Generate a SHA-256 hash of the salted input
hash_object = hashlib.sha256(salted_input.encode())
full_hash = hash_object.hexdigest()
# Convert the hash to an integer
hash_integer = int(full_hash, 16)
# Truncate or pad the hash to the desired length
truncated_hash = str(hash_integer)[:hash_length]
return int(truncated_hash)

View File

@@ -0,0 +1,50 @@
import re
from pathlib import Path
from setuptools import setup, find_packages
# extract version from mrds/__init__.py
here = Path(__file__).parent
init_py = here / "mrds" / "__init__.py"
_version = re.search(
r'^__version__\s*=\s*["\']([^"\']+)["\']', init_py.read_text(), re.MULTILINE
).group(1)
setup(
name="mrds",
version=_version,
packages=find_packages(),
install_requires=[
"click>=8.0.0,<9.0.0",
"oci>=2.129.3,<3.0.0",
"oracledb>=2.5.1,<3.0.0",
"PyYAML>=6.0.0,<7.0.0",
"lxml>=5.0.0,<5.3.0",
"xmlschema>=3.4.0,<3.4.3",
"cryptography>=3.3.1,<42.0.0",
"PyJWT>=2.0.0,<3.0.0",
"requests>=2.25.0,<3.0.0",
],
extras_require={
"dev": [
"black==24.10.0",
"tox==4.23.2",
"pytest==8.3.4",
],
},
entry_points={
"console_scripts": [
"mrds-cli=mrds.cli:cli_main",
],
},
author="",
author_email="",
description="MRDS module for MarS ETL POC",
long_description=open("README.md").read(),
long_description_content_type="text/markdown",
url="",
classifiers=[
"Programming Language :: Python :: 3",
"Operating System :: OS Independent",
],
python_requires=">=3.11",
)

View File

@@ -0,0 +1,17 @@
# tox.ini
[tox]
envlist = py310, format
[testenv]
deps =
pytest
commands =
pytest
[testenv:format]
basepython = python3
deps =
black
commands =
black --check --diff .