init
This commit is contained in:
73
python/code_generation/generate_code.py
Normal file
73
python/code_generation/generate_code.py
Normal file
@@ -0,0 +1,73 @@
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
import csv
|
||||
|
||||
environment = Environment(loader=FileSystemLoader("templates/"))
|
||||
template_sq = environment.get_template("m_Template_sq.sql")
|
||||
template_tgt = environment.get_template("m_Template_target.sql")
|
||||
template_db = environment.get_template("db_Template.sql")
|
||||
template_dag = environment.get_template("dag_Template.py")
|
||||
|
||||
csvFile = csv.reader(open("ods_mopdb_plain.txt", "r"))
|
||||
header = next(csvFile) # skip header
|
||||
|
||||
for row in csvFile:
|
||||
|
||||
(schema, table) = row
|
||||
|
||||
# ou_tms,ACTIVITYLOGDUE,TMS,T_ACTIVITYLOGDUE,m_MOPDB_TMS_T_ACTIVITYLOGDUE_OU_TMS_ACTIVITYLOGDUE,w_ODS_TMS_ACTIVITYLOGDUE
|
||||
|
||||
target_table = f"T_{table}"
|
||||
source_schema = f"OU_{schema}"
|
||||
source_schema_lower = f"{source_schema}".lower()
|
||||
source_table = table
|
||||
mapping_name = f"m_MOPDB_{schema}_{target_table}_{source_schema}_{source_table}"
|
||||
workflow_name = f"w_MOPDB_{schema}_{target_table}"
|
||||
|
||||
|
||||
content = template_sq.render(
|
||||
source_schema = source_schema_lower,
|
||||
source_table = table,
|
||||
target_schema = schema,
|
||||
target_table = target_table,
|
||||
mapping_name = mapping_name,
|
||||
workflow_name = workflow_name
|
||||
)
|
||||
filename=f"source_qualifiers/{mapping_name}_SQ.sql"
|
||||
with open(filename, mode="w", encoding="utf-8") as message:
|
||||
message.write(content)
|
||||
print(f"... wrote {filename}")
|
||||
|
||||
content = template_tgt.render(
|
||||
source_schema = source_schema_lower,
|
||||
source_table = table,
|
||||
target_schema = schema,
|
||||
target_table = target_table,
|
||||
mapping_name = mapping_name,
|
||||
workflow_name = workflow_name
|
||||
)
|
||||
filename=f"targets/{mapping_name}.sql"
|
||||
with open(filename, mode="w", encoding="utf-8") as message:
|
||||
message.write(content)
|
||||
print(f"... wrote {filename}")
|
||||
|
||||
content = template_dag.render(
|
||||
table = table
|
||||
)
|
||||
filename=f"dags/{workflow_name}.py"
|
||||
with open(filename, mode="w", encoding="utf-8") as message:
|
||||
message.write(content)
|
||||
print(f"... wrote {filename}")
|
||||
|
||||
content = template_db.render(
|
||||
table = table,
|
||||
schema = schema
|
||||
)
|
||||
filename=f"db/{source_schema}_{source_table}.sql"
|
||||
with open(filename, mode="w", encoding="utf-8") as message:
|
||||
message.write(content)
|
||||
print(f"... wrote {filename}")
|
||||
filename=f"db/table_changes.sql"
|
||||
with open(filename, mode="a", encoding="utf-8") as message:
|
||||
message.write(f"{content}\n")
|
||||
print(f"... wrote {filename}")
|
||||
|
||||
30
python/code_generation/ods_mopdb_plain.txt
Normal file
30
python/code_generation/ods_mopdb_plain.txt
Normal file
@@ -0,0 +1,30 @@
|
||||
SCHEMA,TABLE
|
||||
TMS,ACMENTRYSTATELEDGERGROUP
|
||||
TMS,ACTIVITYLOGDUE
|
||||
TMS,ACTIVITY_LOG
|
||||
TMS,BALANCE
|
||||
TMS,BLACKOUT_LOG
|
||||
TMS,BRANCH
|
||||
TMS,CALENDAR
|
||||
TMS,CASHFLOW
|
||||
TMS,CLIENT
|
||||
TMS,CUSTODYBALANCE
|
||||
TMS,ECBINSTRUMENTBONDCASHFLOW
|
||||
TMS,EFFECTIVEROLEPROFILE
|
||||
TMS,FINMESSAGELOG
|
||||
TMS,HISTORY_LOG
|
||||
TMS,INSTRUMENTBONDCASHFLOW
|
||||
TMS,INSTRUMENT_REPORT
|
||||
TMS,MARKETINFO
|
||||
TMS,PARAMETER
|
||||
TMS,PORTFOLIOTREE
|
||||
TMS,PRICES
|
||||
TMS,PROPERTY
|
||||
TMS,RECONCILIATION
|
||||
TMS,ROLEPORTFOLIOPROFILE
|
||||
TMS,RULES
|
||||
TMS,SDM_ENTITY_STATE
|
||||
TMS,SECURITYPOSITION
|
||||
TMS,SETTLEMENTCASHFLOW
|
||||
TMS,SETTLEMENTLOG
|
||||
TMS,USERINFORMATION
|
||||
95
python/code_generation/templates/dag_Template.py
Normal file
95
python/code_generation/templates/dag_Template.py
Normal file
@@ -0,0 +1,95 @@
|
||||
from airflow.decorators import dag
|
||||
from airflow.operators.bash import BashOperator
|
||||
from airflow.operators.python import PythonOperator
|
||||
from airflow.utils.dates import days_ago
|
||||
from airflow.utils.trigger_rule import TriggerRule
|
||||
from cosmos import DbtTaskGroup, ProfileConfig, ProjectConfig, RenderConfig
|
||||
|
||||
# Define paths to your dbt files
|
||||
dbt_root_path = "/home/dbt/DBT/mrds"
|
||||
dbt_profiles_dir = "/home/dbt/.dbt/profiles.yml"
|
||||
|
||||
ODS_TABLE = "{{table}}"
|
||||
DATABASE_NAME = "MOPDB"
|
||||
DAG_NAME = f"w_{DATABASE_NAME}_TMS_T_{ODS_TABLE}_OU_TMS_{ODS_TABLE}"
|
||||
MAPPING_NAME = f"m_{DATABASE_NAME}_TMS_T_{ODS_TABLE}_OU_TMS_{ODS_TABLE}"
|
||||
|
||||
|
||||
# Define function for the retrieval of the current run_id
|
||||
def retrieve_run_id(**kwargs):
|
||||
# Retrieve the run_id from the Airflow context
|
||||
run_id = kwargs['run_id']
|
||||
# Store the run_id in XCom for future reference
|
||||
ti = kwargs['ti']
|
||||
ti.xcom_push(key='run_id', value=run_id)
|
||||
return run_id
|
||||
|
||||
def check_dag_status(**kwargs):
|
||||
for task_instance in kwargs['dag_run'].get_task_instances():
|
||||
if task_instance.state == 'failed' and task_instance.task_id != kwargs['task_instance'].task_id:
|
||||
raise Exception("Task {} failed. Failing this DAG run".format(task_instance.task_id))
|
||||
|
||||
# Define function for the check of the status of the previous tasks
|
||||
def determine_workflow_status(**kwargs):
|
||||
# Check the status of previous tasks
|
||||
task_statuses = kwargs['ti'].xcom_pull(task_ids=['retrieve_run_id', 'control_external_run_start', 'mapping_mopdb'])
|
||||
# If any task failed, set workflow_status to 'N', otherwise 'Y'
|
||||
workflow_status = 'N' if any(status != 'success' for status in task_statuses) else 'Y'
|
||||
return workflow_status
|
||||
|
||||
@dag(
|
||||
dag_id=DAG_NAME,
|
||||
schedule_interval=None,
|
||||
start_date=days_ago(2),
|
||||
catchup=False
|
||||
)
|
||||
def run_dag():
|
||||
# Retrieve run_id
|
||||
retrieve_run_id_task = PythonOperator(
|
||||
task_id='retrieve_run_id',
|
||||
python_callable=retrieve_run_id,
|
||||
provide_context=True,
|
||||
# pool='my_custom_pool', # Create pool in Airflow Web UI with one slot to ensure that only one dag can run it at a time.
|
||||
)
|
||||
# Run dbt macro control_external_run_start
|
||||
control_external_run_start = BashOperator(
|
||||
task_id='control_external_run_start',
|
||||
bash_command=(
|
||||
'cd /home/dbt/DBT/mrds && '
|
||||
'dbt run-operation control_external_run_start --vars \'{"orchestration_run_id": "{% raw %}{{{% endraw %} task_instance.xcom_pull(task_ids="retrieve_run_id", key="run_id") {% raw %}}}{% endraw %}", "input_service_name": "' + DATABASE_NAME + '", "workflow_name": "' + DAG_NAME + '"}\' '
|
||||
'--profiles-dir /home/dbt/.dbt/ --target dev'
|
||||
)
|
||||
)
|
||||
# run dbt taskGroup with tag of the mapping name
|
||||
dbtTaskGroup = DbtTaskGroup(
|
||||
group_id=MAPPING_NAME,
|
||||
project_config=ProjectConfig(
|
||||
dbt_project_path = dbt_root_path),
|
||||
profile_config=ProfileConfig(
|
||||
profiles_yml_filepath = dbt_profiles_dir,
|
||||
profile_name="mrds",
|
||||
target_name="dev"),
|
||||
render_config=RenderConfig(select=[f"tag:{MAPPING_NAME}"],),
|
||||
operator_args={'vars': {'orchestration_run_id': '{% raw %}{{{% endraw %} task_instance.xcom_pull(task_ids="retrieve_run_id", key="run_id") {% raw %}}}{% endraw %}', "input_service_name": DATABASE_NAME, "workflow_name": DAG_NAME }}
|
||||
)
|
||||
|
||||
control_external_run_end = BashOperator(
|
||||
task_id='control_external_run_end',
|
||||
bash_command=(
|
||||
'cd /home/dbt/DBT/mrds && '
|
||||
'dbt run-operation control_external_run_end --vars \'{"orchestration_run_id": "{% raw %}{{{% endraw %} task_instance.xcom_pull(task_ids="retrieve_run_id", key="run_id") {% raw %}}}{% endraw %}", "input_service_name": "' + DATABASE_NAME + '", "workflow_name": "' + DAG_NAME + '"}\' '
|
||||
'--profiles-dir /home/dbt/.dbt/ --target dev'
|
||||
),
|
||||
trigger_rule=TriggerRule.ALL_DONE # Run regardless of previous task outcomes
|
||||
)
|
||||
|
||||
dag_status = PythonOperator(
|
||||
task_id='dag_status',
|
||||
provide_context=True,
|
||||
python_callable=check_dag_status,
|
||||
trigger_rule=TriggerRule.ALL_DONE, # Ensures this task runs even if upstream fails
|
||||
)
|
||||
|
||||
# Set task dependencies
|
||||
retrieve_run_id_task >> control_external_run_start >> [dbtTaskGroup] >> control_external_run_end >> dag_status
|
||||
globals()[DAG_NAME] = run_dag()
|
||||
2
python/code_generation/templates/db_Template.sql
Normal file
2
python/code_generation/templates/db_Template.sql
Normal file
@@ -0,0 +1,2 @@
|
||||
ALTER TABLE OU_{{schema}}.{{table}} RENAME COLUMN A_ETL_LOAD_SET_FK TO A_WORKFLOW_HISTORY_KEY;
|
||||
DROP TABLE {{schema}}.T_{{table}};
|
||||
11
python/code_generation/templates/m_Template_sq.sql
Normal file
11
python/code_generation/templates/m_Template_sq.sql
Normal file
@@ -0,0 +1,11 @@
|
||||
{% raw %}{{{% endraw %}
|
||||
config(
|
||||
materialized="table",
|
||||
tags=["{{mapping_name}}", "{{source_schema}}", "{{target_schema}}"],
|
||||
alias="{{target_table}}_SQ",
|
||||
schema="{{target_schema}}"
|
||||
)
|
||||
{% raw %}}}{% endraw %}
|
||||
{% raw %}{{{% endraw %}
|
||||
create_table_from_source("{{source_schema}}","{{source_table}}","{{workflow_name}}",get_main_task_name(model.name),['A_KEY','A_WORKFLOW_HISTORY_KEY'])
|
||||
{% raw %}}}{% endraw %}
|
||||
13
python/code_generation/templates/m_Template_target.sql
Normal file
13
python/code_generation/templates/m_Template_target.sql
Normal file
@@ -0,0 +1,13 @@
|
||||
{% raw %}-- depends_on: {{{% endraw %} ref('{{mapping_name}}_SQ') {% raw %}}} {% endraw %}
|
||||
{% raw %}{{{% endraw %}
|
||||
config(
|
||||
tags=["{{mapping_name}}","MOPDB","{{target_schema}}"],
|
||||
alias="{{target_table}}",
|
||||
schema="{{target_schema}}",
|
||||
materialized="incremental" ,
|
||||
incremental_strategy="merge"
|
||||
)
|
||||
{% raw %}}}{% endraw %}
|
||||
{% raw %}{{{% endraw %}
|
||||
create_table_target('{{mapping_name}}_SQ')
|
||||
{% raw %}}}{% endraw %}
|
||||
201
python/connectors/casper/casper_rqsd.py
Normal file
201
python/connectors/casper/casper_rqsd.py
Normal file
@@ -0,0 +1,201 @@
|
||||
import requests
|
||||
import io
|
||||
import zipfile
|
||||
import pandas as pd
|
||||
import os
|
||||
from datetime import datetime
|
||||
import oci
|
||||
from mrds.utils.secrets import get_secret
|
||||
import mrds.utils.manage_runs as runManager
|
||||
import mrds.utils.manage_files as fileManager
|
||||
import mrds.utils.sql_statements as sqls
|
||||
import sys
|
||||
import yaml
|
||||
TASK_HISTORY_MULTIPLIER = 1_000_000_000
|
||||
|
||||
def initialize_task(workflow_context, task_name):
|
||||
# Initialize task
|
||||
a_task_history_key = runManager.init_task(
|
||||
task_name,
|
||||
workflow_context["run_id"],
|
||||
workflow_context["a_workflow_history_key"],
|
||||
)
|
||||
|
||||
return a_task_history_key
|
||||
|
||||
def rqsd_parser(fileName,bucket_path,file,bucket_name):
|
||||
if "SCOPA" in fileName or "SCOPF" in fileName:
|
||||
print("SCOP")
|
||||
annex_1_1(fileName,bucket_path,file,bucket_name)
|
||||
annex_1_2(fileName,bucket_path,file,bucket_name)
|
||||
elif "RQSDC" in fileName:
|
||||
print("RQSDC")
|
||||
return annex_2(fileName, bucket_path,file,bucket_name)
|
||||
|
||||
def annex_1_1(fileName, bucket_path,file,bucket_name):
|
||||
fileData=fileName.split("_")
|
||||
csv_file_path = fileName[:-4]+".csv"
|
||||
version_number = fileData[6]
|
||||
ref_exercise = fileData[2]
|
||||
ncb = fileData[4]
|
||||
df = pd.read_excel(file, sheet_name="Counterparties in scope", skiprows=3)
|
||||
df = df.dropna(axis=1, how='all').dropna(axis=0, how='all')
|
||||
df['file_name'] = os.path.basename(fileName)
|
||||
df['ingestion_timestamp'] = datetime.now().isoformat()
|
||||
df['version_number'] = version_number
|
||||
df['ref_exercise'] = ref_exercise
|
||||
df['ncb'] = ncb
|
||||
signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
|
||||
client = oci.object_storage.ObjectStorageClient({}, signer = signer) # the first empyty bracket is an empty config
|
||||
client.put_object("frcnomajoc7v",bucket_name,bucket_path+"1_1/"+csv_file_path,bytes(df.to_csv( index=False), encoding='utf-8'))
|
||||
print("Finished uploading {}".format(csv_file_path))
|
||||
print(f"CSV saved to {csv_file_path}")
|
||||
|
||||
def annex_1_2(fileName, bucket_path,file,bucket_name):
|
||||
fileData=fileName.split("_")
|
||||
csv_file_path = fileName[:-4]+".csv"
|
||||
version_number = fileData[6]
|
||||
ref_exercise = fileData[2]
|
||||
ncb = fileData[4]
|
||||
df = pd.read_excel(file, sheet_name="Entities to which data relates", skiprows=3)
|
||||
df = df.dropna(axis=1, how='all').dropna(axis=0, how='all')
|
||||
df['file_name'] = os.path.basename(fileName)
|
||||
df['ingestion_timestamp'] = datetime.now().isoformat()
|
||||
df['version_number'] = version_number
|
||||
df['ref_exercise'] = ref_exercise
|
||||
df['ncb'] = ncb
|
||||
signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
|
||||
client = oci.object_storage.ObjectStorageClient({}, signer = signer) # the first empyty bracket is an empty config
|
||||
client.put_object("frcnomajoc7v",bucket_name,bucket_path+"1_2/"+csv_file_path,bytes(df.to_csv( index=False), encoding='utf-8'))
|
||||
print("Finished uploading {}".format(csv_file_path))
|
||||
print(f"CSV saved to {csv_file_path}")
|
||||
|
||||
def annex_2(fileName,bucket_path,file,bucket_name):
|
||||
fileData=fileName.split("_")
|
||||
# Parameters
|
||||
version_number = fileData[6]
|
||||
ref_exercise = fileData[2]
|
||||
ncb = fileData[4]
|
||||
|
||||
# Read the first sheet, skip the metadata rows
|
||||
df = pd.read_excel(file.getvalue(), sheet_name="Data collection template", skiprows=6)
|
||||
|
||||
# Clean empty rows/columns
|
||||
df = df.dropna(axis=1, how='all').dropna(axis=0, how='all')
|
||||
|
||||
# Add metadata columns
|
||||
df['file_name'] = os.path.basename(fileName)
|
||||
df['ingestion_timestamp'] = datetime.now().isoformat()
|
||||
df['version_number'] = version_number
|
||||
df['ref_exercise'] = ref_exercise
|
||||
df['ncb'] = ncb
|
||||
|
||||
csvName=fileName[:-4]+"csv"
|
||||
# Save to CSV
|
||||
|
||||
signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
|
||||
client = oci.object_storage.ObjectStorageClient({}, signer = signer) # the first empyty bracket is an empty config
|
||||
client.put_object("frcnomajoc7v",bucket_name,bucket_path+"2/"+csvName,bytes(df.to_csv( index=False), encoding='utf-8'))
|
||||
print("Finished uploading {}".format(csvName))
|
||||
print(f"CSV saved to {csvName}")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def rqsd_preflow(secret_crt_id,secret_key_id,casper_api_url,collection_id): #downloads the list of files
|
||||
|
||||
crt_path=os.getcwd()+"/rqsd_tst.crt"
|
||||
key_path=os.getcwd()+"/rqsd_tst.key.pem"
|
||||
try:
|
||||
with open(key_path,"w") as keyfile:
|
||||
keyfile.write(get_secret(secret_key_id))
|
||||
with open (crt_path,"w") as crtfile:
|
||||
crtfile.write(get_secret(secret_crt_id))
|
||||
except:
|
||||
print("Failed to retrieve certificates from secrets")
|
||||
os.remove(crt_path)
|
||||
os.remove(key_path)
|
||||
raise(Exception)
|
||||
|
||||
protected_resource_url=casper_api_url+"/casper-api/filevault/"
|
||||
try:
|
||||
|
||||
max_date=fileManager.execute_query("SELECT to_char(max(processing_end_time),'YYYY-MM-DD HH24:mi:ss') as MAX_PROCESSING_END_TIME FROM ct_ods.a_casper_filevault")
|
||||
if max_date is not []:
|
||||
filterString='isTest eq False and processingStatus eq "PS_COMPLETED" and processingEndTime gt '+max_date[0].split(' ')[0]
|
||||
else:
|
||||
filterString='isTest eq False and processingStatus eq "PS_COMPLETED"'
|
||||
response=requests.get(protected_resource_url+"files/"+collection_id ,headers={"accept": "application/json"},cert=(crt_path,key_path), verify=False, params={"filter": filterString})
|
||||
|
||||
print(response.text)
|
||||
files=response.json()
|
||||
except:
|
||||
print("Failed to retrieve ACC metadata, error during connection or request")
|
||||
raise(Exception)
|
||||
return files
|
||||
|
||||
|
||||
def rqsd_process(files,casper_api_url,bucket_path,bucket_name):
|
||||
crt_path=os.getcwd()+"/rqsd_tst.crt"
|
||||
key_path=os.getcwd()+"/rqsd_tst.key.pem"
|
||||
|
||||
# GET request to a protected
|
||||
for downloadable in files:
|
||||
try:
|
||||
print("\n\n")
|
||||
response=requests.get(casper_api_url+"/casper-api/filevault/download/"+str(downloadable["dcId"])+'/'+str(downloadable["fileID"]) ,headers={"accept": "application/json"},cert=(crt_path, key_path),verify=False)
|
||||
rqsd_parser(downloadable["fileName"],bucket_path,io.BytesIO(response.content),bucket_name)
|
||||
except:
|
||||
print(f"Failed to upload file into target bucket, files saved locally in {os.getcwd()}")
|
||||
os.remove(crt_path)
|
||||
os.remove(key_path)
|
||||
raise(Exception)
|
||||
|
||||
|
||||
def add_a_key_column(headers, data_rows, task_history_key):
|
||||
headers.insert(0, 'A_KEY')
|
||||
for i, row in enumerate(data_rows, start=1):
|
||||
a_key_value = int(task_history_key) * TASK_HISTORY_MULTIPLIER + i
|
||||
row.insert(0, str(a_key_value))
|
||||
|
||||
def add_workflow_key_column(headers, data_rows, workflow_key):
|
||||
headers.insert(1, 'A_WORKFLOW_HISTORY_KEY')
|
||||
for row in data_rows:
|
||||
row.insert(0, workflow_key)
|
||||
|
||||
def initialize_config(config_file_path):
|
||||
# Ensure the file exists
|
||||
if not os.path.exists(config_file_path):
|
||||
raise FileNotFoundError(f"Configuration file {config_file_path} not found.")
|
||||
|
||||
# Load the configuration
|
||||
with open(config_file_path, "r") as f:
|
||||
config_data = yaml.safe_load(f)
|
||||
|
||||
return config_data
|
||||
|
||||
|
||||
def main(workflow_context, flow_config_path, env_config_path, env):
|
||||
|
||||
#init setup
|
||||
flow_info = initialize_config(flow_config_path)
|
||||
envs_info = initialize_config(env_config_path)
|
||||
environment_info = envs_info[env]
|
||||
|
||||
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
|
||||
a_task_history_key = initialize_task(workflow_context, flow_info['TASK_NAME'])
|
||||
|
||||
# get list of files
|
||||
try:
|
||||
|
||||
files = rqsd_preflow(environment_info["CERTIFICATE_FILE"],environment_info["CERTIFICATE_KEY"],environment_info["CASPER_URL"],flow_info["COLLECTION_ID"])
|
||||
rqsd_process(files,environment_info["CASPER_URL"],flow_info["ODS_PREFIX"],environment_info["BUCKET"])
|
||||
except:
|
||||
print("Failed to retrieve DEVO data, error during connection or request")
|
||||
raise(Exception)
|
||||
|
||||
# Finalize task
|
||||
runManager.finalise_task(a_task_history_key, 'Y')
|
||||
27
python/connectors/casper/config/env_config.yaml
Normal file
27
python/connectors/casper/config/env_config.yaml
Normal file
@@ -0,0 +1,27 @@
|
||||
# Environment Configuration
|
||||
dev:
|
||||
BUCKET: "mrds_inbox_dev"
|
||||
CERTIFICATE_KEY: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya7r33ocatalf6jn6kg2xjhnya6kazlqd3e5gw6yghpd5q"
|
||||
CERTIFICATE_FILE: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyaeva4zvj6xdihljookamhse7jlyassfjb4p45xp46bwba"
|
||||
CASPER_URL: "https://internet.api.casper.tst.aws.tadnet.net"
|
||||
RQSD_COLLECTION_ID: "1537"
|
||||
tst:
|
||||
BUCKET: "mrds_inbox_tst"
|
||||
CERTIFICATE_KEY: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya7r33ocatalf6jn6kg2xjhnya6kazlqd3e5gw6yghpd5q"
|
||||
CERTIFICATE_FILE: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyaeva4zvj6xdihljookamhse7jlyassfjb4p45xp46bwba"
|
||||
CASPER_URL: "https://internet.api.casper.tst.aws.tadnet.net"
|
||||
RQSD_COLLECTION_ID: "1537"
|
||||
acc:
|
||||
BUCKET: "mrds_inbox_acc"
|
||||
CERTIFICATE_KEY: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya5snmftggydoszwchjra3ifa4pyiilgc26uqlhejnhcca"
|
||||
CERTIFICATE_FILE: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyaho5t4qgmlqctew6g6mcnwpz2p7z4nhxooyl6hc5sonfa"
|
||||
CASPER_URL: "https://internet.api.casper.stg.aws.ecb.de"
|
||||
RQSD_COLLECTION_ID: "1116"
|
||||
prd:
|
||||
BUCKET: "mrds_inbox_prd"
|
||||
CERTIFICATE_KEY: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyahmv5sopfsv7nytxdyycehoyl5pd7sz5t2drn27qaneta"
|
||||
CERTIFICATE_FILE: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyame3chyqs6cdl2igeyrvzpj3s4vrndhbgeayt533uhgqa"
|
||||
CASPER_URL: "https://internet.api.casper.prd.aws.ecb.de"
|
||||
RQSD_COLLECTION_ID: "1030"
|
||||
|
||||
|
||||
25
python/connectors/devo/config/env_config.yaml
Normal file
25
python/connectors/devo/config/env_config.yaml
Normal file
@@ -0,0 +1,25 @@
|
||||
# Environment Configuration
|
||||
dev:
|
||||
BUCKET_NAMESPACE: "frcnomajoc7v"
|
||||
BUCKET: "mrds_inbox_dev"
|
||||
DEVO_USERNAME: "ap-informatica-ipcwt"
|
||||
DEVO_HOSTNAME: "impala-proxy-devo-lab21-impala01.dw-devo-lab21.om2y56.b0.cloudera.site"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyavrevwxke46wjgj5nz3cc5kwwsybmngbji4zepones55q"
|
||||
tst:
|
||||
BUCKET_NAMESPACE: "frcnomajoc7v"
|
||||
BUCKET: "mrds_inbox_tst"
|
||||
DEVO_USERNAME: "ap-informatica-ipcwt"
|
||||
DEVO_HOSTNAME: "t-impala.devo.escb.eu"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyaxxx7yfifpgpdnxuj6dcowpoktwa6745kwwpezysd44oa"
|
||||
acc:
|
||||
BUCKET_NAMESPACE: "frcnomajoc7v"
|
||||
BUCKET: "mrds_inbox_acc"
|
||||
DEVO_USERNAME: "ap-informatica-ipcwa"
|
||||
DEVO_HOSTNAME: "impala-proxy-devo-acc21-impala01.dw-devo-acc21.inym23.b0.cloudera.site"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya4uttfadlzreloouw2e5bifgl2dvihffym5xoq3b3jmva"
|
||||
prd:
|
||||
BUCKET_NAMESPACE: "frcnomajoc7v"
|
||||
BUCKET: "mrds_inbox_prd"
|
||||
DEVO_USERNAME: "ap-informatica-ipcwp"
|
||||
DEVO_HOSTNAME: "impala-proxy-devo-prd21-impala01.dw-devo-prd21.inym23.b0.cloudera.site"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyanbahqlucid7qtzvoohsf4xrlul7cvhlsqttmbro4n66a"
|
||||
25
python/connectors/devo/config/env_config_rqsd.yaml
Normal file
25
python/connectors/devo/config/env_config_rqsd.yaml
Normal file
@@ -0,0 +1,25 @@
|
||||
# Environment Configuration
|
||||
dev:
|
||||
BUCKET_NAMESPACE: "frcnomajoc7v"
|
||||
BUCKET: "mrds_inbox_dev"
|
||||
DEVO_USERNAME: "ap-devo-rqsd-tst"
|
||||
DEVO_HOSTNAME: "t-impala.devo.escb.eu"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyap6wtzobzob7qizvk4nocszlcaxhwijgzejbvryt3uzbq"
|
||||
tst:
|
||||
BUCKET_NAMESPACE: "frcnomajoc7v"
|
||||
BUCKET: "mrds_inbox_tst"
|
||||
DEVO_USERNAME: "ap-devo-rqsd-tst"
|
||||
DEVO_HOSTNAME: "t-impala.devo.escb.eu"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyap6wtzobzob7qizvk4nocszlcaxhwijgzejbvryt3uzbq"
|
||||
acc:
|
||||
BUCKET_NAMESPACE: "frcnomajoc7v"
|
||||
BUCKET: "mrds_inbox_acc"
|
||||
DEVO_USERNAME: "ap-devo-rqsd-acc"
|
||||
DEVO_HOSTNAME: "impala-proxy-devo-acc21-impala01.dw-devo-acc21.inym23.b0.cloudera.site"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyamzhgatnso57mubvg3c6k4ens3orcx4dieo6efukuvm4a"
|
||||
prd:
|
||||
BUCKET_NAMESPACE: "frcnomajoc7v"
|
||||
BUCKET: "mrds_inbox_prd"
|
||||
DEVO_USERNAME: "ap-devo-rqsd-prd"
|
||||
DEVO_HOSTNAME: "impala-proxy-devo-prd21-impala01.dw-devo-prd21.inym23.b0.cloudera.site"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyawpahgevgxv6csqnwil3p37vi6pthl466onnkg6k7undq"
|
||||
259
python/connectors/devo/devo_connector.py
Normal file
259
python/connectors/devo/devo_connector.py
Normal file
@@ -0,0 +1,259 @@
|
||||
# devo_impala_exporter.py
|
||||
|
||||
import os
|
||||
import io
|
||||
import yaml
|
||||
import datetime
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import pandas as pd
|
||||
from mrds.utils.secrets import get_secret
|
||||
import mrds.utils.manage_runs as runManager
|
||||
import mrds.utils.objectstore as objectstore
|
||||
|
||||
import oci
|
||||
|
||||
from impala.dbapi import (
|
||||
connect,
|
||||
ProgrammingError,
|
||||
DatabaseError,
|
||||
IntegrityError,
|
||||
OperationalError,
|
||||
)
|
||||
from impala.error import HiveServer2Error
|
||||
|
||||
TASK_HISTORY_MULTIPLIER = 1_000_000_000
|
||||
|
||||
|
||||
class DevoConnector:
|
||||
"""
|
||||
Export the result of an Impala (DEVO) query to OCI Object Storage as CSV,
|
||||
while recording task run metadata via mrds.runManager.
|
||||
|
||||
Usage:
|
||||
exporter = DevoImpalaExporter(
|
||||
flow_config_path="/path/to/flow.yaml",
|
||||
env_config_path="/path/to/env.yaml",
|
||||
env="dev",
|
||||
logger=my_logger, # optional
|
||||
oci_client=my_object_storage, # optional ObjectStorageClient
|
||||
oci_signer=my_signer, # optional signer (used if client not provided)
|
||||
)
|
||||
exporter.run({"run_id": 34, "a_workflow_history_key": 6})
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
flow_config_path: str,
|
||||
env_config_path: str,
|
||||
env: str,
|
||||
logger: Optional[logging.Logger] = None,
|
||||
oci_client: Optional[oci.object_storage.ObjectStorageClient] = None,
|
||||
oci_signer: Optional[Any] = None,
|
||||
) -> None:
|
||||
self.flow_info = self._initialize_config(flow_config_path)
|
||||
envs_info = self._initialize_config(env_config_path)
|
||||
BUCKET_NAMESPACE = os.getenv("BUCKET_NAMESPACE", "frcnomajoc7v")
|
||||
|
||||
if env not in envs_info:
|
||||
raise KeyError(f"Environment '{env}' not found in {env_config_path}")
|
||||
self.environment_info = envs_info[env]
|
||||
self.environment_info["BUCKET_NAMESPACE"]=BUCKET_NAMESPACE
|
||||
self.env = env
|
||||
|
||||
# logging
|
||||
self.logger = logger or self._default_logger(self.flow_info.get("TASK_NAME", "devo_task"))
|
||||
|
||||
# OCI client/signer
|
||||
self.oci_client = oci_client
|
||||
self.oci_signer = oci_signer
|
||||
|
||||
# -------------------------
|
||||
# Public API
|
||||
# -------------------------
|
||||
|
||||
def run(self, workflow_context: Dict[str, Any]) -> None:
|
||||
"""Main entry point; executes query, uploads CSV, and finalizes task."""
|
||||
task_name = self.flow_info["TASK_NAME"]
|
||||
a_task_history_key = self._initialize_task(workflow_context, task_name)
|
||||
|
||||
try:
|
||||
# credentials
|
||||
devo_secret_name = self.environment_info["DEVO_SECRET"]
|
||||
password = get_secret(devo_secret_name)
|
||||
self.logger.info("Retrieved secret for DEVO connection.")
|
||||
|
||||
# query
|
||||
query = self.flow_info["DEVO_QUERY"]
|
||||
user = self.environment_info["DEVO_USERNAME"]
|
||||
host = self.environment_info["DEVO_HOSTNAME"]
|
||||
|
||||
columns, data, rowcount = self._execute_query(query=query, user=user, hostname=host, password=password)
|
||||
df = self._tuple_to_dataframe((columns, data))
|
||||
self.logger.info("Query executed and DataFrame created with %d rows.", len(df))
|
||||
|
||||
# upload
|
||||
if rowcount > 0:
|
||||
csv_name = f"{self.flow_info['OUTPUT_TABLE']}.csv"
|
||||
file_path = self._compose_object_path(self.flow_info["ODS_PREFIX"], csv_name)
|
||||
self._upload_dataframe_to_oci(df, csv_name, file_path)
|
||||
self.logger.info("Finished uploading %s to %s.", csv_name, file_path)
|
||||
else:
|
||||
return 0
|
||||
|
||||
# success
|
||||
runManager.finalise_task(a_task_history_key, "Y")
|
||||
self.logger.info("Task %s finalized successfully.", task_name)
|
||||
return rowcount
|
||||
|
||||
except Exception as e:
|
||||
# failure
|
||||
self.logger.exception("Run failed: %s", e)
|
||||
try:
|
||||
runManager.finalise_task(a_task_history_key, "N")
|
||||
finally:
|
||||
# re-raise for upstream handling if used as a library
|
||||
raise
|
||||
|
||||
# -------------------------
|
||||
# Impala / DEVO
|
||||
# -------------------------
|
||||
|
||||
@staticmethod
|
||||
def _get_impala_connection(hostname: str, user: str, secret: str):
|
||||
return connect(
|
||||
host=hostname,
|
||||
port=443,
|
||||
auth_mechanism="PLAIN",
|
||||
user=user,
|
||||
password=secret,
|
||||
use_http_transport=True,
|
||||
http_path="cliservice",
|
||||
use_ssl=True,
|
||||
)
|
||||
|
||||
def _execute_query(self, query: str, user: str, hostname: str, password: str) -> Tuple[List[str], List[List[Any]]]:
|
||||
conn = self._get_impala_connection(hostname, user, password)
|
||||
cursor = None
|
||||
self.logger.info("Executing Impala query against host '%s' as user '%s'.", hostname, user)
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
|
||||
if query.strip().lower().startswith("select") or "select" in query.strip().lower() :
|
||||
rows = cursor.fetchall()
|
||||
columns = [col[0] for col in cursor.description]
|
||||
return columns, rows, cursor.rowcount
|
||||
else:
|
||||
# Non-SELECT: return rowcount (still return a columns list for consistency)
|
||||
return [], [[cursor.rowcount]]
|
||||
|
||||
except OperationalError as oe:
|
||||
raise RuntimeError("Failed to connect to Impala: " + str(oe)) from oe
|
||||
except ProgrammingError as pe:
|
||||
raise ValueError("Query syntax error: " + str(pe)) from pe
|
||||
except IntegrityError as ie:
|
||||
raise PermissionError("Insufficient permissions: " + str(ie)) from ie
|
||||
except DatabaseError as db_err:
|
||||
raise RuntimeError("Database error: " + str(db_err)) from db_err
|
||||
except HiveServer2Error as au_err:
|
||||
raise PermissionError("HiveServer2Error error: " + str(au_err)) from au_err
|
||||
except Exception as e:
|
||||
raise RuntimeError("An unexpected error occurred: " + str(e)) from e
|
||||
finally:
|
||||
try:
|
||||
if cursor:
|
||||
cursor.close()
|
||||
finally:
|
||||
try:
|
||||
conn.close()
|
||||
except Exception:
|
||||
# log but don't mask the original exception
|
||||
self.logger.warning("Failed to close Impala connection cleanly.", exc_info=True)
|
||||
|
||||
# -------------------------
|
||||
# OCI Upload
|
||||
# -------------------------
|
||||
|
||||
|
||||
def _upload_dataframe_to_oci(self, df: pd.DataFrame, csv_name: str, object_path: str) -> None:
|
||||
namespace = self.environment_info["BUCKET_NAMESPACE"]
|
||||
bucket = self.environment_info["BUCKET"]
|
||||
# convert DataFrame to CSV bytes without index
|
||||
csv_bytes = df.to_csv(index=False).encode("utf-8")
|
||||
client=objectstore.get_client()
|
||||
client.put_object(namespace, bucket, object_path, csv_bytes)
|
||||
self.logger.info("CSV '%s' uploaded to bucket '%s' (ns: '%s', key: '%s').", csv_name, bucket, namespace, object_path)
|
||||
|
||||
# -------------------------
|
||||
# Utilities
|
||||
# -------------------------
|
||||
|
||||
@staticmethod
|
||||
def _tuple_to_dataframe(data_tuple: Tuple[List[str], List[List[Any]]]) -> pd.DataFrame:
|
||||
columns, data = data_tuple
|
||||
if not columns:
|
||||
# for non-SELECT queries we returned rowcount; represent it in a DataFrame
|
||||
return pd.DataFrame(data, columns=["rowcount"])
|
||||
return pd.DataFrame(data, columns=columns)
|
||||
|
||||
@staticmethod
|
||||
def _initialize_config(config_file_path: str) -> Dict[str, Any]:
|
||||
if not os.path.exists(config_file_path):
|
||||
raise FileNotFoundError(f"Configuration file {config_file_path} not found.")
|
||||
with open(config_file_path, "r") as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
@staticmethod
|
||||
def _initialize_task(workflow_context: Dict[str, Any], task_name: str) -> int:
|
||||
return runManager.init_task(
|
||||
task_name,
|
||||
workflow_context["run_id"],
|
||||
workflow_context["a_workflow_history_key"],
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def add_a_key_column(headers: List[str], data_rows: List[List[Any]], task_history_key: int) -> None:
|
||||
"""Optionally add an A_KEY column (kept for parity with original script)."""
|
||||
headers.insert(0, "A_KEY")
|
||||
for i, row in enumerate(data_rows, start=1):
|
||||
a_key_value = int(task_history_key) * TASK_HISTORY_MULTIPLIER + i
|
||||
row.insert(0, str(a_key_value))
|
||||
|
||||
@staticmethod
|
||||
def add_workflow_key_column(headers: List[str], data_rows: List[List[Any]], workflow_key: int) -> None:
|
||||
"""Optionally add the workflow key column right after A_KEY if present, otherwise at position 0."""
|
||||
insert_idx = 1 if headers and headers[0] == "A_KEY" else 0
|
||||
headers.insert(insert_idx, "A_WORKFLOW_HISTORY_KEY")
|
||||
for row in data_rows:
|
||||
row.insert(insert_idx, workflow_key)
|
||||
|
||||
@staticmethod
|
||||
def _compose_object_path(prefix: str, filename: str) -> str:
|
||||
if prefix.endswith("/"):
|
||||
return f"{prefix}{filename}"
|
||||
return f"{prefix}/{filename}"
|
||||
|
||||
@staticmethod
|
||||
def _default_logger(task_name: str) -> logging.Logger:
|
||||
logger = logging.getLogger(f"{task_name}_logger")
|
||||
if not logger.handlers:
|
||||
logger.setLevel(logging.INFO)
|
||||
handler = logging.StreamHandler()
|
||||
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
fmt = logging.Formatter(f"%(asctime)s [{task_name}] %(levelname)s: %(message)s")
|
||||
handler.setFormatter(fmt)
|
||||
logger.addHandler(handler)
|
||||
return logger
|
||||
|
||||
|
||||
# Optional: quick-run convenience if you ever want to execute this module directly.
|
||||
if __name__ == "__main__":
|
||||
# Example only—adjust paths/env/context as needed or remove this block.
|
||||
exporter = DevoConnector(
|
||||
flow_config_path="/home/dbt/Marco/mrds_elt/airflow/ods/rqsd/rqsd_process/config/yaml/m_ODS_RQSD_OBSERVATIONS.yaml",
|
||||
env_config_path="/home/dbt/Marco/mrds_elt/python/connectors/devo/config/env_config_rqsd.yaml",
|
||||
env="dev",
|
||||
)
|
||||
exporter.run({"run_id": 34, "a_workflow_history_key": 6})
|
||||
294
python/connectors/tms/TMSDBT.py
Normal file
294
python/connectors/tms/TMSDBT.py
Normal file
@@ -0,0 +1,294 @@
|
||||
|
||||
|
||||
import argparse
|
||||
from TMSQuery import XMLQuery
|
||||
|
||||
import mrds.utils.objectstore
|
||||
import tempfile
|
||||
import re
|
||||
import csv
|
||||
from io import StringIO
|
||||
|
||||
import os.path
|
||||
import os, psutil
|
||||
import sys
|
||||
|
||||
|
||||
namespace = os.getenv("BUCKET_NAMESPACE", "frcnomajoc7v")
|
||||
|
||||
def memory_usage():
|
||||
# return the memory usage in percentage like top
|
||||
process = psutil.Process(os.getpid())
|
||||
mem = process.memory_info().rss/(1024*1024*1024)
|
||||
return mem
|
||||
|
||||
|
||||
def protect_keyword(s):
|
||||
s = s.lower()
|
||||
s = s.replace(' ', '_')
|
||||
|
||||
match s.lower():
|
||||
case 'comment':
|
||||
#return '"comment"'
|
||||
return 'comment_'
|
||||
case 'date':
|
||||
#return '"date"'
|
||||
return 'date_'
|
||||
case 'number':
|
||||
#return '"number"'
|
||||
return 'number_'
|
||||
case _:
|
||||
return s
|
||||
|
||||
|
||||
cModelsDir = sys.path[0] + '/../dbt/mrds/models/ods/'
|
||||
cDatasetMultiplier = 10000000
|
||||
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("command", choices=['create-model', 'create-oracle-table', 'retrieve'], help="create-model retrieve")
|
||||
parser.add_argument("-n", "--name", help="Name")
|
||||
parser.add_argument("-u", "--url", required=True, help="URL of TMS service")
|
||||
parser.add_argument("-U", "--user", required=True, help="TMS user")
|
||||
parser.add_argument("-P", "--password", required=True, help="TMS password")
|
||||
parser.add_argument("-x", "--xmlfile", help="XML file")
|
||||
parser.add_argument("-l", "--layoutfile", help="layout file")
|
||||
parser.add_argument("-f", "--format", help="output format")
|
||||
parser.add_argument("-p", "--parameter", action="append", help="Parameter")
|
||||
parser.add_argument("-c", "--column", action="append", help="Additional column")
|
||||
parser.add_argument("-d", "--destination", help="destination")
|
||||
parser.add_argument("-s", "--dataset", help="data set ID", type=int)
|
||||
parser.add_argument("-v", "--version", help="data model version", type=int, default=1)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
query = XMLQuery()
|
||||
|
||||
if args.xmlfile:
|
||||
with open(args.xmlfile) as f:
|
||||
xml = f.read()
|
||||
query.xml = xml
|
||||
|
||||
|
||||
if args.layoutfile:
|
||||
with open(args.layoutfile) as f:
|
||||
layout = f.read()
|
||||
query.layout = layout
|
||||
|
||||
if args.format:
|
||||
query.format = args.format
|
||||
|
||||
if args.parameter:
|
||||
for p in args.parameter:
|
||||
[name, value] = p.split('=', 1)
|
||||
query.parameter[name] = value
|
||||
|
||||
additional_columns = []
|
||||
if args.column:
|
||||
for p in args.column:
|
||||
[name, value] = p.split('=', 1)
|
||||
t = re.split(r'(?:\|)|(?:/)|(?::)', name, maxsplit = 2)
|
||||
name = t[0]
|
||||
type = None
|
||||
if len(t) == 2:
|
||||
type = t[1]
|
||||
if not type:
|
||||
type = 'varchar2(255)'
|
||||
additional_columns.append((name, type, value))
|
||||
|
||||
|
||||
query.normalize_output()
|
||||
|
||||
from pathlib import Path
|
||||
import pprint
|
||||
p = Path('/tmp/kurt.xml')
|
||||
p.write_text(str(query))
|
||||
|
||||
|
||||
if args.command == 'create-oracle-table':
|
||||
|
||||
d = query.describe(args.url, args.user, args. password)
|
||||
|
||||
|
||||
columns = [" a_key number(38, 0)", "a_workflow_history_key number(38, 0)"]
|
||||
for c in additional_columns:
|
||||
columns.append("%s %s"%(c[0], c[1]))
|
||||
|
||||
for col in d:
|
||||
name = protect_keyword(col[0])
|
||||
match col[1]:
|
||||
case 'text':
|
||||
columns.append(name + " varchar2(512 char)")
|
||||
case 'int':
|
||||
columns.append(name + " number(38,0)")
|
||||
case 'money':
|
||||
columns.append(name + " number(19,4)")
|
||||
case 'floating':
|
||||
columns.append(name + " binary_double")
|
||||
case 'datetime':
|
||||
columns.append(name + " date")
|
||||
case 'integer':
|
||||
columns.append(name + " number(12, 0)")
|
||||
|
||||
|
||||
sql = "create table ct_et_templates." + args.name + " (\n"
|
||||
sql = sql + ",\n ".join(columns)
|
||||
sql = sql + "\n)\n"
|
||||
|
||||
if not args.destination or args.destination == '-':
|
||||
print(sql)
|
||||
else:
|
||||
with open(args.destination, 'w') as f:
|
||||
f.write(sql)
|
||||
|
||||
|
||||
|
||||
elif args.command == 'create-ods-model':
|
||||
|
||||
d = query.describe(args.url, args.user, args. password)
|
||||
|
||||
file_name = cModelsDir + args.name + '.yml'
|
||||
f = open(file_name, 'w') # open file in append mode
|
||||
|
||||
f.write('version: %d\n' % args.version)
|
||||
|
||||
f.write('models:' + '\n')
|
||||
f.write(' - name: ' + args.name + '_dbt\n')
|
||||
f.write(' description: "A starter dbt model"' + '\n')
|
||||
f.write(' columns:' + '\n')
|
||||
for col in d:
|
||||
f.write(' - name: ' + col[0] + '\n')
|
||||
f.write(' data_type: ' + col[1] + '\n')
|
||||
f.close()
|
||||
|
||||
|
||||
file_name = cModelsDir + args.name + '.sql'
|
||||
f = open(file_name, 'w') # open file in append mode
|
||||
|
||||
|
||||
if args.destination and args.destination != '-':
|
||||
if ':' in args.destination:
|
||||
dest = args.destination.split(':', 2)
|
||||
path = dest[1]
|
||||
else:
|
||||
path = args.destination
|
||||
prefix = os.path.dirname(path)
|
||||
else:
|
||||
prefix = 'INBOX/TMS/' + args.name.upper() + '/'
|
||||
|
||||
|
||||
|
||||
|
||||
pars = "ptablename => '%s', ptemplatetablename => 'ou_tms.%s', pprefix => '%s'" % (args.name, args.name, prefix)
|
||||
print(f"creating table {args.name}")
|
||||
f.write('{{\n config(\n post_hook = "call ct_mrds.file_manager.create_external_table(%s)"\n )\n}}\n\n' % pars)
|
||||
f.write("{{ config(materialized='table') }}" + "\n")
|
||||
f.write('with source_data as (' + "\n")
|
||||
columns = []
|
||||
columns.append("cast (1 as number(38,0)) as a_key")
|
||||
columns.append("cast (1 as number(38,0)) as a_workflow_history_key")
|
||||
for col in d:
|
||||
name = protect_keyword(col[0])
|
||||
match col[1]:
|
||||
case 'text':
|
||||
columns.append("cast ('x' as varchar2(255 char)) as " + name)
|
||||
case 'int':
|
||||
columns.append("cast (1 as number(38, 0)) as " + name)
|
||||
case 'money':
|
||||
columns.append("cast (1.0 as number(19,4)) as " + name)
|
||||
case 'floating':
|
||||
columns.append("cast (1.0 as binary_double) as " + name)
|
||||
case 'datetime':
|
||||
columns.append("cast (sysdate as date) as " + name)
|
||||
case 'integer':
|
||||
columns.append("cast (1 as number(12, 0)) as " + name)
|
||||
f.write(' select\n ' + ',\n '.join(columns) + '\n')
|
||||
f.write(')\nselect * from source_data\n ')
|
||||
f.close()
|
||||
|
||||
|
||||
|
||||
elif args.command == 'retrieve':
|
||||
ret = query.execute(args.url, args.user, args. password)
|
||||
|
||||
if query.format in ('scsv', 'standard_csv') and args.dataset:
|
||||
|
||||
# Save result to temporary spooled file for further processing
|
||||
# We avoid doing this in memory to prevent issues with flow EffectivePermissions
|
||||
|
||||
f = tempfile.SpooledTemporaryFile(mode = 'w+', max_size = 200*1024*1024)
|
||||
f.write(ret)
|
||||
del ret
|
||||
f.seek(0)
|
||||
|
||||
|
||||
# Replace embedded newlines for '<br/>'
|
||||
reader = csv.reader(f)
|
||||
|
||||
sio = StringIO()
|
||||
|
||||
writer = csv.writer(sio)
|
||||
for l in reader:
|
||||
l_tmp = [s.replace('\n', '<br/>') for s in l]
|
||||
writer.writerow(l_tmp)
|
||||
f.close()
|
||||
|
||||
|
||||
# Necessary to read the data into an array of lines for further processing
|
||||
sio.seek(0)
|
||||
lines_tmp = sio.readlines()
|
||||
del sio
|
||||
|
||||
|
||||
if not lines_tmp:
|
||||
ret = ""
|
||||
else:
|
||||
# Adding artificial columns A_KEY and A_WORKFLOW_HISTORY_KEY and added columns
|
||||
additional_headers = [t[0] for t in additional_columns]
|
||||
additional_values = [t[2] for t in additional_columns]
|
||||
headers = ['A_KEY','A_WORKFLOW_HISTORY_KEY'] + additional_headers + [protect_keyword(h) for h in lines_tmp[0].split(',')]
|
||||
lines = [','.join(headers) ]
|
||||
|
||||
i = 0
|
||||
for l in lines_tmp[1:]:
|
||||
lines.append(str(args.dataset*cDatasetMultiplier + i) + ',' + str(args.dataset) + ',' + ','.join(additional_values + [l]) )
|
||||
i += 1
|
||||
|
||||
del lines_tmp
|
||||
|
||||
# Spooling again to temporary file to avoid duplication memory needs
|
||||
f = tempfile.SpooledTemporaryFile(mode = 'w+', max_size = 200*1024*1024)
|
||||
f.writelines(lines)
|
||||
del lines
|
||||
f.seek(0)
|
||||
ret = f.read()
|
||||
f.close()
|
||||
|
||||
if not args.destination or args.destination == '-':
|
||||
print(ret, end='')
|
||||
elif ':' not in args.destination:
|
||||
with open(args.destination, 'w') as f:
|
||||
f.write(ret)
|
||||
else:
|
||||
f = tempfile.NamedTemporaryFile(delete = False, mode = 'w', prefix = 'TMSDBT-', suffix = '.csv')
|
||||
f.write(ret)
|
||||
f.close()
|
||||
|
||||
dest = args.destination.split(':', 2)
|
||||
bucket = dest[0]
|
||||
dirname = os.path.dirname(dest[1])
|
||||
filename = os.path.basename(dest[1])
|
||||
client = mrds.utils.objectstore.get_client()
|
||||
with open(f.name, "r") as file:
|
||||
print(file.read())
|
||||
mrds.utils.objectstore.upload_file(client, f.name,namespace, bucket, dirname, filename)
|
||||
|
||||
os.remove(f.name)
|
||||
|
||||
if ret:
|
||||
sys.exit(0)
|
||||
else:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
197
python/connectors/tms/TMSQuery.py
Normal file
197
python/connectors/tms/TMSQuery.py
Normal file
@@ -0,0 +1,197 @@
|
||||
|
||||
import xml.etree.ElementTree as ET
|
||||
import re
|
||||
import base64
|
||||
import sys
|
||||
|
||||
|
||||
class XMLQuery:
|
||||
|
||||
|
||||
def __init__(self, xml = None):
|
||||
|
||||
self._format = 'xml'
|
||||
self._layout = ''
|
||||
self._parameter = {}
|
||||
|
||||
if xml:
|
||||
|
||||
self._parse_xml(xml)
|
||||
|
||||
|
||||
def _parse_xml(self, xml):
|
||||
|
||||
self._tree = ET.fromstring(xml)
|
||||
|
||||
|
||||
layout_b64 = self._tree.find('layout').text
|
||||
self._layout = base64.b64decode(layout_b64).decode('utf-8')
|
||||
|
||||
self._format = self._tree.find('format').get('type')
|
||||
|
||||
|
||||
self._parameter = {}
|
||||
for p in self._tree.findall('parameters/parameter'):
|
||||
self._parameter[p.get('name')] = p.text
|
||||
|
||||
|
||||
|
||||
def execute(self, url, user, password):
|
||||
|
||||
# curl -X POST --basic -u schilli:chili03 --data @tms_activity_interval.xml https://tmsxd104.ecbt1.tadnet.net:9443/report/
|
||||
|
||||
import requests
|
||||
from requests.auth import HTTPBasicAuth
|
||||
|
||||
|
||||
data = str(self)
|
||||
basic = HTTPBasicAuth(user, password)
|
||||
|
||||
response = requests.post(url, data=data, auth=basic, verify=False)
|
||||
|
||||
if response.status_code == 200:
|
||||
response.encoding = "utf-8"
|
||||
return response.text
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def describe(self, url, user, password):
|
||||
|
||||
orig_format = self.format
|
||||
self.format = 'xml'
|
||||
|
||||
ret = self.execute(url, user, password)
|
||||
|
||||
m = re.match('^.*?\<PlainRow\>.*?\<\/PlainRow\>', ret, re.DOTALL)
|
||||
s = m[0] + '\n</report-generator>'
|
||||
|
||||
tree = ET.fromstring(s)
|
||||
|
||||
ret = []
|
||||
row = tree.find('PlainRow')
|
||||
for c in row.findall('Column'):
|
||||
#name = c.get('name')
|
||||
name = c.text
|
||||
type = c.get('type')
|
||||
if type == 'unknown': type = 'integer'
|
||||
|
||||
ret.append((name, type))
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
|
||||
def describe_simple(url, user, password, xml):
|
||||
|
||||
query = XMLQuery(xml)
|
||||
|
||||
query.format='xml'
|
||||
|
||||
ret = query.execute(url = url, user = user, password = password)
|
||||
|
||||
tree = ET.fromstring(ret)
|
||||
|
||||
ret = []
|
||||
row = tree.find('PlainRow')
|
||||
for c in row.findall('Column'):
|
||||
#name = c.get('name')
|
||||
name = c.text
|
||||
type = c.get('type')
|
||||
if type == 'unknown': type = 'integer'
|
||||
|
||||
ret.append((name, type))
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def normalize_output(self, date_format = 'dd/MM/yyyy', time_format = 'HH:mm:ss'):
|
||||
|
||||
lines = self.layout.splitlines()
|
||||
|
||||
lines = [re.sub(r'^date_format\s*=.*', 'date_format=' + date_format, l) for l in lines]
|
||||
lines = [re.sub(r'^time_format\s*=.*', 'time_format=' + time_format, l) for l in lines]
|
||||
lines = [re.sub(r'^NoNumberFormatting\s*=.*', 'NoNumberFormatting=1', l) for l in lines]
|
||||
|
||||
self.layout = '\n'.join(lines)
|
||||
|
||||
|
||||
|
||||
def __setattr__(self, name, value):
|
||||
if name == 'format' and value not in ('bin','xml','xml3','html','txt','csv','standard_csv', 'scsv', 'pdf'):
|
||||
raise Exception("Invalid report format '" + value + "'")
|
||||
|
||||
if not name.startswith('_'):
|
||||
name = '_' + name
|
||||
|
||||
if name == '_layout' and not value.endswith('\n'):
|
||||
value = value + '\n'
|
||||
|
||||
if name == '_xml':
|
||||
self._parse_xml(value)
|
||||
return
|
||||
|
||||
try:
|
||||
self.__dict__[name] = value
|
||||
except KeyError:
|
||||
raise AttributeError
|
||||
|
||||
|
||||
|
||||
def __getattr__(self, name):
|
||||
|
||||
if not name.startswith('_'):
|
||||
name = '_' + name
|
||||
|
||||
try:
|
||||
return self.__dict__[name]
|
||||
except KeyError:
|
||||
raise AttributeError(name)
|
||||
|
||||
|
||||
def __str__(self):
|
||||
parameters = ''
|
||||
for k in self._parameter:
|
||||
parameters = parameters + "\n<parameter name='%s'>%s</parameter>" % (k, self._parameter[k])
|
||||
|
||||
layout_b64 = base64.b64encode(self.layout.encode('utf-8')).decode('utf-8')
|
||||
return ('<?xml version="1.0" encoding="utf-8"?>\n' + \
|
||||
'<report-generator>\n' + \
|
||||
' <format type="%s"/>\n' + \
|
||||
' <layout>\n%s</layout>\n' + \
|
||||
' <parameters>%s\n</parameters>' + \
|
||||
'</report-generator>') % (self._format, layout_b64, parameters)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
||||
file = sys.argv[1]
|
||||
print(file)
|
||||
|
||||
with open(file) as f:
|
||||
xml = f.read()
|
||||
|
||||
|
||||
query = XMLQuery(xml)
|
||||
|
||||
print(query.layout)
|
||||
query.normalize_output()
|
||||
print(query.layout)
|
||||
|
||||
|
||||
|
||||
#query.format='xml'
|
||||
|
||||
#ret = query.execute(url = 'https://tmsxd104.ecbt1.tadnet.net:9443/report/', user = 'schilli', password = 'chili03')
|
||||
|
||||
#print(ret)
|
||||
|
||||
|
||||
desc = XMLQuery.describe_simple(url = 'https://tmsxd104.ecbt1.tadnet.net:9443/report/', user = 'schilli', password = 'chili03', xml = xml)
|
||||
|
||||
print(str(desc))
|
||||
355
python/connectors/tms/sample_DAG.py
Normal file
355
python/connectors/tms/sample_DAG.py
Normal file
@@ -0,0 +1,355 @@
|
||||
"""
|
||||
DAG: w_ODS_TMS_TRANSACTION (expanded example)
|
||||
Purpose:
|
||||
- Load layout+parameter metadata from TMS-layouts/w_ODS_TMS_TRANSACTION.yml
|
||||
- Call connectors/tms/TMSDBT.py to retrieve data into CSV in object storage
|
||||
- On first run, generate Oracle DDL and create an external table
|
||||
- Process file and record status in MRDS workflow tables
|
||||
Notes:
|
||||
- This is an expanded, readable version of the factory-generated DAG.
|
||||
- Replace paths/usernames/password references as appropriate.
|
||||
"""
|
||||
|
||||
import copy
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import traceback
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from airflow import DAG
|
||||
from airflow.operators.python import PythonOperator
|
||||
from pytz import timezone
|
||||
|
||||
# --- Project-specific deps (must exist in your Airflow image) ---
|
||||
from mrds.core import main # noqa: F401 # imported to mirror the factory env
|
||||
import mrds.utils.manage_files as mf
|
||||
import mrds.utils.manage_runs as mr
|
||||
|
||||
# ---------- Paths & constants ----------
|
||||
gScriptDir = Path(globals().get("__file__", "./_")).absolute().parent
|
||||
gDataDir = str(gScriptDir / "TMS-layouts") + "/"
|
||||
gConfigDir = str(gScriptDir / "config")
|
||||
gConnDir = "/opt/airflow/python/connectors/tms/"
|
||||
gTableDir = str(gScriptDir / "TMS-tables") + "/"
|
||||
|
||||
DAG_NAME = "w_ODS_TMS_TRANSACTION"
|
||||
ODS_TABLE = DAG_NAME
|
||||
DATABASE_NAME = "ODS"
|
||||
WF_NAME = DAG_NAME
|
||||
|
||||
default_args = {
|
||||
"owner": "ecb",
|
||||
"depends_on_past": False,
|
||||
"email_on_failure": False,
|
||||
"email_on_retry": False,
|
||||
"retries": 0,
|
||||
"execution_timeout": timedelta(minutes=60),
|
||||
"retry_delay": timedelta(minutes=5),
|
||||
}
|
||||
|
||||
# ---------- Load YAML configs once on parse ----------
|
||||
with open(gDataDir + DAG_NAME + ".yml", "r") as f:
|
||||
report_desc = yaml.safe_load(f) or {}
|
||||
|
||||
with open(gConfigDir + "/TMS.yml", "r") as f:
|
||||
tms_config = yaml.safe_load(f)
|
||||
|
||||
# TMS + storage config
|
||||
tms_url = tms_config["TMS-URL"]
|
||||
tms_user = tms_config["TMS-user"]
|
||||
tms_pwd = tms_config["TMS-password"]
|
||||
prefix = tms_config["dest-prefix"] + DAG_NAME + "/" + DAG_NAME + "/"
|
||||
data_prefix = tms_config["data-prefix"] + DAG_NAME + "/"
|
||||
dest = tms_config["dest-bucket"] + ":" + prefix
|
||||
|
||||
# Visible vs hidden params (from layout YAML)
|
||||
params_visible = {}
|
||||
params_hidden = {}
|
||||
params_dict = report_desc.get("parameters") or {}
|
||||
for p, meta in params_dict.items():
|
||||
val = meta.get("value", None)
|
||||
if not meta.get("hidden", False):
|
||||
params_visible[p] = val
|
||||
else:
|
||||
params_hidden[p] = val
|
||||
|
||||
# ---------- Helpers (parameter handling) ----------
|
||||
def _enum_param_combinations_recursive(params, keys):
|
||||
"""
|
||||
Build all combinations of params (cartesian product), supporting
|
||||
'column(<name>)' derived lists aligned by index.
|
||||
"""
|
||||
k = None
|
||||
result = []
|
||||
keys = list(keys) # safe copy
|
||||
|
||||
while keys:
|
||||
k = keys.pop(0)
|
||||
v = params[k]
|
||||
if v or v == "":
|
||||
break
|
||||
|
||||
if not k:
|
||||
return []
|
||||
|
||||
v = v if isinstance(v, list) else [v]
|
||||
|
||||
# derived columns aligned with v (same length)
|
||||
derived_columns = []
|
||||
# params_dict[k] holds the definition, not just the value
|
||||
pdef = params_dict.get(k, {})
|
||||
for c in list(pdef):
|
||||
if re.match(r"column\(.*\)$", c):
|
||||
vtmp = pdef[c]
|
||||
vtmp = vtmp if isinstance(vtmp, list) else [vtmp]
|
||||
derived_columns.append((c, vtmp))
|
||||
|
||||
if not keys:
|
||||
for i, value in enumerate(v):
|
||||
row = [(k, value)]
|
||||
for col_key, aligned_values in derived_columns:
|
||||
row.append((col_key, aligned_values[i]))
|
||||
result.append(row)
|
||||
return result
|
||||
|
||||
combinations = _enum_param_combinations_recursive(params, keys)
|
||||
for row in combinations:
|
||||
for i, vtmp in enumerate(v):
|
||||
new_row = copy.deepcopy(row)
|
||||
new_row.append((k, vtmp))
|
||||
for col_key, aligned_values in derived_columns:
|
||||
new_row.append((col_key, aligned_values[i]))
|
||||
result.append(new_row)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _enum_param_combinations(params, sequential=False):
|
||||
# Sequential path omitted (buggy in factory; not used there either)
|
||||
return _enum_param_combinations_recursive(params, list(params))
|
||||
|
||||
|
||||
def _allowed_select(table, expression, condition="1 = 1"):
|
||||
"""
|
||||
Guarded select used by eval_params(select(...)).
|
||||
Whitelist tables to avoid arbitrary reads.
|
||||
"""
|
||||
if table.upper() not in (
|
||||
ODS_TABLE.upper(),
|
||||
"DUAL",
|
||||
"CT_MRDS.A_WORKFLOW_HISTORY",
|
||||
):
|
||||
raise Exception(f"Not allowed to select from {table}")
|
||||
res = mr.select_ods_tab(table, expression, condition)
|
||||
return res[0]
|
||||
|
||||
|
||||
def _eval_param(v):
|
||||
"""
|
||||
Evaluate special functional values:
|
||||
- select(...) => guarded DB helper above
|
||||
- eval(...) => strongly discouraged; keep disabled or restricted
|
||||
"""
|
||||
s = str(v) if v is not None else ""
|
||||
if re.match(r"\s*select\(.*\)", s):
|
||||
# Expose only 'select' symbol to eval
|
||||
return eval(s, {"select": _allowed_select}, {})
|
||||
if re.match(r"\s*eval\(.*\)\s*$", s):
|
||||
# If you really must support eval, strictly sandbox or remove this path.
|
||||
raise ValueError("eval(...) not allowed in this hardened DAG.")
|
||||
return v
|
||||
|
||||
|
||||
def _finalize_param_list(param_list):
|
||||
"""
|
||||
Apply replacements and drop virtual params according to YAML definitions.
|
||||
"""
|
||||
d = dict(param_list)
|
||||
|
||||
# Replace parameter tokens inside another parameter (string replace)
|
||||
for p, meta in params_dict.items():
|
||||
if meta.get("replace_parameter"):
|
||||
target = meta["replace_parameter"]
|
||||
if target in d and p in d and isinstance(d[target], str):
|
||||
d[target] = d[target].replace(p, str(d[p]))
|
||||
|
||||
# Drop 'virtual' params
|
||||
cleaned = []
|
||||
for k, v in d.items():
|
||||
meta = params_dict.get(k, {})
|
||||
if not meta.get("virtual", False):
|
||||
cleaned.append((k, v))
|
||||
return cleaned
|
||||
|
||||
|
||||
# ---------- Core work ----------
|
||||
def execute_report(**context):
|
||||
"""
|
||||
For each parameter combination:
|
||||
- create workflow key
|
||||
- call TMSDBT.py retrieve to land CSV
|
||||
- if first time, create Oracle table from generated DDL
|
||||
- process file, finalize workflow Y/N
|
||||
"""
|
||||
logger = logging.getLogger("airflow.task")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
run_id = context["dag_run"].run_id
|
||||
all_params = {**params_visible, **params_hidden}
|
||||
|
||||
# 1) Compute combinations
|
||||
combos = _enum_param_combinations(all_params)
|
||||
|
||||
# 2) Evaluate select(...) etc and finalize
|
||||
evaluated = []
|
||||
for combo in combos or [[]]:
|
||||
# first pass: special evaluations
|
||||
pair_list = []
|
||||
for k, v in combo:
|
||||
pair_list.append((k, _eval_param(v)))
|
||||
# second pass: replacements + pruning
|
||||
evaluated.append(_finalize_param_list(pair_list))
|
||||
|
||||
# if no combos at all, ensure we run once
|
||||
if not evaluated:
|
||||
evaluated = [[]]
|
||||
|
||||
# Timing + workflow
|
||||
ts = "{:%Y%m%d_%H%M%S}".format(datetime.now(timezone("Europe/Berlin")))
|
||||
|
||||
for idx, param_list in enumerate(evaluated, start=1):
|
||||
wf_key = mr.init_workflow(DATABASE_NAME, WF_NAME, run_id)
|
||||
file_name = f"{WF_NAME}.{wf_key}.{ts}.csv"
|
||||
|
||||
try:
|
||||
# Build connector command safely (no shell quoting games)
|
||||
cmd = [
|
||||
sys.executable, # 'python'
|
||||
os.path.join(gConnDir, "TMSDBT.py"),
|
||||
"retrieve",
|
||||
"--name", WF_NAME,
|
||||
"--url", tms_url,
|
||||
"-U", tms_user,
|
||||
"--password", tms_pwd,
|
||||
"--layoutfile", gDataDir + DAG_NAME + ".fkr",
|
||||
"-f", "scsv",
|
||||
"--dataset", str(wf_key),
|
||||
"-d", dest + file_name,
|
||||
]
|
||||
|
||||
# Map params to -p or -c switches
|
||||
for k, v in param_list:
|
||||
sval = "" if v is None else str(v).rstrip()
|
||||
m = re.match(r"column\((.*)\)$", k)
|
||||
if m:
|
||||
cmd.extend(["-c", f'{m.group(1)}={sval}'])
|
||||
else:
|
||||
cmd.extend(["-p", f"{k}={sval}"])
|
||||
mr.set_workflow_property(wf_key, DATABASE_NAME, k, sval)
|
||||
|
||||
logger.debug("Running connector: %s", json.dumps(cmd))
|
||||
res = subprocess.run(cmd, capture_output=True, check=False)
|
||||
logger.debug("stdout: %s", res.stdout.decode(errors="ignore"))
|
||||
logger.debug("stderr: %s", res.stderr.decode(errors="ignore"))
|
||||
|
||||
if res.returncode is None:
|
||||
raise RuntimeError("Connector returned no status")
|
||||
if res.returncode == 1:
|
||||
logger.info("No data returned for wf_key=%s (continuing)", wf_key)
|
||||
mr.finalise_workflow(wf_key, "Y")
|
||||
continue
|
||||
if res.returncode != 0:
|
||||
raise RuntimeError(f"Connector failed (rc={res.returncode})")
|
||||
|
||||
# Data landed -> ensure source config exists, bootstrap table if needed
|
||||
cfg = mf.execute_query(
|
||||
"select * from CT_MRDS.A_SOURCE_FILE_CONFIG "
|
||||
f"where a_source_key = 'TMS' and table_id = '{ODS_TABLE}'"
|
||||
)
|
||||
|
||||
if not cfg:
|
||||
# Generate DDL file
|
||||
ddl_cmd = [
|
||||
sys.executable,
|
||||
os.path.join(gConnDir, "TMSDBT.py"),
|
||||
"create-oracle-table",
|
||||
"--name", WF_NAME,
|
||||
"--url", tms_url,
|
||||
"-U", tms_user,
|
||||
"--password", tms_pwd,
|
||||
"--layoutfile", gDataDir + DAG_NAME + ".fkr",
|
||||
"-d", gTableDir + WF_NAME + ".sql",
|
||||
]
|
||||
for k, v in param_list:
|
||||
sval = "" if v is None else str(v).rstrip()
|
||||
m = re.match(r"column\((.*)\)$", k)
|
||||
if m:
|
||||
ddl_cmd.extend(["-c", f'{m.group(1)}={sval}'])
|
||||
else:
|
||||
ddl_cmd.extend(["-p", f"{k}={sval}"])
|
||||
|
||||
logger.debug("Generating DDL: %s", json.dumps(ddl_cmd))
|
||||
ddl_res = subprocess.run(ddl_cmd, capture_output=True, check=True)
|
||||
logger.debug("DDL stdout: %s", ddl_res.stdout.decode(errors="ignore"))
|
||||
logger.debug("DDL stderr: %s", ddl_res.stderr.decode(errors="ignore"))
|
||||
|
||||
# Execute DDL and create external table
|
||||
sql = Path(gTableDir + WF_NAME + ".sql").read_text()
|
||||
mf.execute(sql)
|
||||
mf.add_column_date_format(
|
||||
f"CT_ET_TEMPLATES.{ODS_TABLE}", "DEFAULT", "DD/MM/YYYY HH24:MI:SS"
|
||||
)
|
||||
mf.create_external_table(ODS_TABLE, f"CT_ET_TEMPLATES.{ODS_TABLE}", data_prefix)
|
||||
mf.add_source_file_config(
|
||||
"TMS",
|
||||
"INPUT",
|
||||
DAG_NAME,
|
||||
DAG_NAME,
|
||||
r".*\.csv",
|
||||
ODS_TABLE,
|
||||
f"CT_ET_TEMPLATES.{ODS_TABLE}",
|
||||
)
|
||||
|
||||
# Process landed file (register, move, etc. as per your mf impl)
|
||||
mf.process_source_file(prefix, file_name)
|
||||
mr.finalise_workflow(wf_key, "Y")
|
||||
|
||||
except BaseException as ex:
|
||||
# rich error logging, then mark workflow failed and re-raise
|
||||
ex_type, ex_value, ex_tb = sys.exc_info()
|
||||
tb = traceback.extract_tb(ex_tb)
|
||||
stack = [
|
||||
f"File: {t[0]}, Line: {t[1]}, Func: {t[2]}, Code: {t[3]}"
|
||||
for t in tb
|
||||
]
|
||||
logging.error("Exception type: %s", ex_type.__name__)
|
||||
logging.error("Exception message: %s", ex_value)
|
||||
logging.error("Stack trace: %s", stack)
|
||||
mr.finalise_workflow(wf_key, "N")
|
||||
raise
|
||||
|
||||
|
||||
# ---------- DAG definition ----------
|
||||
with DAG(
|
||||
dag_id=DAG_NAME,
|
||||
default_args=default_args,
|
||||
description=DAG_NAME,
|
||||
schedule_interval=None, # manual trigger
|
||||
params=params_visible, # visible-only; hidden merged inside task
|
||||
start_date=datetime(2025, 1, 1),
|
||||
catchup=False,
|
||||
tags=[DAG_NAME],
|
||||
) as dag:
|
||||
|
||||
retrieve_report = PythonOperator(
|
||||
task_id="retrieve_report",
|
||||
python_callable=execute_report,
|
||||
execution_timeout=timedelta(minutes=30),
|
||||
)
|
||||
0
python/devo_replicator/config/.gitkeep
Normal file
0
python/devo_replicator/config/.gitkeep
Normal file
86
python/devo_replicator/config/env_config.yaml
Normal file
86
python/devo_replicator/config/env_config.yaml
Normal file
@@ -0,0 +1,86 @@
|
||||
dev:
|
||||
DEVO_USERNAME: "ap-devo_lab-mrds"
|
||||
IMPALA_HOSTNAME: "impala-proxy-devo-lab21-impala01.dw-devo-lab21.om2y56.b0.cloudera.site"
|
||||
HIVE_HOSTNAME: 'hs2-devo-lab21-hive01.dw-devo-lab21.om2y56.b0.cloudera.site'
|
||||
RANGER_HOSTNAME: "https://devo-lab21-dl-gateway.devo-lab.om2y56.b0.cloudera.site:443/devo-lab21-dl/cdp-proxy-api/ranger"
|
||||
BUCKET_PREFIX: "s3a://devo-crp-ffppyd8q/"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya3tsglrzfgiyfisxchref774l5y4nrler2vn54lr3li7q"
|
||||
S3_LOCATION_URI: "https://devo-crp-ffppyd8q.bucket.vpce-040b28f5818b670c1-owicl3ow.s3.eu-central-1.vpce.amazonaws.com/{0}/db/"
|
||||
DEVO_USERNAME_RQSD: "ap-devo-rqsd-lab"
|
||||
DEVO_SECRET_RQSD: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyap6wtzobzob7qizvk4nocszlcaxhwijgzejbvryt3uzbq"
|
||||
FULL_ACCESS_LIST_RAR: "DISC-DC-RAR-R"
|
||||
FULL_ACCESS_LIST_MOPDB: ""
|
||||
FULL_ACCESS_LIST_RQSD: ""
|
||||
tst:
|
||||
DEVO_USERNAME: "ap-devo_tst-mrds"
|
||||
IMPALA_HOSTNAME: "t-impala.devo.escb.eu"
|
||||
HIVE_HOSTNAME: "hs2-devo-tst21-hive01.dw-devo-tst21.om2y56.b0.cloudera.site"
|
||||
RANGER_HOSTNAME: "https://devo-tst21-dl-gateway.devo-tst.om2y56.b0.cloudera.site:443/devo-tst21-dl/cdp-proxy-api/ranger"
|
||||
BUCKET_PREFIX: "s3a://devo-crp-sbul3ju3/"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyayqqotyowhpoml3v5szkwhmtu4rq6bplpkvdruzupz3ma"
|
||||
S3_LOCATION_URI: "https://devo-crp-sbul3ju3.bucket.vpce-040b28f5818b670c1-owicl3ow.s3.eu-central-1.vpce.amazonaws.com/{0}/db/"
|
||||
DEVO_USERNAME_RQSD: "ap-devo-rqsd-tst"
|
||||
DEVO_SECRET_RQSD: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyap6wtzobzob7qizvk4nocszlcaxhwijgzejbvryt3uzbq"
|
||||
FULL_ACCESS_LIST_RAR: "DISC-TC-RAR-R"
|
||||
FULL_ACCESS_LIST_MOPDB: ""
|
||||
FULL_ACCESS_LIST_RQSD: ""
|
||||
acc:
|
||||
DEVO_USERNAME: "ap-devo_acc-mrds"
|
||||
IMPALA_HOSTNAME: "impala-proxy-devo-acc21-impala01.dw-devo-acc21.inym23.b0.cloudera.site"
|
||||
HIVE_HOSTNAME: "hs2-devo-acc21-hive01.dw-devo-acc21.inym23.b0.cloudera.site"
|
||||
RANGER_HOSTNAME: "https://devo-acc21-dl-gateway.devo-acc.inym23.b0.cloudera.site/devo-acc21-dl/cdp-proxy-api/ranger/"
|
||||
BUCKET_PREFIX: "s3a://devo-crp-sbc9vbsu/"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya3x3nic3vxsnpzlfshz2ubj6kekny5tvaqsnwkuh2hw2a"
|
||||
S3_LOCATION_URI: "https://devo-crp-sbc9vbsu.bucket.vpce-0bf4fa440fb60935d-6m9iqoo9.s3.eu-central-1.vpce.amazonaws.com/{0}/db/"
|
||||
DEVO_USERNAME_RQSD: "ap-devo-rqsd-acc"
|
||||
DEVO_SECRET_RQSD: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyamzhgatnso57mubvg3c6k4ens3orcx4dieo6efukuvm4a"
|
||||
FULL_ACCESS_LIST_RAR: "DISC-AC-RAR-R"
|
||||
FULL_ACCESS_LIST_MOPDB: ""
|
||||
FULL_ACCESS_LIST_RQSD: ""
|
||||
prd:
|
||||
DEVO_USERNAME: "ap-devo_prd-mrds"
|
||||
IMPALA_HOSTNAME: "impala-proxy-devo-prd21-impala01.dw-devo-prd21.inym23.b0.cloudera.site"
|
||||
HIVE_HOSTNAME: "hs2-devo-prd21-hive01.dw-devo-prd21.inym23.b0.cloudera.site"
|
||||
RANGER_HOSTNAME: "https://devo-prd21-dl-gateway.devo-prd.inym23.b0.cloudera.site/devo-prd21-dl/cdp-proxy-api/ranger/"
|
||||
BUCKET_PREFIX: "s3a://devo-crp-2gn5maj9/"
|
||||
DEVO_SECRET: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyace73o3xowa3f3jkw4diqzoiyc6skt34sqnnx4yrbykmq"
|
||||
S3_LOCATION_URI: "https://devo-crp-2gn5maj9.bucket.vpce-0aa6cf4490536dfd5-qgy4w5sz.s3.eu-central-1.vpce.amazonaws.com/{0}/db/"
|
||||
DEVO_USERNAME_RQSD: "ap-devo-rqsd-prd"
|
||||
DEVO_SECRET_RQSD: "ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjyacodc43tfgumkw4qyzw4s3j4jp42vp2elakkpwwrmivqa"
|
||||
FULL_ACCESS_LIST_RAR: "DISC-PC-RAR-R"
|
||||
FULL_ACCESS_LIST_MOPDB: ""
|
||||
FULL_ACCESS_LIST_RQSD: ""
|
||||
|
||||
|
||||
rar:
|
||||
corporate_store: "crp_rar"
|
||||
oracle_metadata_table: "CORR_RAR.NH_METADATA_INVENTORY"
|
||||
oracle_igam_table: "CT_MRDS.A_DEVO_SOURCES_IGAM"
|
||||
oracle_mgmt_table: "CT_MRDS.A_DEVO_REPLICA_MGMT_RAR"
|
||||
target_s3_bucket: "rar/db"
|
||||
tech_meta_data_fields: "tec_ingestion_date String, tec_execution_date String, tec_run_id String"
|
||||
mopdb:
|
||||
corporate_store: "crp_mopdb"
|
||||
oracle_metadata_table: "CT_MOPDB.MOPDB_METADATA_INVENTORY"
|
||||
oracle_igam_table: "CT_MRDS.A_DEVO_SOURCES_IGAM"
|
||||
oracle_mgmt_table: "CT_MRDS.A_DEVO_REPLICA_MGMT_MOPDB"
|
||||
target_s3_bucket: "mopdb/db"
|
||||
tech_meta_data_fields: "tec_ingestion_date String, tec_execution_date String, tec_run_id String"
|
||||
rqsd:
|
||||
corporate_store: "crp_rqsd"
|
||||
oracle_metadata_table: "CT_MRDS.A_DEVO_METADATA_INVENTORY"
|
||||
oracle_igam_table: "CT_MRDS.A_DEVO_SOURCES_IGAM"
|
||||
oracle_mgmt_table: "CT_MRDS.A_DEVO_REPLICA_MGMT_RQSD"
|
||||
target_s3_bucket: "rqsd/db"
|
||||
tech_meta_data_fields: "tec_ingestion_date String, tec_execution_date String, tec_run_id String"
|
||||
|
||||
# -- target table name as
|
||||
# SELECT DISTINCT TABLE_ALIAS FROM {oracle_mgmt_table}
|
||||
# WHERE OWNER = ''
|
||||
# AND TABLE_NAME = '';
|
||||
|
||||
# -- type of access
|
||||
# SELECT DISTINCT RAR3_TYPE_OF_ACCESS FROM {oracle_metadata_table}
|
||||
# WHERE A_VALID_TO > SYSDATE AND
|
||||
# OWNER = ''
|
||||
# AND TABLE_NAME = '';
|
||||
0
python/devo_replicator/data_replicator/.gitkeep
Normal file
0
python/devo_replicator/data_replicator/.gitkeep
Normal file
65
python/devo_replicator/data_replicator/diag_s3_access.py
Normal file
65
python/devo_replicator/data_replicator/diag_s3_access.py
Normal file
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys, json
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError
|
||||
from botocore.config import Config
|
||||
|
||||
BUCKET = "devo-crp-sbc9vbsu"
|
||||
PREFIX = "mopdb/db/" # adjust if needed
|
||||
|
||||
def show(e):
|
||||
# Print the structured error if present
|
||||
resp = getattr(e, "response", {})
|
||||
code = resp.get("Error", {}).get("Code")
|
||||
msg = resp.get("Error", {}).get("Message")
|
||||
rid = resp.get("ResponseMetadata", {}).get("RequestId")
|
||||
print(f"{type(e).__name__}: {code} {msg} (RequestId={rid})", file=sys.stderr)
|
||||
|
||||
def main(endpoint_url=None, region=None, force_path=False):
|
||||
session = boto3.Session()
|
||||
cfg = Config(s3={"addressing_style": "path" if force_path else "auto"})
|
||||
s3 = session.client("s3", region_name=region, endpoint_url=endpoint_url, config=cfg)
|
||||
sts = session.client("sts", region_name=region)
|
||||
|
||||
# Who am I?
|
||||
try:
|
||||
ident = sts.get_caller_identity()
|
||||
print(f"Caller: {ident['Arn']} (acct {ident['Account']})")
|
||||
except Exception as e:
|
||||
print("Could not call STS get-caller-identity — credentials not valid for STS.", file=sys.stderr)
|
||||
show(e); return 1
|
||||
|
||||
# Is the bucket reachable at all?
|
||||
try:
|
||||
s3.head_bucket(Bucket=BUCKET)
|
||||
print(f"head_bucket OK on s3://{BUCKET}")
|
||||
except ClientError as e:
|
||||
print("head_bucket failed:", file=sys.stderr)
|
||||
show(e); return 2
|
||||
|
||||
# List with zero keys to test just the ListBucket permission
|
||||
try:
|
||||
s3.list_objects_v2(Bucket=BUCKET, Prefix=PREFIX, MaxKeys=0)
|
||||
print(f"list_objects_v2 OK on prefix '{PREFIX}' (permission exists)")
|
||||
except ClientError as e:
|
||||
print("list_objects_v2 failed:", file=sys.stderr)
|
||||
show(e); return 3
|
||||
|
||||
# Ask for 1 key to confirm data path works
|
||||
try:
|
||||
resp = s3.list_objects_v2(Bucket=BUCKET, Prefix=PREFIX, MaxKeys=1)
|
||||
print("First key:", resp.get("Contents", [{}])[0].get("Key"))
|
||||
except ClientError as e:
|
||||
print("list_objects_v2 (MaxKeys=1) failed:", file=sys.stderr)
|
||||
show(e); return 4
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Allow optional args: --endpoint-url URL --region eu-central-1 --force-path
|
||||
url = None; reg = None; force = False
|
||||
for i,a in enumerate(sys.argv):
|
||||
if a == "--endpoint-url": url = sys.argv[i+1]
|
||||
if a == "--region": reg = sys.argv[i+1]
|
||||
if a == "--force-path": force = True
|
||||
sys.exit(main(endpoint_url=url, region=reg, force_path=force))
|
||||
129
python/devo_replicator/data_replicator/impala_refresher.py
Normal file
129
python/devo_replicator/data_replicator/impala_refresher.py
Normal file
@@ -0,0 +1,129 @@
|
||||
import os
|
||||
import yaml
|
||||
import datetime
|
||||
import pandas as pd
|
||||
from mrds.utils.secrets import get_secret
|
||||
import mrds.utils.manage_runs as runManager
|
||||
import mrds.utils.manage_files as fileManager
|
||||
import mrds.utils.sql_statements as sqls
|
||||
|
||||
import oci
|
||||
|
||||
from impala.dbapi import (
|
||||
connect,
|
||||
ProgrammingError,
|
||||
DatabaseError,
|
||||
IntegrityError,
|
||||
OperationalError,
|
||||
)
|
||||
from impala.error import HiveServer2Error
|
||||
|
||||
|
||||
def get_impala_connection(hostname: str, user: str, secret: str):
|
||||
conn = connect(
|
||||
host=hostname,
|
||||
port=443,
|
||||
auth_mechanism="PLAIN",
|
||||
user=user,
|
||||
password=secret,
|
||||
use_http_transport=True,
|
||||
http_path="cliservice",
|
||||
use_ssl=True
|
||||
)
|
||||
return conn
|
||||
|
||||
def execute_query(query: str,user,hostname,password):
|
||||
conn = get_impala_connection(hostname, user, password)
|
||||
print(conn)
|
||||
columns, result = execute_devo_query(query, conn)
|
||||
return columns, result
|
||||
|
||||
def execute_devo_query(query: str, conn):
|
||||
#impersonation_configuration = {"impala.doas.user": userid} # to be changed
|
||||
#impersonation_configuration = {} # to be changed
|
||||
cursor = conn.cursor()
|
||||
print("executing query")
|
||||
try:
|
||||
cursor.execute(query)
|
||||
# Check if the query is a SELECT query (i.e., reads data)
|
||||
return None, cursor.rowcount # rowcount returns the number of rows affected
|
||||
except OperationalError as oe:
|
||||
raise Exception(
|
||||
status_code=500, detail="Failed to connect to Impala: " + str(oe)
|
||||
)
|
||||
|
||||
except ProgrammingError as pe:
|
||||
raise Exception(status_code=400, detail="Query syntax error: " + str(pe))
|
||||
|
||||
except IntegrityError as ie:
|
||||
raise Exception(
|
||||
status_code=403, detail="Insufficient permissions: " + str(ie)
|
||||
)
|
||||
|
||||
except DatabaseError as db_err:
|
||||
raise Exception(status_code=500, detail="Database error: " + str(db_err))
|
||||
|
||||
except HiveServer2Error as au_err:
|
||||
raise Exception(
|
||||
status_code=403, detail="HiveServer2Error error: " + str(au_err)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
status_code=500, detail="An unexpected error occurred: " + str(e)
|
||||
) from e
|
||||
|
||||
finally:
|
||||
try:
|
||||
if cursor:
|
||||
cursor.close()
|
||||
if conn:
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
status_code=500, detail="Failed to close the connection: " + str(e)
|
||||
)
|
||||
|
||||
def initialize_task(workflow_context, task_name):
|
||||
# Initialize task
|
||||
a_task_history_key = runManager.init_task(
|
||||
task_name,
|
||||
workflow_context["run_id"],
|
||||
workflow_context["a_workflow_history_key"],
|
||||
)
|
||||
|
||||
return a_task_history_key
|
||||
|
||||
def initialize_config(config_file_path):
|
||||
# Ensure the file exists
|
||||
if not os.path.exists(config_file_path):
|
||||
raise FileNotFoundError(f"Configuration file {config_file_path} not found.")
|
||||
|
||||
# Load the configuration
|
||||
with open(config_file_path, "r") as f:
|
||||
config_data = yaml.safe_load(f)
|
||||
|
||||
return config_data
|
||||
|
||||
def main(env_config_path, env, table, corporate_store):
|
||||
|
||||
#init setup
|
||||
envs_info = initialize_config(env_config_path)
|
||||
environment_info = envs_info[env]
|
||||
|
||||
try:
|
||||
devo_secret_name = environment_info["DEVO_SECRET"]
|
||||
password = get_secret(devo_secret_name)
|
||||
except:
|
||||
print("Failed to retrieve credentials from secrets")
|
||||
raise(Exception)
|
||||
# get devo data
|
||||
try:
|
||||
execute_query(f"INVALIDATE METADATA {corporate_store}.{table}", environment_info['DEVO_USERNAME'], environment_info['IMPALA_HOSTNAME'], password)
|
||||
execute_query(f"COMPUTE STATS {corporate_store}.{table}", environment_info['DEVO_USERNAME'], environment_info['IMPALA_HOSTNAME'], password)
|
||||
|
||||
except:
|
||||
print("Failed to retrieve DEVO data, error during connection or request")
|
||||
raise(Exception)
|
||||
return True
|
||||
|
||||
128
python/devo_replicator/data_replicator/list_s3_files_fast.py
Normal file
128
python/devo_replicator/data_replicator/list_s3_files_fast.py
Normal file
@@ -0,0 +1,128 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse, sys
|
||||
from urllib.parse import urlparse
|
||||
import boto3
|
||||
from botocore.config import Config
|
||||
from botocore.exceptions import ClientError, EndpointConnectionError, NoCredentialsError, ReadTimeoutError, ConnectTimeoutError
|
||||
|
||||
def parse_s3_uri(s3_uri: str):
|
||||
if not s3_uri.startswith("s3://"):
|
||||
raise ValueError("S3 URI must start with 's3://'")
|
||||
p = urlparse(s3_uri)
|
||||
if not p.netloc:
|
||||
raise ValueError("Missing bucket in S3 URI")
|
||||
return p.netloc, p.path.lstrip("/")
|
||||
|
||||
def parse_location(location: str):
|
||||
"""Accept s3://... OR https://...amazonaws.com/... and return (bucket, prefix)."""
|
||||
if location.startswith("s3://"):
|
||||
return parse_s3_uri(location)
|
||||
|
||||
if location.startswith(("http://", "https://")):
|
||||
p = urlparse(location)
|
||||
host = p.netloc
|
||||
path = p.path.lstrip("/")
|
||||
# Bucket-scoped VPCe host: <bucket>.bucket.vpce-xxxx.s3.<region>.vpce.amazonaws.com
|
||||
if ".bucket." in host:
|
||||
bucket = host.split(".bucket.", 1)[0]
|
||||
return bucket, path
|
||||
# Virtual-hosted: <bucket>.s3.<region>...
|
||||
if ".s3." in host and not host.startswith("s3."):
|
||||
bucket = host.split(".s3.", 1)[0]
|
||||
return bucket, path
|
||||
# Path-style: s3.<region>.../<bucket>/...
|
||||
if host.startswith("s3."):
|
||||
parts = path.split("/", 1)
|
||||
bucket = parts[0]
|
||||
prefix = parts[1] if len(parts) > 1 else ""
|
||||
return bucket, prefix
|
||||
|
||||
raise ValueError(f"Unsupported location: {location}")
|
||||
|
||||
def iter_keys(s3, bucket: str, prefix: str, page_size: int, max_items: int, verbose: bool):
|
||||
print('here')
|
||||
paginator = s3.get_paginator("list_objects_v2")
|
||||
kwargs = {"Bucket": bucket, "Prefix": prefix}
|
||||
pagination = {"PageSize": page_size}
|
||||
if max_items > 0:
|
||||
pagination["MaxItems"] = max_items
|
||||
|
||||
total = 0
|
||||
page_num = 0
|
||||
for page in paginator.paginate(**kwargs, PaginationConfig=pagination):
|
||||
page_num += 1
|
||||
contents = page.get("Contents", []) or []
|
||||
if verbose:
|
||||
print(f"[page {page_num}] fetched {len(contents)} keys (running total={total + len(contents)})",
|
||||
file=sys.stderr, flush=True)
|
||||
for obj in contents:
|
||||
key = obj["Key"]
|
||||
yield key
|
||||
total += 1
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description="List files under an S3 location quickly and safely.")
|
||||
ap.add_argument("location", help="s3://bucket/prefix/ OR https://<vpc-endpoint-host>/<prefix>")
|
||||
ap.add_argument("--region", default=None, help="AWS region (e.g., eu-central-1)")
|
||||
ap.add_argument("--profile", default=None, help="AWS profile to use")
|
||||
ap.add_argument("--endpoint-url", default=None,
|
||||
help="Custom S3 endpoint (e.g., https://s3.eu-central-1.vpce.amazonaws.com)")
|
||||
ap.add_argument("--force-path-addressing", action="store_true",
|
||||
help="Force path-style addressing (useful with bucket-scoped VPCe hostnames)")
|
||||
ap.add_argument("--page-size", type=int, default=1000, help="S3 page size (default 1000)")
|
||||
ap.add_argument("--max-items", type=int, default=0, help="Stop after N keys (0 = no limit)")
|
||||
ap.add_argument("--connect-timeout", type=float, default=10.0, help="Seconds (default 10)")
|
||||
ap.add_argument("--read-timeout", type=float, default=30.0, help="Seconds (default 30)")
|
||||
ap.add_argument("--retries", type=int, default=3, help="Max retry attempts (default 3)")
|
||||
ap.add_argument("--relative", action="store_true", help="Print keys relative to the prefix")
|
||||
ap.add_argument("--verbose", "-v", action="store_true", help="Print progress to stderr")
|
||||
args = ap.parse_args()
|
||||
|
||||
bucket, prefix = parse_location(args.location)
|
||||
|
||||
# Session & client with explicit timeouts and optional path addressing
|
||||
sess_kwargs = {}
|
||||
if args.profile:
|
||||
sess_kwargs["profile_name"] = args.profile
|
||||
session = boto3.Session(**sess_kwargs)
|
||||
|
||||
cfg = Config(
|
||||
connect_timeout=args.connect_timeout,
|
||||
read_timeout=args.read_timeout,
|
||||
retries={"max_attempts": args.retries, "mode": "standard"},
|
||||
s3={"addressing_style": "path" if args.force_path_addressing else "auto"},
|
||||
)
|
||||
|
||||
s3 = session.client("s3", region_name=args.region, endpoint_url=args.endpoint_url, config=cfg)
|
||||
|
||||
# Quick preflight: try a 0-key list to surface auth/endpoint issues fast
|
||||
try:
|
||||
_ = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, MaxKeys=0)
|
||||
except ClientError as e:
|
||||
print(f"Preflight failed (auth/permissions/endpoint): {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except (EndpointConnectionError, ReadTimeoutError, ConnectTimeoutError) as e:
|
||||
print(f"Network/endpoint error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
for key in iter_keys(s3, bucket, prefix, args.page_size, args.max_items, args.verbose):
|
||||
if args.relative and prefix and key.startswith(prefix):
|
||||
print(key[len(prefix):].lstrip("/"))
|
||||
else:
|
||||
print(f"s3://{bucket}/{key}")
|
||||
except KeyboardInterrupt:
|
||||
print("\nInterrupted.", file=sys.stderr)
|
||||
sys.exit(130)
|
||||
except NoCredentialsError:
|
||||
print("No AWS credentials found. Set env vars or use --profile.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except (EndpointConnectionError, ReadTimeoutError, ConnectTimeoutError) as e:
|
||||
print(f"Network/timeout listing objects: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except ClientError as e:
|
||||
print(f"AWS error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
python/devo_replicator/table_generator/.gitkeep
Normal file
0
python/devo_replicator/table_generator/.gitkeep
Normal file
145
python/devo_replicator/table_generator/FlowOptions.py
Normal file
145
python/devo_replicator/table_generator/FlowOptions.py
Normal file
@@ -0,0 +1,145 @@
|
||||
class Options:
|
||||
def __init__(self, args):
|
||||
self.options = {
|
||||
"corporate_store": None,
|
||||
"service_name": None,
|
||||
"source_schema": None,
|
||||
"source_table": None,
|
||||
"access_type": None,
|
||||
"oracle_metadata_table": None,
|
||||
"oracle_igam_table": None,
|
||||
"query_metadata_access_type1": None,
|
||||
"query_metadata_access_type2a": None,
|
||||
"query_igam_roles": None,
|
||||
"ora_jdbc_url_dwh": None,
|
||||
"ora_jdbc_url_ods": None,
|
||||
"sql_file_path": None,
|
||||
"sql_filename_grants": None,
|
||||
"sentry_role_environment": None,
|
||||
"ranger_script": None,
|
||||
"type3_access_table": None,
|
||||
"type3_access_table_key_column": None,
|
||||
"type3_source_table_key_column": None,
|
||||
"target_s3_bucket": None,
|
||||
"ranger_s3_bucket": None,
|
||||
"ranger_s3_path": None,
|
||||
"rar_full_access_entitlement_list": None,
|
||||
"target_table": None,
|
||||
"tech_meta_data_fields": None,
|
||||
"full_access_entitlement_list": None
|
||||
}
|
||||
# Initialize options from arguments
|
||||
self.initialize_options(args)
|
||||
|
||||
def initialize_options(self, args):
|
||||
# Assuming args is a list of key-value pairs
|
||||
for key in args.keys():
|
||||
if key in self.options:
|
||||
self.options[key] = args[key]
|
||||
|
||||
def get_option_value(self, key):
|
||||
return self.options.get(key, "")
|
||||
|
||||
@property
|
||||
def corporate_store(self):
|
||||
return self.get_option_value("corporate_store")
|
||||
|
||||
@property
|
||||
def source_schema(self):
|
||||
return self.get_option_value("source_schema")
|
||||
|
||||
@property
|
||||
def source_table(self):
|
||||
return self.get_option_value("source_table")
|
||||
|
||||
@property
|
||||
def access_type(self):
|
||||
return self.get_option_value("access_type")
|
||||
|
||||
@property
|
||||
def oracle_metadata_table(self):
|
||||
return self.get_option_value("oracle_metadata_table")
|
||||
|
||||
@property
|
||||
def oracle_igam_table(self):
|
||||
return self.get_option_value("oracle_igam_table")
|
||||
|
||||
@property
|
||||
def query_metadata_access_type1(self):
|
||||
return self.get_option_value("query_metadata_access_type1")
|
||||
|
||||
@property
|
||||
def query_metadata_access_type2a(self):
|
||||
return self.get_option_value("query_metadata_access_type2a")
|
||||
|
||||
@property
|
||||
def query_igam_roles(self):
|
||||
return self.get_option_value("query_igam_roles")
|
||||
|
||||
@property
|
||||
def ora_jdbc_url_dwh(self):
|
||||
return self.get_option_value("ora_jdbc_url_dwh")
|
||||
|
||||
@property
|
||||
def ora_jdbc_url_ods(self):
|
||||
return self.get_option_value("ora_jdbc_url_ods")
|
||||
|
||||
@property
|
||||
def sql_file_path(self):
|
||||
return self.get_option_value("sql_file_path")
|
||||
|
||||
@property
|
||||
def sql_filename_grants(self):
|
||||
return self.get_option_value("sql_filename_grants")
|
||||
|
||||
@property
|
||||
def sentry_role_environment(self):
|
||||
return self.get_option_value("sentry_role_environment")
|
||||
|
||||
@property
|
||||
def ranger_script(self):
|
||||
return self.get_option_value("ranger_script")
|
||||
|
||||
@property
|
||||
def type3_access_table(self):
|
||||
return self.get_option_value("type3_access_table")
|
||||
|
||||
@property
|
||||
def type3_access_table_key_column(self):
|
||||
return self.get_option_value("type3_access_table_key_column")
|
||||
|
||||
@property
|
||||
def type3_source_table_key_column(self):
|
||||
return self.get_option_value("type3_source_table_key_column")
|
||||
|
||||
@property
|
||||
def target_s3_bucket(self):
|
||||
return self.get_option_value("target_s3_bucket")
|
||||
|
||||
@property
|
||||
def ranger_s3_bucket(self):
|
||||
return self.get_option_value("ranger_s3_bucket")
|
||||
|
||||
@property
|
||||
def ranger_s3_path(self):
|
||||
return self.get_option_value("ranger_s3_path")
|
||||
|
||||
@property
|
||||
def rar_full_access_entitlement_list(self):
|
||||
return self.get_option_value("rar_full_access_entitlement_list")
|
||||
|
||||
@property
|
||||
def target_table(self):
|
||||
return self.get_option_value("target_table")
|
||||
|
||||
@property
|
||||
def tech_meta_data_fields(self):
|
||||
return self.get_option_value("tech_meta_data_fields")
|
||||
|
||||
@property
|
||||
def full_access_entitlement_list(self):
|
||||
return self.get_option_value("full_access_entitlement_list")
|
||||
|
||||
@property
|
||||
def service_name(self):
|
||||
return self.get_option_value("service_name")
|
||||
73
python/devo_replicator/table_generator/devo_query.py
Normal file
73
python/devo_replicator/table_generator/devo_query.py
Normal file
@@ -0,0 +1,73 @@
|
||||
from impala.dbapi import (
|
||||
connect,
|
||||
ProgrammingError,
|
||||
DatabaseError,
|
||||
IntegrityError,
|
||||
OperationalError,
|
||||
)
|
||||
from impala.error import HiveServer2Error
|
||||
|
||||
|
||||
def get_DEVO_connection(hostname: str, user: str, secret: str):
|
||||
conn = connect(
|
||||
host=hostname,
|
||||
port=443,
|
||||
auth_mechanism="PLAIN",
|
||||
user=user,
|
||||
password=secret,
|
||||
use_http_transport=True,
|
||||
http_path="cliservice",
|
||||
use_ssl=True,
|
||||
)
|
||||
return conn
|
||||
|
||||
|
||||
def execute_devo_query(query: str, conn):
|
||||
cursor = None
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query) # Check if the query is a SELECT query (i.e., reads data)
|
||||
if query.strip().lower().startswith("select"):
|
||||
rows = cursor.fetchall()
|
||||
columns = [col[0] for col in cursor.description]
|
||||
return columns, rows
|
||||
else:
|
||||
# For non-SELECT queries (e.g., INSERT, UPDATE, DELETE), just return affected rows
|
||||
return None, cursor.rowcount # rowcount returns the number of rows affected
|
||||
|
||||
except OperationalError as oe:
|
||||
raise Exception("Failed to connect to DEVO: " + str(oe))
|
||||
|
||||
except ProgrammingError as pe:
|
||||
raise Exception("Query syntax error: " + str(pe))
|
||||
|
||||
except IntegrityError as ie:
|
||||
raise Exception("Insufficient permissions: " + str(ie))
|
||||
|
||||
except DatabaseError as db_err:
|
||||
raise Exception("Database error: " + str(db_err))
|
||||
|
||||
except HiveServer2Error as au_err:
|
||||
raise Exception("HiveServer2Error error: " + str(au_err))
|
||||
|
||||
finally:
|
||||
try:
|
||||
if cursor:
|
||||
cursor.close()
|
||||
if not conn:
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
raise Exception(status_code=500, detail=f"Failed to close the cursor or impala connection: {str(e)}") from e
|
||||
|
||||
def execute_query(query: str, user: str, hostname: str,password):
|
||||
conn = get_DEVO_connection(hostname, user, password)
|
||||
columns, result = execute_devo_query(query, conn)
|
||||
return columns, result
|
||||
|
||||
|
||||
#sql="CREATE EXTERNAL TABLE IF NOT EXISTS crp_rar.testInternalTable ( iid STRING,RANDOM_DATE DATE, number int) ;"
|
||||
#sql_drop="DROP TABLE IF EXISTS crp_rar.NH_PRICE"
|
||||
|
||||
#print( execute_query("SELECT 1","ap-informatica-ipcwt","t-impala.devo.escb.eu","Start_123456789"))
|
||||
#print( execute_query("SELECT 1","ap-devo_tst-mrds","t-impala.devo.escb.eu","V1XqZ*#fvwQl=nRG*idI"))
|
||||
#print( execute_query("SELECT 1","ap-devo_lab-mrds","impala-proxy-devo-lab21-impala01.dw-devo-lab21.om2y56.b0.cloudera.site","PHkvyVonyePAmZD8wUuw!"))
|
||||
69
python/devo_replicator/table_generator/ranger_test.py
Normal file
69
python/devo_replicator/table_generator/ranger_test.py
Normal file
@@ -0,0 +1,69 @@
|
||||
## Step 3: Let's create a policy
|
||||
|
||||
from apache_ranger.model.ranger_service import *
|
||||
from apache_ranger.client.ranger_client import *
|
||||
from apache_ranger.model.ranger_policy import *
|
||||
from mrds.utils.secrets import get_secret
|
||||
|
||||
## Step 1: create a client to connect to Apache Ranger admin
|
||||
ranger_url ="https://devo-lab21-dl-gateway.devo-lab.om2y56.b0.cloudera.site:443/devo-lab21-dl/cdp-proxy-api/ranger"
|
||||
password= get_secret("ocid1.vaultsecret.oc1.eu-frankfurt-1.amaaaaaa2ky4jjya3tsglrzfgiyfisxchref774l5y4nrler2vn54lr3li7q")
|
||||
ranger_auth = ('ap-devo_lab-mrds', password)
|
||||
|
||||
# For Kerberos authentication
|
||||
#
|
||||
# from requests_kerberos import HTTPKerberosAuth
|
||||
#
|
||||
# ranger_auth = HTTPKerberosAuth()
|
||||
|
||||
ranger = RangerClient(ranger_url, ranger_auth)
|
||||
ranger.session.verify = False
|
||||
|
||||
# to disable SSL certificate validation (not recommended for production use!)
|
||||
#
|
||||
# ranger.session.verify = False
|
||||
|
||||
|
||||
## Step 2: Let's create a service
|
||||
|
||||
policy = RangerPolicy()
|
||||
policy.service = "cm_hive" #da hardcodare
|
||||
policy.name = 'cpo_crp_mopdb_sgroi_1' #corporatestore_table_accessType
|
||||
policy.resources = { 'database': RangerPolicyResource({ 'values': ['crp_RQSD'] }),
|
||||
'table': RangerPolicyResource({ 'values': ['ANNEX_1_1_ALL'] }),
|
||||
'column': RangerPolicyResource({ 'values': ['*'] }) } #change with correct values
|
||||
allowItem1 = RangerPolicyItem() #to try allowItem1.groups
|
||||
allowItem1.groups = ["d_mopdb_mpec"]
|
||||
#allowItem1.users = [] #to try for single users
|
||||
allowItem1.accesses = [ RangerPolicyItemAccess({ 'type': 'create' }),
|
||||
RangerPolicyItemAccess({ 'type': 'alter' }),
|
||||
RangerPolicyItemAccess({ 'type': 'select' }),
|
||||
RangerPolicyItemAccess({ 'type': 'drop' }) ]
|
||||
|
||||
"""denyItem1 = RangerPolicyItem()
|
||||
denyItem1.users = [ 'admin' ] #does it make sense to deny and not allow?
|
||||
denyItem1.accesses = [ RangerPolicyItemAccess({ 'type': 'drop' }) ]"""
|
||||
|
||||
policy.policyItems = [ allowItem1 ]
|
||||
#policy.denyPolicyItems = [ denyItem1 ]
|
||||
#policy2=ranger.get_policy_by_id(policyId=5086)
|
||||
#print(ranger.get_policy(serviceName="cm_hive",policyName='crp_rar_testinternalTable_alcesso1'))
|
||||
#print(ranger.find_policies({"service": "cm_hive", "resources": {"database": {"values": ["crp_rar"], "isExcludes": False , "isRecursive": False}, "column": {"values": ["*"], "isExcludes": False, "isRecursive": False}, "table": {"values": ["testInternalTable"], "isExcludes": False, "isRecursive": False}}}))
|
||||
#print(ranger.delete_policy(serviceName="cm_hive",policyName="crp_rar_testinternalTable_alcesso1"))
|
||||
#print(policy2)
|
||||
#print('Creating policy: name=' + policy.name)
|
||||
#created_policy = ranger.create_policy(policy)
|
||||
|
||||
#print(' created policy: name=' + created_policy.name + ', id=' + str(created_policy.id))
|
||||
|
||||
## Step 4: Delete policy and service created above
|
||||
#print('Deleting policy: id=' + str(created_policy.id))
|
||||
|
||||
#ranger.delete_policy_by_id(created_policy.id)
|
||||
|
||||
data=ranger.get_policies_in_service(serviceName="cm_hive")
|
||||
with open("output.txt", "w") as file:
|
||||
for string in data:
|
||||
file.write(str(string))
|
||||
file.close()
|
||||
|
||||
250
python/devo_replicator/table_generator/ranger_updater.py
Normal file
250
python/devo_replicator/table_generator/ranger_updater.py
Normal file
@@ -0,0 +1,250 @@
|
||||
from typing import List, Optional
|
||||
from apache_ranger.model.ranger_service import *
|
||||
from apache_ranger.client.ranger_client import *
|
||||
from apache_ranger.model.ranger_policy import *
|
||||
import re
|
||||
def add_table_permission_groups(corporate_store: str, target_table: str, access_type: str, source_table: str, igam_entitlement_list: List[str], columns_list: Optional[List[str]] = None, row_list: Optional[List[str]] = None):
|
||||
igam_entitlements = igam_entitlement_list + ["public"] if source_table.lower() == "rar_sources_igam_sentry" else igam_entitlement_list
|
||||
column_details = columns_list if columns_list is not None else ["*"]
|
||||
columns = column_details
|
||||
row_filter = row_list if row_list is not None else ["*"]
|
||||
filter_condition = ','.join([f"'{row}'" for row in row_filter])
|
||||
igam_roles = [x.lower() for x in igam_entitlements if x !=""]
|
||||
|
||||
return {
|
||||
'corporate_store': corporate_store,
|
||||
'target_table': target_table,
|
||||
'access_type': access_type,
|
||||
'columns': columns,
|
||||
'rows': filter_condition,
|
||||
'igam_roles': igam_roles
|
||||
}
|
||||
|
||||
from typing import List, Optional
|
||||
# --- helpers ---------------------------------------------------------------
|
||||
|
||||
def _policy_name_from_params(config, policy_id: Optional[str] = None) -> Optional[str]:
|
||||
"""
|
||||
Build the exact policy name used by your create functions.
|
||||
Returns None for types where we need to match multiple (e.g., 2a without id).
|
||||
"""
|
||||
cs = config['corporate_store'].lower()
|
||||
tbl = config['target_table'].lower()
|
||||
at = config['access_type'].lower()
|
||||
base = f"cpo_{cs}_{tbl}_{at}"
|
||||
|
||||
if at == "1":
|
||||
# yaml_format_1
|
||||
return base
|
||||
elif at == "2a":
|
||||
# yaml_format_2a -> requires policy_id to be exact
|
||||
if policy_id:
|
||||
return f"{base}_policy_{policy_id}"
|
||||
# without policy_id, we’ll delete all that start with this prefix
|
||||
return None
|
||||
elif at == "2b":
|
||||
# yaml_format_2b
|
||||
return f"{base}_row_level_policy"
|
||||
elif at == "3":
|
||||
# yaml_format_3 uses same name pattern as 2b in your script
|
||||
return f"{base}_row_level_policy"
|
||||
else:
|
||||
raise ValueError(f"Invalid access type '{config['access_type']}'. Expected one of: 1, 2a, 2b, 3.")
|
||||
|
||||
|
||||
def _ranger_client(env_config) -> RangerClient:
|
||||
ranger_url = env_config['RANGER_HOSTNAME']
|
||||
ranger_auth = ( env_config['DEVO_USERNAME'], env_config['DEVO_SECRET'])
|
||||
client = RangerClient(ranger_url, ranger_auth)
|
||||
client.session.verify = False
|
||||
return client
|
||||
|
||||
|
||||
# --- main deletion API -----------------------------------------------------
|
||||
|
||||
def delete_policy(config,env_config, policy_id: Optional[str] = None) -> List[str]:
|
||||
"""
|
||||
Delete Ranger policy/policies by name based on:
|
||||
- params['corporate_store']
|
||||
- params['target_table']
|
||||
- typeOfAccess: "1", "2a", "2b", "3"
|
||||
- policy_id: optional (only meaningful for '2a')
|
||||
|
||||
Returns a list of deleted policy names.
|
||||
"""
|
||||
ranger = _ranger_client(env_config)
|
||||
service_name = "cm_hive"
|
||||
|
||||
# Try build exact name
|
||||
deleted: List[str] = []
|
||||
|
||||
|
||||
# If we don’t have an exact name (e.g. type 2a without policy_id),
|
||||
# delete *all* that match the expected prefix.
|
||||
cs = config['corporate_store'].lower()
|
||||
tbl = config['target_table'].lower()
|
||||
at = config['access_type'].lower()
|
||||
prefix = f"cpo_{cs}_{tbl}_"
|
||||
print(prefix)
|
||||
# Fetch all policies for the table and filter client-side to reduce calls.
|
||||
start = 0
|
||||
candidates = []
|
||||
page_size=1000
|
||||
service_name="cm_hive"
|
||||
while True:
|
||||
params = {"pageSize": page_size, "startIndex": start}
|
||||
page = ranger.get_policies_in_service(service_name, params=params) or []
|
||||
candidates.extend(page)
|
||||
if len(page) < page_size:
|
||||
break
|
||||
start += len(page)
|
||||
for p in candidates:
|
||||
name = p["name"]
|
||||
print(f"analizing policy:{name}")
|
||||
if re.fullmatch(f"{prefix}([0-9]?[a-z]?)(_policy_)?([0-9]*)?(_row_level_policy)?(full_access)?$",name) != None:
|
||||
try:
|
||||
ranger.delete_policy_by_id(p["id"])
|
||||
deleted.append(name)
|
||||
except Exception:
|
||||
# continue attempting others
|
||||
pass
|
||||
if not deleted:
|
||||
raise RuntimeError(
|
||||
f"No matching policies found for deletion with prefix '{prefix}'. "
|
||||
)
|
||||
return deleted
|
||||
|
||||
|
||||
|
||||
def generate_policy(params,env_config, policy_id: Optional[str] = None):
|
||||
access_type = params['access_type'].lower()
|
||||
if access_type == "1":
|
||||
return yaml_format_1(params,env_config)
|
||||
elif access_type == "2a":
|
||||
return yaml_format_2a(params, env_config, policy_id)
|
||||
elif access_type == "2b":
|
||||
return yaml_format_1(params,env_config)
|
||||
elif access_type == "3":
|
||||
return yaml_format_3(params)
|
||||
else:
|
||||
raise Exception(f"Invalid access type {params['access_type']}. Please check the input param")
|
||||
|
||||
def yaml_format_1(params,env_config) -> str:
|
||||
ranger=_ranger_client(env_config)
|
||||
|
||||
# For Kerberos authentication
|
||||
#
|
||||
# from requests_kerberos import HTTPKerberosAuth
|
||||
#
|
||||
# ranger_auth = HTTPKerberosAuth()
|
||||
|
||||
policy = RangerPolicy()
|
||||
policy.service = "cm_hive" #harcoded
|
||||
policy.name = f"cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}" #corporatestore_table_accessType
|
||||
policy.resources = { 'database': RangerPolicyResource({ 'values': [params['corporate_store'].lower()] }),
|
||||
'table': RangerPolicyResource({ 'values': [params['target_table']] }),
|
||||
'column': RangerPolicyResource({ 'values': params['columns'] }) }
|
||||
|
||||
allowItem1 = RangerPolicyItem()
|
||||
allowItem1.groups = params['igam_roles']
|
||||
allowItem1.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
|
||||
policy.policyItems = [ allowItem1 ]
|
||||
created_policy = ranger.create_policy(policy)
|
||||
print('Created policy: name=' + created_policy.name + ', id=' + str(created_policy.id))
|
||||
return policy
|
||||
|
||||
def yaml_format_2a(params, env_config,policy_id: Optional[str]) -> str:
|
||||
policy_ID = policy_id if policy_id is not None else "0"
|
||||
|
||||
|
||||
# For Kerberos authentication
|
||||
#
|
||||
# from requests_kerberos import HTTPKerberosAuth
|
||||
#
|
||||
# ranger_auth = HTTPKerberosAuth()
|
||||
|
||||
ranger = _ranger_client(env_config)
|
||||
|
||||
policy = RangerPolicy()
|
||||
policy.service = "cm_hive" #harcoded
|
||||
policy.name = f"cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}_policy_{policy_ID}" #corporatestore_table_accessType
|
||||
policy.resources = { 'database': RangerPolicyResource({ 'values': [params['corporate_store'].lower()] }),
|
||||
'table': RangerPolicyResource({ 'values': [params['target_table']] }),
|
||||
'column': RangerPolicyResource({ 'values': params['columns'] }) }
|
||||
allowItem1 = RangerPolicyItem()
|
||||
allowItem1.groups = params['igam_roles']
|
||||
allowItem1.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
|
||||
policy.policyItems = [ allowItem1 ]
|
||||
created_policy = ranger.create_policy(policy)
|
||||
print(' created policy: name=' + created_policy.name + ', id=' + str(created_policy.id))
|
||||
return policy
|
||||
|
||||
def yaml_format_2b(params,env_config, full_access_list: Optional[List]) -> str:
|
||||
|
||||
# For Kerberos authentication
|
||||
#
|
||||
# from requests_kerberos import HTTPKerberosAuth
|
||||
#
|
||||
# ranger_auth = HTTPKerberosAuth()
|
||||
|
||||
ranger = _ranger_client(env_config)
|
||||
|
||||
policy = RangerPolicy()
|
||||
policy.service = "cm_hive" #harcoded
|
||||
policy.name = f"cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}_row_level_policy" #corporatestore_table_accessType
|
||||
policy.isEnabled = True
|
||||
policy.resources ={ 'database': RangerPolicyResource({ 'values': [params['corporate_store'].lower()] }),
|
||||
'table': RangerPolicyResource({ 'values': [params['target_table']] })}
|
||||
rowFilterAllowItem1= RangerRowFilterPolicyItem()
|
||||
rowFilterAllowItem1.groups = params['igam_roles']
|
||||
rowFilterAllowItem1.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
|
||||
rowFilterAllowItem1.rowFilterInfo = RangerPolicyItemRowFilterInfo({ 'filterExpr': f"lower(source) IN (select lower(rar_subsource_id) from {params['corporate_store'].lower()}.t_ref_rar_sources_igam_sentry where lower(rar_igam_entitlement) IN (select ad_group from {params['corporate_store'].lower()}.active_directory_user_groups where username = lower(regexp_extract(current_user(),'[^@]*',0))))" })
|
||||
rowFilterAllowItem2= RangerRowFilterPolicyItem()
|
||||
rowFilterAllowItem2.groups = [x.lower() for x in full_access_list]
|
||||
rowFilterAllowItem2.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
|
||||
rowFilterAllowItem2.rowFilterInfo = RangerPolicyItemRowFilterInfo({ 'filterExpr': f"1=1" })
|
||||
policy.rowFilterPolicyItems= [rowFilterAllowItem1, rowFilterAllowItem2]
|
||||
|
||||
created_policy = ranger.create_policy(policy)
|
||||
print(' created policy: name=' + created_policy.name + ', id=' + str(created_policy.id))
|
||||
return policy
|
||||
|
||||
|
||||
def yaml_format_3(params, env_config,filterString, full_access_list: Optional[List]) -> str:
|
||||
|
||||
ranger = _ranger_client(env_config)
|
||||
|
||||
policy = RangerPolicy()
|
||||
policy.service = "cm_hive" # hardcoded
|
||||
policy.name = (
|
||||
f"cpo_{params['corporate_store'].lower()}_"
|
||||
f"{params['target_table'].lower()}_"
|
||||
f"{params['access_type'].lower()}_row_level_policy"
|
||||
)
|
||||
policy.isEnabled = True
|
||||
policy.resources = {
|
||||
"database": RangerPolicyResource({"values": [params["corporate_store"].lower()]}),
|
||||
"table": RangerPolicyResource({"values": [params["target_table"]]}),
|
||||
}
|
||||
|
||||
# Row filter item
|
||||
rowFilterAllowItem = RangerRowFilterPolicyItem()
|
||||
rowFilterAllowItem.groups = params["igam_roles"]
|
||||
rowFilterAllowItem.accesses = [RangerPolicyItemAccess({"type": "select"})]
|
||||
rowFilterAllowItem.rowFilterInfo = RangerPolicyItemRowFilterInfo(
|
||||
{
|
||||
"filterExpr": filterString
|
||||
}
|
||||
)
|
||||
rowFilterAllowItem2= RangerRowFilterPolicyItem()
|
||||
rowFilterAllowItem2.groups = [x.lower() for x in full_access_list]
|
||||
rowFilterAllowItem2.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
|
||||
rowFilterAllowItem2.rowFilterInfo = RangerPolicyItemRowFilterInfo({ 'filterExpr': f"1=1" })
|
||||
|
||||
policy.rowFilterPolicyItems = [rowFilterAllowItem,rowFilterAllowItem2]
|
||||
|
||||
# Create policy in Ranger
|
||||
created_policy = ranger.create_policy(policy)
|
||||
print(f" created policy: name={created_policy.name}, id={created_policy.id}")
|
||||
|
||||
return policy
|
||||
345
python/devo_replicator/table_generator/ranger_updater_old.py
Normal file
345
python/devo_replicator/table_generator/ranger_updater_old.py
Normal file
@@ -0,0 +1,345 @@
|
||||
from typing import List, Optional
|
||||
from apache_ranger.model.ranger_service import *
|
||||
from apache_ranger.client.ranger_client import *
|
||||
from apache_ranger.model.ranger_policy import *
|
||||
import re
|
||||
def add_table_permission_groups(corporate_store: str, target_table: str, access_type: str, source_table: str, igam_entitlement_list: List[str], columns_list: Optional[List[str]] = None, row_list: Optional[List[str]] = None):
|
||||
igam_entitlements = igam_entitlement_list + ["public"] if source_table.lower() == "rar_sources_igam_sentry" else igam_entitlement_list
|
||||
column_details = columns_list if columns_list is not None else ["*"]
|
||||
columns = column_details
|
||||
row_filter = row_list if row_list is not None else ["*"]
|
||||
filter_condition = ','.join([f"'{row}'" for row in row_filter])
|
||||
igam_roles = [x.lower() for x in igam_entitlements if x !=""]
|
||||
|
||||
return {
|
||||
'corporate_store': corporate_store,
|
||||
'target_table': target_table,
|
||||
'access_type': access_type,
|
||||
'columns': columns,
|
||||
'rows': filter_condition,
|
||||
'igam_roles': igam_roles
|
||||
}
|
||||
|
||||
from typing import List, Optional
|
||||
# --- helpers ---------------------------------------------------------------
|
||||
|
||||
def _policy_name_from_params(config, policy_id: Optional[str] = None) -> Optional[str]:
|
||||
"""
|
||||
Build the exact policy name used by your create functions.
|
||||
Returns None for types where we need to match multiple (e.g., 2a without id).
|
||||
"""
|
||||
cs = config.corporate_store.lower()
|
||||
tbl = config.target_table.lower()
|
||||
at = config.access_type.lower()
|
||||
base = f"cpo_{cs}_{tbl}_{at}"
|
||||
|
||||
if at == "1":
|
||||
# yaml_format_1
|
||||
return base
|
||||
elif at == "2a":
|
||||
# yaml_format_2a -> requires policy_id to be exact
|
||||
if policy_id:
|
||||
return f"{base}_policy_{policy_id}"
|
||||
# without policy_id, we’ll delete all that start with this prefix
|
||||
return None
|
||||
elif at == "2b":
|
||||
# yaml_format_2b
|
||||
return f"{base}_row_level_policy"
|
||||
elif at == "3":
|
||||
# yaml_format_3 uses same name pattern as 2b in your script
|
||||
return f"{base}_row_level_policy"
|
||||
else:
|
||||
raise ValueError(f"Invalid access type '{config.access_type}'. Expected one of: 1, 2a, 2b, 3.")
|
||||
|
||||
|
||||
def _ranger_client(env_config) -> RangerClient:
|
||||
ranger_url = env_config['RANGER_HOSTNAME']
|
||||
ranger_auth = ( env_config['DEVO_USERNAME'], env_config['DEVO_SECRET'])
|
||||
client = RangerClient(ranger_url, ranger_auth)
|
||||
client.session.verify = False
|
||||
return client
|
||||
|
||||
|
||||
# --- main deletion API -----------------------------------------------------
|
||||
|
||||
def delete_policy(config,env_config, policy_id: Optional[str] = None) -> List[str]:
|
||||
"""
|
||||
Delete Ranger policy/policies by name based on:
|
||||
- params['corporate_store']
|
||||
- params['target_table']
|
||||
- typeOfAccess: "1", "2a", "2b", "3"
|
||||
- policy_id: optional (only meaningful for '2a')
|
||||
|
||||
Returns a list of deleted policy names.
|
||||
"""
|
||||
ranger = _ranger_client(env_config)
|
||||
service_name = "cm_hive"
|
||||
|
||||
# Try build exact name
|
||||
deleted: List[str] = []
|
||||
|
||||
|
||||
# If we don’t have an exact name (e.g. type 2a without policy_id),
|
||||
# delete *all* that match the expected prefix.
|
||||
cs = config.corporate_store.lower()
|
||||
tbl = config.target_table.lower()
|
||||
at = config.access_type.lower()
|
||||
prefix = f"cpo_{cs}_{tbl}_"
|
||||
# Fetch all policies for the table and filter client-side to reduce calls.
|
||||
start = 0
|
||||
candidates = []
|
||||
page_size=1000
|
||||
service_name="cm_hive"
|
||||
while True:
|
||||
params = {"pageSize": page_size, "startIndex": start}
|
||||
page = ranger.get_policies_in_service(service_name, params=params) or []
|
||||
candidates.extend(page)
|
||||
if len(page) < page_size:
|
||||
break
|
||||
start += len(page)
|
||||
for p in candidates:
|
||||
name = p["name"]
|
||||
print(f"analizing policy:{name}")
|
||||
if re.fullmatch(f"{prefix}([0-9]?[a-z]?)(_policy_)?([0-9]*)?(_row_level_policy)?(full_access)?$",name) != None:
|
||||
try:
|
||||
ranger.delete_policy_by_id(p["id"])
|
||||
deleted.append(name)
|
||||
except Exception:
|
||||
# continue attempting others
|
||||
pass
|
||||
if not deleted:
|
||||
raise RuntimeError(
|
||||
f"No matching policies found for deletion with prefix '{prefix}'. "
|
||||
f"Provide 'policy_id' to delete a specific 2a policy."
|
||||
)
|
||||
return deleted
|
||||
|
||||
|
||||
|
||||
def generate_policy(params,env_config, policy_id: Optional[str] = None):
|
||||
access_type = params['access_type'].lower()
|
||||
if access_type == "1":
|
||||
return yaml_format_1(params,env_config)
|
||||
elif access_type == "2a":
|
||||
return yaml_format_2a(params, env_config, policy_id)
|
||||
elif access_type == "2b":
|
||||
return yaml_format_1(params,env_config)
|
||||
elif access_type == "3":
|
||||
return yaml_format_3(params)
|
||||
else:
|
||||
raise Exception(f"Invalid access type {params['access_type']}. Please check the input param")
|
||||
|
||||
def yaml_format_1(params,env_config) -> str:
|
||||
ranger=_ranger_client(env_config)
|
||||
|
||||
# For Kerberos authentication
|
||||
#
|
||||
# from requests_kerberos import HTTPKerberosAuth
|
||||
#
|
||||
# ranger_auth = HTTPKerberosAuth()
|
||||
|
||||
policy = RangerPolicy()
|
||||
policy.service = "cm_hive" #harcoded
|
||||
policy.name = f"cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}" #corporatestore_table_accessType
|
||||
policy.resources = { 'database': RangerPolicyResource({ 'values': [params['corporate_store'].lower()] }),
|
||||
'table': RangerPolicyResource({ 'values': [params['target_table']] }),
|
||||
'column': RangerPolicyResource({ 'values': params['columns'] }) }
|
||||
|
||||
allowItem1 = RangerPolicyItem()
|
||||
allowItem1.groups = params['igam_roles']
|
||||
allowItem1.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
|
||||
policy.policyItems = [ allowItem1 ]
|
||||
print(policy)
|
||||
try:
|
||||
created_policy = ranger.create_policy(policy)
|
||||
print('Created policy: name=' + created_policy.name + ', id=' + str(created_policy.id))
|
||||
except:
|
||||
pass
|
||||
'''
|
||||
yaml_format = f"""- name: "{{{{ ecb_env }}}} : {{{{ cdp_env_name }}}}: allow CRP RAR users to select core tables"
|
||||
devo_ranger_client:
|
||||
name: "cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}"
|
||||
policy:
|
||||
service: cm_hive
|
||||
resources:
|
||||
database:
|
||||
values:
|
||||
- {params['corporate_store'].lower()}
|
||||
table:
|
||||
values:
|
||||
- {params['target_table']}
|
||||
column:
|
||||
values:
|
||||
{params['columns']}
|
||||
policyItems:
|
||||
- groups:
|
||||
{params['igam_roles'].lower()}
|
||||
accesses:
|
||||
- select
|
||||
"""
|
||||
return yaml_format'
|
||||
'''
|
||||
|
||||
def yaml_format_2a(params, env_config,policy_id: Optional[str]) -> str:
|
||||
policy_ID = policy_id if policy_id is not None else "0"
|
||||
|
||||
|
||||
# For Kerberos authentication
|
||||
#
|
||||
# from requests_kerberos import HTTPKerberosAuth
|
||||
#
|
||||
# ranger_auth = HTTPKerberosAuth()
|
||||
|
||||
ranger = _ranger_client(env_config)
|
||||
|
||||
policy = RangerPolicy()
|
||||
policy.service = "cm_hive" #harcoded
|
||||
policy.name = f"cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}_policy_{policy_ID}" #corporatestore_table_accessType
|
||||
policy.resources = { 'database': RangerPolicyResource({ 'values': [params['corporate_store'].lower()] }),
|
||||
'table': RangerPolicyResource({ 'values': [params['target_table']] }),
|
||||
'column': RangerPolicyResource({ 'values': params['columns'] }) }
|
||||
allowItem1 = RangerPolicyItem()
|
||||
allowItem1.groups = params['igam_roles']
|
||||
allowItem1.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
|
||||
policy.policyItems = [ allowItem1 ]
|
||||
|
||||
|
||||
print(policy)
|
||||
print("\n\n")
|
||||
|
||||
|
||||
|
||||
|
||||
#created_policy = ranger.create_policy(policy)
|
||||
#print(' created policy: name=' + created_policy.name + ', id=' + str(created_policy.id))
|
||||
|
||||
'''
|
||||
yaml_format = f"""- name: "{{{{ ecb_env }}}} : {{{{ cdp_env_name }}}}: allow CRP RAR users to select core tables"
|
||||
devo_ranger_client:
|
||||
name: "cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}_policy_{policy_ID}"
|
||||
policy:
|
||||
service: cm_hive
|
||||
resources:
|
||||
database:
|
||||
values:
|
||||
- {params['corporate_store'].lower()}
|
||||
table:
|
||||
values:
|
||||
- {params['target_table']}
|
||||
column:
|
||||
values:
|
||||
{params['columns']}
|
||||
policyItems:
|
||||
- groups:
|
||||
{params['igam_roles'].lower()}
|
||||
accesses:
|
||||
- select
|
||||
"""
|
||||
return yaml_format'
|
||||
'''
|
||||
|
||||
def yaml_format_2b(params,env_config, full_access_list: Optional[List]) -> str:
|
||||
|
||||
# For Kerberos authentication
|
||||
#
|
||||
# from requests_kerberos import HTTPKerberosAuth
|
||||
#
|
||||
# ranger_auth = HTTPKerberosAuth()
|
||||
|
||||
ranger = _ranger_client(env_config)
|
||||
|
||||
policy = RangerPolicy()
|
||||
policy.service = "cm_hive" #harcoded
|
||||
policy.name = f"cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}_row_level_policy" #corporatestore_table_accessType
|
||||
policy.isEnabled = True
|
||||
policy.resources ={ 'database': RangerPolicyResource({ 'values': [params['corporate_store'].lower()] }),
|
||||
'table': RangerPolicyResource({ 'values': [params['target_table']] })}
|
||||
rowFilterAllowItem1= RangerRowFilterPolicyItem()
|
||||
rowFilterAllowItem1.groups = params['igam_roles']
|
||||
rowFilterAllowItem1.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
|
||||
rowFilterAllowItem1.rowFilterInfo = RangerPolicyItemRowFilterInfo({ 'filterExpr': f"lower(source) IN (select lower(rar_subsource_id) from {params['corporate_store'].lower()}.t_ref_rar_sources_igam_sentry where lower(rar_igam_entitlement) IN (select ad_group from {params['corporate_store'].lower()}.active_directory_user_groups where username = lower(regexp_extract(current_user(),'[^@]*',0))))" })
|
||||
rowFilterAllowItem2= RangerRowFilterPolicyItem()
|
||||
rowFilterAllowItem2.groups = [x.lower() for x in full_access_list]
|
||||
rowFilterAllowItem2.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
|
||||
rowFilterAllowItem2.rowFilterInfo = RangerPolicyItemRowFilterInfo({ 'filterExpr': f"1=1" })
|
||||
policy.rowFilterPolicyItems= [rowFilterAllowItem1, rowFilterAllowItem2]
|
||||
print(policy)
|
||||
|
||||
created_policy = ranger.create_policy(policy)
|
||||
print(' created policy: name=' + created_policy.name + ', id=' + str(created_policy.id))
|
||||
'''
|
||||
yaml_format = f"""- name: "{{{{ ecb_env }}}} : {{{{ cdp_env_name }}}}: filter by confidentiality level"
|
||||
devo_ranger_client:
|
||||
name: "cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}_row_level_policy"
|
||||
policy:
|
||||
isEnabled: "true"
|
||||
service: cm_hive
|
||||
resources:
|
||||
database:
|
||||
values:
|
||||
- {params['corporate_store'].lower()}
|
||||
table:
|
||||
values:
|
||||
- {params['target_table']}
|
||||
rowFilterPolicyItems:
|
||||
"""
|
||||
return yaml_format
|
||||
'''
|
||||
|
||||
def yaml_format_3(params, env_config,filterString, full_access_list: Optional[List]) -> str:
|
||||
|
||||
ranger = _ranger_client(env_config)
|
||||
|
||||
policy = RangerPolicy()
|
||||
policy.service = "cm_hive" # hardcoded
|
||||
policy.name = (
|
||||
f"cpo_{params['corporate_store'].lower()}_"
|
||||
f"{params['target_table'].lower()}_"
|
||||
f"{params['access_type'].lower()}_row_level_policy"
|
||||
)
|
||||
policy.isEnabled = True
|
||||
policy.resources = {
|
||||
"database": RangerPolicyResource({"values": [params["corporate_store"].lower()]}),
|
||||
"table": RangerPolicyResource({"values": [params["target_table"]]}),
|
||||
}
|
||||
|
||||
# Row filter item
|
||||
rowFilterAllowItem = RangerRowFilterPolicyItem()
|
||||
rowFilterAllowItem.groups = params["igam_roles"]
|
||||
rowFilterAllowItem.accesses = [RangerPolicyItemAccess({"type": "select"})]
|
||||
rowFilterAllowItem.rowFilterInfo = RangerPolicyItemRowFilterInfo(
|
||||
{
|
||||
"filterExpr": filterString
|
||||
}
|
||||
)
|
||||
rowFilterAllowItem2= RangerRowFilterPolicyItem()
|
||||
rowFilterAllowItem2.groups = [x.lower() for x in full_access_list]
|
||||
rowFilterAllowItem2.accesses = [RangerPolicyItemAccess({ 'type': 'select' })]
|
||||
rowFilterAllowItem2.rowFilterInfo = RangerPolicyItemRowFilterInfo({ 'filterExpr': f"1=1" })
|
||||
|
||||
policy.rowFilterPolicyItems = [rowFilterAllowItem,rowFilterAllowItem2]
|
||||
print(policy)
|
||||
# Create policy in Ranger
|
||||
created_policy = ranger.create_policy(policy)
|
||||
print(f" created policy: name={created_policy.name}, id={created_policy.id}")
|
||||
|
||||
return created_policy
|
||||
|
||||
|
||||
"""
|
||||
yaml_format = f"- name: "{{{{ ecb_env }}}} : {{{{ cdp_env_name }}}}: filter by confidentiality level"
|
||||
devo_ranger_client:
|
||||
name: "cpo_{params['corporate_store'].lower()}_{params['target_table'].lower()}_{params['access_type'].lower()}_row_level_policy"
|
||||
policy:
|
||||
isEnabled: "true"
|
||||
service: cm_hive
|
||||
resources:
|
||||
database:
|
||||
values:
|
||||
- {params['corporate_store'].lower()}
|
||||
table:
|
||||
values:
|
||||
- {params['target_table']}
|
||||
rowFilterPolicyItems:
|
||||
return yaml_format
|
||||
"""
|
||||
@@ -0,0 +1,793 @@
|
||||
import pandasql as ps
|
||||
import pandas as pd
|
||||
import mrds.utils.manage_files as fileManager
|
||||
import logging
|
||||
import tableBuilderQueries as tbq
|
||||
from devo_query import execute_query
|
||||
import ranger_updater_old as ranger
|
||||
import os
|
||||
import yaml
|
||||
import FlowOptions as fo
|
||||
import numpy as np
|
||||
from mrds.utils.secrets import get_secret
|
||||
import traceback
|
||||
from mrds.utils import oraconn
|
||||
|
||||
# Set up basic configuration for logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
# Create a logger object
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
|
||||
#0 utilities
|
||||
def initialize_config(config_file_path):
|
||||
# Ensure the file exists
|
||||
if not os.path.exists(config_file_path):
|
||||
raise FileNotFoundError(f"Configuration file {config_file_path} not found.")
|
||||
|
||||
# Load the configuration
|
||||
with open(config_file_path, "r") as f:
|
||||
config_data = yaml.safe_load(f)
|
||||
|
||||
return config_data
|
||||
|
||||
def fix_impala_sql(sql: str) -> str:
|
||||
# List of reserved keywords in Impala that need backticks if used as column names
|
||||
impala_reserved_keywords = {
|
||||
'date', 'value', 'source', 'comment', 'partition', 'row', 'select', 'insert',
|
||||
'table', 'external', 'format', 'location', 'stored', 'inputformat', 'outputformat',
|
||||
'scenario', 'string', 'int', 'decimal', 'timestamp', 'float', 'double','procedure', 'floor'
|
||||
}
|
||||
|
||||
# Regex pattern to find column definitions
|
||||
pattern = re.compile(
|
||||
r'(?P<col>`?\w+`?)\s+(?P<type>[A-Za-z]+\s*(?:\([^)]+\))?)\s*(?P<comment>comment\s*\'[^\']*\'|)?',
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
def replace(match):
|
||||
col = match.group('col').strip('`')
|
||||
dtype = match.group('type')
|
||||
comment = match.group('comment') or ''
|
||||
# Add backticks only if column name is a reserved keyword or contains special chars
|
||||
if col.lower() in impala_reserved_keywords or not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', col):
|
||||
col = f'`{col}`'
|
||||
return f"{col} {dtype} {comment}".strip()
|
||||
|
||||
# Only replace column list part between parentheses
|
||||
table_def_start = sql.find('(')
|
||||
table_def_end = sql.find('ROW FORMAT SERDE', table_def_start)
|
||||
if table_def_start == -1 or table_def_end == -1:
|
||||
raise ValueError("Invalid SQL format: Missing column definition parentheses.")
|
||||
|
||||
before = sql[:table_def_start + 1]
|
||||
columns = sql[table_def_start + 1:table_def_end]
|
||||
after = sql[table_def_end:]
|
||||
|
||||
# Replace all columns inside definition
|
||||
fixed_columns = pattern.sub(replace, columns)
|
||||
|
||||
# Combine and return
|
||||
final= before + fixed_columns + after
|
||||
final=final.replace("\\'", "").replace('\\\\', '\\')
|
||||
return final
|
||||
|
||||
|
||||
def applyQueryParameters(query: str, parameters: str) -> str:
|
||||
"""
|
||||
Replaces placeholders in the query with values from parameters.
|
||||
|
||||
Parameters:
|
||||
- query: Original query string with placeholders like $$$1, $$$2, etc.
|
||||
- parameters: Semicolon-separated string of parameter values.
|
||||
|
||||
Returns:
|
||||
- String with the query filled with parameter values.
|
||||
"""
|
||||
filled_query = query
|
||||
if parameters:
|
||||
# Split the parameters string and reverse the list
|
||||
params_array = parameters.split(';')[::-1]
|
||||
index = len(params_array)
|
||||
for param in params_array:
|
||||
# Replace the placeholder $$$<index> with the parameter
|
||||
placeholder = f"$$${index}"
|
||||
filled_query = filled_query.replace(placeholder, param)
|
||||
index -= 1 # Decrement the index
|
||||
return filled_query
|
||||
|
||||
def format_column_definition(row):
|
||||
if pd.isnull(row['data_description']):
|
||||
# If data_description is null, only include column_name and data_type_string
|
||||
return f"{row['column_name']} {row['data_type_string']}"
|
||||
else:
|
||||
# If data_description is present, include it with a comment
|
||||
# Ensure data_description does not contain single quotes
|
||||
data_description = str(row['data_description']).replace("'", "\\'")
|
||||
return f"{row['column_name']} {row['data_type_string']} comment '{data_description}'"
|
||||
#1 receive table name and check for target table and access type
|
||||
|
||||
def execute_oracle_query(sql):
|
||||
oracle_conn = oraconn.connect('MRDS_LOADER_MOPDB')
|
||||
cursor = oracle_conn.cursor()
|
||||
options=cursor.execute(sql).fetchall()
|
||||
oracle_conn.commit()
|
||||
|
||||
df = pd.DataFrame(options,columns= [row[0].lower() for row in cursor.description])
|
||||
## fetch db dtypes
|
||||
cursor.close()
|
||||
oracle_conn.close()
|
||||
return df
|
||||
|
||||
|
||||
def get_target_table(oracle_mgmt_table,source_schema,source_table, env):
|
||||
sql=f"SELECT DISTINCT TABLE_ALIAS FROM {oracle_mgmt_table} WHERE OWNER = '{source_schema}' AND TABLE_NAME = '{source_table}'"
|
||||
df=execute_oracle_query(sql)
|
||||
return df
|
||||
|
||||
def get_type_ofAccess(oracle_metadata_table,source_schema,source_table,env):
|
||||
sql=f"SELECT DISTINCT RAR3_TYPE_OF_ACCESS FROM {oracle_metadata_table} WHERE A_VALID_TO > SYSDATE AND OWNER = '{source_schema}'AND TABLE_NAME = '{source_table}'"
|
||||
df=execute_oracle_query(sql)
|
||||
return df
|
||||
|
||||
#2 load metadata
|
||||
def readIGAMRoles( config ,env):
|
||||
queryParams = "'" + config.sentry_role_environment + "'"
|
||||
igamRolesQuery = tbq.get_query_igam_roles(config.oracle_igam_table,config.service_name)
|
||||
logger.info(f"Querying the IGAM Table")
|
||||
|
||||
queryWithParamsIgamSentry = applyQueryParameters(igamRolesQuery, queryParams)
|
||||
|
||||
logger.info(f"Replaced params to IGAM Table:")
|
||||
|
||||
igamRoleDF = execute_oracle_query(queryWithParamsIgamSentry)
|
||||
return igamRoleDF
|
||||
|
||||
def loadMetadataTable( config,env ):
|
||||
|
||||
metadataQuery = tbq.get_query_metadata(config.oracle_metadata_table, config.source_schema, config.source_table)
|
||||
|
||||
logger.info("Map Oracle metadata (data types) to Hive query: ")
|
||||
|
||||
jdbcMetaDataDF = df=execute_oracle_query(metadataQuery)
|
||||
|
||||
logger.info("Fetch all fields for table and concatenate them separated by ','")
|
||||
tableDataList = jdbcMetaDataDF.apply(format_column_definition, axis=1).tolist()
|
||||
tableFields = ",".join(tableDataList)
|
||||
|
||||
return tableFields
|
||||
|
||||
|
||||
#3 drop table and policies
|
||||
def deleteExternalTable(config,env_config):
|
||||
try:
|
||||
deleted=ranger.delete_policy(config,env_config)
|
||||
except Exception as e:
|
||||
pass
|
||||
sql_drop = f"DROP TABLE IF EXISTS {config.corporate_store}.{config.target_table}"
|
||||
execute_query(
|
||||
sql_drop,
|
||||
env_config['DEVO_USERNAME'], env_config['IMPALA_HOSTNAME'], env_config['DEVO_SECRET'],
|
||||
)
|
||||
|
||||
#4 create external table and policies
|
||||
def createExternalTables( config, tableFields,env_config ):
|
||||
sql_create = (
|
||||
f"CREATE EXTERNAL TABLE {config.corporate_store}.{config.target_table} "
|
||||
f"({tableFields}, {config.tech_meta_data_fields}) "
|
||||
"ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' "
|
||||
"STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' "
|
||||
"OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' "
|
||||
f"LOCATION '{config.target_s3_bucket}/{config.target_table}' "
|
||||
"TBLPROPERTIES ("
|
||||
"'external.table.purge'='true', "
|
||||
"'parquet.compression'='snappy')"
|
||||
)
|
||||
sql_create=fix_impala_sql(sql_create)
|
||||
execute_query(sql_create,env_config['DEVO_USERNAME'],env_config['HIVE_HOSTNAME'],env_config['DEVO_SECRET'])
|
||||
|
||||
def createTableFromExternal( config, tableFields,env_config ):
|
||||
|
||||
sql_create = (
|
||||
f"CREATE EXTERNAL TABLE {config.corporate_store}.{config.target_table} AS "
|
||||
f"SELECT * FROM {config.corporate_store}.{config.target_table}_EXT"
|
||||
)
|
||||
|
||||
execute_query(sql_create,env_config['DEVO_USERNAME'],env_config['HIVE_HOSTNAME'],env_config['DEVO_SECRET'])
|
||||
|
||||
def accessTypeMapper(config, env_config, igamRoleDF):
|
||||
|
||||
if config.access_type.lower() == '1':
|
||||
accessType_1(config, env_config, igamRoleDF)
|
||||
elif (config.access_type.lower() == '2a'):
|
||||
accessType_2A(config, env_config, igamRoleDF)
|
||||
elif (config.access_type.lower() == '2b'):
|
||||
accessType_2B(config, env_config, igamRoleDF)
|
||||
elif (config.access_type.lower() == '3'):
|
||||
accessType_3(config, env_config, igamRoleDF)
|
||||
else:
|
||||
logger.info(f"Invalid access type {config.access_type}. Please check the input param")
|
||||
|
||||
def accessType_1(config, env_config, igamRoleDF):
|
||||
logger.info("Grant privileges for access type 1")
|
||||
logger.info("Fetch metadata from Oracle for access type 1")
|
||||
|
||||
# ---- Construct query and fetch from Oracle ----
|
||||
queryParams = f"'{config.source_schema}.{config.source_table}'"
|
||||
queryMetadataAccessType1 = tbq.get_query_metadata_access_type1(config.oracle_metadata_table)
|
||||
queryWithParamsAccessType1 = applyQueryParameters(queryMetadataAccessType1, queryParams)
|
||||
|
||||
logger.info("Metadata table query: " )
|
||||
jdbcMetaDataAccessType1DF = df=execute_oracle_query(queryWithParamsAccessType1)
|
||||
|
||||
# ---- Normalize columns ----
|
||||
df = jdbcMetaDataAccessType1DF.copy()
|
||||
df["rar3_type_of_access"] = df["rar3_type_of_access"].astype(str).str.strip()
|
||||
df["source"] = df["source"].astype(str).str.strip().str.upper()
|
||||
igamRoleDF["datasource"] = igamRoleDF["datasource"].astype(str).str.strip().str.upper()
|
||||
|
||||
# ---- Branch A: source != 'RAR' ----
|
||||
left_a = (
|
||||
df.loc[
|
||||
(df["rar3_type_of_access"] == "1") & (df["source"] != config.service_name),
|
||||
["table_name", "source"]
|
||||
]
|
||||
.drop_duplicates()
|
||||
)
|
||||
|
||||
branch_a = (
|
||||
left_a.merge(
|
||||
igamRoleDF,
|
||||
left_on="source",
|
||||
right_on="datasource",
|
||||
how="inner"
|
||||
)
|
||||
[["table_name", "source", "subsource_id", "igam_entitlement", "environment"]]
|
||||
.drop_duplicates()
|
||||
)
|
||||
|
||||
# ---- Branch B: source == 'RAR' (CROSS JOIN with igamRoleDF) ----
|
||||
left_b = (
|
||||
df.loc[
|
||||
(df["rar3_type_of_access"] == "1") & (df["source"] == config.service_name),
|
||||
["table_name", "source"]
|
||||
]
|
||||
.drop_duplicates()
|
||||
)
|
||||
|
||||
if not left_b.empty:
|
||||
branch_b = (
|
||||
left_b.merge(igamRoleDF, how="cross")
|
||||
[["table_name", "source", "subsource_id", "igam_entitlement", "environment"]]
|
||||
.drop_duplicates()
|
||||
)
|
||||
else:
|
||||
branch_b = pd.DataFrame(columns=["table_name", "source", "subsource_id", "igam_entitlement", "environment"])
|
||||
|
||||
# ---- UNION (distinct) ----
|
||||
typeOneDF = (
|
||||
pd.concat([branch_a, branch_b], ignore_index=True)
|
||||
.drop_duplicates()
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
|
||||
# ---- Collect IGAM entitlements ----
|
||||
igam_entitlements = (
|
||||
typeOneDF["igam_entitlement"]
|
||||
.dropna()
|
||||
.astype(str)
|
||||
.str.strip()
|
||||
.tolist()
|
||||
)
|
||||
# Extract IGAM entitlements
|
||||
|
||||
# Merge with optional full access list
|
||||
if config.full_access_entitlement_list is None:
|
||||
combined_entitlements = igam_entitlements
|
||||
else:
|
||||
full_access_list_clean = config.full_access_entitlement_list
|
||||
combined_entitlements = igam_entitlements + full_access_list_clean
|
||||
|
||||
# Add table permission groups using YAMLFormatter
|
||||
params = ranger.add_table_permission_groups(
|
||||
config.corporate_store,
|
||||
config.target_table,
|
||||
config.access_type,
|
||||
config.source_table,
|
||||
combined_entitlements
|
||||
)
|
||||
|
||||
# Generate the final YAML policy
|
||||
formattedYaml = ranger.generate_policy(params,env_config, None)
|
||||
logger.info(f"Final YAML format")
|
||||
|
||||
return formattedYaml
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def accessType_2A(config, env_config, igamRoleDF):
|
||||
logger.info("Grant privileges for access type 2a")
|
||||
logger.info("Fetch the metadata in Oracle for access type 2a")
|
||||
|
||||
# ---- Construct query and fetch from Oracle ----
|
||||
queryParams = f"'{config.source_schema}.{config.source_table}'"
|
||||
queryMetadataAccessType2a = tbq.get_query_metadata_access_type2a(config.oracle_metadata_table)
|
||||
queryWithParamsAccessType2a = applyQueryParameters(queryMetadataAccessType2a, queryParams)
|
||||
|
||||
logger.info(f"Meta data table query: {queryWithParamsAccessType2a} ")
|
||||
jdbcMetaDataAccessType2aDF = execute_oracle_query(queryWithParamsAccessType2a)
|
||||
|
||||
# ---- Normalize columns ----
|
||||
df = jdbcMetaDataAccessType2aDF.copy()
|
||||
df["rar3_type_of_access"] = df["rar3_type_of_access"].astype(str).str.strip().str.lower()
|
||||
df["source"] = df["source"].astype(str).str.strip().str.upper()
|
||||
print(df)
|
||||
|
||||
roles = igamRoleDF.copy()
|
||||
# expected columns in igamRoleDF: rar_subsource_id, igam_entitlement, environment (plus anything else you keep)
|
||||
roles["subsource_id"] = roles["subsource_id"].astype(str).str.strip().str.upper()
|
||||
roles["igam_entitlement"] = roles["igam_entitlement"].astype(str).str.strip()
|
||||
|
||||
# ---- Branch A: source != service_name -> INNER JOIN on source == rar_subsource_id ----
|
||||
left_a = (
|
||||
df.loc[
|
||||
(df["rar3_type_of_access"] == "2a")
|
||||
& (df["source"] != config.service_name.upper()),
|
||||
["table_name", "column_name", "source"]
|
||||
]
|
||||
)
|
||||
|
||||
branch_a = (
|
||||
left_a.merge(
|
||||
roles,
|
||||
left_on="source",
|
||||
right_on="subsource_id",
|
||||
how="inner"
|
||||
)
|
||||
.drop(columns=["subsource_id", "source"], errors="ignore")
|
||||
[["table_name", "column_name", "igam_entitlement", "environment"]]
|
||||
)
|
||||
|
||||
# ---- Branch B: source == service_name -> CROSS JOIN with igamRoleDF ----
|
||||
left_b = (
|
||||
df.loc[
|
||||
(df["rar3_type_of_access"] == "2a")
|
||||
& (df["source"] == config.service_name.upper()),
|
||||
["table_name", "column_name", "source"]
|
||||
]
|
||||
)
|
||||
|
||||
if not left_b.empty:
|
||||
try:
|
||||
branch_b = (
|
||||
left_b.merge(roles, how="cross")
|
||||
.drop(columns=["subsource_id", "source"], errors="ignore")
|
||||
[["table_name", "column_name", "igam_entitlement", "environment"]]
|
||||
)
|
||||
except TypeError:
|
||||
# pandas < 1.2 fallback
|
||||
left_b["_cj"] = 1
|
||||
roles["_cj"] = 1
|
||||
branch_b = (
|
||||
left_b.merge(roles, on="_cj")
|
||||
.drop(columns=["_cj", "subsource_id", "source"], errors="ignore")
|
||||
[["table_name", "column_name", "igam_entitlement", "environment"]]
|
||||
)
|
||||
# (optional) cleanup if you keep using roles later
|
||||
roles.drop(columns=["_cj"], inplace=True, errors="ignore")
|
||||
else:
|
||||
branch_b = pd.DataFrame(columns=["table_name", "column_name", "igam_entitlement", "environment"])
|
||||
|
||||
# ---- UNION (distinct) ----
|
||||
one_df = (
|
||||
pd.concat([branch_a, branch_b], ignore_index=True)
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
|
||||
# ---- Group 1: (table_name, igam_entitlement) -> sorted, comma-joined column_list ----
|
||||
tmp = one_df.sort_values(["table_name", "igam_entitlement", "column_name"], kind="mergesort")
|
||||
new_df = (
|
||||
tmp.groupby(["table_name", "igam_entitlement"], as_index=False)["column_name"]
|
||||
.apply(lambda s: ",".join(s.dropna().astype(str).tolist()))
|
||||
.rename(columns={"column_name": "column_list"})
|
||||
)
|
||||
# Columns: table_name, igam_entitlement, column_list
|
||||
|
||||
# ---- Group 2: (table_name, column_list) -> comma-joined igam_entitlement ----
|
||||
grouped = (
|
||||
new_df.groupby(["table_name", "column_list"], as_index=False)["igam_entitlement"]
|
||||
.apply(lambda s: ",".join(s.dropna().astype(str).tolist()))
|
||||
)
|
||||
# Columns: table_name, column_list, igam_entitlement
|
||||
|
||||
# ---- ROW_NUMBER() OVER (ORDER BY column_list) -> policy_id ----
|
||||
grouped = grouped.sort_values(["column_list"], kind="mergesort")
|
||||
grouped["policy_id"] = np.arange(1, len(grouped) + 1).astype(int)
|
||||
|
||||
# ---- Emit policies: one per (table_name, column_list) row ----
|
||||
for _, row in grouped.iterrows():
|
||||
entitlements_list = [e.strip() for e in str(row["igam_entitlement"]).split(",") if e.strip()]
|
||||
columns_list = [c.strip() for c in str(row["column_list"]).split(",") if c.strip()]
|
||||
policy_id = str(int(row["policy_id"]))
|
||||
|
||||
params = ranger.add_table_permission_groups(
|
||||
config.corporate_store,
|
||||
config.target_table,
|
||||
config.access_type, # "2a"
|
||||
config.source_table,
|
||||
entitlements_list,
|
||||
columns_list=columns_list
|
||||
)
|
||||
ranger.generate_policy(params, env_config, policy_id)
|
||||
|
||||
|
||||
# ---- Optional: append full-access YAML if list provided on config ----
|
||||
if getattr(config, "full_access_entitlement_list", None):
|
||||
# If your code already provides a list, use it directly; otherwise split string.
|
||||
if isinstance(config.full_access_entitlement_list, list):
|
||||
full_access_list = config.full_access_entitlement_list
|
||||
else:
|
||||
full_access_list = [s.strip() for s in str(config.full_access_entitlement_list).split(",") if s.strip()]
|
||||
|
||||
params_full = ranger.add_table_permission_groups(
|
||||
config.corporate_store,
|
||||
config.target_table,
|
||||
config.access_type, # keep same access type per your pattern
|
||||
config.source_table,
|
||||
full_access_list
|
||||
)
|
||||
ranger.generate_policy(params_full, env_config, "full_access")
|
||||
|
||||
|
||||
|
||||
|
||||
def accessType_2B(config, env_config,igamRoleDF):
|
||||
logger.info(f"Grant privileges for access type {config.access_type}")
|
||||
logger.info("Fetch the metadata in Oracle for access type 2b")
|
||||
|
||||
# --- Validate required columns ---
|
||||
required = {"environment", "igam_entitlement", "subsource_id"}
|
||||
missing = required - set(igamRoleDF.columns)
|
||||
if missing:
|
||||
raise KeyError(f"igamRoleDF missing required column(s): {sorted(missing)}")
|
||||
|
||||
# --- Normalize to strings (robust against None/NaN) ---
|
||||
igamRoleDF = igamRoleDF.copy()
|
||||
igamRoleDF["environment"] = igamRoleDF["environment"].astype(str).str.strip()
|
||||
igamRoleDF["igam_entitlement"] = igamRoleDF["igam_entitlement"].astype(str).str.strip()
|
||||
igamRoleDF["subsource_id"] = igamRoleDF["subsource_id"].astype(str).str.strip()
|
||||
|
||||
# --- Aggregation: per (environment, igam_entitlement) collect unique subsource_id list ---
|
||||
# Keep a stable order by sorting; remove empties.
|
||||
agg_df = (
|
||||
igamRoleDF.loc[igamRoleDF["subsource_id"].ne(""), ["environment", "igam_entitlement", "subsource_id"]]
|
||||
.drop_duplicates()
|
||||
.sort_values(["environment", "igam_entitlement", "subsource_id"], kind="mergesort")
|
||||
.groupby(["environment", "igam_entitlement"], as_index=False)["subsource_id"]
|
||||
.agg(lambda s: ",".join(s.unique()))
|
||||
.rename(columns={"subsource_id": "subsource_id_list"})
|
||||
)
|
||||
|
||||
# List of tuples (IGAM_ENTITLEMENT, subsource_id_list) — mirrors your log payload
|
||||
accessType2bValidList = list(zip(
|
||||
agg_df["igam_entitlement"].astype(str),
|
||||
agg_df["subsource_id_list"].astype(str)
|
||||
))
|
||||
|
||||
# --- Entitlements for policy generation (unique, non-empty) ---
|
||||
igam_entitlements = (
|
||||
igamRoleDF["igam_entitlement"]
|
||||
.dropna()
|
||||
.map(str)
|
||||
.str.strip()
|
||||
.loc[lambda s: s.ne("")]
|
||||
.drop_duplicates()
|
||||
.tolist()
|
||||
)
|
||||
|
||||
# --- Row-level permissions (per your existing API) ---
|
||||
params_row_level = ranger.add_table_permission_groups(
|
||||
config.corporate_store,
|
||||
config.target_table,
|
||||
config.access_type,
|
||||
config.source_table,
|
||||
igam_entitlements
|
||||
)
|
||||
|
||||
# --- Table-level permissions, merging in full-access entitlements if provided ---
|
||||
if getattr(config, "full_access_entitlement_list", None):
|
||||
combined_entitlements = igam_entitlements + config.full_access_entitlement_list
|
||||
else:
|
||||
combined_entitlements = igam_entitlements
|
||||
|
||||
|
||||
# --- Emit YAML using your helpers ---
|
||||
|
||||
if getattr(config, "full_access_entitlement_list", None):
|
||||
params = ranger.add_table_permission_groups(
|
||||
config.corporate_store,
|
||||
config.target_table,
|
||||
config.access_type,
|
||||
config.source_table,
|
||||
combined_entitlements
|
||||
)
|
||||
ranger.generate_policy(params, env_config)
|
||||
|
||||
|
||||
ranger.yaml_format_2b(params_row_level,env_config, config.full_access_entitlement_list) # row-level policy
|
||||
logger.info("Final YAML format emitted for 2B.")
|
||||
|
||||
|
||||
|
||||
|
||||
def accessType_3(config,env_config, igamRoleDF):
|
||||
"""
|
||||
Python/pandas translation of the Scala accessType_3.
|
||||
Expects igamRoleDF to have at least: ['igam_entitlement', 'subsource_id'].
|
||||
The `config` object should expose the attributes used below (names match your Scala/Python usage).
|
||||
Uses a YAML formatter module `ranger` with:
|
||||
- add_table_permission_groups(corporate_store, target_table, access_type, source_table, entitlements)
|
||||
- yaml_format_3(params)
|
||||
- yaml_format_1(params)
|
||||
"""
|
||||
|
||||
# --- 1) Filter entitlements where subsource_id = 'TMS' ---
|
||||
if not {"igam_entitlement", "subsource_id"}.issubset(igamRoleDF.columns):
|
||||
missing = {"igam_entitlement", "subsource_id"} - set(igamRoleDF.columns)
|
||||
raise KeyError(f"igamRoleDF missing required column(s): {sorted(missing)}")
|
||||
|
||||
new_df = (
|
||||
igamRoleDF.loc[
|
||||
igamRoleDF["subsource_id"].astype(str).str.upper() == "TMS",
|
||||
["igam_entitlement"]
|
||||
].drop_duplicates()
|
||||
)
|
||||
|
||||
accessType3ValidList = new_df["igam_entitlement"].astype(str).str.strip().tolist()
|
||||
|
||||
# --- 2) Build params for row-level groups (type 3) ---
|
||||
params_row_level = ranger.add_table_permission_groups(
|
||||
config.corporate_store,
|
||||
config.target_table,
|
||||
config.access_type,
|
||||
config.source_table,
|
||||
accessType3ValidList
|
||||
)
|
||||
|
||||
corp = str(config.corporate_store).lower()
|
||||
src_tbl = str(config.source_table).lower()
|
||||
|
||||
# --- 3) Compose the filter expressions (match Scala strings) ---
|
||||
sqlCreateView3NonRestrString_Ptree = (
|
||||
"(parent_fk in ( "
|
||||
f"select portfolio_fk from {corp}.nh_portfolio_access "
|
||||
"where lower(user_id) LIKE concat('%', lower(regexp_extract(current_user(),'[^@]*',0)), '%') "
|
||||
"AND to_date(a_valid_to) > current_timestamp() "
|
||||
")) AND (child_fk in ( "
|
||||
f"select portfolio_fk from {corp}.nh_portfolio_access "
|
||||
"where lower(user_id) LIKE concat('%', lower(regexp_extract(current_user(),'[^@]*',0)), '%') "
|
||||
"AND to_date(a_valid_to) > current_timestamp() "
|
||||
"))"
|
||||
)
|
||||
|
||||
sqlCreateView3NonRestrString_Pos = (
|
||||
"position_key in ( "
|
||||
f"select position_key from {corp}.nh_portfolio_access a "
|
||||
f"inner join {corp}.nh_position b on ( "
|
||||
"(b.portfolio_fk = a.portfolio_fk and b.portfolio_fk is not NULL) or "
|
||||
"(b.portfolio_compare_fk = a.portfolio_fk and b.portfolio_compare_fk is not NULL) "
|
||||
") "
|
||||
"where lower(user_id) LIKE concat('%', lower(regexp_extract(current_user(),'[^@]*',0)), '%') "
|
||||
"AND to_date(a_valid_to) > current_timestamp() "
|
||||
")"
|
||||
)
|
||||
|
||||
sqlCreateView3PortAccess = "lower(user_id) LIKE concat('%', lower(regexp_extract(current_user(),'[^@]*',0)), '%')"
|
||||
sqlCreateView3LimAccess = "lower(user_id) LIKE concat('%', lower(regexp_extract(current_user(),'[^@]*',0)), '%')"
|
||||
|
||||
# Standard case uses the configured key columns/table names
|
||||
key_col = getattr(config, "type3SourceTableKeyColumn", None)
|
||||
acc_col = getattr(config, "type3AccessTableKeyColumn", None)
|
||||
acc_table= getattr(config, "type3AccessTable", None)
|
||||
if not all([key_col, acc_col, acc_table]):
|
||||
# Only needed for the default branch; keep None if your config doesn't use the default
|
||||
key_col = key_col or "source_key_col"
|
||||
acc_col = acc_col or "access_key_col"
|
||||
acc_table = acc_table or "type3_access_table"
|
||||
|
||||
sqlCreateView3NonRestrString_Stdrd = (
|
||||
f"{key_col} in (select {acc_col} from {corp}.{acc_table} "
|
||||
"where lower(user_id) LIKE concat('%', lower(regexp_extract(current_user(),'[^@]*',0)), '%') "
|
||||
"AND to_date(a_valid_to) > current_timestamp())"
|
||||
)
|
||||
|
||||
# --- 4) Choose the filter by source table (matches Scala match/case) ---
|
||||
if src_tbl == "nh_portfoliotree":
|
||||
sqlCreateViewType3Filter = sqlCreateView3NonRestrString_Ptree
|
||||
elif src_tbl == "nh_position":
|
||||
sqlCreateViewType3Filter = sqlCreateView3NonRestrString_Pos
|
||||
elif src_tbl == "nh_portfolio_access":
|
||||
sqlCreateViewType3Filter = sqlCreateView3PortAccess
|
||||
elif src_tbl == "nh_limit_access":
|
||||
sqlCreateViewType3Filter = sqlCreateView3LimAccess
|
||||
else:
|
||||
sqlCreateViewType3Filter = sqlCreateView3NonRestrString_Stdrd
|
||||
|
||||
# --- 5) Row filter YAML block (uses groups from params_row_level) ---
|
||||
# Expecting params_row_level like {'igam_roles': '...'}; adjust key if your API differs.
|
||||
igam_roles_lower = str(params_row_level.get("igam_roles", "")).lower()
|
||||
rowFilter = (
|
||||
"- groups:\n"
|
||||
f" {igam_roles_lower}\n"
|
||||
" accesses:\n"
|
||||
" - select\n"
|
||||
f" filterExpr: \"{sqlCreateViewType3Filter}\"\n"
|
||||
" "
|
||||
)
|
||||
|
||||
# --- 6) Handle optional full access entitlements ---
|
||||
|
||||
|
||||
if config.full_access_entitlement_list:
|
||||
paramsFullAccess = ranger.add_table_permission_groups(
|
||||
config.corporate_store,
|
||||
config.target_table,
|
||||
config.access_type,
|
||||
config.source_table,
|
||||
config.full_access_entitlement_list
|
||||
)
|
||||
|
||||
full_groups_lower = str(paramsFullAccess.get("igam_roles", "")).lower()
|
||||
"""
|
||||
fullAccessFilter = (
|
||||
"- groups:\n"
|
||||
f" {full_groups_lower}\n"
|
||||
" accesses:\n"
|
||||
" - select\n"
|
||||
" filterExpr: \"1=1\"\n"
|
||||
" "
|
||||
)
|
||||
"""
|
||||
params_table_level = ranger.add_table_permission_groups(
|
||||
config.corporate_store,
|
||||
config.target_table,
|
||||
config.access_type,
|
||||
config.source_table,
|
||||
accessType3ValidList + config.full_access_entitlement_list
|
||||
)
|
||||
else:
|
||||
fullAccessFilter = ""
|
||||
params_table_level = ranger.add_table_permission_groups(
|
||||
config.corporate_store,
|
||||
config.target_table,
|
||||
config.access_type,
|
||||
config.source_table,
|
||||
accessType3ValidList
|
||||
)
|
||||
|
||||
# --- 7) Render YAML and merge like Scala ---
|
||||
ranger.yaml_format_3(params_row_level,env_config,sqlCreateViewType3Filter,config.full_access_entitlement_list ) # base type 3 yaml
|
||||
ranger.yaml_format_1(params_table_level,env_config) # table-level yaml
|
||||
|
||||
|
||||
#5 create extra policies for super-users
|
||||
#6 refresh metadata
|
||||
|
||||
def run_process(env_file, env, service_name,source_schema,source_table,sentry_role_environment):
|
||||
#1 receive table name and check for target table and access type
|
||||
env_dict=initialize_config(env_file)
|
||||
env_config=env_dict[env]
|
||||
if service_name.lower()=='rqsd':
|
||||
env_config["DEVO_SECRET"]=env_config["DEVO_SECRET_RQSD"]
|
||||
env_config["DEVO_USERNAME"]=env_config["DEVO_USERNAME_RQSD"]
|
||||
try:
|
||||
devo_secret_name = env_config["DEVO_SECRET"]
|
||||
env_config["DEVO_SECRET"]= get_secret(devo_secret_name)
|
||||
except:
|
||||
logger.error("Failed to retrieve credentials from secrets")
|
||||
raise(Exception)
|
||||
db_config=env_dict[service_name]
|
||||
try:
|
||||
target_table=get_target_table(db_config['oracle_mgmt_table'],source_schema,source_table,env)['table_alias'][0]
|
||||
except Exception as e:
|
||||
logger.error("Table not found in oracle management table")
|
||||
logger.error("Exception: %s", e)
|
||||
logger.error("Traceback:\n%s", traceback.format_exc())
|
||||
raise
|
||||
try:
|
||||
access_type=get_type_ofAccess(db_config['oracle_metadata_table'],source_schema,source_table,env)['rar3_type_of_access'][0].strip()
|
||||
except Exception as e:
|
||||
logger.error("Table not found in oracle metadata inventory")
|
||||
logger.error("Exception: %s", e)
|
||||
logger.error("Traceback:\n%s", traceback.format_exc())
|
||||
raise
|
||||
args={
|
||||
'corporate_store':db_config['corporate_store'],
|
||||
'service_name': service_name,
|
||||
'source_schema':source_schema,
|
||||
'source_table':source_table,
|
||||
'oracle_metadata_table':db_config['oracle_metadata_table'],
|
||||
'oracle_igam_table':db_config['oracle_igam_table'],
|
||||
'oracle_mgmt_table': db_config['oracle_mgmt_table'],
|
||||
'target_table':target_table,
|
||||
'sentry_role_environment':sentry_role_environment,
|
||||
'target_s3_bucket': env_config["BUCKET_PREFIX"]+db_config['target_s3_bucket'] ,
|
||||
'tech_meta_data_fields': db_config['tech_meta_data_fields'],
|
||||
'full_access_entitlement_list':env_config[f"FULL_ACCESS_LIST_{service_name.upper()}"].split(','),
|
||||
'access_type': access_type
|
||||
}
|
||||
config=fo.Options(args)
|
||||
#2 load metadata
|
||||
tableFields=loadMetadataTable(config,env)
|
||||
igamRoles=readIGAMRoles(config,env)
|
||||
#3 drop table and policies
|
||||
deleteExternalTable(config,env_config)
|
||||
#4 create external table and policies
|
||||
if (config.target_table[-4:].upper() == '_EXT'):
|
||||
createExternalTables( config, tableFields,env_config )
|
||||
else:
|
||||
createTableFromExternal( config, tableFields,env_config)
|
||||
|
||||
accessTypeMapper(config,env_config,igamRoles)
|
||||
#5 refresh metadata
|
||||
#execute_query(f"INVALIDATE METADATA {config.corporate_store}.{config.target_table}",env_config["DEVO_USERNAME"],env_config['IMPALA_HOSTNAME'],env_config['DEVO_SECRET'])
|
||||
#execute_query(f"COMPUTE STATS {config.corporate_store}.{config.target_table}",env_config["DEVO_USERNAME"],env_config['IMPALA_HOSTNAME'],env_config['DEVO_SECRET'])
|
||||
|
||||
#run_process("/home/dbt/Marco/mrds_elt/python/devo_replicator/env_config.yaml",'tst','mopdb','MPEC','T_MPEC','TEST/INTEGRATION')
|
||||
|
||||
#run_process("/home/dbt/Marco/mrds_elt/python/devo_replicator/config/env_config.yaml",'tst','rar','CORR_RAR','NH_ASSET','TEST/INTEGRATION')
|
||||
#run_process("/home/dbt/Marco/mrds_elt/python/devo_replicator/config/env_config.yaml",'dev','rar','CORR_RAR','NH_LIMIT','TEST/INTEGRATION')
|
||||
|
||||
run_process("/home/dbt/Marco/mrds_elt/python/devo_replicator/config/env_config.yaml",'dev','rar','CORR_RAR','NH_Asset_transactial_data'.upper(),'TEST/INTEGRATION')
|
||||
"""
|
||||
df=execute_oracle_query('select owner, table_name from CT_MRDS.A_DEVO_REPLICA_MGMT_RQSD')
|
||||
listfail=[]
|
||||
for index, row in df.iterrows():
|
||||
|
||||
try:
|
||||
print("running table: ",row["table_name"])
|
||||
run_process("/home/dbt/Marco/mrds_elt/python/devo_replicator/config/env_config.yaml",'tst','rqsd',row['owner'],row['table_name'].upper(),'TEST/INTEGRATION')
|
||||
except:
|
||||
print("failed")
|
||||
listfail.append(row["table_name"])
|
||||
print("succeded")
|
||||
print(listfail)
|
||||
"""
|
||||
|
||||
|
||||
'''{"id": 48754, "guid": "d75f1491-538d-402a-a8ac-e7e21ac0be53", "isEnabled": true, "version": 1, "service": "cm_hive", "name": "cpo_crp_rar_mu_asset_code_map_2a_policy_1", "policyType": 0, "policyPriority": 0, "description": "created-ranger_client-v0.0.6-2025-10-17T14:28:24.135108", "isAuditEnabled": true, "resources": {"database": {"values": ["crp_rar"], "isExcludes": false, "isRecursive": false},
|
||||
"column": {"values": ["ASSET_FK", "ASSET_FK", "A_DWH_LOAD_SET_FK", "A_DWH_LOAD_SET_FK", "A_VALID_FROM", "A_VALID_FROM", "A_VALID_TO", "A_VALID_TO", "CODE_TYPE_NO_ID", "CODE_TYPE_NO_ID", "CODE_VALUE", "CODE_VALUE", "INDEP_SUBPROCESS_FK", "INDEP_SUBPROCESS_FK", "TEC_EXECUTION_DATE", "TEC_EXECUTION_DATE", "TEC_INGESTION_DATE", "TEC_INGESTION_DATE", "TEC_RUN_ID", "TEC_RUN_ID"], "isExcludes": false, "isRecursive": false},
|
||||
"table": {"values": ["MU_ASSET_CODE_MAP"], "isExcludes": false, "isRecursive": false}}, "policyItems": [{"accesses": [{"type": "select", "isAllowed": true}],
|
||||
"groups": ["a_mopdb_ea", "disc-au-bda"], "delegateAdmin": false}], "serviceType": "hive", "isDenyAllElse": false}
|
||||
|
||||
{"id": 48755, "guid": "5ff857c2-3683-4178-98ce-5932c0677cd4", "isEnabled": true, "version": 1, "service": "cm_hive", "name": "cpo_crp_rar_mu_asset_code_map_2a_policy_2", "policyType": 0, "policyPriority": 0, "description": "created-ranger_client-v0.0.6-2025-10-17T14:28:24.135108", "isAuditEnabled": true, "resources": {"database": {"values": ["crp_rar"], "isExcludes": false, "isRecursive": false},
|
||||
"column": {"values": ["ASSET_FK", "A_DWH_LOAD_SET_FK", "A_VALID_FROM", "A_VALID_TO", "CODE_TYPE_NO_ID", "CODE_VALUE", "INDEP_SUBPROCESS_FK", "TEC_EXECUTION_DATE", "TEC_INGESTION_DATE", "TEC_RUN_ID"], "isExcludes": false, "isRecursive": false},
|
||||
"table": {"values": ["MU_ASSET_CODE_MAP"], "isExcludes": false, "isRecursive": false}}, "policyItems": [{"accesses": [{"type": "select", "isAllowed": true}], "
|
||||
groups": ["su-omd-reuters-users", "a_mopdb_excess_liquidity", "a-mora-lba-exp-a", "a_rar_csdb_reference_data", "a_mopdb_uc", "a_rar_csdb_ratings_data", "a_mopdb_credit_operations", "a_rar_fxcd_data", "a_rar_mdp_bbg_data", "disc-ac-riad_cnf_n-r", "a-mora-lba-ana-a", "a_mopdb_tms_data", "disc-ac-riad_core-r", "a_mopdb_mpec", "a-led-ana-a", "a-led-exp-a", "a_mopdb_ela_all"], "delegateAdmin": false}], "serviceType": "hive", "isDenyAllElse": false}
|
||||
|
||||
{"id": 48756, "guid": "1071767f-8ef6-47be-bb9b-7077ed9e9a90", "isEnabled": true, "version": 1, "service": "cm_hive", "name": "cpo_crp_rar_mu_asset_code_map_2a_policy_full_access", "policyType": 0, "policyPriority": 0, "description": "created-ranger_client-v0.0.6-2025-10-17T14:28:24.135108", "isAuditEnabled": true, "resources": {"database": {"values": ["crp_rar"], "isExcludes": false, "isRecursive": false}, "column": {"values": ["*"], "isExcludes": false, "isRecursive": false}, "table": {"values": ["MU_ASSET_CODE_MAP"], "isExcludes": false, "isRecursive": false}},
|
||||
"policyItems": [{"accesses": [{"type": "select", "isAllowed": true}], "groups": ["disc-ac-rar-r"], "delegateAdmin": false}], "serviceType": "hive", "isDenyAllElse": false}'''
|
||||
|
||||
|
||||
'''
|
||||
{"isEnabled": true, "isDenyAllElse": false, "service": "cm_hive", "name": "cpo_crp_rar_mu_asset_code_map_ext_2a_policy_1", "resources": {"database": {"values": ["crp_rar"], "isExcludes": false, "isRecursive": false}, "table": {"values": ["MU_ASSET_CODE_MAP_EXT"], "isExcludes": false, "isRecursive": false},
|
||||
"column": {"values": ["ASSET_FK", "ASSET_FK", "A_DWH_LOAD_SET_FK", "A_DWH_LOAD_SET_FK", "A_VALID_FROM", "A_VALID_FROM", "A_VALID_TO", "A_VALID_TO", "CODE_TYPE_NO_ID", "CODE_TYPE_NO_ID", "CODE_VALUE", "CODE_VALUE", "INDEP_SUBPROCESS_FK", "INDEP_SUBPROCESS_FK", "TEC_EXECUTION_DATE", "TEC_EXECUTION_DATE", "TEC_INGESTION_DATE", "TEC_INGESTION_DATE", "TEC_RUN_ID", "TEC_RUN_ID"], "isExcludes": false, "isRecursive": false}}, "policyItems": [{"delegateAdmin": false,
|
||||
"groups": ["disc-tu-bda", "t_mopdb_ea"], "accesses": [{"type": "select", "isAllowed": true}]}]}
|
||||
|
||||
|
||||
|
||||
{"isEnabled": true, "isDenyAllElse": false, "service": "cm_hive", "name": "cpo_crp_rar_mu_asset_code_map_ext_2a_policy_2", "resources": {"database": {"values": ["crp_rar"], "isExcludes": false, "isRecursive": false},
|
||||
"table": {"values": ["MU_ASSET_CODE_MAP_EXT"], "isExcludes": false, "isRecursive": false},
|
||||
"column": {"values": ["ASSET_FK", "A_DWH_LOAD_SET_FK", "A_VALID_FROM", "A_VALID_TO", "CODE_TYPE_NO_ID", "CODE_VALUE", "INDEP_SUBPROCESS_FK", "TEC_EXECUTION_DATE", "TEC_INGESTION_DATE", "TEC_RUN_ID"], "isExcludes": false, "isRecursive": false}}, "policyItems": [{"delegateAdmin": false,
|
||||
"groups": ["a-led-ana-t", "a-led-exp-t", "a-mora-lba-ana-t", "a-mora-lba-exp-t", "disc-tc-riad_cnf_n-r", "disc-tc-riad_core-r", "su-omd-reuters-users", "t_mopdb_credit_operations", "t_mopdb_ela_all", "t_mopdb_excess_liquidity", "t_mopdb_mpec", "t_mopdb_tms_data", "t_mopdb_uc", "t_rar_csdb_ratings_data", "t_rar_csdb_reference_data", "t_rar_fxcd_data", "t_rar_mdp_bbg_data"],
|
||||
"accesses": [{"type": "select", "isAllowed": true}]}]}
|
||||
|
||||
|
||||
|
||||
{"isEnabled": true, "isDenyAllElse": false, "service": "cm_hive", "name": "cpo_crp_rar_mu_asset_code_map_ext_2a_policy_full_access", "resources": {"database": {"values": ["crp_rar"], "isExcludes": false, "isRecursive": false},
|
||||
"table": {"values": ["MU_ASSET_CODE_MAP_EXT"], "isExcludes": false, "isRecursive": false}, "column": {"values": ["*"], "isExcludes": false, "isRecursive": false}}, "policyItems": [{"delegateAdmin": false,
|
||||
"groups": ["disc-dc-rar-r"], "accesses": [{"type": "select", "isAllowed": true}]}]}
|
||||
'''
|
||||
130
python/devo_replicator/table_generator/tableBuilderQueries.py
Normal file
130
python/devo_replicator/table_generator/tableBuilderQueries.py
Normal file
@@ -0,0 +1,130 @@
|
||||
metadata_table = "DW_RAR.NH_METADATA_INVENTORY"
|
||||
|
||||
def get_query_metadata(metadata_table, owner, table_name):
|
||||
query_metadata = (
|
||||
"WITH metaDF AS ( "
|
||||
"SELECT owner, table_name, column_id, column_name, data_type, data_precision, data_scale, "
|
||||
"CASE WHEN data_precision IS NULL AND data_scale IS NULL THEN NULL "
|
||||
"WHEN data_precision IS NOT NULL AND data_scale IS NULL THEN data_precision "
|
||||
"WHEN CAST(data_precision AS INT) >= CAST(data_scale AS INT) AND CAST(data_scale AS INT) >= 0 THEN data_precision "
|
||||
"WHEN CAST(data_precision AS INT) < CAST(data_scale AS INT) AND CAST(data_scale AS INT) <= 38 AND CAST(data_scale AS INT) > 0 THEN data_scale "
|
||||
"WHEN CAST(data_precision AS INT) < CAST(data_scale AS INT) AND (CAST(data_scale AS INT) > 38 OR CAST(data_scale AS INT) < 0) THEN NULL "
|
||||
"ELSE NULL END AS data_precision_hive, "
|
||||
"CASE WHEN data_precision IS NULL AND data_scale IS NULL THEN NULL "
|
||||
"WHEN data_precision IS NOT NULL AND data_scale IS NULL THEN CAST(0 AS INT) "
|
||||
"WHEN CAST(data_precision AS INT) >= CAST(data_scale AS INT) AND CAST(data_scale AS INT) >= 0 THEN data_scale "
|
||||
"WHEN CAST(data_precision AS INT) < CAST(data_scale AS INT) AND CAST(data_scale AS INT) <= 38 AND CAST(data_scale AS INT) > 0 THEN data_scale "
|
||||
"WHEN CAST(data_precision AS INT) < CAST(data_scale AS INT) AND (CAST(data_scale AS INT) > 38 OR CAST(data_scale AS INT) < 0) THEN NULL "
|
||||
"ELSE NULL END AS data_scale_hive, "
|
||||
"CASE WHEN data_type LIKE '%NUMBER%' AND data_precision IS NULL AND data_scale IS NULL THEN 'String' "
|
||||
"WHEN data_type LIKE '%NUMBER%' AND data_precision IS NOT NULL AND data_scale IS NULL THEN 'Decimal' "
|
||||
"WHEN data_type LIKE '%NUMBER%' AND CAST(data_precision AS INT) >= CAST(data_scale AS INT) AND CAST(data_scale AS INT) >= 0 THEN 'Decimal' "
|
||||
"WHEN data_type LIKE '%NUMBER%' AND CAST(data_precision AS INT) < CAST(data_scale AS INT) AND CAST(data_scale AS INT) <= 38 AND CAST(data_scale AS INT) > 0 THEN 'Decimal' "
|
||||
"WHEN data_type LIKE '%NUMBER%' AND CAST(data_precision AS INT) < CAST(data_scale AS INT) AND (CAST(data_scale AS INT) > 38 OR CAST(data_scale AS INT) < 0) THEN 'String' "
|
||||
"WHEN data_type LIKE '%CHAR%' THEN 'String' "
|
||||
"WHEN data_type LIKE '%VARCHAR2%' THEN 'String' "
|
||||
"WHEN data_type LIKE '%TIMESTAMP%' THEN 'String' "
|
||||
"WHEN data_type LIKE '%DATE%' THEN 'String' "
|
||||
"ELSE 'String' END AS data_type_hive, "
|
||||
"REGEXP_REPLACE(data_description, '''', '\\''') AS data_description "
|
||||
"FROM {0} "
|
||||
"WHERE lower(owner||'.'||table_name) = lower('{1}'||'.'||'{2}') "
|
||||
"AND a_valid_to > sysdate) "
|
||||
"SELECT owner, table_name, column_id, column_name, data_type, data_precision, data_scale, "
|
||||
"data_precision_hive, data_scale_hive, data_type_hive, "
|
||||
"CASE WHEN data_type_hive = 'Decimal' THEN 'Decimal(' || COALESCE(CAST(data_precision_hive AS VARCHAR2(30)), '') || ',' || COALESCE(CAST(data_scale_hive AS VARCHAR2(30)), '') || ')' "
|
||||
"ELSE data_type_hive END AS data_type_string, data_description "
|
||||
"FROM metaDF "
|
||||
"ORDER BY CAST(column_id AS INT) "
|
||||
).format(metadata_table, owner, table_name)
|
||||
|
||||
return query_metadata
|
||||
|
||||
def get_query_metadata_access_type1(metadata_table):
|
||||
query_metadata_access_type1 = (
|
||||
"SELECT owner, table_name, list_of_sources as SOURCE, rar3_type_of_access "
|
||||
"FROM {0} "
|
||||
"WHERE a_valid_to > sysdate "
|
||||
"AND rar3_type_of_access = '1' "
|
||||
"AND list_of_sources NOT IN 'RAR' "
|
||||
"AND lower(owner||'.'||table_name) = lower($$$1) "
|
||||
"UNION "
|
||||
"SELECT owner, table_name, list_of_sources as SOURCE, rar3_type_of_access "
|
||||
"FROM {0} "
|
||||
"WHERE a_valid_to > sysdate "
|
||||
"AND rar3_type_of_access = '1' "
|
||||
"AND owner = 'CORR_REF_MAIN' "
|
||||
"AND lower(owner||'.'||table_name) = lower($$$1) "
|
||||
).format(metadata_table)
|
||||
|
||||
return query_metadata_access_type1
|
||||
|
||||
def get_query_metadata_access_type2a(metadata_table):
|
||||
query_metadata_access_type2a = (
|
||||
"WITH rar_columns AS ( "
|
||||
"SELECT owner, table_name, column_name, source, rar3_type_of_access "
|
||||
"FROM ( "
|
||||
"SELECT owner, table_name, column_name, rar3_type_of_access, list_of_sources, "
|
||||
"tms, c2d_ea, c2d_ela, c2d_mpec, c2d_uc, ceph, lm, csdb_reference, "
|
||||
"csdb_ratings, fxcd, mdp_bbg, mdp_reu, riad_cl, riad_ou, sdw_estr, sdw_fx, "
|
||||
"top, rar, rtm, led, mdp_cma "
|
||||
"FROM {0} "
|
||||
"WHERE a_valid_to > sysdate "
|
||||
"AND rar3_type_of_access = '2a' "
|
||||
"AND lower(owner || '.' || table_name) = lower($$$1) "
|
||||
"AND list_of_sources NOT LIKE '%,%' "
|
||||
"AND upper(column_name) NOT IN ('DATABASE') "
|
||||
") a "
|
||||
"UNPIVOT ( "
|
||||
"val FOR (source) IN ( "
|
||||
"tms AS 'TMS', c2d_ea AS 'C2D_EA', c2d_ela AS 'C2D_ELA', c2d_mpec AS 'C2D_MPEC', "
|
||||
"c2d_uc AS 'C2D_UC', ceph AS 'CEPH', lm AS 'LM', csdb_reference AS 'CSDB_REFERENCE', "
|
||||
"csdb_ratings AS 'CSDB_RATINGS', fxcd AS 'FXCD', mdp_bbg AS 'MDP_BBG', mdp_reu AS 'MDP_REU', "
|
||||
"riad_cl AS 'RIAD_CL', riad_ou AS 'RIAD_OU', sdw_estr AS 'SDW_ESTR', sdw_fx AS 'SDW_FX', "
|
||||
"top AS 'TOP', rar AS 'RAR', rtm AS 'RTM', led AS 'LED', mdp_cma AS 'MDP_CMA') "
|
||||
") "
|
||||
"ORDER BY owner, table_name, column_name "
|
||||
"), "
|
||||
"dummy_entry AS ( "
|
||||
"SELECT owner, table_name, 'RAR' as SOURCE, rar3_type_of_access "
|
||||
"FROM rar_columns "
|
||||
"FETCH FIRST ROW ONLY "
|
||||
"), "
|
||||
"disc_tec_fields AS ( "
|
||||
"SELECT owner, table_name, 'TEC_INGESTION_DATE' AS column_name, source, rar3_type_of_access "
|
||||
"FROM dummy_entry "
|
||||
"UNION "
|
||||
"( "
|
||||
"SELECT owner, table_name, 'TEC_EXECUTION_DATE' AS column_name, source, rar3_type_of_access "
|
||||
"FROM dummy_entry "
|
||||
") "
|
||||
"UNION "
|
||||
"( "
|
||||
"SELECT owner, table_name, 'TEC_RUN_ID' AS column_name, source, rar3_type_of_access "
|
||||
"FROM dummy_entry "
|
||||
") "
|
||||
") "
|
||||
"SELECT owner, table_name, column_name, source, rar3_type_of_access "
|
||||
"FROM disc_tec_fields "
|
||||
"UNION "
|
||||
"SELECT owner, table_name, column_name, source, rar3_type_of_access "
|
||||
"FROM rar_columns "
|
||||
).format(metadata_table)
|
||||
|
||||
return query_metadata_access_type2a
|
||||
|
||||
def get_query_igam_roles(igam_table,service):
|
||||
if service.lower() == 'rar':
|
||||
service_entitlement='mrds'
|
||||
elif service.lower()=='mopdb':
|
||||
service_entitlement='mrds'
|
||||
else:
|
||||
service_entitlement='mrds'
|
||||
query_igam_roles = (
|
||||
"SELECT MRDS_subsource_id as Datasource, "
|
||||
"MRDS_subsource_id as subsource_id, "
|
||||
"MRDS_entitlement as IGAM_Entitlement, "
|
||||
"environment "
|
||||
"FROM {0} where lower(environment) = lower($$$1) and SERVICE_NAME='{1}'").format(igam_table,service.upper())
|
||||
|
||||
return query_igam_roles
|
||||
54
python/devo_replicator/table_generator/testScript.py
Normal file
54
python/devo_replicator/table_generator/testScript.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import os
|
||||
|
||||
import mrds_elt.python.devo_replicator.FlowOptions as ro
|
||||
import tableBuilderProcessor_2 as tbp
|
||||
|
||||
# setting variables
|
||||
# args = [
|
||||
# 'corporate_store=crp_rar',
|
||||
# 'source_schema=CORR_RAR',
|
||||
# 'source_table=NH_F_RATING',
|
||||
# 'target_table=NH_F_RATING',
|
||||
# 'access_type=1',
|
||||
# 'oracle_metadata_table=CORR_RAR.NH_METADATA_INVENTORY',
|
||||
# 'oracle_igam_table=CT_REF.RAR_SOURCES_IGAM_SENTRY',
|
||||
# 'sentry_role_environment=production',
|
||||
# 'target_s3_bucket=s3a://devo-crp-ffppyd8q',
|
||||
# 'tech_meta_data_fields=tec_ingestion_date String, tec_execution_date String, tec_run_id String',
|
||||
# 'full_access_entitlement_list=DISC-PC-RAR-R'
|
||||
# ]
|
||||
|
||||
# args = [
|
||||
# 'corporate_store=crp_rar',
|
||||
# 'source_schema=CORR_RAR',
|
||||
# 'source_table=NH_ASSET',
|
||||
# 'target_table=NH_ASSET',
|
||||
# 'access_type=2a',
|
||||
# 'oracle_metadata_table=CORR_RAR.NH_METADATA_INVENTORY',
|
||||
# 'oracle_igam_table=CT_REF.RAR_SOURCES_IGAM_SENTRY',
|
||||
# 'sentry_role_environment=production',
|
||||
# 'target_s3_bucket=s3a://devo-crp-ffppyd8q',
|
||||
# 'tech_meta_data_fields=tec_ingestion_date String, tec_execution_date String, tec_run_id String',
|
||||
# 'full_access_entitlement_list=DISC-PC-RAR-R'
|
||||
# ]
|
||||
|
||||
|
||||
args = [
|
||||
'corporate_store=crp_mopdb',
|
||||
'source_schema=MPEC',
|
||||
'source_table=T_MPEC',
|
||||
'oracle_metadata_table=CT_MOPDB.MOPDB_METADATA_INVENTORY',
|
||||
'oracle_igam_table=CT_MOPDB.MOPDB_SOURCES_IGAM_SENTRY',
|
||||
'sentry_role_environment=production',
|
||||
'target_s3_bucket=s3a://devo-crp-sbul3ju3/mopdb/db',
|
||||
'tech_meta_data_fields=tec_ingestion_date String, tec_execution_date String, tec_run_id String',
|
||||
'full_access_entitlement_list='
|
||||
]
|
||||
|
||||
rar_options = ro.Options(args)
|
||||
|
||||
tableFields = tbp.loadMetadataTable(rar_options)
|
||||
tbp.createExternalTables_CRP_RAR(rar_options,tableFields )
|
||||
igamRoleDF = tbp.readIGAMRoles(rar_options)
|
||||
|
||||
tbp.accessTypeMapper(rar_options, igamRoleDF)
|
||||
6
python/mrds_common/.gitignore
vendored
Normal file
6
python/mrds_common/.gitignore
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
__pycache__
|
||||
*.log
|
||||
.venv
|
||||
.tox
|
||||
*.egg-info/
|
||||
build
|
||||
72
python/mrds_common/CHANGELOG.md
Normal file
72
python/mrds_common/CHANGELOG.md
Normal file
@@ -0,0 +1,72 @@
|
||||
# Changelog
|
||||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [0.6.0] - 13-10-2025
|
||||
|
||||
### Added
|
||||
|
||||
- new type of column xpath_element_id
|
||||
|
||||
## [0.5.0] - 08-10-2025
|
||||
|
||||
### Added
|
||||
|
||||
- added new mandatory configuration parameter `archive_prefix`. App now archives source file to this location, before deleting it from inbox_prefix location.
|
||||
- log app version at runtime.
|
||||
|
||||
### Changed
|
||||
|
||||
- improved logging when calling database function CT_MRDS.FILE_MANAGER.PROCESS_SOURCE_FILE
|
||||
- removed local zip file deletion from version 0.4.0 to accomodate archiving at the end of the processing
|
||||
|
||||
|
||||
## [0.4.1] - 03-10-2025
|
||||
|
||||
### Added
|
||||
|
||||
- `--version` flag to CLI, now shows package version from `mrds.__version__`. ([#179](https://gitlab.sofa.dev/mrds/mrds_elt/-/merge_requests/179))
|
||||
|
||||
## [0.4.0] - 03-10-2025
|
||||
|
||||
### Added
|
||||
|
||||
- App versioning!
|
||||
- Streaming algorithm when reading, filtering and enriching csv files. This drastically improves application performance in regards to RAM usage.
|
||||
- Unzipping now deletes local source zip file, after data has been extracted.
|
||||
|
||||
## [0.3.1] - 30-09-2025
|
||||
|
||||
### Fixed
|
||||
|
||||
- fixed small bug related to the new encoding setting
|
||||
|
||||
## [0.3.0] - 29-09-2025
|
||||
|
||||
### Added
|
||||
|
||||
- new type of config - Application config.
|
||||
These will be very specific application settings to be overridden in specific cases. Consequently, such configuration will only be optional, because rare usage is expected. First such config is encoding_type
|
||||
|
||||
### Changed
|
||||
|
||||
- removed output of .log files when running the application
|
||||
|
||||
### Fixed
|
||||
|
||||
- small bug when unzipping a file
|
||||
|
||||
### [0.2.0] - 17-09-2025
|
||||
|
||||
### Added
|
||||
|
||||
- automatic deletion of the source file, and all temporary files created by the app.
|
||||
- two new cli paramters - --keep-source-file and --keep-tmp-dir flags, to be used to avoid deleting the source file and/or temporary working directory when testing.
|
||||
- row count output in log files after enrichment.
|
||||
|
||||
### Fixed
|
||||
|
||||
- source and output columns in csv extraction were mistakenly swapped. This is now fixed.
|
||||
328
python/mrds_common/README.md
Normal file
328
python/mrds_common/README.md
Normal file
@@ -0,0 +1,328 @@
|
||||
# MRDS APP
|
||||
|
||||
The main purpose of this application is to download XML or CSV files from source, perform some basic ETL and upload them to target.
|
||||
Below is a simplified workflow of the application.
|
||||
|
||||
## Application workflow
|
||||
|
||||
```mermaid
|
||||
flowchart LR
|
||||
subgraph CoreApplication
|
||||
direction TB
|
||||
B[Read and validate config file] --> |If valid| C[Download source file]
|
||||
C[Download source file] --> D[Unzip if file is ZIP]
|
||||
D[Unzip if file is ZIP] --> E[Validate source file]
|
||||
E --> |If valid| G[Start task defined in config file]
|
||||
G --> H[Build output file with selected data from source]
|
||||
H --> I[Enrich output file with metadata]
|
||||
I --> J[Upload the output file]
|
||||
J --> K[Trigger remote function]
|
||||
K --> L[Check if more tasks are available in config file]
|
||||
L --> |Yes| G
|
||||
L --> |No| M[Archive & Delete source file]
|
||||
M --> N[Finish workflow]
|
||||
end
|
||||
A[Trigger app via CLI or Airflow DAG] --> CoreApplication
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
Checkout repository and cd to root project directory
|
||||
|
||||
```shell
|
||||
cd python/mrds_common
|
||||
```
|
||||
|
||||
Create new virtual environment using Python >=3.11
|
||||
|
||||
```shell
|
||||
python3.11 -m venv .venv
|
||||
```
|
||||
|
||||
Activate virtual environment
|
||||
|
||||
```shell
|
||||
source .venv/bin/activate
|
||||
```
|
||||
|
||||
Upgrade pip
|
||||
|
||||
```shell
|
||||
pip install --upgrade pip
|
||||
```
|
||||
|
||||
Install app
|
||||
|
||||
```shell
|
||||
pip install .
|
||||
```
|
||||
|
||||
## Environment variables
|
||||
|
||||
There are two operating system environment variables, which are requred by the application:
|
||||
|
||||
BUCKET_NAMESPACE - OCI namespace where main operating bucket is located (if not found - default value is frcnomajoc7v)
|
||||
|
||||
BUCKET - main operating OCI bucket for downloading and uploading files (if not found - default value is mrds_inbox_poc)
|
||||
|
||||
|
||||
## Usage
|
||||
|
||||
The application accepts two required and four optional parameters.
|
||||
|
||||
### Parameters
|
||||
|
||||
| Parameter | Short Flag | Required | Default | Description |
|
||||
|-------------------------------|------------|----------|---------|----------------------------------------------------------------------------------------------------------------------|
|
||||
| `--workflow-context` | `-w` | No* | None | JSON string representing the workflow context. Must contain `run_id` and `a_workflow_history_key`. |
|
||||
| `--generate-workflow-context` | | No* | | Flag type. If provided, app automatically generates and finalizes workflow context. Use this if `--workflow-context` is not provided. |
|
||||
| `--source-filename` | `-s` | Yes | None | Name of the source file to be looked up in source inbox set in configuration file (`inbox_prefix`). |
|
||||
| `--config-file` | `-c` | Yes | None | Path to the YAML configuration file. Can be absolute, or relative to current working directory. |
|
||||
| `--keep-source-file` | | No | | Flag type. If provided, app keeps source file, instead of archiving and deleting it. |
|
||||
| `--keep-tmp-dir` | | No | | Flag type. If provided, app keeps tmp directory, instead of deleting it. |
|
||||
|
||||
*`--workflow-context` and `--generate-workflow-context` are both optional, however - either one of them MUST be provided for the application to run.
|
||||
|
||||
|
||||
### CLI
|
||||
|
||||
```shell
|
||||
mrds-cli --workflow-context '{"run_id": "0ce35637-302c-4293-8069-3186d5d9a57d", "a_workflow_history_key": 352344}' \
|
||||
--source-filename 'CSDB_Debt_Daily.ZIP' \
|
||||
--config-file /home/dbt/GEORGI/projects/mrds_elt/airflow/ods/csdb/debt_daily/config/yaml/csdb_debt_daily.yaml
|
||||
```
|
||||
|
||||
### Python module
|
||||
|
||||
Import main function from core module and provide needed parameters:
|
||||
|
||||
```python
|
||||
from mrds.core import main
|
||||
from mrds.utils.manage_runs import init_workflow, finalise_workflow
|
||||
from mrds.utils.static_vars import status_success, status_failed
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import sys
|
||||
|
||||
# Configure logging for your needs. This is just a sample
|
||||
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
log_filename = f"mrds_{current_time}.log"
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(name)s - %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler(log_filename),
|
||||
logging.StreamHandler(sys.stdout),
|
||||
],
|
||||
)
|
||||
|
||||
STATUS_SUCCESS = status_success
|
||||
STATUS_FAILURE = status_failed
|
||||
|
||||
# Run time parameters
|
||||
|
||||
run_id = "0ce35637-302c-4293-8069-3186d5d9a57d"
|
||||
a_workflow_history_key = init_workflow(database_name='ODS', workflow_name='w_OU_C2D_UC_DISSEM', workflow_run_id=run_id)
|
||||
|
||||
workflow_context = {
|
||||
"run_id": run_id,
|
||||
"a_workflow_history_key": a_workflow_history_key,
|
||||
}
|
||||
|
||||
source_filename = "CSDB_Debt_Daily.ZIP"
|
||||
config_file = "/home/dbt/GEORGI/projects/mrds_elt/airflow/ods/csdb/debt_daily/config/yaml/csdb_debt_daily.yaml"
|
||||
|
||||
main(workflow_context, source_filename, config_file)
|
||||
|
||||
# implement your desired error handling logic and provide correct status to function finalize_workflow
|
||||
|
||||
finalise_workflow(workflow_context["a_workflow_history_key"], STATUS_SUCCESS)
|
||||
|
||||
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### Generate workflow context
|
||||
|
||||
Use this if you are using the application in standalone mode. Workflow context will be generated, and then finalized.
|
||||
|
||||
### Source filename
|
||||
|
||||
This is the source file name to be looked up in in source inbox set in the configuration file (`inbox_prefix`).
|
||||
|
||||
### Workflow context
|
||||
|
||||
This is a JSON string (or from the application standpoint view - dictionary) containing run_id and a_workflow_history_key values.
|
||||
|
||||
```JSON
|
||||
workflow_context = {
|
||||
"run_id": "0ce35637-302c-4293-8069-3186d5d9a57d",
|
||||
"a_workflow_history_key": 352344,
|
||||
}
|
||||
```
|
||||
|
||||
run_id - this represent orchestration ID. Can be any string ID of your choice, for example Airflow DAG ID.
|
||||
a_workflow_history_key - can be generated via mrds.utils.manage_runs.init_workflow() function.
|
||||
|
||||
If you provide workflow context by yourself, you need to take care of finalizing it too.
|
||||
|
||||
### Config file
|
||||
|
||||
This is the main place which we can control the application.
|
||||
|
||||
At the top, are the Application configurations. These apply to all tasks. These are all optional and are used to override some specific runtime application settings.
|
||||
|
||||
```yaml
|
||||
# System configurations
|
||||
|
||||
encoding_type: cp1252 # Overrides default encoding type (utf-8) of the app. This encoding is used when reading source csv/xml files and when writing the output csv files of the app. For codec naming, follow guidelines here - https://docs.python.org/3/library/codecs.html#standard-encodings
|
||||
```
|
||||
|
||||
After that, are the global configurations. These apply to all tasks:
|
||||
|
||||
```yaml
|
||||
# Global configurations
|
||||
tmpdir: /tmp # root temporary directory to create runtime temporary directory, download source file and perform operations on it, before upload it to target
|
||||
inbox_prefix: INBOX/C2D/UC_DISSEM # prefix for the inbox containing the source file
|
||||
archive_prefix: ARCHIVE/C2D/UC_DISSEM # prefix for the archive bucket
|
||||
workflow_name: w_OU_C2D_UC_DISSEM # name of the particular workflow
|
||||
validation_schema_path: 'xsd/UseOfCollateralMessage.xsd' # relative path (to runtime location) to schema used to validate XML or CSV file
|
||||
file_type: xml # file type of the expected source file - either CSV or XML
|
||||
```
|
||||
|
||||
Following, there is a list of tasks to be performed on the source file.
|
||||
We can have multiple tasks per file, meaning - we can generate more than one output file, from one source file.
|
||||
Further, one of the key configuration parameters per task is "output_columns". There we define columns of the final output file.
|
||||
There are several types of columns:
|
||||
|
||||
xpath - this type of column is used when source file is XML. It is a standart xpath expression, pointing to path in the xml.
|
||||
|
||||
xpath_element_id - this type of column is used when we need to id a particular xml element. Used to create foreign keys between two separate tasks. It is a standart xpath expression, pointing to path in the xml.
|
||||
|
||||
csv_header - this type of column is used when source file is CSV. It just points to the corresponding csv header in the source file.
|
||||
|
||||
a_key - generates key unique per row.
|
||||
|
||||
workflow_key - generates key unique per run of the application
|
||||
|
||||
static - allows the user to define column with static value
|
||||
|
||||
The application respects the order of the output columns in the configuration file, when generating the output file.
|
||||
Data and columns from the source file, not included in the configuration file, will not be present in the final output file.
|
||||
|
||||
Example of xml task configuration:
|
||||
|
||||
```yaml
|
||||
# List of tasks
|
||||
tasks:
|
||||
- task_name: ou_lm_standing_facilities_header_create_file # name of the particular task
|
||||
ods_prefix: INBOX/LM/STANDING_FACILITIES/STANDING_FACILITIES_HEADER # prefix for the upload location
|
||||
output_table: standing_facilities_headers # table in Oracle
|
||||
namespaces:
|
||||
ns2: 'http://escb.ecb.int/sf' # XML namespace
|
||||
output_columns: # Columns in the output file, order will be respected.
|
||||
- type: 'a_key' # A_KEY type of column
|
||||
column_header: 'A_KEY' # naming of the column in the output file
|
||||
- type: 'workflow_key' # WORKFLOW_KEY type of column
|
||||
column_header: 'A_WORKFLOW_HISTORY_KEY'
|
||||
- type: 'xpath' # xpath type of column
|
||||
value: '//ns2:header/ns2:version'
|
||||
column_header: 'REV_NUMBER'
|
||||
is_key: 'N' # value is transposed across the rows - YES/NO. Used when there is only single value in source XML
|
||||
- type: 'xpath'
|
||||
value: '//ns2:header/ns2:referenceDate'
|
||||
column_header: 'REF_DATE'
|
||||
is_key: 'N'
|
||||
- type: 'static'
|
||||
value: ''
|
||||
column_header: 'FREE_TEXT'
|
||||
|
||||
- task_name: ou_lm_standing_facilities_create_file
|
||||
ods_prefix: INBOX/LM/STANDING_FACILITIES/STANDING_FACILITIES
|
||||
output_table: standing_facilities
|
||||
namespaces:
|
||||
ns2: 'http://escb.ecb.int/sf'
|
||||
output_columns:
|
||||
- type: 'a_key'
|
||||
column_header: 'A_KEY'
|
||||
- type: 'workflow_key'
|
||||
column_header: 'A_SFH_FK'
|
||||
- type: 'workflow_key'
|
||||
column_header: 'A_WORKFLOW_HISTORY_KEY'
|
||||
- type: 'xpath'
|
||||
value: '//ns2:disaggregatedStandingFacilities/ns2:standingFacilities/ns2:disaggregatedStandingFacility/ns2:country'
|
||||
column_header: 'COUNTRY'
|
||||
- type: 'static'
|
||||
value: ''
|
||||
column_header: 'COMMENT_'
|
||||
|
||||
```
|
||||
|
||||
Example of CSV task configuration:
|
||||
|
||||
```yaml
|
||||
tasks:
|
||||
- task_name: ODS_CSDB_DEBT_DAILY_process_csv
|
||||
ods_prefix: ODS/CSDB/DEBT_DAILY
|
||||
output_table: DEBT_DAILY
|
||||
output_columns:
|
||||
- type: 'a_key'
|
||||
column_header: 'A_KEY'
|
||||
- type: 'workflow_key'
|
||||
column_header: 'A_WORKFLOW_HISTORY_KEY'
|
||||
- type: 'csv_header' # csv_header type of column
|
||||
value: 'Date last modified' # naming of the column in the SOURCE file
|
||||
column_header: 'Date last modified' # naming of the column in the OUTPUT file
|
||||
- type: 'csv_header'
|
||||
value: 'Extraction date'
|
||||
column_header: 'Extraction date'
|
||||
- type: 'csv_header'
|
||||
value: 'ISIN code'
|
||||
column_header: 'ISIN code'
|
||||
```
|
||||
|
||||
## Development
|
||||
|
||||
### Installing requirements
|
||||
|
||||
Install app + dev requirements. For easier workflow, you can install in editable mode
|
||||
|
||||
```
|
||||
pip install -e .[dev]
|
||||
```
|
||||
|
||||
In editable mode, instead of copying the package files to the site-packages directory, pip creates a special link that points to the source code directory. This means any changes you make to your source code will be immediately available without needing to reinstall the package.
|
||||
|
||||
### Code formattting
|
||||
|
||||
Run black to reformat the code before pushing changes.
|
||||
|
||||
Following will reformat all files recursively from current dir.
|
||||
|
||||
```
|
||||
black .
|
||||
```
|
||||
|
||||
Following will only check and report what needs to be formatted, recursively from current dir.
|
||||
|
||||
```
|
||||
black --check --diff .
|
||||
```
|
||||
|
||||
### Tests
|
||||
|
||||
Run tests with
|
||||
|
||||
```
|
||||
pytest .
|
||||
```
|
||||
|
||||
### Tox automation
|
||||
|
||||
Tox automates runs of black checks and tests
|
||||
|
||||
```
|
||||
tox .
|
||||
```
|
||||
1
python/mrds_common/mrds/__init__.py
Normal file
1
python/mrds_common/mrds/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
__version__ = "0.6.0"
|
||||
117
python/mrds_common/mrds/cli.py
Normal file
117
python/mrds_common/mrds/cli.py
Normal file
@@ -0,0 +1,117 @@
|
||||
import click
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
|
||||
from mrds import __version__
|
||||
from mrds.core import main
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.version_option(version=__version__, prog_name="mrds")
|
||||
@click.option(
|
||||
"--workflow-context",
|
||||
"-w",
|
||||
required=False,
|
||||
help="Workflow context to be used by the application. This is required unless --generate-workflow-context is provided.",
|
||||
)
|
||||
@click.option(
|
||||
"--source-filename",
|
||||
"-s",
|
||||
required=True,
|
||||
help="Source filename to be processed.",
|
||||
)
|
||||
@click.option(
|
||||
"--config-file",
|
||||
"-c",
|
||||
type=click.Path(exists=True),
|
||||
required=True,
|
||||
help="Path to the YAML configuration file.",
|
||||
)
|
||||
@click.option(
|
||||
"--generate-workflow-context",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Generate a workflow context automatically. If this is set, --workflow-context is not required.",
|
||||
)
|
||||
@click.option(
|
||||
"--keep-source-file",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Keep source file, instead of deleting it.",
|
||||
)
|
||||
@click.option(
|
||||
"--keep-tmp-dir",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Keep tmp directory, instead of deleting it.",
|
||||
)
|
||||
def cli_main(
|
||||
workflow_context,
|
||||
source_filename,
|
||||
config_file,
|
||||
generate_workflow_context,
|
||||
keep_source_file,
|
||||
keep_tmp_dir,
|
||||
):
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(name)s - %(message)s",
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout),
|
||||
],
|
||||
)
|
||||
|
||||
# Handle conflicting options
|
||||
if workflow_context and generate_workflow_context:
|
||||
raise click.UsageError(
|
||||
"You cannot use both --workflow-context and --generate-workflow-context at the same time. "
|
||||
"Please provide only one."
|
||||
)
|
||||
|
||||
# Enforce that either --workflow-context or --generate-workflow-context must be provided
|
||||
if not workflow_context and not generate_workflow_context:
|
||||
raise click.UsageError(
|
||||
"You must provide --workflow-context or use --generate-workflow-context flag."
|
||||
)
|
||||
|
||||
# Parse and validate the workflow_context if provided
|
||||
if workflow_context:
|
||||
try:
|
||||
workflow_context = json.loads(workflow_context)
|
||||
except json.JSONDecodeError as e:
|
||||
raise click.UsageError(f"Invalid JSON for --workflow-context: {e}")
|
||||
|
||||
# Validate that the workflow_context matches the expected structure
|
||||
if (
|
||||
not isinstance(workflow_context, dict)
|
||||
or "run_id" not in workflow_context
|
||||
or "a_workflow_history_key" not in workflow_context
|
||||
):
|
||||
raise click.UsageError(
|
||||
"Invalid workflow context structure. It must be a JSON object with 'run_id' and 'a_workflow_history_key'."
|
||||
)
|
||||
|
||||
# Call the core processing function
|
||||
main(
|
||||
workflow_context,
|
||||
source_filename,
|
||||
config_file,
|
||||
generate_workflow_context,
|
||||
keep_source_file,
|
||||
keep_tmp_dir,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
cli_main()
|
||||
sys.exit(0)
|
||||
except click.UsageError as e:
|
||||
logging.error(f"Usage error: {e}")
|
||||
sys.exit(2)
|
||||
except Exception as e:
|
||||
logging.error(f"Unexpected error: {e}")
|
||||
sys.exit(1)
|
||||
366
python/mrds_common/mrds/core.py
Normal file
366
python/mrds_common/mrds/core.py
Normal file
@@ -0,0 +1,366 @@
|
||||
import os
|
||||
import uuid
|
||||
import logging
|
||||
import yaml
|
||||
import zipfile
|
||||
import tempfile
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from mrds import __version__
|
||||
|
||||
from mrds.processors import get_file_processor
|
||||
from mrds.utils import (
|
||||
manage_runs,
|
||||
objectstore,
|
||||
static_vars,
|
||||
xml_utils,
|
||||
)
|
||||
|
||||
|
||||
# environment variables
|
||||
MRDS_ENV = os.getenv("MRDS_ENV", "poc")
|
||||
BUCKET = os.getenv("INBOX_BUCKET", "mrds_inbox_poc")
|
||||
BUCKET_NAMESPACE = os.getenv("BUCKET_NAMESPACE", "frcnomajoc7v")
|
||||
|
||||
|
||||
# Static configuration variables
|
||||
WORKFLOW_TYPE = "ODS"
|
||||
ENCODING_TYPE = "utf-8"
|
||||
|
||||
CONFIG_REQUIRED_KEYS = [
|
||||
"tmpdir",
|
||||
"inbox_prefix",
|
||||
"archive_prefix",
|
||||
"workflow_name",
|
||||
"validation_schema_path",
|
||||
"tasks",
|
||||
"file_type",
|
||||
]
|
||||
|
||||
TASK_REQUIRED_KEYS = [
|
||||
"task_name",
|
||||
"ods_prefix",
|
||||
"output_table",
|
||||
"output_columns",
|
||||
]
|
||||
|
||||
STATUS_SUCCESS = static_vars.status_success
|
||||
STATUS_FAILURE = static_vars.status_failed
|
||||
|
||||
|
||||
@dataclass
|
||||
class GlobalConfig:
|
||||
tmpdir: str
|
||||
inbox_prefix: str
|
||||
archive_prefix: str
|
||||
workflow_name: str
|
||||
source_filename: str
|
||||
validation_schema_path: str
|
||||
bucket: str
|
||||
bucket_namespace: str
|
||||
file_type: str
|
||||
encoding_type: str
|
||||
|
||||
def __post_init__(self):
|
||||
self.original_source_filename = self.source_filename # keep this in case we have a zip file to archive
|
||||
|
||||
@property
|
||||
def source_filepath(self) -> str:
|
||||
return os.path.join(self.tmpdir, self.source_filename)
|
||||
|
||||
@property
|
||||
def original_source_filepath(self) -> str:
|
||||
return os.path.join(self.tmpdir, self.original_source_filename)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TaskConfig:
|
||||
task_name: str
|
||||
ods_prefix: str
|
||||
output_table: str
|
||||
namespaces: dict
|
||||
output_columns: list
|
||||
|
||||
|
||||
def initialize_config(source_filename, config_file_path):
|
||||
logging.info(f"Source filename is set to: {source_filename}")
|
||||
logging.info(f"Loading configuration from {config_file_path}")
|
||||
# Ensure the file exists
|
||||
if not os.path.exists(config_file_path):
|
||||
raise FileNotFoundError(f"Configuration file {config_file_path} not found.")
|
||||
|
||||
# Load the configuration
|
||||
with open(config_file_path, "r") as f:
|
||||
config_data = yaml.safe_load(f)
|
||||
logging.debug(f"Configuration data: {config_data}")
|
||||
|
||||
missing_keys = [key for key in CONFIG_REQUIRED_KEYS if key not in config_data]
|
||||
if missing_keys:
|
||||
raise ValueError(f"Missing required keys in configuration: {missing_keys}")
|
||||
|
||||
# Create GlobalConfig instance
|
||||
global_config = GlobalConfig(
|
||||
tmpdir=config_data["tmpdir"],
|
||||
inbox_prefix=config_data["inbox_prefix"],
|
||||
archive_prefix=config_data["archive_prefix"],
|
||||
workflow_name=config_data["workflow_name"],
|
||||
source_filename=source_filename,
|
||||
validation_schema_path=config_data["validation_schema_path"],
|
||||
bucket=BUCKET,
|
||||
bucket_namespace=BUCKET_NAMESPACE,
|
||||
file_type=config_data["file_type"],
|
||||
encoding_type=config_data.get("encoding_type", ENCODING_TYPE),
|
||||
)
|
||||
|
||||
# Create list of TaskConfig instances
|
||||
tasks_data = config_data["tasks"]
|
||||
tasks = []
|
||||
for task_data in tasks_data:
|
||||
# Validate required keys in task_data
|
||||
missing_task_keys = [key for key in TASK_REQUIRED_KEYS if key not in task_data]
|
||||
if missing_task_keys:
|
||||
raise ValueError(
|
||||
f"Missing required keys in task configuration: {missing_task_keys}"
|
||||
)
|
||||
|
||||
task = TaskConfig(
|
||||
task_name=task_data["task_name"],
|
||||
ods_prefix=task_data["ods_prefix"],
|
||||
output_table=task_data["output_table"],
|
||||
namespaces=task_data.get("namespaces", {}),
|
||||
output_columns=task_data["output_columns"],
|
||||
)
|
||||
tasks.append(task)
|
||||
|
||||
return global_config, tasks
|
||||
|
||||
|
||||
def initialize_workflow(global_config):
|
||||
|
||||
run_id = str(uuid.uuid4())
|
||||
|
||||
logging.info(f"Initializing workflow '{global_config.workflow_name}'")
|
||||
a_workflow_history_key = manage_runs.init_workflow(
|
||||
WORKFLOW_TYPE, global_config.workflow_name, run_id
|
||||
)
|
||||
|
||||
return {
|
||||
"run_id": run_id,
|
||||
"a_workflow_history_key": a_workflow_history_key,
|
||||
}
|
||||
|
||||
|
||||
def download_source_file(client, global_config):
|
||||
logging.info(
|
||||
f"Downloading source file '{global_config.source_filename}' "
|
||||
f"from '{global_config.bucket}/{global_config.inbox_prefix}'"
|
||||
)
|
||||
objectstore.download_file(
|
||||
client,
|
||||
global_config.bucket_namespace,
|
||||
global_config.bucket,
|
||||
global_config.inbox_prefix,
|
||||
global_config.source_filename,
|
||||
global_config.source_filepath,
|
||||
)
|
||||
logging.info(f"Source file downloaded to '{global_config.source_filepath}'")
|
||||
|
||||
|
||||
def delete_source_file(client, global_config):
|
||||
|
||||
logging.info(
|
||||
f"Deleting source file '{global_config.bucket}/{global_config.inbox_prefix}/{global_config.original_source_filename}'"
|
||||
)
|
||||
objectstore.delete_file(
|
||||
client,
|
||||
global_config.original_source_filename,
|
||||
global_config.bucket_namespace,
|
||||
global_config.bucket,
|
||||
global_config.inbox_prefix,
|
||||
)
|
||||
logging.info(
|
||||
f"Deleted source file '{global_config.bucket}/{global_config.inbox_prefix}/{global_config.original_source_filename}'"
|
||||
)
|
||||
|
||||
|
||||
def archive_source_file(client, global_config):
|
||||
|
||||
logging.info(
|
||||
f"Archiving source file to '{global_config.bucket}/{global_config.archive_prefix}/{global_config.original_source_filename}'"
|
||||
)
|
||||
objectstore.upload_file(
|
||||
client,
|
||||
global_config.original_source_filepath,
|
||||
global_config.bucket_namespace,
|
||||
global_config.bucket,
|
||||
global_config.archive_prefix,
|
||||
global_config.original_source_filename,
|
||||
)
|
||||
logging.info(
|
||||
f"Source file archived to '{global_config.bucket}/{global_config.archive_prefix}/{global_config.original_source_filename}'"
|
||||
)
|
||||
|
||||
|
||||
def unzip_source_file_if_needed(global_config):
|
||||
source_filepath = global_config.source_filepath
|
||||
|
||||
# If it's not a zip, nothing to do
|
||||
if not zipfile.is_zipfile(source_filepath):
|
||||
logging.info(f"File '{source_filepath}' is not a ZIP file.")
|
||||
return True
|
||||
|
||||
logging.info(f"File '{source_filepath}' is a ZIP file. Unzipping...")
|
||||
|
||||
extract_dir = os.path.dirname(source_filepath)
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(source_filepath, "r") as zip_ref:
|
||||
extracted_files = zip_ref.namelist()
|
||||
|
||||
if len(extracted_files) != 1:
|
||||
logging.error(
|
||||
f"Expected one file in the ZIP, but found {len(extracted_files)} files."
|
||||
)
|
||||
return False
|
||||
|
||||
# Extract everything
|
||||
zip_ref.extractall(extract_dir)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error while extracting '{source_filepath}': {e}")
|
||||
return False
|
||||
|
||||
# Update the global_config to point to the extracted file
|
||||
extracted_filename = extracted_files[0]
|
||||
global_config.source_filename = extracted_filename
|
||||
|
||||
logging.info(
|
||||
f"Extracted '{extracted_filename}' to '{extract_dir}'. "
|
||||
f"Updated source_filepath to '{global_config.source_filepath}'."
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def validate_source_file(global_config):
|
||||
file_type = global_config.file_type.lower()
|
||||
|
||||
if file_type == "xml":
|
||||
xml_is_valid, xml_validation_message = xml_utils.validate_xml(
|
||||
global_config.source_filepath, global_config.validation_schema_path
|
||||
)
|
||||
if not xml_is_valid:
|
||||
raise ValueError(f"XML validation failed: {xml_validation_message}")
|
||||
logging.info(xml_validation_message)
|
||||
|
||||
elif file_type == "csv":
|
||||
# TODO: add CSV validation here
|
||||
pass
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {file_type}")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def process_tasks(tasks, global_config, workflow_context, client):
|
||||
|
||||
# get appropriate task processor
|
||||
processor_class = get_file_processor(global_config)
|
||||
|
||||
for task_conf in tasks:
|
||||
|
||||
logging.info(f"Starting task '{task_conf.task_name}'")
|
||||
file_processor = processor_class(
|
||||
global_config, task_conf, client, workflow_context
|
||||
)
|
||||
file_processor.process()
|
||||
|
||||
|
||||
def finalize_workflow(workflow_context, success=True):
|
||||
status = STATUS_SUCCESS if success else STATUS_FAILURE
|
||||
manage_runs.finalise_workflow(workflow_context["a_workflow_history_key"], status)
|
||||
if success:
|
||||
logging.info("Workflow completed successfully")
|
||||
else:
|
||||
logging.error("Workflow failed")
|
||||
|
||||
|
||||
def main(
|
||||
workflow_context: dict,
|
||||
source_filename: str,
|
||||
config_file_path: str,
|
||||
generate_workflow_context=False,
|
||||
keep_source_file=False,
|
||||
keep_tmp_dir=False,
|
||||
):
|
||||
|
||||
logging.info(f"Initializing mrds app, version {__version__}")
|
||||
|
||||
tmpdir_manager = None
|
||||
|
||||
try:
|
||||
# get configs
|
||||
global_config, tasks = initialize_config(source_filename, config_file_path)
|
||||
|
||||
# Handle temporary dirs
|
||||
if keep_tmp_dir:
|
||||
tmpdir = tempfile.mkdtemp(
|
||||
prefix="mrds_", dir=global_config.tmpdir
|
||||
) # dir is created and never deleted
|
||||
logging.info(
|
||||
f"Created temporary working directory (not auto-deleted): {tmpdir}"
|
||||
)
|
||||
else:
|
||||
tmpdir_manager = tempfile.TemporaryDirectory(
|
||||
prefix="mrds_", dir=global_config.tmpdir
|
||||
)
|
||||
tmpdir = tmpdir_manager.name
|
||||
logging.info(
|
||||
f"Created temporary working directory (auto-deleted): {tmpdir}"
|
||||
)
|
||||
|
||||
# override tmpdir with newly created tmpdir
|
||||
global_config.tmpdir = tmpdir
|
||||
|
||||
client = objectstore.get_client()
|
||||
|
||||
# Handle workflow_context generation if required
|
||||
if generate_workflow_context:
|
||||
logging.info("Generating workflow context automatically.")
|
||||
workflow_context = initialize_workflow(global_config)
|
||||
logging.info(f"Generated workflow context: {workflow_context}")
|
||||
else:
|
||||
logging.info(f"Using provided workflow context: {workflow_context}")
|
||||
|
||||
download_source_file(client, global_config)
|
||||
unzip_source_file_if_needed(global_config)
|
||||
validate_source_file(global_config)
|
||||
process_tasks(tasks, global_config, workflow_context, client)
|
||||
|
||||
if generate_workflow_context:
|
||||
finalize_workflow(workflow_context)
|
||||
|
||||
if not keep_source_file:
|
||||
archive_source_file(client, global_config)
|
||||
delete_source_file(client, global_config)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Critical error: {str(e)}")
|
||||
|
||||
# Finalize workflow with failure if needed
|
||||
if generate_workflow_context and "workflow_context" in locals():
|
||||
finalize_workflow(workflow_context, success=False)
|
||||
|
||||
raise RuntimeError(f"Workflow failed due to: {e}")
|
||||
|
||||
finally:
|
||||
# Always attempt to remove tmpdir if created a TemporaryDirectory manager
|
||||
if tmpdir_manager and not keep_tmp_dir:
|
||||
try:
|
||||
tmpdir_manager.cleanup()
|
||||
logging.info(f"Deleted temporary working directory {tmpdir}")
|
||||
except Exception:
|
||||
logging.exception(
|
||||
f"Failed to delete up temporary working directory {tmpdir}"
|
||||
)
|
||||
186
python/mrds_common/mrds/docs/rqsd_sample.yaml
Normal file
186
python/mrds_common/mrds/docs/rqsd_sample.yaml
Normal file
@@ -0,0 +1,186 @@
|
||||
# static configs
|
||||
tmpdir: /tmp
|
||||
inbox_prefix: INBOX/RQSD/RQSD_PROCESS
|
||||
workflow_name: w_ODS_RQSD_PROCESS_DEVO
|
||||
validation_schema_path: None
|
||||
file_type: csv
|
||||
|
||||
# task configs
|
||||
tasks:
|
||||
- task_name: m_ODS_RQSD_OBSERVATIONS_PARSE
|
||||
ods_prefix: INBOX/RQSD/RQSD_PROCESS/RQSD_OBSERVATIONS
|
||||
output_table: RQSD_OBSERVATIONS
|
||||
output_columns:
|
||||
- type: 'workflow_key'
|
||||
column_header: 'A_WORKFLOW_HISTORY_KEY'
|
||||
- type: 'csv_header'
|
||||
value: 'datacollectioncode'
|
||||
column_header: 'datacollectioncode'
|
||||
- type: 'csv_header'
|
||||
value: 'datacollectionname'
|
||||
column_header: 'datacollectionname'
|
||||
- type: 'csv_header'
|
||||
value: 'datacollectionowner'
|
||||
column_header: 'datacollectionowner'
|
||||
- type: 'csv_header'
|
||||
value: 'reportingcyclename'
|
||||
column_header: 'reportingcyclename'
|
||||
- type: 'csv_header'
|
||||
value: 'reportingcyclestatus'
|
||||
column_header: 'reportingcyclestatus'
|
||||
- type: 'csv_header'
|
||||
value: 'modulecode'
|
||||
column_header: 'modulecode'
|
||||
- type: 'csv_header'
|
||||
value: 'modulename'
|
||||
column_header: 'modulename'
|
||||
- type: 'csv_header'
|
||||
value: 'moduleversionnumber'
|
||||
column_header: 'moduleversionnumber'
|
||||
- type: 'csv_header'
|
||||
value: 'reportingentitycollectionuniqueid'
|
||||
column_header: 'reportingentitycollectionuniqueid'
|
||||
- type: 'csv_header'
|
||||
value: 'entityattributereportingcode'
|
||||
column_header: 'entityattributereportingcode'
|
||||
- type: 'csv_header'
|
||||
value: 'reportingentityname'
|
||||
column_header: 'reportingentityname'
|
||||
- type: 'csv_header'
|
||||
value: 'reportingentityentitytype'
|
||||
column_header: 'reportingentityentitytype'
|
||||
- type: 'csv_header'
|
||||
value: 'entityattributecountry'
|
||||
column_header: 'entityattributecountry'
|
||||
- type: 'csv_header'
|
||||
value: 'entitygroupentityname'
|
||||
column_header: 'entitygroupentityname'
|
||||
- type: 'csv_header'
|
||||
value: 'obligationmodulereferencedate'
|
||||
column_header: 'obligationmodulereferencedate'
|
||||
- type: 'csv_header'
|
||||
value: 'obligationmoduleremittancedate'
|
||||
column_header: 'obligationmoduleremittancedate'
|
||||
- type: 'csv_header'
|
||||
value: 'receivedfilereceiveddate'
|
||||
column_header: 'receivedfilereceiveddate'
|
||||
- type: 'csv_header'
|
||||
value: 'obligationmoduleexpected'
|
||||
column_header: 'obligationmoduleexpected'
|
||||
- type: 'csv_header'
|
||||
value: 'receivedfileversionnumber'
|
||||
column_header: 'receivedfileversionnumber'
|
||||
- type: 'csv_header'
|
||||
value: 'revalidationversionnumber'
|
||||
column_header: 'revalidationversionnumber'
|
||||
- type: 'csv_header'
|
||||
value: 'revalidationdate'
|
||||
column_header: 'revalidationdate'
|
||||
- type: 'csv_header'
|
||||
value: 'receivedfilesystemfilename'
|
||||
column_header: 'receivedfilesystemfilename'
|
||||
- type: 'csv_header'
|
||||
value: 'obligationstatusstatus'
|
||||
column_header: 'obligationstatusstatus'
|
||||
- type: 'csv_header'
|
||||
value: 'filestatussetsubmissionstatus'
|
||||
column_header: 'filestatussetsubmissionstatus'
|
||||
- type: 'csv_header'
|
||||
value: 'filestatussetvalidationstatus'
|
||||
column_header: 'filestatussetvalidationstatus'
|
||||
- type: 'csv_header'
|
||||
value: 'filestatussetexternalvalidationstatus'
|
||||
column_header: 'filestatussetexternalvalidationstatus'
|
||||
- type: 'csv_header'
|
||||
value: 'numberoferrors'
|
||||
column_header: 'numberoferrors'
|
||||
- type: 'csv_header'
|
||||
value: 'numberofwarnings'
|
||||
column_header: 'numberofwarnings'
|
||||
- type: 'csv_header'
|
||||
value: 'delayindays'
|
||||
column_header: 'delayindays'
|
||||
- type: 'csv_header'
|
||||
value: 'failedattempts'
|
||||
column_header: 'failedattempts'
|
||||
- type: 'csv_header'
|
||||
value: 'observationvalue'
|
||||
column_header: 'observationvalue'
|
||||
- type: 'csv_header'
|
||||
value: 'observationtextvalue'
|
||||
column_header: 'observationtextvalue'
|
||||
- type: 'csv_header'
|
||||
value: 'observationdatevalue'
|
||||
column_header: 'observationdatevalue'
|
||||
- type: 'csv_header'
|
||||
value: 'datapointsetdatapointidentifier'
|
||||
column_header: 'datapointsetdatapointidentifier'
|
||||
- type: 'csv_header'
|
||||
value: 'datapointsetlabel'
|
||||
column_header: 'datapointsetlabel'
|
||||
- type: 'csv_header'
|
||||
value: 'obsrvdescdatatype'
|
||||
column_header: 'obsrvdescdatatype'
|
||||
- type: 'csv_header'
|
||||
value: 'ordinatecode'
|
||||
column_header: 'ordinatecode'
|
||||
- type: 'csv_header'
|
||||
value: 'ordinateposition'
|
||||
column_header: 'ordinateposition'
|
||||
- type: 'csv_header'
|
||||
value: 'tablename'
|
||||
column_header: 'tablename'
|
||||
- type: 'csv_header'
|
||||
value: 'isstock'
|
||||
column_header: 'isstock'
|
||||
- type: 'csv_header'
|
||||
value: 'scale'
|
||||
column_header: 'scale'
|
||||
- type: 'csv_header'
|
||||
value: 'currency'
|
||||
column_header: 'currency'
|
||||
- type: 'csv_header'
|
||||
value: 'numbertype'
|
||||
column_header: 'numbertype'
|
||||
- type: 'csv_header'
|
||||
value: 'ismandatory'
|
||||
column_header: 'ismandatory'
|
||||
- type: 'csv_header'
|
||||
value: 'decimalplaces'
|
||||
column_header: 'decimalplaces'
|
||||
- type: 'csv_header'
|
||||
value: 'serieskey'
|
||||
column_header: 'serieskey'
|
||||
- type: 'csv_header'
|
||||
value: 'tec_source_system'
|
||||
column_header: 'tec_source_system'
|
||||
- type: 'csv_header'
|
||||
value: 'tec_dataset'
|
||||
column_header: 'tec_dataset'
|
||||
- type: 'csv_header'
|
||||
value: 'tec_surrogate_key'
|
||||
column_header: 'tec_surrogate_key'
|
||||
- type: 'csv_header'
|
||||
value: 'tec_crc'
|
||||
column_header: 'tec_crc'
|
||||
- type: 'csv_header'
|
||||
value: 'tec_ingestion_date'
|
||||
column_header: 'tec_ingestion_date'
|
||||
- type: 'csv_header'
|
||||
value: 'tec_version_id'
|
||||
column_header: 'tec_version_id'
|
||||
- type: 'csv_header'
|
||||
value: 'tec_execution_date'
|
||||
column_header: 'tec_execution_date'
|
||||
- type: 'csv_header'
|
||||
value: 'tec_run_id'
|
||||
column_header: 'tec_run_id'
|
||||
- type: 'static'
|
||||
value: 'test test'
|
||||
column_header: 'BLABLA'
|
||||
- type: 'a_key'
|
||||
column_header: 'A_KEY'
|
||||
- type: 'csv_header'
|
||||
value: 'tec_business_date'
|
||||
column_header: 'tec_business_dateTest!'
|
||||
|
||||
50
python/mrds_common/mrds/docs/upload.py
Normal file
50
python/mrds_common/mrds/docs/upload.py
Normal file
@@ -0,0 +1,50 @@
|
||||
# file uploader
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
from mrds.utils import objectstore
|
||||
|
||||
BUCKET = os.getenv("INBOX_BUCKET", "mrds_inbox_poc")
|
||||
BUCKET_NAMESPACE = os.getenv("BUCKET_NAMESPACE", "frcnomajoc7v")
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(name)s - %(message)s",
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout),
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
source_filepath = '/home/dbt/tmp/mrds_4twsw_ib/20250630_Pre-Production_DV_P2_DBT_I4.zip'
|
||||
source_filename = '20250630_Pre-Production_DV_P2_DBT_I4.zip'
|
||||
target_prefix = 'INBOX/CSDB/STC_CentralizedSecuritiesDissemination_ECB'
|
||||
|
||||
def upload_file():
|
||||
|
||||
client = objectstore.get_client()
|
||||
|
||||
logging.info(
|
||||
f"uploading source file to '{BUCKET}/{target_prefix}/{source_filename}'"
|
||||
)
|
||||
objectstore.upload_file(
|
||||
client,
|
||||
source_filepath,
|
||||
BUCKET_NAMESPACE,
|
||||
BUCKET,
|
||||
target_prefix,
|
||||
source_filename,
|
||||
)
|
||||
logging.info(
|
||||
f"Source file uploaded to '{BUCKET}/{target_prefix}/{source_filename}'"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
upload_file()
|
||||
sys.exit(0)
|
||||
except Exception as e:
|
||||
logging.error(f"Unexpected error: {e}")
|
||||
sys.exit(1)
|
||||
15
python/mrds_common/mrds/processors/__init__.py
Normal file
15
python/mrds_common/mrds/processors/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from .xml_processor import XMLTaskProcessor
|
||||
from .csv_processor import CSVTaskProcessor
|
||||
|
||||
|
||||
def get_file_processor(global_config):
|
||||
"""
|
||||
Factory function to get the appropriate file processor class based on the file type in the global configuration.
|
||||
"""
|
||||
file_type = global_config.file_type.lower()
|
||||
if file_type == "xml":
|
||||
return XMLTaskProcessor
|
||||
elif file_type == "csv":
|
||||
return CSVTaskProcessor
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {file_type}")
|
||||
211
python/mrds_common/mrds/processors/base.py
Normal file
211
python/mrds_common/mrds/processors/base.py
Normal file
@@ -0,0 +1,211 @@
|
||||
import logging
|
||||
import os
|
||||
import csv
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from mrds.utils.utils import parse_output_columns
|
||||
|
||||
from mrds.utils import (
|
||||
manage_files,
|
||||
manage_runs,
|
||||
objectstore,
|
||||
static_vars,
|
||||
)
|
||||
|
||||
|
||||
OUTPUT_FILENAME_TEMPLATE = "{output_table}-{task_history_key}.csv"
|
||||
STATUS_SUCCESS = static_vars.status_success # duplicated needs to be moved #TODO
|
||||
|
||||
|
||||
class TaskProcessor(ABC):
|
||||
def __init__(self, global_config, task_conf, client, workflow_context):
|
||||
self.global_config = global_config
|
||||
self.task_conf = task_conf
|
||||
self.client = client
|
||||
self.workflow_context = workflow_context
|
||||
self._init_common()
|
||||
self._post_init()
|
||||
|
||||
def _init_common(self):
|
||||
# Initialize task
|
||||
self.a_task_history_key = manage_runs.init_task(
|
||||
self.task_conf.task_name,
|
||||
self.workflow_context["run_id"],
|
||||
self.workflow_context["a_workflow_history_key"],
|
||||
)
|
||||
logging.info(f"Task initialized with history key: {self.a_task_history_key}")
|
||||
|
||||
# Define output file paths
|
||||
self.output_filename = OUTPUT_FILENAME_TEMPLATE.format(
|
||||
output_table=self.task_conf.output_table,
|
||||
task_history_key=self.a_task_history_key,
|
||||
)
|
||||
self.output_filepath = os.path.join(
|
||||
self.global_config.tmpdir, self.output_filename
|
||||
)
|
||||
|
||||
# Parse the output_columns
|
||||
(
|
||||
self.xpath_entries,
|
||||
self.csv_entries,
|
||||
self.static_entries,
|
||||
self.a_key_entries,
|
||||
self.workflow_key_entries,
|
||||
self.xml_position_entries,
|
||||
self.column_order,
|
||||
) = parse_output_columns(self.task_conf.output_columns)
|
||||
|
||||
def _post_init(self):
|
||||
"""Optional hook for classes to override"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _extract(self):
|
||||
"""Non-optional hook for classes to override"""
|
||||
pass
|
||||
|
||||
def _enrich(self):
|
||||
"""
|
||||
Stream-based enrich: read one row at a time, append static/A-key/workflow-key,
|
||||
reorder columns, and write out immediately.
|
||||
"""
|
||||
|
||||
TASK_HISTORY_MULTIPLIER = 1_000_000_000
|
||||
|
||||
logging.info(f"Enriching CSV file at '{self.output_filepath}'")
|
||||
|
||||
temp_output = self.output_filepath + ".tmp"
|
||||
encoding = self.global_config.encoding_type
|
||||
|
||||
with open(self.output_filepath, newline="", encoding=encoding) as inf, open(
|
||||
temp_output, newline="", encoding=encoding, mode="w"
|
||||
) as outf:
|
||||
|
||||
reader = csv.reader(inf)
|
||||
writer = csv.writer(outf, quoting=csv.QUOTE_ALL)
|
||||
|
||||
# Read the original header
|
||||
original_headers = next(reader)
|
||||
|
||||
# Compute the full set of headers
|
||||
headers = list(original_headers)
|
||||
|
||||
# Add static column headers if missing
|
||||
for col_name, _ in self.static_entries:
|
||||
if col_name not in headers:
|
||||
headers.append(col_name)
|
||||
|
||||
# Add A-key column headers if missing
|
||||
for col_name in self.a_key_entries:
|
||||
if col_name not in headers:
|
||||
headers.append(col_name)
|
||||
|
||||
# Add workflow key column headers if missing
|
||||
for col_name in self.workflow_key_entries:
|
||||
if col_name not in headers:
|
||||
headers.append(col_name)
|
||||
|
||||
# Rearrange headers to desired ordr
|
||||
header_to_index = {h: i for i, h in enumerate(headers)}
|
||||
out_indices = [
|
||||
header_to_index[h] for h in self.column_order if h in header_to_index
|
||||
]
|
||||
out_headers = [headers[i] for i in out_indices]
|
||||
|
||||
# Write the new header
|
||||
writer.writerow(out_headers)
|
||||
|
||||
# Stream each row, enrich in-place, reorder, and write
|
||||
row_count = 0
|
||||
base_task_history = int(self.a_task_history_key) * TASK_HISTORY_MULTIPLIER
|
||||
|
||||
for i, in_row in enumerate(reader, start=1):
|
||||
# Build a working list that matches `headers` order.
|
||||
# Start by copying the existing columns (or '' if missing)
|
||||
work_row = [None] * len(headers)
|
||||
for j, h in enumerate(original_headers):
|
||||
idx = header_to_index[h]
|
||||
work_row[idx] = in_row[j]
|
||||
|
||||
# Fill static columns
|
||||
for col_name, value in self.static_entries:
|
||||
idx = header_to_index[col_name]
|
||||
work_row[idx] = value
|
||||
|
||||
# Fill A-key columns
|
||||
for col_name in self.a_key_entries:
|
||||
idx = header_to_index[col_name]
|
||||
a_key_value = base_task_history + i
|
||||
work_row[idx] = str(a_key_value)
|
||||
|
||||
# Fill workflow key columns
|
||||
wf_val = self.workflow_context["a_workflow_history_key"]
|
||||
for col_name in self.workflow_key_entries:
|
||||
idx = header_to_index[col_name]
|
||||
work_row[idx] = wf_val
|
||||
|
||||
# Reorder to output order and write
|
||||
out_row = [work_row[j] for j in out_indices]
|
||||
writer.writerow(out_row)
|
||||
row_count += 1
|
||||
|
||||
# Atomically replace
|
||||
os.replace(temp_output, self.output_filepath)
|
||||
logging.info(
|
||||
f"CSV file enriched at '{self.output_filepath}', {row_count} rows generated"
|
||||
)
|
||||
|
||||
def _upload(self):
|
||||
# Upload CSV to object store
|
||||
logging.info(
|
||||
f"Uploading CSV file to '{self.global_config.bucket}/{self.task_conf.ods_prefix}/{self.output_filename}'"
|
||||
)
|
||||
objectstore.upload_file(
|
||||
self.client,
|
||||
self.output_filepath,
|
||||
self.global_config.bucket_namespace,
|
||||
self.global_config.bucket,
|
||||
self.task_conf.ods_prefix,
|
||||
self.output_filename,
|
||||
)
|
||||
logging.info(
|
||||
f"CSV file uploaded to '{self.global_config.bucket}/{self.task_conf.ods_prefix}/{self.output_filename}'"
|
||||
)
|
||||
|
||||
def _process_remote(self):
|
||||
# Process the source file
|
||||
logging.info(f"Processing source file '{self.output_filename}' with CT_MRDS.FILE_MANAGER.PROCESS_SOURCE_FILE database function.")
|
||||
try:
|
||||
manage_files.process_source_file(
|
||||
self.task_conf.ods_prefix, self.output_filename
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
f"Processing source file '{self.output_filename}' failed. Cleaning up..."
|
||||
)
|
||||
objectstore.delete_file(
|
||||
self.client,
|
||||
self.output_filename,
|
||||
self.global_config.bucket_namespace,
|
||||
self.global_config.bucket,
|
||||
self.task_conf.ods_prefix,
|
||||
)
|
||||
logging.error(
|
||||
f"CSV file '{self.global_config.bucket}/{self.task_conf.ods_prefix}/{self.output_filename}' deleted."
|
||||
)
|
||||
raise
|
||||
else:
|
||||
logging.info(f"Source file '{self.output_filename}' processed")
|
||||
|
||||
def _finalize(self):
|
||||
# Finalize task
|
||||
manage_runs.finalise_task(self.a_task_history_key, STATUS_SUCCESS)
|
||||
logging.info(f"Task '{self.task_conf.task_name}' completed successfully")
|
||||
|
||||
def process(self):
|
||||
# main processor function
|
||||
self._extract()
|
||||
self._enrich()
|
||||
self._upload()
|
||||
self._process_remote()
|
||||
self._finalize()
|
||||
52
python/mrds_common/mrds/processors/csv_processor.py
Normal file
52
python/mrds_common/mrds/processors/csv_processor.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import logging
|
||||
import csv
|
||||
import os
|
||||
from .base import TaskProcessor
|
||||
|
||||
|
||||
class CSVTaskProcessor(TaskProcessor):
|
||||
|
||||
def _extract(self):
|
||||
input_path = self.global_config.source_filepath
|
||||
output_path = self.output_filepath
|
||||
encoding = self.global_config.encoding_type
|
||||
|
||||
logging.info(f"Reading source CSV file at '{input_path}'")
|
||||
|
||||
# Open both input & output at once for streaming row-by-row
|
||||
temp_output = output_path + ".tmp"
|
||||
with open(input_path, newline="", encoding=encoding) as inf, open(
|
||||
temp_output, newline="", encoding=encoding, mode="w"
|
||||
) as outf:
|
||||
|
||||
reader = csv.reader(inf)
|
||||
writer = csv.writer(outf, quoting=csv.QUOTE_ALL)
|
||||
|
||||
# Read and parse the header
|
||||
headers = next(reader)
|
||||
|
||||
# Build the list of headers to keep + their new names
|
||||
headers_to_keep = [old for _, old in self.csv_entries]
|
||||
headers_rename = [new for new, _ in self.csv_entries]
|
||||
|
||||
# Check if all specified headers exist in the input file
|
||||
missing = [h for h in headers_to_keep if h not in headers]
|
||||
if missing:
|
||||
raise ValueError(
|
||||
f"The following headers are not in the input CSV: {missing}"
|
||||
)
|
||||
|
||||
# Determine the indices of the headers to keep
|
||||
indices = [headers.index(old) for old in headers_to_keep]
|
||||
|
||||
# Write the renamed header
|
||||
writer.writerow(headers_rename)
|
||||
|
||||
# Stream through every data row and write out the filtered columns
|
||||
for row in reader:
|
||||
filtered = [row[i] for i in indices]
|
||||
writer.writerow(filtered)
|
||||
|
||||
# Atomically replace the old file
|
||||
os.replace(temp_output, output_path)
|
||||
logging.info(f"Core data written to CSV file at '{output_path}'")
|
||||
30
python/mrds_common/mrds/processors/xml_processor.py
Normal file
30
python/mrds_common/mrds/processors/xml_processor.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import logging
|
||||
|
||||
from .base import TaskProcessor
|
||||
|
||||
from mrds.utils import (
|
||||
xml_utils,
|
||||
csv_utils,
|
||||
)
|
||||
|
||||
|
||||
class XMLTaskProcessor(TaskProcessor):
|
||||
|
||||
def _extract(self):
|
||||
# Extract data from XML
|
||||
csv_data = xml_utils.extract_data(
|
||||
self.global_config.source_filepath,
|
||||
self.xpath_entries,
|
||||
self.xml_position_entries,
|
||||
self.task_conf.namespaces,
|
||||
self.workflow_context,
|
||||
self.global_config.encoding_type,
|
||||
)
|
||||
logging.info(f"CSV data extracted for task '{self.task_conf.task_name}'")
|
||||
|
||||
# Generate CSV
|
||||
logging.info(f"Writing core data to CSV file at '{self.output_filepath}'")
|
||||
csv_utils.write_data_to_csv_file(
|
||||
self.output_filepath, csv_data, self.global_config.encoding_type
|
||||
)
|
||||
logging.info(f"Core data written to CSV file at '{self.output_filepath}'")
|
||||
0
python/mrds_common/mrds/utils/__init__.py
Normal file
0
python/mrds_common/mrds/utils/__init__.py
Normal file
69
python/mrds_common/mrds/utils/csv_utils.py
Normal file
69
python/mrds_common/mrds/utils/csv_utils.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import csv
|
||||
import os
|
||||
|
||||
TASK_HISTORY_MULTIPLIER = 1_000_000_000
|
||||
|
||||
|
||||
def read_csv_file(csv_filepath, encoding_type="utf-8"):
|
||||
with open(csv_filepath, "r", newline="", encoding=encoding_type) as csvfile:
|
||||
reader = list(csv.reader(csvfile))
|
||||
headers = reader[0]
|
||||
data_rows = reader[1:]
|
||||
return headers, data_rows
|
||||
|
||||
|
||||
def write_data_to_csv_file(csv_filepath, data, encoding_type="utf-8"):
|
||||
temp_csv_filepath = csv_filepath + ".tmp"
|
||||
with open(temp_csv_filepath, "w", newline="", encoding=encoding_type) as csvfile:
|
||||
writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
|
||||
writer.writerow(data["headers"])
|
||||
writer.writerows(data["rows"])
|
||||
os.replace(temp_csv_filepath, csv_filepath)
|
||||
|
||||
|
||||
def add_static_columns(data_rows, headers, static_entries):
|
||||
for column_header, value in static_entries:
|
||||
if column_header not in headers:
|
||||
headers.append(column_header)
|
||||
for row in data_rows:
|
||||
row.append(value)
|
||||
else:
|
||||
idx = headers.index(column_header)
|
||||
for row in data_rows:
|
||||
row[idx] = value
|
||||
|
||||
|
||||
def add_a_key_columns(data_rows, headers, a_key_entries, task_history_key):
|
||||
for column_header in a_key_entries:
|
||||
if column_header not in headers:
|
||||
headers.append(column_header)
|
||||
for i, row in enumerate(data_rows, start=1):
|
||||
a_key_value = int(task_history_key) * TASK_HISTORY_MULTIPLIER + i
|
||||
row.append(str(a_key_value))
|
||||
else:
|
||||
idx = headers.index(column_header)
|
||||
for i, row in enumerate(data_rows, start=1):
|
||||
a_key_value = int(task_history_key) * TASK_HISTORY_MULTIPLIER + i
|
||||
row[idx] = str(a_key_value)
|
||||
|
||||
|
||||
def add_workflow_key_columns(data_rows, headers, workflow_key_entries, workflow_key):
|
||||
for column_header in workflow_key_entries:
|
||||
if column_header not in headers:
|
||||
headers.append(column_header)
|
||||
for row in data_rows:
|
||||
row.append(workflow_key)
|
||||
else:
|
||||
idx = headers.index(column_header)
|
||||
for row in data_rows:
|
||||
row[idx] = workflow_key
|
||||
|
||||
|
||||
def rearrange_columns(headers, data_rows, column_order):
|
||||
header_to_index = {header: idx for idx, header in enumerate(headers)}
|
||||
new_indices = [
|
||||
header_to_index[header] for header in column_order if header in header_to_index
|
||||
]
|
||||
headers = [headers[idx] for idx in new_indices]
|
||||
data_rows = [[row[idx] for idx in new_indices] for row in data_rows]
|
||||
return headers, data_rows
|
||||
177
python/mrds_common/mrds/utils/manage_files.py
Normal file
177
python/mrds_common/mrds/utils/manage_files.py
Normal file
@@ -0,0 +1,177 @@
|
||||
from . import oraconn
|
||||
from . import sql_statements
|
||||
from . import utils
|
||||
|
||||
# Get the next load id from the sequence
|
||||
|
||||
#
|
||||
# Workflows
|
||||
#
|
||||
|
||||
|
||||
def process_source_file_from_event(resource_id: str):
|
||||
#
|
||||
# expects object uri in the form /n/<namespace>/b/<bucket>/o/<object>
|
||||
# eg /n/frcnomajoc7v/b/dmarsdb1/o/sqlnet.log
|
||||
# and calls process_source_file with prefix and file_name extracted from that uri
|
||||
#
|
||||
|
||||
_, _, prefix, file_name = utils.parse_uri_with_regex(resource_id)
|
||||
process_source_file(prefix, file_name)
|
||||
|
||||
|
||||
def process_source_file(prefix: str, filename: str):
|
||||
|
||||
sourcefile = f"{prefix.rstrip('/')}/{filename}" # rstrip to cater for cases where the prefix is passed with a trailing slash
|
||||
try:
|
||||
conn = oraconn.connect("MRDS_LOADER")
|
||||
|
||||
oraconn.run_proc(conn, "CT_MRDS.FILE_MANAGER.PROCESS_SOURCE_FILE", [sourcefile])
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def execute_query(query, query_parameters=None, account_alias="MRDS_LOADER"):
|
||||
query_result = None
|
||||
try:
|
||||
conn = oraconn.connect(account_alias)
|
||||
curs = conn.cursor()
|
||||
if query_parameters != None:
|
||||
curs.execute(query, query_parameters)
|
||||
else:
|
||||
curs.execute(query)
|
||||
query_result = curs.fetchall()
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
return [t[0] for t in query_result]
|
||||
|
||||
|
||||
def get_file_prefix(source_key, source_file_id, table_id):
|
||||
query_result = None
|
||||
try:
|
||||
conn = oraconn.connect("MRDS_LOADER")
|
||||
curs = conn.cursor()
|
||||
|
||||
curs.execute(
|
||||
sql_statements.get_sql("get_file_prefix"),
|
||||
[source_key, source_file_id, table_id],
|
||||
)
|
||||
query_result = curs.fetchone()
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
return query_result[0]
|
||||
|
||||
|
||||
def get_inbox_bucket():
|
||||
try:
|
||||
conn = oraconn.connect("MRDS_LOADER")
|
||||
|
||||
ret = oraconn.run_func(conn, "CT_MRDS.FILE_MANAGER.GET_INBOX_BUCKET", str, [])
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def get_data_bucket():
|
||||
try:
|
||||
conn = oraconn.connect("MRDS_LOADER")
|
||||
|
||||
ret = oraconn.run_func(conn, "CT_MRDS.FILE_MANAGER.GET_DATA_BUCKET", str, [])
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def add_source_file_config(
|
||||
source_key,
|
||||
source_file_type,
|
||||
source_file_id,
|
||||
source_file_desc,
|
||||
source_file_name_pattern,
|
||||
table_id,
|
||||
template_table_name,
|
||||
):
|
||||
try:
|
||||
conn = oraconn.connect("MRDS_LOADER")
|
||||
|
||||
ret = oraconn.run_proc(
|
||||
conn,
|
||||
"CT_MRDS.FILE_MANAGER.ADD_SOURCE_FILE_CONFIG",
|
||||
[
|
||||
source_key,
|
||||
source_file_type,
|
||||
source_file_id,
|
||||
source_file_desc,
|
||||
source_file_name_pattern,
|
||||
table_id,
|
||||
template_table_name,
|
||||
],
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def add_column_date_format(template_table_name, column_name, date_format):
|
||||
try:
|
||||
conn = oraconn.connect("MRDS_LOADER")
|
||||
|
||||
ret = oraconn.run_proc(
|
||||
conn,
|
||||
"CT_MRDS.FILE_MANAGER.ADD_column_date_format",
|
||||
[template_table_name, column_name, date_format],
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def execute(stmt):
|
||||
try:
|
||||
conn = oraconn.connect("MRDS_LOADER")
|
||||
curs = conn.cursor()
|
||||
curs.execute(stmt)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def create_external_table(table_name, template_table_name, prefix):
|
||||
try:
|
||||
conn = oraconn.connect("ODS_LOADER")
|
||||
|
||||
ret = oraconn.run_proc(
|
||||
conn,
|
||||
"CT_MRDS.FILE_MANAGER.CREATE_EXTERNAL_TABLE",
|
||||
[table_name, template_table_name, prefix, get_bucket("ODS")],
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def get_bucket(bucket):
|
||||
try:
|
||||
conn = oraconn.connect("MRDS_LOADER")
|
||||
|
||||
ret = oraconn.run_func(
|
||||
conn, "CT_MRDS.FILE_MANAGER.GET_BUCKET_URI", str, [bucket]
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return ret
|
||||
97
python/mrds_common/mrds/utils/manage_runs.py
Normal file
97
python/mrds_common/mrds/utils/manage_runs.py
Normal file
@@ -0,0 +1,97 @@
|
||||
from . import oraconn
|
||||
from . import sql_statements
|
||||
from . import static_vars
|
||||
from . import manage_files
|
||||
|
||||
|
||||
def init_workflow(database_name: str, workflow_name: str, workflow_run_id: str):
|
||||
|
||||
try:
|
||||
conn = oraconn.connect("MRDS_LOADER")
|
||||
a_workflow_history_key = oraconn.run_func(
|
||||
conn,
|
||||
"CT_MRDS.WORKFLOW_MANAGER.INIT_WORKFLOW",
|
||||
int,
|
||||
[database_name, workflow_run_id, workflow_name],
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return a_workflow_history_key
|
||||
|
||||
|
||||
def finalise_workflow(a_workflow_history_key: int, workflow_status: str):
|
||||
|
||||
try:
|
||||
conn = oraconn.connect("MRDS_LOADER")
|
||||
|
||||
oraconn.run_proc(
|
||||
conn,
|
||||
"CT_MRDS.WORKFLOW_MANAGER.FINALISE_WORKFLOW",
|
||||
[a_workflow_history_key, workflow_status],
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def init_task(task_name: str, task_run_id: str, a_workflow_history_key: int):
|
||||
|
||||
a_task_history_key: int
|
||||
|
||||
try:
|
||||
conn = oraconn.connect("MRDS_LOADER")
|
||||
a_task_history_key = oraconn.run_func(
|
||||
conn,
|
||||
"CT_MRDS.WORKFLOW_MANAGER.INIT_TASK",
|
||||
int,
|
||||
[task_run_id, task_name, a_workflow_history_key],
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return a_task_history_key
|
||||
|
||||
|
||||
def finalise_task(a_task_history_key: int, task_status: str):
|
||||
|
||||
try:
|
||||
conn = oraconn.connect("MRDS_LOADER")
|
||||
curs = conn.cursor()
|
||||
|
||||
curs.execute(
|
||||
sql_statements.get_sql("finalise_task"), [task_status, a_task_history_key]
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def set_workflow_property(
|
||||
wf_history_key: int, service_name: str, property: str, value: str
|
||||
):
|
||||
try:
|
||||
conn = oraconn.connect("MRDS_LOADER")
|
||||
|
||||
ret = oraconn.run_proc(
|
||||
conn,
|
||||
"CT_MRDS.WORKFLOW_MANAGER.SET_WORKFLOW_PROPERTY",
|
||||
[wf_history_key, service_name, property, value],
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def select_ods_tab(table_name: str, value: str, condition="1 = 1"):
|
||||
|
||||
query = "select %s from %s where %s" % (value, table_name, condition)
|
||||
print("query = |%s|" % query)
|
||||
return manage_files.execute_query(query=query, account_alias="ODS_LOADER")
|
||||
53
python/mrds_common/mrds/utils/objectstore.py
Normal file
53
python/mrds_common/mrds/utils/objectstore.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import oci
|
||||
|
||||
|
||||
def get_client():
|
||||
#
|
||||
# Authentication is done using Instance Principals on VMs and Resouce Principal on OCI Container Instances
|
||||
# The function first tries Resource Principal and fails back to Instance Principal in case of error
|
||||
#
|
||||
try:
|
||||
signer = oci.auth.signers.get_resource_principals_signer()
|
||||
except:
|
||||
signer = signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
|
||||
|
||||
# Create secret client and retrieve content
|
||||
client = oci.object_storage.ObjectStorageClient(
|
||||
{}, signer=signer
|
||||
) # the first empyty bracket is an empty config
|
||||
return client
|
||||
|
||||
|
||||
def list_bucket(client, namespace, bucket, prefix):
|
||||
objects = client.list_objects(namespace, bucket, prefix=prefix)
|
||||
# see https://docs.oracle.com/en-us/iaas/tools/python/2.135.0/api/request_and_response.html#oci.response.Response for all attrs
|
||||
return objects.data
|
||||
|
||||
|
||||
def upload_file(client, source_filename, namespace, bucket, prefix, target_filename):
|
||||
with open(source_filename, "rb") as in_file:
|
||||
client.put_object(
|
||||
namespace, bucket, f"{prefix.rstrip('/')}/{target_filename}", in_file
|
||||
)
|
||||
|
||||
|
||||
def clean_folder(client, namespace, bucket, prefix):
|
||||
objects = client.list_objects(namespace, bucket, prefix=prefix)
|
||||
for o in objects.data.objects:
|
||||
print(f"Deleting {prefix.rstrip('/')}/{o.name}")
|
||||
client.delete_object(namespace, bucket, f"{o.name}")
|
||||
|
||||
|
||||
def delete_file(client, file, namespace, bucket, prefix):
|
||||
client.delete_object(namespace, bucket, f"{prefix.rstrip('/')}/{file}")
|
||||
|
||||
|
||||
def download_file(client, namespace, bucket, prefix, source_filename, target_filename):
|
||||
# Retrieve the file, streaming it into another file in 1 MiB chunks
|
||||
|
||||
get_obj = client.get_object(
|
||||
namespace, bucket, f"{prefix.rstrip('/')}/{source_filename}"
|
||||
)
|
||||
with open(target_filename, "wb") as f:
|
||||
for chunk in get_obj.data.raw.stream(1024 * 1024, decode_content=False):
|
||||
f.write(chunk)
|
||||
38
python/mrds_common/mrds/utils/oraconn.py
Normal file
38
python/mrds_common/mrds/utils/oraconn.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import oracledb
|
||||
import os
|
||||
import traceback
|
||||
import sys
|
||||
|
||||
|
||||
def connect(alias):
|
||||
|
||||
username = os.getenv(alias + "_DB_USER")
|
||||
password = os.getenv(alias + "_DB_PASS")
|
||||
tnsalias = os.getenv(alias + "_DB_TNS")
|
||||
connstr = username + "/" + password + "@" + tnsalias
|
||||
|
||||
oracledb.init_oracle_client()
|
||||
|
||||
try:
|
||||
conn = oracledb.connect(connstr)
|
||||
return conn
|
||||
except oracledb.DatabaseError as db_err:
|
||||
tb = traceback.format_exc()
|
||||
print(f"DatabaseError connecting to '{alias}': {db_err}\n{tb}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except Exception as exc:
|
||||
tb = traceback.format_exc()
|
||||
print(f"Unexpected error connecting to '{alias}': {exc}\n{tb}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def run_proc(connection, proc: str, param: []):
|
||||
curs = connection.cursor()
|
||||
curs.callproc(proc, param)
|
||||
|
||||
|
||||
def run_func(connection, proc: str, rettype, param: []):
|
||||
curs = connection.cursor()
|
||||
ret = curs.callfunc(proc, rettype, param)
|
||||
|
||||
return ret
|
||||
46
python/mrds_common/mrds/utils/secrets.py
Normal file
46
python/mrds_common/mrds/utils/secrets.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import oci
|
||||
import ast
|
||||
import base64
|
||||
|
||||
# Specify the OCID of the secret to retrieve
|
||||
|
||||
|
||||
def get_secretcontents(ocid):
|
||||
#
|
||||
# Authentication is done using Instance Principals on VMs and Resouce Principal on OCI Container Instances
|
||||
# The function first tries Resource Principal and fails back to Instance Principal in case of error
|
||||
#
|
||||
try:
|
||||
signer = oci.auth.signers.get_resource_principals_signer()
|
||||
except:
|
||||
signer = signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
|
||||
|
||||
# Create secret client and retrieve content
|
||||
secretclient = oci.secrets.SecretsClient({}, signer=signer)
|
||||
secretcontents = secretclient.get_secret_bundle(secret_id=ocid)
|
||||
return secretcontents
|
||||
|
||||
|
||||
def get_password(ocid):
|
||||
|
||||
secretcontents = get_secretcontents(ocid)
|
||||
|
||||
# Decode the secret from base64 and return password
|
||||
keybase64 = secretcontents.data.secret_bundle_content.content
|
||||
keybase64bytes = keybase64.encode("ascii")
|
||||
keybytes = base64.b64decode(keybase64bytes)
|
||||
key = keybytes.decode("ascii")
|
||||
keydict = ast.literal_eval(key)
|
||||
return keydict["password"]
|
||||
|
||||
|
||||
def get_secret(ocid):
|
||||
|
||||
# Create client
|
||||
secretcontents = get_secretcontents(ocid)
|
||||
|
||||
# Decode the secret from base64 and return it
|
||||
certbase64 = secretcontents.data.secret_bundle_content.content
|
||||
certbytes = base64.b64decode(certbase64)
|
||||
cert = certbytes.decode("UTF-8")
|
||||
return cert
|
||||
106
python/mrds_common/mrds/utils/security_utils.py
Normal file
106
python/mrds_common/mrds/utils/security_utils.py
Normal file
@@ -0,0 +1,106 @@
|
||||
import re
|
||||
import logging
|
||||
|
||||
|
||||
def verify_run_id(run_id, context=None):
|
||||
"""
|
||||
Verify run_id for security compliance.
|
||||
|
||||
Args:
|
||||
run_id (str): The run_id to verify
|
||||
context (dict, optional): Airflow context for logging
|
||||
|
||||
Returns:
|
||||
str: Verified run_id
|
||||
|
||||
Raises:
|
||||
ValueError: If run_id is invalid or suspicious
|
||||
"""
|
||||
try:
|
||||
# Basic checks
|
||||
if not run_id or not isinstance(run_id, str):
|
||||
raise ValueError(
|
||||
f"Invalid run_id: must be non-empty string, got: {type(run_id).__name__}"
|
||||
)
|
||||
|
||||
run_id = run_id.strip()
|
||||
|
||||
if len(run_id) < 1 or len(run_id) > 250:
|
||||
raise ValueError(
|
||||
f"Invalid run_id: length must be 1-250 chars, got: {len(run_id)}"
|
||||
)
|
||||
|
||||
# Allow only safe characters
|
||||
if not re.match(r"^[a-zA-Z0-9_\-:+.T]+$", run_id):
|
||||
suspicious_chars = "".join(
|
||||
set(
|
||||
char for char in run_id if not re.match(r"[a-zA-Z0-9_\-:+.T]", char)
|
||||
)
|
||||
)
|
||||
logging.warning(f"SECURITY: Invalid chars in run_id: '{suspicious_chars}'")
|
||||
raise ValueError("Invalid run_id: contains unsafe characters")
|
||||
|
||||
# Check for attack patterns
|
||||
dangerous_patterns = [
|
||||
r"\.\./",
|
||||
r"\.\.\\",
|
||||
r"<script",
|
||||
r"javascript:",
|
||||
r"union\s+select",
|
||||
r"drop\s+table",
|
||||
r"insert\s+into",
|
||||
r"delete\s+from",
|
||||
r"exec\s*\(",
|
||||
r"system\s*\(",
|
||||
r"eval\s*\(",
|
||||
r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]",
|
||||
]
|
||||
|
||||
for pattern in dangerous_patterns:
|
||||
if re.search(pattern, run_id, re.IGNORECASE):
|
||||
logging.error(f"SECURITY: Dangerous pattern in run_id: '{run_id}'")
|
||||
raise ValueError("Invalid run_id: contains dangerous pattern")
|
||||
|
||||
# Log success
|
||||
if context:
|
||||
dag_id = (
|
||||
getattr(context.get("dag"), "dag_id", "unknown")
|
||||
if context.get("dag")
|
||||
else "unknown"
|
||||
)
|
||||
logging.info(f"run_id verified: '{run_id}' for DAG: '{dag_id}'")
|
||||
|
||||
return run_id
|
||||
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
f"SECURITY: run_id verification failed: '{run_id}', Error: {str(e)}"
|
||||
)
|
||||
raise ValueError(f"run_id verification failed: {str(e)}")
|
||||
|
||||
|
||||
def get_verified_run_id(context):
|
||||
"""
|
||||
Extract and verify run_id from Airflow context.
|
||||
|
||||
Args:
|
||||
context (dict): Airflow context
|
||||
|
||||
Returns:
|
||||
str: Verified run_id
|
||||
"""
|
||||
try:
|
||||
run_id = None
|
||||
if context and "ti" in context:
|
||||
run_id = context["ti"].run_id
|
||||
elif context and "run_id" in context:
|
||||
run_id = context["run_id"]
|
||||
|
||||
if not run_id:
|
||||
raise ValueError("Could not extract run_id from context")
|
||||
|
||||
return verify_run_id(run_id, context)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to get verified run_id: {str(e)}")
|
||||
raise
|
||||
68
python/mrds_common/mrds/utils/sql_statements.py
Normal file
68
python/mrds_common/mrds/utils/sql_statements.py
Normal file
@@ -0,0 +1,68 @@
|
||||
sql_statements = {}
|
||||
|
||||
#
|
||||
# Workflows
|
||||
#
|
||||
|
||||
# register_workflow: Register new DW load
|
||||
|
||||
sql_statements[
|
||||
"register_workflow"
|
||||
] = """INSERT INTO CT_MRDS.A_WORKFLOW_HISTORY
|
||||
(A_WORKFLOW_HISTORY_KEY, WORKFLOW_RUN_ID,
|
||||
WORKFLOW_NAME, WORKFLOW_START, WORKFLOW_SSUCCESSFUL)
|
||||
VALUES (:a_workflow_history_key, :workflow_run_id, :workflow_name, SYSTIMESTAMP, :running_status)
|
||||
"""
|
||||
|
||||
# get_a_workflow_history_key: get new key from sequence
|
||||
sql_statements["get_a_workflow_history_key"] = (
|
||||
"SELECT CT_MRDS.A_WORKFLOW_HISTORY_KEY_SEQ.NEXTVAL FROM DUAL"
|
||||
)
|
||||
|
||||
# finalise: Update load record in A_LOAD_HISTORY after workflow completion
|
||||
sql_statements[
|
||||
"finalise_workflow"
|
||||
] = """UPDATE CT_MRDS.A_WORKFLOW_HISTORY
|
||||
SET WORKFLOW_END = SYSTIMESTAMP, WORKFLOW_SUCCESSFUL = :workflow_status
|
||||
WHERE A_WORKFLOW_HISTORY_KEY = :a_workflow_history_key
|
||||
"""
|
||||
#
|
||||
# Tasks
|
||||
#
|
||||
|
||||
# register_task
|
||||
|
||||
sql_statements[
|
||||
"register_task"
|
||||
] = """INSERT INTO CT_MRDS.A_TASK_HISTORY (A_TASK_HISTORY_KEY,
|
||||
A_WORKFLOW_HISTORY_KEY, TASK_RUN_ID,
|
||||
TASK_NAME, TASK_START, TASK_SUCCESSFUL)
|
||||
VALUES (:a_workflow_history_key, :workflow_run_id, :workflow_name, SYSTIMESTAMP, :running_status)
|
||||
"""
|
||||
|
||||
# get_a_task_history_key: get new key from sequence
|
||||
sql_statements["get_a_task_history_key"] = (
|
||||
"SELECT CT_MRDS.A_TASK_HISTORY_KEY_SEQ.NEXTVAL FROM DUAL"
|
||||
)
|
||||
|
||||
# finalise: Update load record in A_LOAD_HISTORY after workflow completion
|
||||
sql_statements[
|
||||
"finalise_task"
|
||||
] = """UPDATE CT_MRDS.A_TASK_HISTORY
|
||||
SET TASK_END = SYSTIMESTAMP, TASK_SUCCESSFUL = :workflow_status
|
||||
WHERE A_TASK_HISTORY_KEY = :a_workflow_history_key
|
||||
"""
|
||||
|
||||
#
|
||||
# Files
|
||||
#
|
||||
sql_statements["get_file_prefix"] = (
|
||||
"SELECT CT_MRDS.FILE_MANAGER.GET_BUCKET_PATH(:source_key, :source_file_id, :table_id) FROM DUAL"
|
||||
)
|
||||
|
||||
|
||||
def get_sql(stmt_id: str):
|
||||
if stmt_id in sql_statements:
|
||||
return sql_statements[stmt_id]
|
||||
else:
|
||||
return
|
||||
6
python/mrds_common/mrds/utils/static_vars.py
Normal file
6
python/mrds_common/mrds/utils/static_vars.py
Normal file
@@ -0,0 +1,6 @@
|
||||
#
|
||||
# Task management variables
|
||||
#
|
||||
status_running: str = "RUNNING"
|
||||
status_failed: str = "N"
|
||||
status_success: str = "Y"
|
||||
83
python/mrds_common/mrds/utils/utils.py
Normal file
83
python/mrds_common/mrds/utils/utils.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import re
|
||||
|
||||
|
||||
def parse_uri_with_regex(uri):
|
||||
"""
|
||||
Parses an Oracle Object Storage URI using regular expressions to extract the namespace,
|
||||
bucket name, prefix, and object name.
|
||||
|
||||
Parameters:
|
||||
uri (str): The URI string to parse, in the format '/n/{namespace}/b/{bucketname}/o/{object_path}'
|
||||
|
||||
Returns:
|
||||
tuple: A tuple containing (namespace, bucket_name, prefix, object_name)
|
||||
"""
|
||||
# Define the regular expression pattern
|
||||
pattern = r"^/n/([^/]+)/b/([^/]+)/o/(.*)$"
|
||||
|
||||
# Match the pattern against the URI
|
||||
match = re.match(pattern, uri)
|
||||
|
||||
if not match:
|
||||
raise ValueError("Invalid URI format")
|
||||
|
||||
# Extract namespace, bucket name, and object path from the matched groups
|
||||
namespace = match.group(1)
|
||||
bucket_name = match.group(2)
|
||||
object_path = match.group(3)
|
||||
|
||||
# Split the object path into prefix and object name
|
||||
if "/" in object_path:
|
||||
# Split at the last '/' to separate prefix and object name
|
||||
prefix, object_name = object_path.rsplit("/", 1)
|
||||
# Ensure the prefix ends with a '/'
|
||||
prefix += "/"
|
||||
else:
|
||||
# If there is no '/', there is no prefix
|
||||
prefix = ""
|
||||
object_name = object_path
|
||||
|
||||
return namespace, bucket_name, prefix, object_name
|
||||
|
||||
|
||||
def parse_output_columns(output_columns):
|
||||
xpath_entries = []
|
||||
csv_entries = []
|
||||
static_entries = []
|
||||
a_key_entries = []
|
||||
workflow_key_entries = []
|
||||
xml_position_entries = []
|
||||
column_order = []
|
||||
|
||||
for entry in output_columns:
|
||||
entry_type = entry["type"]
|
||||
column_header = entry["column_header"]
|
||||
column_order.append(column_header)
|
||||
|
||||
if entry_type == "xpath":
|
||||
xpath_expr = entry["value"]
|
||||
is_key = entry["is_key"]
|
||||
xpath_entries.append((xpath_expr, column_header, is_key))
|
||||
elif entry_type == "csv_header":
|
||||
value = entry["value"]
|
||||
csv_entries.append((column_header, value))
|
||||
elif entry_type == "static":
|
||||
value = entry["value"]
|
||||
static_entries.append((column_header, value))
|
||||
elif entry_type == "a_key":
|
||||
a_key_entries.append(column_header)
|
||||
elif entry_type == "workflow_key":
|
||||
workflow_key_entries.append(column_header)
|
||||
elif entry_type == "xpath_element_id": # TODO - update all xml_position namings to xpath_element_id
|
||||
xpath_expr = entry["value"]
|
||||
xml_position_entries.append((xpath_expr, column_header))
|
||||
|
||||
return (
|
||||
xpath_entries,
|
||||
csv_entries,
|
||||
static_entries,
|
||||
a_key_entries,
|
||||
workflow_key_entries,
|
||||
xml_position_entries,
|
||||
column_order,
|
||||
)
|
||||
23
python/mrds_common/mrds/utils/vault.py
Normal file
23
python/mrds_common/mrds/utils/vault.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import oci
|
||||
import ast
|
||||
import base64
|
||||
|
||||
# Specify the OCID of the secret to retrieve
|
||||
|
||||
|
||||
def get_password(ocid):
|
||||
|
||||
# Create vaultsclient using the default config file (\.oci\config) for auth to the API
|
||||
signer = signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
|
||||
|
||||
# Get the secret
|
||||
secretclient = oci.secrets.SecretsClient({}, signer=signer)
|
||||
secretcontents = secretclient.get_secret_bundle(secret_id=ocid)
|
||||
|
||||
# Decode the secret from base64 and print
|
||||
keybase64 = secretcontents.data.secret_bundle_content.content
|
||||
keybase64bytes = keybase64.encode("ascii")
|
||||
keybytes = base64.b64decode(keybase64bytes)
|
||||
key = keybytes.decode("ascii")
|
||||
keydict = ast.literal_eval(key)
|
||||
return keydict["password"]
|
||||
177
python/mrds_common/mrds/utils/xml_utils.py
Normal file
177
python/mrds_common/mrds/utils/xml_utils.py
Normal file
@@ -0,0 +1,177 @@
|
||||
import xmlschema
|
||||
import hashlib
|
||||
from lxml import etree
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
def validate_xml(xml_file, xsd_file):
|
||||
try:
|
||||
# Create an XMLSchema instance with strict validation
|
||||
schema = xmlschema.XMLSchema(xsd_file, validation="strict")
|
||||
# Validate the XML file
|
||||
schema.validate(xml_file)
|
||||
return True, "XML file is valid against the provided XSD schema."
|
||||
except xmlschema.validators.exceptions.XMLSchemaValidationError as e:
|
||||
return False, f"XML validation error: {str(e)}"
|
||||
except xmlschema.validators.exceptions.XMLSchemaException as e:
|
||||
return False, f"XML schema error: {str(e)}"
|
||||
except Exception as e:
|
||||
return False, f"An error occurred during XML validation: {str(e)}"
|
||||
|
||||
|
||||
def extract_data(
|
||||
filename,
|
||||
xpath_columns, # List[(expr, header, is_key)]
|
||||
xml_position_columns, # List[(expr, header)]
|
||||
namespaces,
|
||||
workflow_context,
|
||||
encoding_type="utf-8",
|
||||
):
|
||||
"""
|
||||
Parses an XML file using XPath expressions and extracts data.
|
||||
|
||||
Parameters:
|
||||
- filename (str): The path to the XML file to parse.
|
||||
- xpath_columns (list): A list of tuples, each containing:
|
||||
- XPath expression (str)
|
||||
- CSV column header (str)
|
||||
- Indicator if the field is a key ('Y' or 'N')
|
||||
- xml_position_columns (list)
|
||||
- namespaces (dict): Namespace mapping needed for lxml's xpath()
|
||||
|
||||
Returns:
|
||||
- dict: A dictionary containing headers and rows with extracted data.
|
||||
"""
|
||||
|
||||
parser = etree.XMLParser(remove_blank_text=True)
|
||||
tree = etree.parse(filename, parser)
|
||||
root = tree.getroot()
|
||||
|
||||
# Separate out key vs non‐key columns
|
||||
key_cols = [ (expr, h) for expr, h, k in xpath_columns if k == "Y" ]
|
||||
nonkey_cols = [ (expr, h) for expr, h, k in xpath_columns if k == "N" ]
|
||||
|
||||
# Evaluate every non‐key XPath and keep the ELEMENT nodes
|
||||
nonkey_elements = {}
|
||||
for expr, header in nonkey_cols:
|
||||
elems = root.xpath(expr, namespaces=namespaces)
|
||||
nonkey_elements[header] = elems
|
||||
|
||||
# figure out how many rows total we need
|
||||
# that's the maximum length of any of the nonkey lists
|
||||
if nonkey_elements:
|
||||
row_count = max(len(lst) for lst in nonkey_elements.values())
|
||||
else:
|
||||
row_count = 0
|
||||
|
||||
# pad every nonkey list up to row_count with `None`
|
||||
for header, lst in nonkey_elements.items():
|
||||
if len(lst) < row_count:
|
||||
lst.extend([None] * (row_count - len(lst)))
|
||||
|
||||
# key columns
|
||||
key_values = []
|
||||
for expr, header in key_cols:
|
||||
nodes = root.xpath(expr, namespaces=namespaces)
|
||||
if not nodes:
|
||||
key_values.append("")
|
||||
else:
|
||||
first = nodes[0]
|
||||
txt = (first.text if isinstance(first, etree._Element) else str(first)) or ""
|
||||
key_values.append(txt.strip())
|
||||
|
||||
# xml_position columns
|
||||
xml_positions = {}
|
||||
for expr, header in xml_position_columns:
|
||||
xml_positions[header] = root.xpath(expr, namespaces=namespaces)
|
||||
|
||||
# prepare headers
|
||||
headers = [h for _, h in nonkey_cols] + [h for _, h in key_cols] + [h for _, h in xml_position_columns]
|
||||
|
||||
# build rows
|
||||
rows = []
|
||||
for i in range(row_count):
|
||||
row = []
|
||||
|
||||
# non‐key data
|
||||
for expr, header in nonkey_cols:
|
||||
elem = nonkey_elements[header][i]
|
||||
text = ""
|
||||
if isinstance(elem, etree._Element):
|
||||
text = elem.text or ""
|
||||
elif elem is not None:
|
||||
text = str(elem)
|
||||
row.append(text.strip())
|
||||
|
||||
# key columns
|
||||
row.extend(key_values)
|
||||
|
||||
# xml_position columns
|
||||
for expr, header in xml_position_columns:
|
||||
if not nonkey_cols:
|
||||
row.append("")
|
||||
continue
|
||||
|
||||
first_header = nonkey_cols[0][1]
|
||||
data_elem = nonkey_elements[first_header][i]
|
||||
if data_elem is None:
|
||||
row.append("")
|
||||
continue
|
||||
|
||||
target_list = xml_positions[header]
|
||||
current = data_elem
|
||||
found = None
|
||||
while current is not None:
|
||||
if current in target_list:
|
||||
found = current
|
||||
break
|
||||
current = current.getparent()
|
||||
|
||||
if not found:
|
||||
row.append("")
|
||||
else:
|
||||
# compute full‐path with indices
|
||||
path_elems = []
|
||||
walk = found
|
||||
while walk is not None:
|
||||
idx = 1 + sum(1 for s in walk.itersiblings(preceding=True) if s.tag == walk.tag)
|
||||
path_elems.append(f"{walk.tag}[{idx}]")
|
||||
walk = walk.getparent()
|
||||
full_path = "/" + "/".join(reversed(path_elems))
|
||||
row.append(_xml_pos_hasher(full_path, workflow_context["a_workflow_history_key"]))
|
||||
|
||||
rows.append(row)
|
||||
|
||||
return {"headers": headers, "rows": rows}
|
||||
|
||||
|
||||
def _xml_pos_hasher(input_string, salt, hash_length=15):
|
||||
"""
|
||||
Helps hashing xml positions.
|
||||
|
||||
Parameters:
|
||||
input_string (str): The string to hash.
|
||||
salt (int): The integer salt to ensure deterministic, run-specific behavior.
|
||||
hash_length (int): The desired length of the resulting hash (default is 15 digits).
|
||||
|
||||
Returns:
|
||||
int: A deterministic integer hash of the specified length.
|
||||
"""
|
||||
# Ensure the hash length is valid
|
||||
if hash_length <= 0:
|
||||
raise ValueError("Hash length must be a positive integer.")
|
||||
|
||||
# Combine the input string with the salt to create a deterministic input
|
||||
salted_input = f"{salt}:{input_string}"
|
||||
|
||||
# Generate a SHA-256 hash of the salted input
|
||||
hash_object = hashlib.sha256(salted_input.encode())
|
||||
full_hash = hash_object.hexdigest()
|
||||
|
||||
# Convert the hash to an integer
|
||||
hash_integer = int(full_hash, 16)
|
||||
|
||||
# Truncate or pad the hash to the desired length
|
||||
truncated_hash = str(hash_integer)[:hash_length]
|
||||
|
||||
return int(truncated_hash)
|
||||
50
python/mrds_common/setup.py
Normal file
50
python/mrds_common/setup.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import re
|
||||
from pathlib import Path
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
# extract version from mrds/__init__.py
|
||||
here = Path(__file__).parent
|
||||
init_py = here / "mrds" / "__init__.py"
|
||||
_version = re.search(
|
||||
r'^__version__\s*=\s*["\']([^"\']+)["\']', init_py.read_text(), re.MULTILINE
|
||||
).group(1)
|
||||
|
||||
setup(
|
||||
name="mrds",
|
||||
version=_version,
|
||||
packages=find_packages(),
|
||||
install_requires=[
|
||||
"click>=8.0.0,<9.0.0",
|
||||
"oci>=2.129.3,<3.0.0",
|
||||
"oracledb>=2.5.1,<3.0.0",
|
||||
"PyYAML>=6.0.0,<7.0.0",
|
||||
"lxml>=5.0.0,<5.3.0",
|
||||
"xmlschema>=3.4.0,<3.4.3",
|
||||
"cryptography>=3.3.1,<42.0.0",
|
||||
"PyJWT>=2.0.0,<3.0.0",
|
||||
"requests>=2.25.0,<3.0.0",
|
||||
],
|
||||
extras_require={
|
||||
"dev": [
|
||||
"black==24.10.0",
|
||||
"tox==4.23.2",
|
||||
"pytest==8.3.4",
|
||||
],
|
||||
},
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
"mrds-cli=mrds.cli:cli_main",
|
||||
],
|
||||
},
|
||||
author="",
|
||||
author_email="",
|
||||
description="MRDS module for MarS ETL POC",
|
||||
long_description=open("README.md").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
url="",
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3",
|
||||
"Operating System :: OS Independent",
|
||||
],
|
||||
python_requires=">=3.11",
|
||||
)
|
||||
17
python/mrds_common/tox.ini
Normal file
17
python/mrds_common/tox.ini
Normal file
@@ -0,0 +1,17 @@
|
||||
# tox.ini
|
||||
|
||||
[tox]
|
||||
envlist = py310, format
|
||||
|
||||
[testenv]
|
||||
deps =
|
||||
pytest
|
||||
commands =
|
||||
pytest
|
||||
|
||||
[testenv:format]
|
||||
basepython = python3
|
||||
deps =
|
||||
black
|
||||
commands =
|
||||
black --check --diff .
|
||||
Reference in New Issue
Block a user