from __future__ import annotations import sys sys.path.append('/opt/airflow/python/mrds_common') sys.path.append('/opt/airflow/python/devo_replicator/table_generator') import pandas as pd import mrds.utils.manage_files as fileManager import logging import tableBuilderQueries as tbq from devo_query import execute_query import ranger_updater as ranger import yaml import numpy as np from mrds.utils.secrets import get_secret import os import logging import yaml from datetime import timedelta from airflow import DAG from airflow.utils.dates import days_ago from airflow.utils.trigger_rule import TriggerRule from airflow.operators.python import PythonOperator import traceback try: from airflow.exceptions import AirflowFailException except Exception: from airflow.exceptions import AirflowException as AirflowFailException from mrds.utils import oraconn ENV_CONFIG_PATH = "/opt/airflow/python/devo_replicator/config/env_config.yaml" # Set up basic configuration for logging logging.basicConfig(level=logging.INFO) # Create a logger object logger = logging.getLogger(__name__) import re #0 utilities def initialize_config(config_file_path): # Ensure the file exists if not os.path.exists(config_file_path): raise FileNotFoundError(f"Configuration file {config_file_path} not found.") # Load the configuration with open(config_file_path, "r") as f: config_data = yaml.safe_load(f) return config_data def fix_impala_sql(sql: str) -> str: # List of reserved keywords in Impala that need backticks if used as column names impala_reserved_keywords = { 'date', 'value', 'source', 'comment', 'partition', 'row', 'select', 'insert', 'table', 'external', 'format', 'location', 'stored', 'inputformat', 'outputformat', 'scenario', 'string', 'int', 'decimal', 'timestamp', 'float', 'double','procedure', 'floor' } # Regex pattern to find column definitions pattern = re.compile( r'(?P`?\w+`?)\s+(?P[A-Za-z]+\s*(?:\([^)]+\))?)\s*(?Pcomment\s*\'[^\']*\'|)?', re.IGNORECASE ) def replace(match): col = match.group('col').strip('`') dtype = match.group('type') comment = match.group('comment') or '' # Add backticks only if column name is a reserved keyword or contains special chars if col.lower() in impala_reserved_keywords or not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', col): col = f'`{col}`' return f"{col} {dtype} {comment}".strip() # Only replace column list part between parentheses table_def_start = sql.find('(') table_def_end = sql.find('ROW FORMAT SERDE', table_def_start) if table_def_start == -1 or table_def_end == -1: raise ValueError("Invalid SQL format: Missing column definition parentheses.") before = sql[:table_def_start + 1] columns = sql[table_def_start + 1:table_def_end] after = sql[table_def_end:] # Replace all columns inside definition fixed_columns = pattern.sub(replace, columns) # Combine and return final= before + fixed_columns + after final=final.replace("\\'", "").replace('\\\\', '\\') return final def applyQueryParameters(query: str, parameters: str) -> str: """ Replaces placeholders in the query with values from parameters. Parameters: - query: Original query string with placeholders like $$$1, $$$2, etc. - parameters: Semicolon-separated string of parameter values. Returns: - String with the query filled with parameter values. """ filled_query = query if parameters: # Split the parameters string and reverse the list params_array = parameters.split(';')[::-1] index = len(params_array) for param in params_array: # Replace the placeholder $$$ with the parameter placeholder = f"$$${index}" filled_query = filled_query.replace(placeholder, param) index -= 1 # Decrement the index return filled_query def format_column_definition(row): if pd.isnull(row['data_description']): # If data_description is null, only include column_name and data_type_string return f"{row['column_name']} {row['data_type_string']}" else: # If data_description is present, include it with a comment # Ensure data_description does not contain single quotes data_description = str(row['data_description']).replace("'", "\\'") return f"{row['column_name']} {row['data_type_string']} comment '{data_description}'" #1 receive table name and check for target table and access type def execute_oracle_query(sql): oracle_conn = oraconn.connect('MRDS_LOADER') cursor = oracle_conn.cursor() options=cursor.execute(sql).fetchall() oracle_conn.commit() df = pd.DataFrame(options,columns= [row[0].lower() for row in cursor.description]) ## fetch db dtypes cursor.close() oracle_conn.close() return df def get_target_table(oracle_mgmt_table,source_schema,source_table): sql=f"SELECT DISTINCT TABLE_ALIAS FROM {oracle_mgmt_table} WHERE OWNER = '{source_schema}' AND TABLE_NAME = '{source_table}'" df=execute_oracle_query(sql) return df def get_type_ofAccess(oracle_metadata_table,table_owner,source_table,service_name): sql=f"SELECT DISTINCT RAR3_TYPE_OF_ACCESS FROM {oracle_metadata_table} WHERE A_VALID_TO > SYSDATE AND OWNER = '{table_owner}'AND TABLE_NAME = '{source_table}'" df=execute_oracle_query(sql) return df #2 load metadata def readIGAMRoles( config ): queryParams = "'" + config['sentry_role_environment'] + "'" igamRolesQuery = tbq.get_query_igam_roles(config['oracle_igam_table'],config['service_name']) logger.info(f"Querying the IGAM Table: {igamRolesQuery}") queryWithParamsIgamSentry = applyQueryParameters(igamRolesQuery, queryParams) logger.info(f"Replaced params to IGAM Table: {queryWithParamsIgamSentry}") igamRoleDF = execute_oracle_query(queryWithParamsIgamSentry) return igamRoleDF def loadMetadataTable( config ): metadataQuery = tbq.get_query_metadata(config['oracle_metadata_table'], config['source_schema'], config['source_table']) logger.info("Map Oracle metadata (data types) to Hive query: {metadataQuery}") jdbcMetaDataDF = execute_oracle_query( metadataQuery) logger.info("Fetch all fields for table and concatenate them separated by ','") tableDataList = jdbcMetaDataDF.apply(format_column_definition, axis=1).tolist() tableFields = ",".join(tableDataList) return tableFields #3 drop table and policies def deleteExternalTable(config,env_config): try: try: deleted=ranger.delete_policy(config,env_config) logger.info(f"deleted policies: {deleted}") except Exception as e: logger.error("Error in dropping table") logger.error("Exception: %s", e) logger.error("Traceback:\n%s", traceback.format_exc()) except RuntimeError as e: logger.error("Error in dropping table") logger.error("Exception: %s", e) logger.error("Traceback:\n%s", traceback.format_exc()) sql_drop = f"DROP TABLE IF EXISTS {config['corporate_store']}.{config['target_table']}" execute_query( sql_drop, env_config['DEVO_USERNAME'], env_config['IMPALA_HOSTNAME'], env_config['DEVO_SECRET'], ) logger.info(f"table {config['corporate_store']}.{config['target_table']} dropped") #4 create external table def createExternalTables(config, tableFields,env_config ): sql_create = ( f"CREATE EXTERNAL TABLE {config['corporate_store']}.{config['target_table']} " f"({tableFields}, {config['tech_meta_data_fields']}) " "ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' " "STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' " "OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' " f"LOCATION '{config['target_s3_bucket']}/{config['target_table']}' " "TBLPROPERTIES (" "'external.table.purge'='true', " "'parquet.compression'='snappy')" ) sql_create=fix_impala_sql(sql_create) execute_query(sql_create,env_config['DEVO_USERNAME'],env_config['HIVE_HOSTNAME'],env_config['DEVO_SECRET']) def createTableFromExternal( config, tableFields,env_config ): sql_create = ( f"CREATE EXTERNAL TABLE {config['corporate_store']}.{config['target_table']} AS " f"SELECT * FROM {config['corporate_store']}.{config['target_table']}_EXT" ) execute_query(sql_create,env_config['DEVO_USERNAME'],env_config['HIVE_HOSTNAME'],env_config['DEVO_SECRET']) #5 create table policies def accessTypeMapper(config, env_config, igamRoleDF): if (config['access_type'].lower() == '1'): return accessType_1(config, env_config, igamRoleDF) elif (config['access_type'].lower() == '2a'): return accessType_2A(config, env_config, igamRoleDF) elif (config['access_type'].lower() == '2b'): return accessType_2B(config, env_config, igamRoleDF) elif (config['access_type'].lower() == '3'): return accessType_3(config, env_config, igamRoleDF) else: logger.info(f"Invalid access type ${config['access_type']}. Please check the input param") raise RuntimeError( f"Access type error, access type :{config['access_type'].lower()} unsupported" ) def accessType_1(config, env_config, igamRoleDF): logger.info("Grant privileges for access type 1") logger.info("Fetch metadata from Oracle for access type 1") # ---- Construct query and fetch from Oracle ---- queryParams = f"'{config['source_schema']}.{config['source_table']}'" queryMetadataAccessType1 = tbq.get_query_metadata_access_type1(config['oracle_metadata_table']) queryWithParamsAccessType1 = applyQueryParameters(queryMetadataAccessType1, queryParams) logger.info("Metadata table query: " + queryWithParamsAccessType1) jdbcMetaDataAccessType1DF = execute_oracle_query(queryWithParamsAccessType1) # ---- Normalize columns ---- df = jdbcMetaDataAccessType1DF.copy() df["rar3_type_of_access"] = df["rar3_type_of_access"].astype(str).str.strip() df["source"] = df["source"].astype(str).str.strip().str.upper() igamRoleDF["datasource"] = igamRoleDF["datasource"].astype(str).str.strip().str.upper() # ---- Branch A: source != 'RAR' ---- left_a = ( df.loc[ (df["rar3_type_of_access"] == "1") & (df["source"] != config['service_name']), ["table_name", "source"] ] .drop_duplicates() ) branch_a = ( left_a.merge( igamRoleDF, left_on="source", right_on="datasource", how="inner" ) [["table_name", "source", "subsource_id", "igam_entitlement", "environment"]] .drop_duplicates() ) # ---- Branch B: source == 'RAR' (CROSS JOIN with igamRoleDF) ---- left_b = ( df.loc[ (df["rar3_type_of_access"] == "1") & (df["source"] == config['service_name']), ["table_name", "source"] ] .drop_duplicates() ) if not left_b.empty: branch_b = ( left_b.merge(igamRoleDF, how="cross") [["table_name", "source", "subsource_id", "igam_entitlement", "environment"]] .drop_duplicates() ) else: branch_b = pd.DataFrame(columns=["table_name", "source", "subsource_id", "igam_entitlement", "environment"]) # ---- UNION (distinct) ---- typeOneDF = ( pd.concat([branch_a, branch_b], ignore_index=True) .drop_duplicates() .reset_index(drop=True) ) logger.info("typeOneDF:\n%s", typeOneDF) # ---- Collect IGAM entitlements ---- igam_entitlements = ( typeOneDF["igam_entitlement"] .dropna() .astype(str) .str.strip() .tolist() ) # Extract IGAM entitlements # Merge with optional full access list if config['full_access_entitlement_list'] is None: combined_entitlements = igam_entitlements else: full_access_list_clean = config['full_access_entitlement_list'] combined_entitlements = igam_entitlements + full_access_list_clean # Add table permission groups using YAMLFormatter params = ranger.add_table_permission_groups( config['corporate_store'], config['target_table'], config['access_type'], config['source_table'], combined_entitlements ) # Generate the final YAML policy ranger.generate_policy(params,env_config, None) def accessType_2A(config, env_config, igamRoleDF): logger.info("Grant privileges for access type 2a") logger.info("Fetch the metadata in Oracle for access type 2a") # ---- Construct query and fetch from Oracle ---- queryParams = f"'{config['source_schema']}.{config['source_table']}'" queryMetadataAccessType2a = tbq.get_query_metadata_access_type2a(config['oracle_metadata_table']) queryWithParamsAccessType2a = applyQueryParameters(queryMetadataAccessType2a, queryParams) logger.info("Meta data table query: ") jdbcMetaDataAccessType2aDF = execute_oracle_query(queryWithParamsAccessType2a) # ---- Normalize columns ---- df = jdbcMetaDataAccessType2aDF.copy() df["rar3_type_of_access"] = df["rar3_type_of_access"].astype(str).str.strip().str.lower() df["source"] = df["source"].astype(str).str.strip().str.upper() roles = igamRoleDF.copy() # expected columns in igamRoleDF: rar_subsource_id, igam_entitlement, environment (plus anything else you keep) roles["subsource_id"] = roles["subsource_id"].astype(str).str.strip().str.upper() roles["igam_entitlement"] = roles["igam_entitlement"].astype(str).str.strip() # ---- Branch A: source != service_name -> INNER JOIN on source == rar_subsource_id ---- left_a = ( df.loc[ (df["rar3_type_of_access"] == "2a") & (df["source"] != config['service_name'].upper()), ["table_name", "column_name", "source"] ] ) branch_a = ( left_a.merge( roles, left_on="source", right_on="subsource_id", how="inner" ) .drop(columns=["subsource_id", "source"], errors="ignore") [["table_name", "column_name", "igam_entitlement", "environment"]] ) # ---- Branch B: source == service_name -> CROSS JOIN with igamRoleDF ---- left_b = ( df.loc[ (df["rar3_type_of_access"] == "2a") & (df["source"] == config['service_name'].upper()), ["table_name", "column_name", "source"] ] ) if not left_b.empty: try: branch_b = ( left_b.merge(roles, how="cross") .drop(columns=["subsource_id", "source"], errors="ignore") [["table_name", "column_name", "igam_entitlement", "environment"]] ) except TypeError: # pandas < 1.2 fallback left_b["_cj"] = 1 roles["_cj"] = 1 branch_b = ( left_b.merge(roles, on="_cj") .drop(columns=["_cj", "subsource_id", "source"], errors="ignore") [["table_name", "column_name", "igam_entitlement", "environment"]] ) # (optional) cleanup if you keep using roles later roles.drop(columns=["_cj"], inplace=True, errors="ignore") else: branch_b = pd.DataFrame(columns=["table_name", "column_name", "igam_entitlement", "environment"]) # ---- UNION (ALL) ---- one_df = ( pd.concat([branch_a, branch_b], ignore_index=True) .reset_index(drop=True) ) # ---- Group 1: (table_name, igam_entitlement) -> sorted, comma-joined column_list ---- tmp = one_df.sort_values(["table_name", "igam_entitlement", "column_name"], kind="mergesort") new_df = ( tmp.groupby(["table_name", "igam_entitlement"], as_index=False)["column_name"] .apply(lambda s: ",".join(s.dropna().astype(str).tolist())) .rename(columns={"column_name": "column_list"}) ) # Columns: table_name, igam_entitlement, column_list # ---- Group 2: (table_name, column_list) -> comma-joined igam_entitlement ---- grouped = ( new_df.groupby(["table_name", "column_list"], as_index=False)["igam_entitlement"] .apply(lambda s: ",".join(s.dropna().astype(str).tolist())) ) # Columns: table_name, column_list, igam_entitlement # ---- ROW_NUMBER() OVER (ORDER BY column_list) -> policy_id ---- grouped = grouped.sort_values(["column_list"], kind="mergesort") grouped["policy_id"] = np.arange(1, len(grouped) + 1).astype(int) # ---- Emit policies: one per (table_name, column_list) row ---- for _, row in grouped.iterrows(): entitlements_list = [e.strip() for e in str(row["igam_entitlement"]).split(",") if e.strip()] columns_list = [c.strip() for c in str(row["column_list"]).split(",") if c.strip()] policy_id = str(int(row["policy_id"])) params = ranger.add_table_permission_groups( config['corporate_store'], config['target_table'], config['access_type'], # "2a" config['source_table'], entitlements_list, columns_list=columns_list ) ranger.generate_policy(params, env_config, policy_id) # ---- Optional: append full-access YAML if list provided on config ---- if config["full_access_entitlement_list"] != None: # If your code already provides a list, use it directly; otherwise split string. if isinstance(config["full_access_entitlement_list"], list): full_access_list = config["full_access_entitlement_list"] else: full_access_list = [s.strip() for s in str(config["full_access_entitlement_list"]).split(",") if s.strip()] params_full = ranger.add_table_permission_groups( config['corporate_store'], config['target_table'], config['access_type'], config['source_table'], full_access_list ) ranger.generate_policy(params_full, env_config, "full_access") def accessType_2B(config, env_config,igamRoleDF): logger.info(f"Grant privileges for access type {config['access_type']}") logger.info("Fetch the metadata in Oracle for access type 2b") # --- Validate required columns --- required = {"environment", "igam_entitlement", "subsource_id"} missing = required - set(igamRoleDF.columns) if missing: raise KeyError(f"igamRoleDF missing required column(s): {sorted(missing)}") # --- Normalize to strings (robust against None/NaN) --- igamRoleDF = igamRoleDF.copy() igamRoleDF["environment"] = igamRoleDF["environment"].astype(str).str.strip() igamRoleDF["igam_entitlement"] = igamRoleDF["igam_entitlement"].astype(str).str.strip() igamRoleDF["subsource_id"] = igamRoleDF["subsource_id"].astype(str).str.strip() # --- Aggregation: per (environment, igam_entitlement) collect unique rar_subsource_id list --- # Keep a stable order by sorting; remove empties. agg_df = ( igamRoleDF.loc[igamRoleDF["subsource_id"].ne(""), ["environment", "igam_entitlement", "subsource_id"]] .drop_duplicates() .sort_values(["environment", "igam_entitlement", "subsource_id"], kind="mergesort") .groupby(["environment", "igam_entitlement"], as_index=False)["subsource_id"] .agg(lambda s: ",".join(s.unique())) .rename(columns={"subsource_id": "subsource_id_list"}) ) # List of tuples (IGAM_ENTITLEMENT, rar_subsource_id_list) — mirrors your log payload accessType2bValidList = list(zip( agg_df["igam_entitlement"].astype(str), agg_df["subsource_id_list"].astype(str) )) logger.info(f"accessType2bValidList : {accessType2bValidList}") # --- Entitlements for policy generation (unique, non-empty) --- igam_entitlements = ( igamRoleDF["igam_entitlement"] .dropna() .map(str) .str.strip() .loc[lambda s: s.ne("")] .drop_duplicates() .tolist() ) logger.info(f"Collected IGAM entitlements ({len(igam_entitlements)}): {igam_entitlements}") # --- Row-level permissions (per your existing API) --- params_row_level = ranger.add_table_permission_groups( config['corporate_store'], config['target_table'], config['access_type'], config['source_table'], igam_entitlements ) # --- Table-level permissions, merging in full-access entitlements if provided --- if ["full_access_entitlement_list"] != None: combined_entitlements = igam_entitlements + config['full_access_entitlement_list'] logger.info(f"Full-access entitlements provided ({len(config['full_access_entitlement_list'])}): {config['full_access_entitlement_list']}") else: combined_entitlements = igam_entitlements # ---- Optional: append full-access YAML if list provided on config ---- if config["full_access_entitlement_list"] != None: # If your code already provides a list, use it directly; otherwise split string. params_full = ranger.add_table_permission_groups( config["corporate_store"], config["target_table"], config["access_type"], # keep same access type per your pattern config["source_table"], combined_entitlements ) ranger.generate_policy(params_full, env_config) ranger.yaml_format_2b(params_row_level,env_config, config['full_access_entitlement_list']) # row-level policy logger.info("Final YAML format emitted for 2B.") def accessType_3(config,env_config, igamRoleDF): """ Python/pandas translation of the Scala accessType_3. Expects igamRoleDF to have at least: ['igam_entitlement', 'rar_subsource_id']. The `config` object should expose the attributes used below (names match your Scala/Python usage). Uses a YAML formatter module `ranger` with: - add_table_permission_groups(corporate_store, target_table, access_type, source_table, entitlements) - yaml_format_3(params) - yaml_format_1(params) """ # --- 1) Filter entitlements where rar_subsource_id = 'TMS' --- if not {"igam_entitlement", "subsource_id"}.issubset(igamRoleDF.columns): missing = {"igam_entitlement", "subsource_id"} - set(igamRoleDF.columns) raise KeyError(f"igamRoleDF missing required column(s): {sorted(missing)}") new_df = ( igamRoleDF.loc[ igamRoleDF["subsource_id"].astype(str).str.upper() == "TMS", ["igam_entitlement"] ].drop_duplicates() ) logger.info("new_df :\n%s", new_df.to_string(index=False)) accessType3ValidList = new_df["igam_entitlement"].astype(str).str.strip().tolist() # --- 2) Build params for row-level groups (type 3) --- params_row_level = ranger.add_table_permission_groups( config['corporate_store'], config['target_table'], config['access_type'], config['source_table'], accessType3ValidList ) corp = str(config['corporate_store']).lower() src_tbl = str(config['source_table']).lower() # --- 3) Compose the filter expressions (match Scala strings) --- sqlCreateView3NonRestrString_Ptree = ( "(parent_fk in ( " f"select portfolio_fk from {corp}.nh_portfolio_access " "where lower(user_id) LIKE concat('%', lower(regexp_extract(current_user(),'[^@]*',0)), '%') " "AND to_date(a_valid_to) > current_timestamp() " ")) AND (child_fk in ( " f"select portfolio_fk from {corp}.nh_portfolio_access " "where lower(user_id) LIKE concat('%', lower(regexp_extract(current_user(),'[^@]*',0)), '%') " "AND to_date(a_valid_to) > current_timestamp() " "))" ) sqlCreateView3NonRestrString_Pos = ( "position_key in ( " f"select position_key from {corp}.nh_portfolio_access a " f"inner join {corp}.nh_position b on ( " "(b.portfolio_fk = a.portfolio_fk and b.portfolio_fk is not NULL) or " "(b.portfolio_compare_fk = a.portfolio_fk and b.portfolio_compare_fk is not NULL) " ") " "where lower(user_id) LIKE concat('%', lower(regexp_extract(current_user(),'[^@]*',0)), '%') " "AND to_date(a_valid_to) > current_timestamp() " ")" ) sqlCreateView3PortAccess = "lower(user_id) LIKE concat('%', lower(regexp_extract(current_user(),'[^@]*',0)), '%')" sqlCreateView3LimAccess = "lower(user_id) LIKE concat('%', lower(regexp_extract(current_user(),'[^@]*',0)), '%')" # Standard case uses the configured key columns/table names key_col = getattr(config, "type3SourceTableKeyColumn", None) acc_col = getattr(config, "type3AccessTableKeyColumn", None) acc_table= getattr(config, "type3AccessTable", None) if not all([key_col, acc_col, acc_table]): # Only needed for the default branch; keep None if your config doesn't use the default key_col = key_col or "source_key_col" acc_col = acc_col or "access_key_col" acc_table = acc_table or "type3_access_table" sqlCreateView3NonRestrString_Stdrd = ( f"{key_col} in (select {acc_col} from {corp}.{acc_table} " "where lower(user_id) LIKE concat('%', lower(regexp_extract(current_user(),'[^@]*',0)), '%') " "AND to_date(a_valid_to) > current_timestamp())" ) # --- 4) Choose the filter by source table (matches Scala match/case) --- if src_tbl == "nh_portfoliotree": sqlCreateViewType3Filter = sqlCreateView3NonRestrString_Ptree elif src_tbl == "nh_position": sqlCreateViewType3Filter = sqlCreateView3NonRestrString_Pos elif src_tbl == "nh_portfolio_access": sqlCreateViewType3Filter = sqlCreateView3PortAccess elif src_tbl == "nh_limit_access": sqlCreateViewType3Filter = sqlCreateView3LimAccess else: sqlCreateViewType3Filter = sqlCreateView3NonRestrString_Stdrd # --- 5) Row filter YAML block (uses groups from params_row_level) --- # Expecting params_row_level like {'igam_roles': '...'}; adjust key if your API differs. igam_roles_lower = str(params_row_level.get("igam_roles", "")).lower() rowFilter = ( "- groups:\n" f" {igam_roles_lower}\n" " accesses:\n" " - select\n" f" filterExpr: \"{sqlCreateViewType3Filter}\"\n" " " ) # --- 6) Handle optional full access entitlements --- if config['full_access_entitlement_list']: paramsFullAccess = ranger.add_table_permission_groups( config['corporate_store'], config['target_table'], config['access_type'], config['source_table'], config['full_access_entitlement_list'] ) full_groups_lower = str(paramsFullAccess.get("igam_roles", "")).lower() """ fullAccessFilter = ( "- groups:\n" f" {full_groups_lower}\n" " accesses:\n" " - select\n" " filterExpr: \"1=1\"\n" " " ) """ params_table_level = ranger.add_table_permission_groups( config['corporate_store'], config['target_table'], config['access_type'], config['source_table'], accessType3ValidList + config['full_access_entitlement_list'] ) else: fullAccessFilter = "" params_table_level = ranger.add_table_permission_groups( config['corporate_store'], config['target_table'], config['access_type'], config['source_table'], accessType3ValidList ) # --- 7) Render YAML and merge like Scala --- ranger.yaml_format_3(params_row_level,env_config,sqlCreateViewType3Filter,config['full_access_entitlement_list'] ) # base type 3 yaml ranger.yaml_format_1(params_table_level,env_config) # table-level yaml ######################################################################################################################################################### ####################################STARTING DAG######################################################################################################### default_args = { 'owner': 'devo', 'depends_on_past': False, 'start_date': days_ago(1), 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1), } with DAG( dag_id='devo_table_generator_core', default_args=default_args, description='Core Devo table generator workflow for single table', schedule=None, catchup=False, tags=['DevoTableGenerator'], max_active_runs=10, max_active_tasks=16, ) as dag: # Init - read config from context def init_step(**context): dag_run = context.get("dag_run") ti = context["ti"] conf = (dag_run.conf or {}) if dag_run else {} env = os.getenv("MRDS_ENV") if not env: raise ValueError("MRDS_ENV environment variable is required") env = env.lower() store = conf.get("store") if not store: raise ValueError("store parameter is required") store = store.lower() owner_table = conf.get("owner_table") if not owner_table or '.' not in owner_table: raise ValueError("owner_table must be in format 'OWNER.TABLE_NAME'") table_owner, table_name = owner_table.split('.', 1) if env not in {"dev", "tst","acc","prd"}: raise ValueError(f"Unsupported env '{env}'. Expected 'dev', 'tst', 'acc' or 'prd'.") if store not in {"mopdb", "rar", 'rqsd'}: raise ValueError(f"Unsupported store '{store}'. Expected 'mopdb', 'rar', 'rqsd'.") if store.lower() == "mopdb": p_service_name = "MOPDB" elif store.lower() == "rar": p_service_name = "RAR" elif store.lower() == 'rqsd': p_service_name = "RQSD" if env == "dev" or env == "tst": sentry_role_environment = "TEST/INTEGRATION" elif env == "acc": sentry_role_environment = "ACCEPTANCE" elif env == 'prd': sentry_role_environment = "PRODUCTION" with open(ENV_CONFIG_PATH, "r") as f: cfg = yaml.safe_load(f) env_cfg = cfg[env] store_cfg = cfg[store] p_objectstore_uri = env_cfg["S3_LOCATION_URI"].replace("{0}", store.lower()) p_run_id = str(ti.run_id) logging.info("=== init_step begins === env=%s store=%s table=%s.%s run_id=%s sentry_role_environment=%s", env, store, table_owner, table_name, p_run_id, sentry_role_environment) if store.lower()=='rqsd': env_cfg["DEVO_SECRET"]=env_cfg["DEVO_SECRET_RQSD"] env_cfg["DEVO_USERNAME"]=env_cfg["DEVO_USERNAME_RQSD"] try: devo_secret_name = env_cfg["DEVO_SECRET"] env_cfg["DEVO_SECRET"]= get_secret(devo_secret_name) except: logger.error("Failed to retrieve credentials from secrets") raise(Exception) logging.info("=== init_step getting table info=== env=%s store=%s table=%s.%s run_id=%s devo_user=%s sentry_role_environment=%s", env, store, table_owner, table_name, p_run_id,env_cfg["DEVO_USERNAME"], sentry_role_environment) try: target_table=get_target_table(store_cfg['oracle_mgmt_table'],table_owner,table_name)['table_alias'][0] except Exception as e: logger.error("Table not found in oracle management table") logger.error("Exception: %s", e) logger.error("Traceback:\n%s", traceback.format_exc()) raise try: access_type=get_type_ofAccess(store_cfg['oracle_metadata_table'],table_owner,table_name,p_service_name)['rar3_type_of_access'][0] except Exception as e: logger.error("Table not found in oracle metadata inventory") logger.error("Exception: %s", e) logger.error("Traceback:\n%s", traceback.format_exc()) raise xcom = { 'db':store, 'env': env, 'run_id':p_run_id, 'corporate_store':store_cfg['corporate_store'], 'service_name': p_service_name, 'source_schema':table_owner, 'source_table':table_name, 'oracle_metadata_table':store_cfg['oracle_metadata_table'], 'oracle_igam_table':store_cfg['oracle_igam_table'], 'oracle_mgmt_table': store_cfg['oracle_mgmt_table'], 'target_table':target_table, 'sentry_role_environment':sentry_role_environment, 'target_s3_bucket': env_cfg["BUCKET_PREFIX"]+store_cfg['target_s3_bucket'] , 'tech_meta_data_fields': store_cfg['tech_meta_data_fields'], 'full_access_entitlement_list':env_cfg[f'FULL_ACCESS_LIST_{p_service_name.upper()}'].split(','), 'access_type': access_type, 'DEVO_USERNAME': env_cfg["DEVO_USERNAME"], 'DEVO_SECRET': env_cfg["DEVO_SECRET"], 'IMPALA_HOSTNAME': env_cfg["IMPALA_HOSTNAME"], 'HIVE_HOSTNAME': env_cfg["HIVE_HOSTNAME"], 'RANGER_HOSTNAME': env_cfg["RANGER_HOSTNAME"], 'BUCKET_PREFIX': env_cfg['BUCKET_PREFIX'], 'S3_LOCATION_URI': env_cfg["S3_LOCATION_URI"] } for k, v in xcom.items(): ti.xcom_push(key=k, value=v) init = PythonOperator( task_id='init_step', python_callable=init_step, ) # Start log table def start_log_table_task(**context): ti = context["ti"] p_run_id = ti.xcom_pull(task_ids='init_step', key='run_id') p_service_name = ti.xcom_pull(task_ids='init_step', key='service_name') p_table_owner = ti.xcom_pull(task_ids='init_step', key='table_owner') p_table_name = ti.xcom_pull(task_ids='init_step', key='table_name') oracle_conn = None try: oracle_conn = oraconn.connect('MRDS_LOADER') oraconn.run_proc( oracle_conn, 'MRDS_LOADER.DATA_REPLICATOR.end_log_table_generator', [p_run_id, p_service_name, p_table_owner, p_table_name,'success'] ) oracle_conn.commit() logging.info("start_log_table procedure executed successfully") except Exception as e: logging.error(f"Error in start_log_table: {e}") raise finally: if oracle_conn: oracle_conn.close() t1 = PythonOperator( task_id='start_log_table', python_callable=start_log_table_task, ) # Drop table def drop_table_task(**context): ti = context["ti"] config={"corporate_store":ti.xcom_pull(task_ids='init_step', key='corporate_store'), "target_table":ti.xcom_pull(task_ids='init_step', key='target_table'), 'access_type': ti.xcom_pull(task_ids='init_step', key='access_type'), 'env': ti.xcom_pull(task_ids='init_step', key='env') } env_config={'DEVO_USERNAME': ti.xcom_pull(task_ids='init_step', key="DEVO_USERNAME"), 'DEVO_SECRET': ti.xcom_pull(task_ids='init_step', key="DEVO_SECRET"), 'IMPALA_HOSTNAME':ti.xcom_pull(task_ids='init_step', key="IMPALA_HOSTNAME"), 'HIVE_HOSTNAME': ti.xcom_pull(task_ids='init_step', key="HIVE_HOSTNAME"), 'RANGER_HOSTNAME': ti.xcom_pull(task_ids='init_step', key="RANGER_HOSTNAME"), 'BUCKET_PREFIX': ti.xcom_pull(task_ids='init_step', key='BUCKET_PREFIX'), 'S3_LOCATION_URI':ti.xcom_pull(task_ids='init_step', key="S3_LOCATION_URI")} try: deleteExternalTable(config,env_config) logging.info("drop_table procedure executed successfully") except Exception as e: logging.error(f"Error in drop_table: {e}") raise t2 = PythonOperator( task_id='drop_table', python_callable=drop_table_task, trigger_rule=TriggerRule.ALL_DONE, ) # Devo Iextarnal table creation def devo_table_creation_task(**context): ti = context["ti"] config={"corporate_store":ti.xcom_pull(task_ids='init_step', key='corporate_store'), "target_table":ti.xcom_pull(task_ids='init_step', key='target_table'), 'access_type': ti.xcom_pull(task_ids='init_step', key='access_type'), 'source_schema': ti.xcom_pull(task_ids='init_step', key='source_schema'), 'source_table': ti.xcom_pull(task_ids='init_step', key='source_table'), 'oracle_metadata_table': ti.xcom_pull(task_ids='init_step', key='oracle_metadata_table'), 'target_s3_bucket': ti.xcom_pull(task_ids='init_step', key='target_s3_bucket'), 'tech_meta_data_fields': ti.xcom_pull(task_ids='init_step', key='tech_meta_data_fields'), 'env': ti.xcom_pull(task_ids='init_step', key='env') } env_config={'DEVO_USERNAME': ti.xcom_pull(task_ids='init_step', key="DEVO_USERNAME"), 'DEVO_SECRET': ti.xcom_pull(task_ids='init_step', key="DEVO_SECRET"), 'IMPALA_HOSTNAME':ti.xcom_pull(task_ids='init_step', key="IMPALA_HOSTNAME"), 'HIVE_HOSTNAME': ti.xcom_pull(task_ids='init_step', key="HIVE_HOSTNAME"), 'RANGER_HOSTNAME': ti.xcom_pull(task_ids='init_step', key="RANGER_HOSTNAME"), 'BUCKET_PREFIX': ti.xcom_pull(task_ids='init_step', key='BUCKET_PREFIX'), 'S3_LOCATION_URI':ti.xcom_pull(task_ids='init_step', key="S3_LOCATION_URI")} tableFields=loadMetadataTable(config) logging.info("Starting table creation on hive with env=%s store=%s corporate_store=%s table=%s devo_user=%s", ti.xcom_pull(task_ids='init_step', key='env'), ti.xcom_pull(task_ids='init_step', key='db'), ti.xcom_pull(task_ids='init_step', key='corporate_store'), ti.xcom_pull(task_ids='init_step', key='target_table'),env_config["DEVO_USERNAME"]) if (config['target_table'][-4:].upper() == '_EXT'): createExternalTables( config, tableFields,env_config ) else: createTableFromExternal( config, tableFields,env_config) try: logging.info("Impyla (Devo) task finished successfully.") except Exception as e: logging.error(f"Error in devo_impyla_task: {e}") raise t3 = PythonOperator( task_id='devo_table_creation_task', python_callable=devo_table_creation_task, trigger_rule=TriggerRule.ALL_DONE, ) # Devo def ranger_policy_creation(**context): ti = context["ti"] config={"corporate_store":ti.xcom_pull(task_ids='init_step', key='corporate_store'), 'source_table': ti.xcom_pull(task_ids='init_step', key='source_table'), "target_table":ti.xcom_pull(task_ids='init_step', key='target_table'), 'access_type': ti.xcom_pull(task_ids='init_step', key='access_type'), 'source_schema': ti.xcom_pull(task_ids='init_step', key='source_schema'), 'service_name': ti.xcom_pull(task_ids='init_step', key='service_name'), 'oracle_metadata_table': ti.xcom_pull(task_ids='init_step', key='oracle_metadata_table'), 'target_s3_bucket': ti.xcom_pull(task_ids='init_step', key='target_s3_bucket'), 'tech_meta_data_fields': ti.xcom_pull(task_ids='init_step', key='tech_meta_data_fields'), 'sentry_role_environment': ti.xcom_pull(task_ids='init_step', key='sentry_role_environment'), 'full_access_entitlement_list': ti.xcom_pull(task_ids='init_step', key='full_access_entitlement_list'), 'oracle_igam_table':ti.xcom_pull(task_ids='init_step', key='oracle_igam_table'), 'env': ti.xcom_pull(task_ids='init_step', key='env') } env_config={'DEVO_USERNAME': ti.xcom_pull(task_ids='init_step', key="DEVO_USERNAME"), 'DEVO_SECRET': ti.xcom_pull(task_ids='init_step', key="DEVO_SECRET"), 'IMPALA_HOSTNAME':ti.xcom_pull(task_ids='init_step', key="IMPALA_HOSTNAME"), 'HIVE_HOSTNAME': ti.xcom_pull(task_ids='init_step', key="HIVE_HOSTNAME"), 'RANGER_HOSTNAME': ti.xcom_pull(task_ids='init_step', key="RANGER_HOSTNAME"), 'BUCKET_PREFIX': ti.xcom_pull(task_ids='init_step', key='BUCKET_PREFIX'), 'S3_LOCATION_URI':ti.xcom_pull(task_ids='init_step', key="S3_LOCATION_URI")} logging.info("Starting Policy creation with env=%s store=%s corporate_store=%s table=%s sentry_role_environment=%s", ti.xcom_pull(task_ids='init_step', key='env'), ti.xcom_pull(task_ids='init_step', key='db'), ti.xcom_pull(task_ids='init_step', key='corporate_store'), ti.xcom_pull(task_ids='init_step', key='target_table'),ti.xcom_pull(task_ids='init_step', key='sentry_role_environment')) if config['target_table'][-4:].upper() != '_EXT': igamRoles=readIGAMRoles(config) logger.info(accessTypeMapper(config,env_config,igamRoles)) try: logging.info("Impyla (Devo) task finished successfully.") except Exception as e: logging.error(f"Error in devo_impyla_task: {e}") raise t4 = PythonOperator( task_id='ranger_policy_creation', python_callable=ranger_policy_creation, trigger_rule=TriggerRule.ALL_DONE, ) # End log table def end_log_table_task(**context): ti = context["ti"] p_service_name = ti.xcom_pull(task_ids='init_step', key='p_service_name') p_table_owner = ti.xcom_pull(task_ids='init_step', key='p_table_owner') p_table_name = ti.xcom_pull(task_ids='init_step', key='p_table_name') oracle_conn = None try: oracle_conn = oraconn.connect('MRDS_LOADER') oraconn.run_proc( oracle_conn, 'MRDS_LOADER.DATA_REPLICATOR.end_log_table', [p_service_name, p_table_owner, p_table_name] ) oracle_conn.commit() logging.info("end_log_table procedure executed successfully") except Exception as e: logging.error(f"Error in end_log_table: {e}") logging.info("Continuing despite end_log_table error (cleanup task)") finally: if oracle_conn: oracle_conn.close() t5 = PythonOperator( task_id='end_log_table', python_callable=end_log_table_task, trigger_rule=TriggerRule.ALL_DONE, ) # Check status and fail if needed def fail_if_any_failed(**context): dag_run = context['dag_run'] check_tasks = ['init_step','start_log_table', 'drop_table', 'devo_table_creation_task','ranger_policy_creation', 'end_log_table'] failed = [] for tid in check_tasks: ti_up = dag_run.get_task_instance(tid) if ti_up and ti_up.state == 'failed': failed.append(tid) if failed: ti = context["ti"] p_run_id = ti.xcom_pull(task_ids='init_step', key='run_id') p_service_name = ti.xcom_pull(task_ids='init_step', key='service_name') p_table_owner = ti.xcom_pull(task_ids='init_step', key='table_owner') p_table_name = ti.xcom_pull(task_ids='init_step', key='table_name') oracle_conn = None try: oracle_conn = oraconn.connect('MRDS_LOADER') oraconn.run_proc( oracle_conn, 'MRDS_LOADER.DATA_REPLICATOR.end_log_table_generator', [p_run_id, p_service_name, p_table_owner, p_table_name,f"The Following task failed: {failed[0]}"] ) oracle_conn.commit() logging.info("start_log_table procedure executed successfully") except Exception as e: logging.error(f"Error in start_log_table: {e}") raise finally: if oracle_conn: oracle_conn.close() error_msg = f"Critical task(s) failed: {', '.join(failed)}. DAG execution failed." logging.error(error_msg) raise AirflowFailException(error_msg) logging.info("All critical tasks completed successfully: %s", check_tasks) t6 = PythonOperator( task_id='fail_if_any_failed', python_callable=fail_if_any_failed, trigger_rule=TriggerRule.ALL_DONE, ) # Dependencies init >> t1 >> t2 >> t3>> t4 [t1, t2, t3, t4] >> t5 t5 >> t6