init

2026-03-02 09:47:35 +01:00
commit 2c225d68ac
715 changed files with 130067 additions and 0 deletions
--- a/python/devo_replicator/data_replicator/.gitkeep
+++ b/python/devo_replicator/data_replicator/.gitkeep
--- a/python/devo_replicator/data_replicator/diag_s3_access.py
+++ b/python/devo_replicator/data_replicator/diag_s3_access.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+import sys, json
+import boto3
+from botocore.exceptions import ClientError
+from botocore.config import Config
+
+BUCKET = "devo-crp-sbc9vbsu"
+PREFIX = "mopdb/db/"   # adjust if needed
+
+def show(e):
+    # Print the structured error if present
+    resp = getattr(e, "response", {})
+    code = resp.get("Error", {}).get("Code")
+    msg = resp.get("Error", {}).get("Message")
+    rid = resp.get("ResponseMetadata", {}).get("RequestId")
+    print(f"{type(e).__name__}: {code} {msg} (RequestId={rid})", file=sys.stderr)
+
+def main(endpoint_url=None, region=None, force_path=False):
+    session = boto3.Session()
+    cfg = Config(s3={"addressing_style": "path" if force_path else "auto"})
+    s3 = session.client("s3", region_name=region, endpoint_url=endpoint_url, config=cfg)
+    sts = session.client("sts", region_name=region)
+
+    # Who am I?
+    try:
+        ident = sts.get_caller_identity()
+        print(f"Caller: {ident['Arn']} (acct {ident['Account']})")
+    except Exception as e:
+        print("Could not call STS get-caller-identity — credentials not valid for STS.", file=sys.stderr)
+        show(e); return 1
+
+    # Is the bucket reachable at all?
+    try:
+        s3.head_bucket(Bucket=BUCKET)
+        print(f"head_bucket OK on s3://{BUCKET}")
+    except ClientError as e:
+        print("head_bucket failed:", file=sys.stderr)
+        show(e); return 2
+
+    # List with zero keys to test just the ListBucket permission
+    try:
+        s3.list_objects_v2(Bucket=BUCKET, Prefix=PREFIX, MaxKeys=0)
+        print(f"list_objects_v2 OK on prefix '{PREFIX}' (permission exists)")
+    except ClientError as e:
+        print("list_objects_v2 failed:", file=sys.stderr)
+        show(e); return 3
+
+    # Ask for 1 key to confirm data path works
+    try:
+        resp = s3.list_objects_v2(Bucket=BUCKET, Prefix=PREFIX, MaxKeys=1)
+        print("First key:", resp.get("Contents", [{}])[0].get("Key"))
+    except ClientError as e:
+        print("list_objects_v2 (MaxKeys=1) failed:", file=sys.stderr)
+        show(e); return 4
+
+    return 0
+
+if __name__ == "__main__":
+    # Allow optional args: --endpoint-url URL --region eu-central-1 --force-path
+    url = None; reg = None; force = False
+    for i,a in enumerate(sys.argv):
+        if a == "--endpoint-url": url = sys.argv[i+1]
+        if a == "--region": reg = sys.argv[i+1]
+        if a == "--force-path": force = True
+    sys.exit(main(endpoint_url=url, region=reg, force_path=force))
--- a/python/devo_replicator/data_replicator/impala_refresher.py
+++ b/python/devo_replicator/data_replicator/impala_refresher.py
@@ -0,0 +1,129 @@
+import os
+import yaml
+import datetime
+import pandas as pd
+from  mrds.utils.secrets import  get_secret
+import mrds.utils.manage_runs as runManager
+import mrds.utils.manage_files as fileManager
+import mrds.utils.sql_statements as sqls
+
+import oci
+
+from impala.dbapi import (
+    connect,
+    ProgrammingError,
+    DatabaseError,
+    IntegrityError,
+    OperationalError,
+)
+from impala.error import HiveServer2Error
+
+
+def get_impala_connection(hostname: str, user: str, secret: str):
+    conn = connect(
+        host=hostname,
+        port=443,
+        auth_mechanism="PLAIN",
+        user=user,
+        password=secret,
+        use_http_transport=True,
+        http_path="cliservice",
+        use_ssl=True
+        )
+    return conn
+
+def execute_query(query: str,user,hostname,password):
+    conn = get_impala_connection(hostname, user, password)
+    print(conn)
+    columns, result = execute_devo_query(query, conn)
+    return columns, result
+
+def execute_devo_query(query: str, conn):
+    #impersonation_configuration = {"impala.doas.user": userid} # to be changed
+    #impersonation_configuration = {} # to be changed
+    cursor = conn.cursor()
+    print("executing query")
+    try:
+        cursor.execute(query)
+        # Check if the query is a SELECT query (i.e., reads data)
+        return None, cursor.rowcount  # rowcount returns the number of rows affected
+    except OperationalError as oe:
+        raise Exception(
+            status_code=500, detail="Failed to connect to Impala: " + str(oe)
+        )
+
+    except ProgrammingError as pe:
+        raise Exception(status_code=400, detail="Query syntax error: " + str(pe))
+
+    except IntegrityError as ie:
+        raise Exception(
+            status_code=403, detail="Insufficient permissions: " + str(ie)
+        )
+
+    except DatabaseError as db_err:
+        raise Exception(status_code=500, detail="Database error: " + str(db_err))
+
+    except HiveServer2Error as au_err:
+        raise Exception(
+            status_code=403, detail="HiveServer2Error error: " + str(au_err)
+        )
+
+    except Exception as e:
+        raise Exception(
+            status_code=500, detail="An unexpected error occurred: " + str(e)
+        ) from e
+
+    finally:
+        try:
+            if cursor:
+                cursor.close()
+            if conn:
+                conn.close()
+        except Exception as e:
+            raise Exception(
+                status_code=500, detail="Failed to close the connection: " + str(e)
+            )
+
+def initialize_task(workflow_context, task_name):
+    # Initialize task
+    a_task_history_key = runManager.init_task(
+        task_name,
+        workflow_context["run_id"],
+        workflow_context["a_workflow_history_key"],
+    )
+
+    return a_task_history_key
+
+def initialize_config(config_file_path): 
+    # Ensure the file exists
+    if not os.path.exists(config_file_path):
+        raise FileNotFoundError(f"Configuration file {config_file_path} not found.")
+
+    # Load the configuration
+    with open(config_file_path, "r") as f:
+        config_data = yaml.safe_load(f)
+
+    return config_data
+
+def main(env_config_path, env, table, corporate_store):
+
+    #init setup
+    envs_info = initialize_config(env_config_path)
+    environment_info = envs_info[env]
+
+    try:
+        devo_secret_name = environment_info["DEVO_SECRET"]
+        password = get_secret(devo_secret_name)
+    except:
+        print("Failed to retrieve credentials from secrets")    
+        raise(Exception)
+    # get devo data
+    try:
+        execute_query(f"INVALIDATE METADATA {corporate_store}.{table}", environment_info['DEVO_USERNAME'], environment_info['IMPALA_HOSTNAME'], password)
+        execute_query(f"COMPUTE STATS {corporate_store}.{table}", environment_info['DEVO_USERNAME'], environment_info['IMPALA_HOSTNAME'], password)
+
+    except:
+        print("Failed to retrieve DEVO data, error during connection or request")
+        raise(Exception)
+    return True
+
--- a/python/devo_replicator/data_replicator/list_s3_files_fast.py
+++ b/python/devo_replicator/data_replicator/list_s3_files_fast.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+import argparse, sys
+from urllib.parse import urlparse
+import boto3
+from botocore.config import Config
+from botocore.exceptions import ClientError, EndpointConnectionError, NoCredentialsError, ReadTimeoutError, ConnectTimeoutError
+
+def parse_s3_uri(s3_uri: str):
+    if not s3_uri.startswith("s3://"):
+        raise ValueError("S3 URI must start with 's3://'")
+    p = urlparse(s3_uri)
+    if not p.netloc:
+        raise ValueError("Missing bucket in S3 URI")
+    return p.netloc, p.path.lstrip("/")
+
+def parse_location(location: str):
+    """Accept s3://... OR https://...amazonaws.com/... and return (bucket, prefix)."""
+    if location.startswith("s3://"):
+        return parse_s3_uri(location)
+
+    if location.startswith(("http://", "https://")):
+        p = urlparse(location)
+        host = p.netloc
+        path = p.path.lstrip("/")
+        # Bucket-scoped VPCe host: <bucket>.bucket.vpce-xxxx.s3.<region>.vpce.amazonaws.com
+        if ".bucket." in host:
+            bucket = host.split(".bucket.", 1)[0]
+            return bucket, path
+        # Virtual-hosted: <bucket>.s3.<region>...
+        if ".s3." in host and not host.startswith("s3."):
+            bucket = host.split(".s3.", 1)[0]
+            return bucket, path
+        # Path-style: s3.<region>.../<bucket>/...
+        if host.startswith("s3."):
+            parts = path.split("/", 1)
+            bucket = parts[0]
+            prefix = parts[1] if len(parts) > 1 else ""
+            return bucket, prefix
+
+    raise ValueError(f"Unsupported location: {location}")
+
+def iter_keys(s3, bucket: str, prefix: str, page_size: int, max_items: int, verbose: bool):
+    print('here')
+    paginator = s3.get_paginator("list_objects_v2")
+    kwargs = {"Bucket": bucket, "Prefix": prefix}
+    pagination = {"PageSize": page_size}
+    if max_items > 0:
+        pagination["MaxItems"] = max_items
+
+    total = 0
+    page_num = 0
+    for page in paginator.paginate(**kwargs, PaginationConfig=pagination):
+        page_num += 1
+        contents = page.get("Contents", []) or []
+        if verbose:
+            print(f"[page {page_num}] fetched {len(contents)} keys (running total={total + len(contents)})",
+                  file=sys.stderr, flush=True)
+        for obj in contents:
+            key = obj["Key"]
+            yield key
+            total += 1
+
+def main():
+    ap = argparse.ArgumentParser(description="List files under an S3 location quickly and safely.")
+    ap.add_argument("location", help="s3://bucket/prefix/ OR https://<vpc-endpoint-host>/<prefix>")
+    ap.add_argument("--region", default=None, help="AWS region (e.g., eu-central-1)")
+    ap.add_argument("--profile", default=None, help="AWS profile to use")
+    ap.add_argument("--endpoint-url", default=None,
+                    help="Custom S3 endpoint (e.g., https://s3.eu-central-1.vpce.amazonaws.com)")
+    ap.add_argument("--force-path-addressing", action="store_true",
+                    help="Force path-style addressing (useful with bucket-scoped VPCe hostnames)")
+    ap.add_argument("--page-size", type=int, default=1000, help="S3 page size (default 1000)")
+    ap.add_argument("--max-items", type=int, default=0, help="Stop after N keys (0 = no limit)")
+    ap.add_argument("--connect-timeout", type=float, default=10.0, help="Seconds (default 10)")
+    ap.add_argument("--read-timeout", type=float, default=30.0, help="Seconds (default 30)")
+    ap.add_argument("--retries", type=int, default=3, help="Max retry attempts (default 3)")
+    ap.add_argument("--relative", action="store_true", help="Print keys relative to the prefix")
+    ap.add_argument("--verbose", "-v", action="store_true", help="Print progress to stderr")
+    args = ap.parse_args()
+
+    bucket, prefix = parse_location(args.location)
+
+    # Session & client with explicit timeouts and optional path addressing
+    sess_kwargs = {}
+    if args.profile:
+        sess_kwargs["profile_name"] = args.profile
+    session = boto3.Session(**sess_kwargs)
+
+    cfg = Config(
+        connect_timeout=args.connect_timeout,
+        read_timeout=args.read_timeout,
+        retries={"max_attempts": args.retries, "mode": "standard"},
+        s3={"addressing_style": "path" if args.force_path_addressing else "auto"},
+    )
+
+    s3 = session.client("s3", region_name=args.region, endpoint_url=args.endpoint_url, config=cfg)
+
+    # Quick preflight: try a 0-key list to surface auth/endpoint issues fast
+    try:
+        _ = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, MaxKeys=0)
+    except ClientError as e:
+        print(f"Preflight failed (auth/permissions/endpoint): {e}", file=sys.stderr)
+        sys.exit(1)
+    except (EndpointConnectionError, ReadTimeoutError, ConnectTimeoutError) as e:
+        print(f"Network/endpoint error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    try:
+        for key in iter_keys(s3, bucket, prefix, args.page_size, args.max_items, args.verbose):
+            if args.relative and prefix and key.startswith(prefix):
+                print(key[len(prefix):].lstrip("/"))
+            else:
+                print(f"s3://{bucket}/{key}")
+    except KeyboardInterrupt:
+        print("\nInterrupted.", file=sys.stderr)
+        sys.exit(130)
+    except NoCredentialsError:
+        print("No AWS credentials found. Set env vars or use --profile.", file=sys.stderr)
+        sys.exit(1)
+    except (EndpointConnectionError, ReadTimeoutError, ConnectTimeoutError) as e:
+        print(f"Network/timeout listing objects: {e}", file=sys.stderr)
+        sys.exit(1)
+    except ClientError as e:
+        print(f"AWS error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()