This commit is contained in:
Grzegorz Michalski
2026-03-02 09:47:35 +01:00
commit 2c225d68ac
715 changed files with 130067 additions and 0 deletions

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env python3
import sys, json
import boto3
from botocore.exceptions import ClientError
from botocore.config import Config
BUCKET = "devo-crp-sbc9vbsu"
PREFIX = "mopdb/db/" # adjust if needed
def show(e):
# Print the structured error if present
resp = getattr(e, "response", {})
code = resp.get("Error", {}).get("Code")
msg = resp.get("Error", {}).get("Message")
rid = resp.get("ResponseMetadata", {}).get("RequestId")
print(f"{type(e).__name__}: {code} {msg} (RequestId={rid})", file=sys.stderr)
def main(endpoint_url=None, region=None, force_path=False):
session = boto3.Session()
cfg = Config(s3={"addressing_style": "path" if force_path else "auto"})
s3 = session.client("s3", region_name=region, endpoint_url=endpoint_url, config=cfg)
sts = session.client("sts", region_name=region)
# Who am I?
try:
ident = sts.get_caller_identity()
print(f"Caller: {ident['Arn']} (acct {ident['Account']})")
except Exception as e:
print("Could not call STS get-caller-identity — credentials not valid for STS.", file=sys.stderr)
show(e); return 1
# Is the bucket reachable at all?
try:
s3.head_bucket(Bucket=BUCKET)
print(f"head_bucket OK on s3://{BUCKET}")
except ClientError as e:
print("head_bucket failed:", file=sys.stderr)
show(e); return 2
# List with zero keys to test just the ListBucket permission
try:
s3.list_objects_v2(Bucket=BUCKET, Prefix=PREFIX, MaxKeys=0)
print(f"list_objects_v2 OK on prefix '{PREFIX}' (permission exists)")
except ClientError as e:
print("list_objects_v2 failed:", file=sys.stderr)
show(e); return 3
# Ask for 1 key to confirm data path works
try:
resp = s3.list_objects_v2(Bucket=BUCKET, Prefix=PREFIX, MaxKeys=1)
print("First key:", resp.get("Contents", [{}])[0].get("Key"))
except ClientError as e:
print("list_objects_v2 (MaxKeys=1) failed:", file=sys.stderr)
show(e); return 4
return 0
if __name__ == "__main__":
# Allow optional args: --endpoint-url URL --region eu-central-1 --force-path
url = None; reg = None; force = False
for i,a in enumerate(sys.argv):
if a == "--endpoint-url": url = sys.argv[i+1]
if a == "--region": reg = sys.argv[i+1]
if a == "--force-path": force = True
sys.exit(main(endpoint_url=url, region=reg, force_path=force))

View File

@@ -0,0 +1,129 @@
import os
import yaml
import datetime
import pandas as pd
from mrds.utils.secrets import get_secret
import mrds.utils.manage_runs as runManager
import mrds.utils.manage_files as fileManager
import mrds.utils.sql_statements as sqls
import oci
from impala.dbapi import (
connect,
ProgrammingError,
DatabaseError,
IntegrityError,
OperationalError,
)
from impala.error import HiveServer2Error
def get_impala_connection(hostname: str, user: str, secret: str):
conn = connect(
host=hostname,
port=443,
auth_mechanism="PLAIN",
user=user,
password=secret,
use_http_transport=True,
http_path="cliservice",
use_ssl=True
)
return conn
def execute_query(query: str,user,hostname,password):
conn = get_impala_connection(hostname, user, password)
print(conn)
columns, result = execute_devo_query(query, conn)
return columns, result
def execute_devo_query(query: str, conn):
#impersonation_configuration = {"impala.doas.user": userid} # to be changed
#impersonation_configuration = {} # to be changed
cursor = conn.cursor()
print("executing query")
try:
cursor.execute(query)
# Check if the query is a SELECT query (i.e., reads data)
return None, cursor.rowcount # rowcount returns the number of rows affected
except OperationalError as oe:
raise Exception(
status_code=500, detail="Failed to connect to Impala: " + str(oe)
)
except ProgrammingError as pe:
raise Exception(status_code=400, detail="Query syntax error: " + str(pe))
except IntegrityError as ie:
raise Exception(
status_code=403, detail="Insufficient permissions: " + str(ie)
)
except DatabaseError as db_err:
raise Exception(status_code=500, detail="Database error: " + str(db_err))
except HiveServer2Error as au_err:
raise Exception(
status_code=403, detail="HiveServer2Error error: " + str(au_err)
)
except Exception as e:
raise Exception(
status_code=500, detail="An unexpected error occurred: " + str(e)
) from e
finally:
try:
if cursor:
cursor.close()
if conn:
conn.close()
except Exception as e:
raise Exception(
status_code=500, detail="Failed to close the connection: " + str(e)
)
def initialize_task(workflow_context, task_name):
# Initialize task
a_task_history_key = runManager.init_task(
task_name,
workflow_context["run_id"],
workflow_context["a_workflow_history_key"],
)
return a_task_history_key
def initialize_config(config_file_path):
# Ensure the file exists
if not os.path.exists(config_file_path):
raise FileNotFoundError(f"Configuration file {config_file_path} not found.")
# Load the configuration
with open(config_file_path, "r") as f:
config_data = yaml.safe_load(f)
return config_data
def main(env_config_path, env, table, corporate_store):
#init setup
envs_info = initialize_config(env_config_path)
environment_info = envs_info[env]
try:
devo_secret_name = environment_info["DEVO_SECRET"]
password = get_secret(devo_secret_name)
except:
print("Failed to retrieve credentials from secrets")
raise(Exception)
# get devo data
try:
execute_query(f"INVALIDATE METADATA {corporate_store}.{table}", environment_info['DEVO_USERNAME'], environment_info['IMPALA_HOSTNAME'], password)
execute_query(f"COMPUTE STATS {corporate_store}.{table}", environment_info['DEVO_USERNAME'], environment_info['IMPALA_HOSTNAME'], password)
except:
print("Failed to retrieve DEVO data, error during connection or request")
raise(Exception)
return True

View File

@@ -0,0 +1,128 @@
#!/usr/bin/env python3
import argparse, sys
from urllib.parse import urlparse
import boto3
from botocore.config import Config
from botocore.exceptions import ClientError, EndpointConnectionError, NoCredentialsError, ReadTimeoutError, ConnectTimeoutError
def parse_s3_uri(s3_uri: str):
if not s3_uri.startswith("s3://"):
raise ValueError("S3 URI must start with 's3://'")
p = urlparse(s3_uri)
if not p.netloc:
raise ValueError("Missing bucket in S3 URI")
return p.netloc, p.path.lstrip("/")
def parse_location(location: str):
"""Accept s3://... OR https://...amazonaws.com/... and return (bucket, prefix)."""
if location.startswith("s3://"):
return parse_s3_uri(location)
if location.startswith(("http://", "https://")):
p = urlparse(location)
host = p.netloc
path = p.path.lstrip("/")
# Bucket-scoped VPCe host: <bucket>.bucket.vpce-xxxx.s3.<region>.vpce.amazonaws.com
if ".bucket." in host:
bucket = host.split(".bucket.", 1)[0]
return bucket, path
# Virtual-hosted: <bucket>.s3.<region>...
if ".s3." in host and not host.startswith("s3."):
bucket = host.split(".s3.", 1)[0]
return bucket, path
# Path-style: s3.<region>.../<bucket>/...
if host.startswith("s3."):
parts = path.split("/", 1)
bucket = parts[0]
prefix = parts[1] if len(parts) > 1 else ""
return bucket, prefix
raise ValueError(f"Unsupported location: {location}")
def iter_keys(s3, bucket: str, prefix: str, page_size: int, max_items: int, verbose: bool):
print('here')
paginator = s3.get_paginator("list_objects_v2")
kwargs = {"Bucket": bucket, "Prefix": prefix}
pagination = {"PageSize": page_size}
if max_items > 0:
pagination["MaxItems"] = max_items
total = 0
page_num = 0
for page in paginator.paginate(**kwargs, PaginationConfig=pagination):
page_num += 1
contents = page.get("Contents", []) or []
if verbose:
print(f"[page {page_num}] fetched {len(contents)} keys (running total={total + len(contents)})",
file=sys.stderr, flush=True)
for obj in contents:
key = obj["Key"]
yield key
total += 1
def main():
ap = argparse.ArgumentParser(description="List files under an S3 location quickly and safely.")
ap.add_argument("location", help="s3://bucket/prefix/ OR https://<vpc-endpoint-host>/<prefix>")
ap.add_argument("--region", default=None, help="AWS region (e.g., eu-central-1)")
ap.add_argument("--profile", default=None, help="AWS profile to use")
ap.add_argument("--endpoint-url", default=None,
help="Custom S3 endpoint (e.g., https://s3.eu-central-1.vpce.amazonaws.com)")
ap.add_argument("--force-path-addressing", action="store_true",
help="Force path-style addressing (useful with bucket-scoped VPCe hostnames)")
ap.add_argument("--page-size", type=int, default=1000, help="S3 page size (default 1000)")
ap.add_argument("--max-items", type=int, default=0, help="Stop after N keys (0 = no limit)")
ap.add_argument("--connect-timeout", type=float, default=10.0, help="Seconds (default 10)")
ap.add_argument("--read-timeout", type=float, default=30.0, help="Seconds (default 30)")
ap.add_argument("--retries", type=int, default=3, help="Max retry attempts (default 3)")
ap.add_argument("--relative", action="store_true", help="Print keys relative to the prefix")
ap.add_argument("--verbose", "-v", action="store_true", help="Print progress to stderr")
args = ap.parse_args()
bucket, prefix = parse_location(args.location)
# Session & client with explicit timeouts and optional path addressing
sess_kwargs = {}
if args.profile:
sess_kwargs["profile_name"] = args.profile
session = boto3.Session(**sess_kwargs)
cfg = Config(
connect_timeout=args.connect_timeout,
read_timeout=args.read_timeout,
retries={"max_attempts": args.retries, "mode": "standard"},
s3={"addressing_style": "path" if args.force_path_addressing else "auto"},
)
s3 = session.client("s3", region_name=args.region, endpoint_url=args.endpoint_url, config=cfg)
# Quick preflight: try a 0-key list to surface auth/endpoint issues fast
try:
_ = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, MaxKeys=0)
except ClientError as e:
print(f"Preflight failed (auth/permissions/endpoint): {e}", file=sys.stderr)
sys.exit(1)
except (EndpointConnectionError, ReadTimeoutError, ConnectTimeoutError) as e:
print(f"Network/endpoint error: {e}", file=sys.stderr)
sys.exit(1)
try:
for key in iter_keys(s3, bucket, prefix, args.page_size, args.max_items, args.verbose):
if args.relative and prefix and key.startswith(prefix):
print(key[len(prefix):].lstrip("/"))
else:
print(f"s3://{bucket}/{key}")
except KeyboardInterrupt:
print("\nInterrupted.", file=sys.stderr)
sys.exit(130)
except NoCredentialsError:
print("No AWS credentials found. Set env vars or use --profile.", file=sys.stderr)
sys.exit(1)
except (EndpointConnectionError, ReadTimeoutError, ConnectTimeoutError) as e:
print(f"Network/timeout listing objects: {e}", file=sys.stderr)
sys.exit(1)
except ClientError as e:
print(f"AWS error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()