init
This commit is contained in:
0
python/devo_replicator/data_replicator/.gitkeep
Normal file
0
python/devo_replicator/data_replicator/.gitkeep
Normal file
65
python/devo_replicator/data_replicator/diag_s3_access.py
Normal file
65
python/devo_replicator/data_replicator/diag_s3_access.py
Normal file
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys, json
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError
|
||||
from botocore.config import Config
|
||||
|
||||
BUCKET = "devo-crp-sbc9vbsu"
|
||||
PREFIX = "mopdb/db/" # adjust if needed
|
||||
|
||||
def show(e):
|
||||
# Print the structured error if present
|
||||
resp = getattr(e, "response", {})
|
||||
code = resp.get("Error", {}).get("Code")
|
||||
msg = resp.get("Error", {}).get("Message")
|
||||
rid = resp.get("ResponseMetadata", {}).get("RequestId")
|
||||
print(f"{type(e).__name__}: {code} {msg} (RequestId={rid})", file=sys.stderr)
|
||||
|
||||
def main(endpoint_url=None, region=None, force_path=False):
|
||||
session = boto3.Session()
|
||||
cfg = Config(s3={"addressing_style": "path" if force_path else "auto"})
|
||||
s3 = session.client("s3", region_name=region, endpoint_url=endpoint_url, config=cfg)
|
||||
sts = session.client("sts", region_name=region)
|
||||
|
||||
# Who am I?
|
||||
try:
|
||||
ident = sts.get_caller_identity()
|
||||
print(f"Caller: {ident['Arn']} (acct {ident['Account']})")
|
||||
except Exception as e:
|
||||
print("Could not call STS get-caller-identity — credentials not valid for STS.", file=sys.stderr)
|
||||
show(e); return 1
|
||||
|
||||
# Is the bucket reachable at all?
|
||||
try:
|
||||
s3.head_bucket(Bucket=BUCKET)
|
||||
print(f"head_bucket OK on s3://{BUCKET}")
|
||||
except ClientError as e:
|
||||
print("head_bucket failed:", file=sys.stderr)
|
||||
show(e); return 2
|
||||
|
||||
# List with zero keys to test just the ListBucket permission
|
||||
try:
|
||||
s3.list_objects_v2(Bucket=BUCKET, Prefix=PREFIX, MaxKeys=0)
|
||||
print(f"list_objects_v2 OK on prefix '{PREFIX}' (permission exists)")
|
||||
except ClientError as e:
|
||||
print("list_objects_v2 failed:", file=sys.stderr)
|
||||
show(e); return 3
|
||||
|
||||
# Ask for 1 key to confirm data path works
|
||||
try:
|
||||
resp = s3.list_objects_v2(Bucket=BUCKET, Prefix=PREFIX, MaxKeys=1)
|
||||
print("First key:", resp.get("Contents", [{}])[0].get("Key"))
|
||||
except ClientError as e:
|
||||
print("list_objects_v2 (MaxKeys=1) failed:", file=sys.stderr)
|
||||
show(e); return 4
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Allow optional args: --endpoint-url URL --region eu-central-1 --force-path
|
||||
url = None; reg = None; force = False
|
||||
for i,a in enumerate(sys.argv):
|
||||
if a == "--endpoint-url": url = sys.argv[i+1]
|
||||
if a == "--region": reg = sys.argv[i+1]
|
||||
if a == "--force-path": force = True
|
||||
sys.exit(main(endpoint_url=url, region=reg, force_path=force))
|
||||
129
python/devo_replicator/data_replicator/impala_refresher.py
Normal file
129
python/devo_replicator/data_replicator/impala_refresher.py
Normal file
@@ -0,0 +1,129 @@
|
||||
import os
|
||||
import yaml
|
||||
import datetime
|
||||
import pandas as pd
|
||||
from mrds.utils.secrets import get_secret
|
||||
import mrds.utils.manage_runs as runManager
|
||||
import mrds.utils.manage_files as fileManager
|
||||
import mrds.utils.sql_statements as sqls
|
||||
|
||||
import oci
|
||||
|
||||
from impala.dbapi import (
|
||||
connect,
|
||||
ProgrammingError,
|
||||
DatabaseError,
|
||||
IntegrityError,
|
||||
OperationalError,
|
||||
)
|
||||
from impala.error import HiveServer2Error
|
||||
|
||||
|
||||
def get_impala_connection(hostname: str, user: str, secret: str):
|
||||
conn = connect(
|
||||
host=hostname,
|
||||
port=443,
|
||||
auth_mechanism="PLAIN",
|
||||
user=user,
|
||||
password=secret,
|
||||
use_http_transport=True,
|
||||
http_path="cliservice",
|
||||
use_ssl=True
|
||||
)
|
||||
return conn
|
||||
|
||||
def execute_query(query: str,user,hostname,password):
|
||||
conn = get_impala_connection(hostname, user, password)
|
||||
print(conn)
|
||||
columns, result = execute_devo_query(query, conn)
|
||||
return columns, result
|
||||
|
||||
def execute_devo_query(query: str, conn):
|
||||
#impersonation_configuration = {"impala.doas.user": userid} # to be changed
|
||||
#impersonation_configuration = {} # to be changed
|
||||
cursor = conn.cursor()
|
||||
print("executing query")
|
||||
try:
|
||||
cursor.execute(query)
|
||||
# Check if the query is a SELECT query (i.e., reads data)
|
||||
return None, cursor.rowcount # rowcount returns the number of rows affected
|
||||
except OperationalError as oe:
|
||||
raise Exception(
|
||||
status_code=500, detail="Failed to connect to Impala: " + str(oe)
|
||||
)
|
||||
|
||||
except ProgrammingError as pe:
|
||||
raise Exception(status_code=400, detail="Query syntax error: " + str(pe))
|
||||
|
||||
except IntegrityError as ie:
|
||||
raise Exception(
|
||||
status_code=403, detail="Insufficient permissions: " + str(ie)
|
||||
)
|
||||
|
||||
except DatabaseError as db_err:
|
||||
raise Exception(status_code=500, detail="Database error: " + str(db_err))
|
||||
|
||||
except HiveServer2Error as au_err:
|
||||
raise Exception(
|
||||
status_code=403, detail="HiveServer2Error error: " + str(au_err)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
status_code=500, detail="An unexpected error occurred: " + str(e)
|
||||
) from e
|
||||
|
||||
finally:
|
||||
try:
|
||||
if cursor:
|
||||
cursor.close()
|
||||
if conn:
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
status_code=500, detail="Failed to close the connection: " + str(e)
|
||||
)
|
||||
|
||||
def initialize_task(workflow_context, task_name):
|
||||
# Initialize task
|
||||
a_task_history_key = runManager.init_task(
|
||||
task_name,
|
||||
workflow_context["run_id"],
|
||||
workflow_context["a_workflow_history_key"],
|
||||
)
|
||||
|
||||
return a_task_history_key
|
||||
|
||||
def initialize_config(config_file_path):
|
||||
# Ensure the file exists
|
||||
if not os.path.exists(config_file_path):
|
||||
raise FileNotFoundError(f"Configuration file {config_file_path} not found.")
|
||||
|
||||
# Load the configuration
|
||||
with open(config_file_path, "r") as f:
|
||||
config_data = yaml.safe_load(f)
|
||||
|
||||
return config_data
|
||||
|
||||
def main(env_config_path, env, table, corporate_store):
|
||||
|
||||
#init setup
|
||||
envs_info = initialize_config(env_config_path)
|
||||
environment_info = envs_info[env]
|
||||
|
||||
try:
|
||||
devo_secret_name = environment_info["DEVO_SECRET"]
|
||||
password = get_secret(devo_secret_name)
|
||||
except:
|
||||
print("Failed to retrieve credentials from secrets")
|
||||
raise(Exception)
|
||||
# get devo data
|
||||
try:
|
||||
execute_query(f"INVALIDATE METADATA {corporate_store}.{table}", environment_info['DEVO_USERNAME'], environment_info['IMPALA_HOSTNAME'], password)
|
||||
execute_query(f"COMPUTE STATS {corporate_store}.{table}", environment_info['DEVO_USERNAME'], environment_info['IMPALA_HOSTNAME'], password)
|
||||
|
||||
except:
|
||||
print("Failed to retrieve DEVO data, error during connection or request")
|
||||
raise(Exception)
|
||||
return True
|
||||
|
||||
128
python/devo_replicator/data_replicator/list_s3_files_fast.py
Normal file
128
python/devo_replicator/data_replicator/list_s3_files_fast.py
Normal file
@@ -0,0 +1,128 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse, sys
|
||||
from urllib.parse import urlparse
|
||||
import boto3
|
||||
from botocore.config import Config
|
||||
from botocore.exceptions import ClientError, EndpointConnectionError, NoCredentialsError, ReadTimeoutError, ConnectTimeoutError
|
||||
|
||||
def parse_s3_uri(s3_uri: str):
|
||||
if not s3_uri.startswith("s3://"):
|
||||
raise ValueError("S3 URI must start with 's3://'")
|
||||
p = urlparse(s3_uri)
|
||||
if not p.netloc:
|
||||
raise ValueError("Missing bucket in S3 URI")
|
||||
return p.netloc, p.path.lstrip("/")
|
||||
|
||||
def parse_location(location: str):
|
||||
"""Accept s3://... OR https://...amazonaws.com/... and return (bucket, prefix)."""
|
||||
if location.startswith("s3://"):
|
||||
return parse_s3_uri(location)
|
||||
|
||||
if location.startswith(("http://", "https://")):
|
||||
p = urlparse(location)
|
||||
host = p.netloc
|
||||
path = p.path.lstrip("/")
|
||||
# Bucket-scoped VPCe host: <bucket>.bucket.vpce-xxxx.s3.<region>.vpce.amazonaws.com
|
||||
if ".bucket." in host:
|
||||
bucket = host.split(".bucket.", 1)[0]
|
||||
return bucket, path
|
||||
# Virtual-hosted: <bucket>.s3.<region>...
|
||||
if ".s3." in host and not host.startswith("s3."):
|
||||
bucket = host.split(".s3.", 1)[0]
|
||||
return bucket, path
|
||||
# Path-style: s3.<region>.../<bucket>/...
|
||||
if host.startswith("s3."):
|
||||
parts = path.split("/", 1)
|
||||
bucket = parts[0]
|
||||
prefix = parts[1] if len(parts) > 1 else ""
|
||||
return bucket, prefix
|
||||
|
||||
raise ValueError(f"Unsupported location: {location}")
|
||||
|
||||
def iter_keys(s3, bucket: str, prefix: str, page_size: int, max_items: int, verbose: bool):
|
||||
print('here')
|
||||
paginator = s3.get_paginator("list_objects_v2")
|
||||
kwargs = {"Bucket": bucket, "Prefix": prefix}
|
||||
pagination = {"PageSize": page_size}
|
||||
if max_items > 0:
|
||||
pagination["MaxItems"] = max_items
|
||||
|
||||
total = 0
|
||||
page_num = 0
|
||||
for page in paginator.paginate(**kwargs, PaginationConfig=pagination):
|
||||
page_num += 1
|
||||
contents = page.get("Contents", []) or []
|
||||
if verbose:
|
||||
print(f"[page {page_num}] fetched {len(contents)} keys (running total={total + len(contents)})",
|
||||
file=sys.stderr, flush=True)
|
||||
for obj in contents:
|
||||
key = obj["Key"]
|
||||
yield key
|
||||
total += 1
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description="List files under an S3 location quickly and safely.")
|
||||
ap.add_argument("location", help="s3://bucket/prefix/ OR https://<vpc-endpoint-host>/<prefix>")
|
||||
ap.add_argument("--region", default=None, help="AWS region (e.g., eu-central-1)")
|
||||
ap.add_argument("--profile", default=None, help="AWS profile to use")
|
||||
ap.add_argument("--endpoint-url", default=None,
|
||||
help="Custom S3 endpoint (e.g., https://s3.eu-central-1.vpce.amazonaws.com)")
|
||||
ap.add_argument("--force-path-addressing", action="store_true",
|
||||
help="Force path-style addressing (useful with bucket-scoped VPCe hostnames)")
|
||||
ap.add_argument("--page-size", type=int, default=1000, help="S3 page size (default 1000)")
|
||||
ap.add_argument("--max-items", type=int, default=0, help="Stop after N keys (0 = no limit)")
|
||||
ap.add_argument("--connect-timeout", type=float, default=10.0, help="Seconds (default 10)")
|
||||
ap.add_argument("--read-timeout", type=float, default=30.0, help="Seconds (default 30)")
|
||||
ap.add_argument("--retries", type=int, default=3, help="Max retry attempts (default 3)")
|
||||
ap.add_argument("--relative", action="store_true", help="Print keys relative to the prefix")
|
||||
ap.add_argument("--verbose", "-v", action="store_true", help="Print progress to stderr")
|
||||
args = ap.parse_args()
|
||||
|
||||
bucket, prefix = parse_location(args.location)
|
||||
|
||||
# Session & client with explicit timeouts and optional path addressing
|
||||
sess_kwargs = {}
|
||||
if args.profile:
|
||||
sess_kwargs["profile_name"] = args.profile
|
||||
session = boto3.Session(**sess_kwargs)
|
||||
|
||||
cfg = Config(
|
||||
connect_timeout=args.connect_timeout,
|
||||
read_timeout=args.read_timeout,
|
||||
retries={"max_attempts": args.retries, "mode": "standard"},
|
||||
s3={"addressing_style": "path" if args.force_path_addressing else "auto"},
|
||||
)
|
||||
|
||||
s3 = session.client("s3", region_name=args.region, endpoint_url=args.endpoint_url, config=cfg)
|
||||
|
||||
# Quick preflight: try a 0-key list to surface auth/endpoint issues fast
|
||||
try:
|
||||
_ = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, MaxKeys=0)
|
||||
except ClientError as e:
|
||||
print(f"Preflight failed (auth/permissions/endpoint): {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except (EndpointConnectionError, ReadTimeoutError, ConnectTimeoutError) as e:
|
||||
print(f"Network/endpoint error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
for key in iter_keys(s3, bucket, prefix, args.page_size, args.max_items, args.verbose):
|
||||
if args.relative and prefix and key.startswith(prefix):
|
||||
print(key[len(prefix):].lstrip("/"))
|
||||
else:
|
||||
print(f"s3://{bucket}/{key}")
|
||||
except KeyboardInterrupt:
|
||||
print("\nInterrupted.", file=sys.stderr)
|
||||
sys.exit(130)
|
||||
except NoCredentialsError:
|
||||
print("No AWS credentials found. Set env vars or use --profile.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except (EndpointConnectionError, ReadTimeoutError, ConnectTimeoutError) as e:
|
||||
print(f"Network/timeout listing objects: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except ClientError as e:
|
||||
print(f"AWS error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user