mars-elt/python/mrds_common/mrds/utils/utils.py

import re


def parse_uri_with_regex(uri):
    """
    Parses an Oracle Object Storage URI using regular expressions to extract the namespace,
    bucket name, prefix, and object name.

    Parameters:
        uri (str): The URI string to parse, in the format '/n/{namespace}/b/{bucketname}/o/{object_path}'

    Returns:
        tuple: A tuple containing (namespace, bucket_name, prefix, object_name)
    """
    # Define the regular expression pattern
    pattern = r"^/n/([^/]+)/b/([^/]+)/o/(.*)$"

    # Match the pattern against the URI
    match = re.match(pattern, uri)

    if not match:
        raise ValueError("Invalid URI format")

    # Extract namespace, bucket name, and object path from the matched groups
    namespace = match.group(1)
    bucket_name = match.group(2)
    object_path = match.group(3)

    # Split the object path into prefix and object name
    if "/" in object_path:
        # Split at the last '/' to separate prefix and object name
        prefix, object_name = object_path.rsplit("/", 1)
        # Ensure the prefix ends with a '/'
        prefix += "/"
    else:
        # If there is no '/', there is no prefix
        prefix = ""
        object_name = object_path

    return namespace, bucket_name, prefix, object_name


def parse_output_columns(output_columns):
    xpath_entries = []
    csv_entries = []
    static_entries = []
    a_key_entries = []
    workflow_key_entries = []
    xml_position_entries = []
    column_order = []

    for entry in output_columns:
        entry_type = entry["type"]
        column_header = entry["column_header"]
        column_order.append(column_header)

        if entry_type == "xpath":
            xpath_expr = entry["value"]
            is_key = entry["is_key"]
            xpath_entries.append((xpath_expr, column_header, is_key))
        elif entry_type == "csv_header":
            value = entry["value"]
            csv_entries.append((column_header, value))
        elif entry_type == "static":
            value = entry["value"]
            static_entries.append((column_header, value))
        elif entry_type == "a_key":
            a_key_entries.append(column_header)
        elif entry_type == "workflow_key":
            workflow_key_entries.append(column_header)
        elif entry_type == "xpath_element_id": # TODO - update all xml_position namings to xpath_element_id
            xpath_expr = entry["value"]
            xml_position_entries.append((xpath_expr, column_header))

    return (
        xpath_entries,
        csv_entries,
        static_entries,
        a_key_entries,
        workflow_key_entries,
        xml_position_entries,
        column_order,
    )