init

2026-03-02 09:47:35 +01:00
commit 2c225d68ac
715 changed files with 130067 additions and 0 deletions
--- a/python/mrds_common/mrds/utils/xml_utils.py
+++ b/python/mrds_common/mrds/utils/xml_utils.py
@@ -0,0 +1,177 @@
+import xmlschema
+import hashlib
+from lxml import etree
+from typing import Dict, List
+
+
+def validate_xml(xml_file, xsd_file):
+    try:
+        # Create an XMLSchema instance with strict validation
+        schema = xmlschema.XMLSchema(xsd_file, validation="strict")
+        # Validate the XML file
+        schema.validate(xml_file)
+        return True, "XML file is valid against the provided XSD schema."
+    except xmlschema.validators.exceptions.XMLSchemaValidationError as e:
+        return False, f"XML validation error: {str(e)}"
+    except xmlschema.validators.exceptions.XMLSchemaException as e:
+        return False, f"XML schema error: {str(e)}"
+    except Exception as e:
+        return False, f"An error occurred during XML validation: {str(e)}"
+
+
+def extract_data(
+    filename,
+    xpath_columns,         # List[(expr, header, is_key)]
+    xml_position_columns,  # List[(expr, header)]
+    namespaces,
+    workflow_context,
+    encoding_type="utf-8",
+):
+    """
+    Parses an XML file using XPath expressions and extracts data.
+
+    Parameters:
+    - filename (str): The path to the XML file to parse.
+    - xpath_columns (list): A list of tuples, each containing:
+        - XPath expression (str)
+        - CSV column header (str)
+        - Indicator if the field is a key ('Y' or 'N')
+    - xml_position_columns (list)
+    - namespaces (dict): Namespace mapping needed for lxml's xpath()
+
+    Returns:
+    - dict: A dictionary containing headers and rows with extracted data.
+    """
+
+    parser = etree.XMLParser(remove_blank_text=True)
+    tree = etree.parse(filename, parser)
+    root = tree.getroot()
+
+    # Separate out key vs non‐key columns
+    key_cols    = [ (expr, h) for expr, h, k in xpath_columns if k == "Y" ]
+    nonkey_cols = [ (expr, h) for expr, h, k in xpath_columns if k == "N" ]
+
+    # Evaluate every non‐key XPath and keep the ELEMENT nodes
+    nonkey_elements = {}
+    for expr, header in nonkey_cols:
+        elems = root.xpath(expr, namespaces=namespaces)
+        nonkey_elements[header] = elems
+
+    # figure out how many rows total we need
+    # that's the maximum length of any of the nonkey lists
+    if nonkey_elements:
+        row_count = max(len(lst) for lst in nonkey_elements.values())
+    else:
+        row_count = 0
+
+    # pad every nonkey list up to row_count with `None`
+    for header, lst in nonkey_elements.items():
+        if len(lst) < row_count:
+            lst.extend([None] * (row_count - len(lst)))
+
+    # key columns
+    key_values = []
+    for expr, header in key_cols:
+        nodes = root.xpath(expr, namespaces=namespaces)
+        if not nodes:
+            key_values.append("")
+        else:
+            first = nodes[0]
+            txt = (first.text if isinstance(first, etree._Element) else str(first)) or ""
+            key_values.append(txt.strip())
+
+    # xml_position columns
+    xml_positions = {}
+    for expr, header in xml_position_columns:
+        xml_positions[header] = root.xpath(expr, namespaces=namespaces)
+
+    # prepare headers
+    headers = [h for _, h in nonkey_cols] + [h for _, h in key_cols] + [h for _, h in xml_position_columns]
+
+    # build rows
+    rows = []
+    for i in range(row_count):
+        row = []
+
+        # non‐key data
+        for expr, header in nonkey_cols:
+            elem = nonkey_elements[header][i]
+            text = ""
+            if isinstance(elem, etree._Element):
+                text = elem.text or ""
+            elif elem is not None:
+                text = str(elem)
+            row.append(text.strip())
+
+        # key columns
+        row.extend(key_values)
+
+        # xml_position columns
+        for expr, header in xml_position_columns:
+            if not nonkey_cols:
+                row.append("")
+                continue
+
+            first_header = nonkey_cols[0][1]
+            data_elem = nonkey_elements[first_header][i]
+            if data_elem is None:
+                row.append("")
+                continue
+
+            target_list = xml_positions[header]
+            current = data_elem
+            found = None
+            while current is not None:
+                if current in target_list:
+                    found = current
+                    break
+                current = current.getparent()
+
+            if not found:
+                row.append("")
+            else:
+                # compute full‐path with indices
+                path_elems = []
+                walk = found
+                while walk is not None:
+                    idx = 1 + sum(1 for s in walk.itersiblings(preceding=True) if s.tag == walk.tag)
+                    path_elems.append(f"{walk.tag}[{idx}]")
+                    walk = walk.getparent()
+                full_path = "/" + "/".join(reversed(path_elems))
+                row.append(_xml_pos_hasher(full_path, workflow_context["a_workflow_history_key"]))
+
+        rows.append(row)
+
+    return {"headers": headers, "rows": rows}
+
+
+def _xml_pos_hasher(input_string, salt, hash_length=15):
+    """
+    Helps hashing xml positions.
+
+    Parameters:
+        input_string (str): The string to hash.
+        salt (int): The integer salt to ensure deterministic, run-specific behavior.
+        hash_length (int): The desired length of the resulting hash (default is 15 digits).
+
+    Returns:
+        int: A deterministic integer hash of the specified length.
+    """
+    # Ensure the hash length is valid
+    if hash_length <= 0:
+        raise ValueError("Hash length must be a positive integer.")
+    
+    # Combine the input string with the salt to create a deterministic input
+    salted_input = f"{salt}:{input_string}"
+    
+    # Generate a SHA-256 hash of the salted input
+    hash_object = hashlib.sha256(salted_input.encode())
+    full_hash = hash_object.hexdigest()
+    
+    # Convert the hash to an integer
+    hash_integer = int(full_hash, 16)
+    
+    # Truncate or pad the hash to the desired length
+    truncated_hash = str(hash_integer)[:hash_length]
+    
+    return int(truncated_hash)