This commit is contained in:
Grzegorz Michalski
2026-03-02 09:47:35 +01:00
commit 2c225d68ac
715 changed files with 130067 additions and 0 deletions

View File

@@ -0,0 +1,177 @@
import xmlschema
import hashlib
from lxml import etree
from typing import Dict, List
def validate_xml(xml_file, xsd_file):
try:
# Create an XMLSchema instance with strict validation
schema = xmlschema.XMLSchema(xsd_file, validation="strict")
# Validate the XML file
schema.validate(xml_file)
return True, "XML file is valid against the provided XSD schema."
except xmlschema.validators.exceptions.XMLSchemaValidationError as e:
return False, f"XML validation error: {str(e)}"
except xmlschema.validators.exceptions.XMLSchemaException as e:
return False, f"XML schema error: {str(e)}"
except Exception as e:
return False, f"An error occurred during XML validation: {str(e)}"
def extract_data(
filename,
xpath_columns, # List[(expr, header, is_key)]
xml_position_columns, # List[(expr, header)]
namespaces,
workflow_context,
encoding_type="utf-8",
):
"""
Parses an XML file using XPath expressions and extracts data.
Parameters:
- filename (str): The path to the XML file to parse.
- xpath_columns (list): A list of tuples, each containing:
- XPath expression (str)
- CSV column header (str)
- Indicator if the field is a key ('Y' or 'N')
- xml_position_columns (list)
- namespaces (dict): Namespace mapping needed for lxml's xpath()
Returns:
- dict: A dictionary containing headers and rows with extracted data.
"""
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(filename, parser)
root = tree.getroot()
# Separate out key vs nonkey columns
key_cols = [ (expr, h) for expr, h, k in xpath_columns if k == "Y" ]
nonkey_cols = [ (expr, h) for expr, h, k in xpath_columns if k == "N" ]
# Evaluate every nonkey XPath and keep the ELEMENT nodes
nonkey_elements = {}
for expr, header in nonkey_cols:
elems = root.xpath(expr, namespaces=namespaces)
nonkey_elements[header] = elems
# figure out how many rows total we need
# that's the maximum length of any of the nonkey lists
if nonkey_elements:
row_count = max(len(lst) for lst in nonkey_elements.values())
else:
row_count = 0
# pad every nonkey list up to row_count with `None`
for header, lst in nonkey_elements.items():
if len(lst) < row_count:
lst.extend([None] * (row_count - len(lst)))
# key columns
key_values = []
for expr, header in key_cols:
nodes = root.xpath(expr, namespaces=namespaces)
if not nodes:
key_values.append("")
else:
first = nodes[0]
txt = (first.text if isinstance(first, etree._Element) else str(first)) or ""
key_values.append(txt.strip())
# xml_position columns
xml_positions = {}
for expr, header in xml_position_columns:
xml_positions[header] = root.xpath(expr, namespaces=namespaces)
# prepare headers
headers = [h for _, h in nonkey_cols] + [h for _, h in key_cols] + [h for _, h in xml_position_columns]
# build rows
rows = []
for i in range(row_count):
row = []
# nonkey data
for expr, header in nonkey_cols:
elem = nonkey_elements[header][i]
text = ""
if isinstance(elem, etree._Element):
text = elem.text or ""
elif elem is not None:
text = str(elem)
row.append(text.strip())
# key columns
row.extend(key_values)
# xml_position columns
for expr, header in xml_position_columns:
if not nonkey_cols:
row.append("")
continue
first_header = nonkey_cols[0][1]
data_elem = nonkey_elements[first_header][i]
if data_elem is None:
row.append("")
continue
target_list = xml_positions[header]
current = data_elem
found = None
while current is not None:
if current in target_list:
found = current
break
current = current.getparent()
if not found:
row.append("")
else:
# compute fullpath with indices
path_elems = []
walk = found
while walk is not None:
idx = 1 + sum(1 for s in walk.itersiblings(preceding=True) if s.tag == walk.tag)
path_elems.append(f"{walk.tag}[{idx}]")
walk = walk.getparent()
full_path = "/" + "/".join(reversed(path_elems))
row.append(_xml_pos_hasher(full_path, workflow_context["a_workflow_history_key"]))
rows.append(row)
return {"headers": headers, "rows": rows}
def _xml_pos_hasher(input_string, salt, hash_length=15):
"""
Helps hashing xml positions.
Parameters:
input_string (str): The string to hash.
salt (int): The integer salt to ensure deterministic, run-specific behavior.
hash_length (int): The desired length of the resulting hash (default is 15 digits).
Returns:
int: A deterministic integer hash of the specified length.
"""
# Ensure the hash length is valid
if hash_length <= 0:
raise ValueError("Hash length must be a positive integer.")
# Combine the input string with the salt to create a deterministic input
salted_input = f"{salt}:{input_string}"
# Generate a SHA-256 hash of the salted input
hash_object = hashlib.sha256(salted_input.encode())
full_hash = hash_object.hexdigest()
# Convert the hash to an integer
hash_integer = int(full_hash, 16)
# Truncate or pad the hash to the desired length
truncated_hash = str(hash_integer)[:hash_length]
return int(truncated_hash)