init
This commit is contained in:
211
python/mrds_common/mrds/processors/base.py
Normal file
211
python/mrds_common/mrds/processors/base.py
Normal file
@@ -0,0 +1,211 @@
|
||||
import logging
|
||||
import os
|
||||
import csv
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from mrds.utils.utils import parse_output_columns
|
||||
|
||||
from mrds.utils import (
|
||||
manage_files,
|
||||
manage_runs,
|
||||
objectstore,
|
||||
static_vars,
|
||||
)
|
||||
|
||||
|
||||
OUTPUT_FILENAME_TEMPLATE = "{output_table}-{task_history_key}.csv"
|
||||
STATUS_SUCCESS = static_vars.status_success # duplicated needs to be moved #TODO
|
||||
|
||||
|
||||
class TaskProcessor(ABC):
|
||||
def __init__(self, global_config, task_conf, client, workflow_context):
|
||||
self.global_config = global_config
|
||||
self.task_conf = task_conf
|
||||
self.client = client
|
||||
self.workflow_context = workflow_context
|
||||
self._init_common()
|
||||
self._post_init()
|
||||
|
||||
def _init_common(self):
|
||||
# Initialize task
|
||||
self.a_task_history_key = manage_runs.init_task(
|
||||
self.task_conf.task_name,
|
||||
self.workflow_context["run_id"],
|
||||
self.workflow_context["a_workflow_history_key"],
|
||||
)
|
||||
logging.info(f"Task initialized with history key: {self.a_task_history_key}")
|
||||
|
||||
# Define output file paths
|
||||
self.output_filename = OUTPUT_FILENAME_TEMPLATE.format(
|
||||
output_table=self.task_conf.output_table,
|
||||
task_history_key=self.a_task_history_key,
|
||||
)
|
||||
self.output_filepath = os.path.join(
|
||||
self.global_config.tmpdir, self.output_filename
|
||||
)
|
||||
|
||||
# Parse the output_columns
|
||||
(
|
||||
self.xpath_entries,
|
||||
self.csv_entries,
|
||||
self.static_entries,
|
||||
self.a_key_entries,
|
||||
self.workflow_key_entries,
|
||||
self.xml_position_entries,
|
||||
self.column_order,
|
||||
) = parse_output_columns(self.task_conf.output_columns)
|
||||
|
||||
def _post_init(self):
|
||||
"""Optional hook for classes to override"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _extract(self):
|
||||
"""Non-optional hook for classes to override"""
|
||||
pass
|
||||
|
||||
def _enrich(self):
|
||||
"""
|
||||
Stream-based enrich: read one row at a time, append static/A-key/workflow-key,
|
||||
reorder columns, and write out immediately.
|
||||
"""
|
||||
|
||||
TASK_HISTORY_MULTIPLIER = 1_000_000_000
|
||||
|
||||
logging.info(f"Enriching CSV file at '{self.output_filepath}'")
|
||||
|
||||
temp_output = self.output_filepath + ".tmp"
|
||||
encoding = self.global_config.encoding_type
|
||||
|
||||
with open(self.output_filepath, newline="", encoding=encoding) as inf, open(
|
||||
temp_output, newline="", encoding=encoding, mode="w"
|
||||
) as outf:
|
||||
|
||||
reader = csv.reader(inf)
|
||||
writer = csv.writer(outf, quoting=csv.QUOTE_ALL)
|
||||
|
||||
# Read the original header
|
||||
original_headers = next(reader)
|
||||
|
||||
# Compute the full set of headers
|
||||
headers = list(original_headers)
|
||||
|
||||
# Add static column headers if missing
|
||||
for col_name, _ in self.static_entries:
|
||||
if col_name not in headers:
|
||||
headers.append(col_name)
|
||||
|
||||
# Add A-key column headers if missing
|
||||
for col_name in self.a_key_entries:
|
||||
if col_name not in headers:
|
||||
headers.append(col_name)
|
||||
|
||||
# Add workflow key column headers if missing
|
||||
for col_name in self.workflow_key_entries:
|
||||
if col_name not in headers:
|
||||
headers.append(col_name)
|
||||
|
||||
# Rearrange headers to desired ordr
|
||||
header_to_index = {h: i for i, h in enumerate(headers)}
|
||||
out_indices = [
|
||||
header_to_index[h] for h in self.column_order if h in header_to_index
|
||||
]
|
||||
out_headers = [headers[i] for i in out_indices]
|
||||
|
||||
# Write the new header
|
||||
writer.writerow(out_headers)
|
||||
|
||||
# Stream each row, enrich in-place, reorder, and write
|
||||
row_count = 0
|
||||
base_task_history = int(self.a_task_history_key) * TASK_HISTORY_MULTIPLIER
|
||||
|
||||
for i, in_row in enumerate(reader, start=1):
|
||||
# Build a working list that matches `headers` order.
|
||||
# Start by copying the existing columns (or '' if missing)
|
||||
work_row = [None] * len(headers)
|
||||
for j, h in enumerate(original_headers):
|
||||
idx = header_to_index[h]
|
||||
work_row[idx] = in_row[j]
|
||||
|
||||
# Fill static columns
|
||||
for col_name, value in self.static_entries:
|
||||
idx = header_to_index[col_name]
|
||||
work_row[idx] = value
|
||||
|
||||
# Fill A-key columns
|
||||
for col_name in self.a_key_entries:
|
||||
idx = header_to_index[col_name]
|
||||
a_key_value = base_task_history + i
|
||||
work_row[idx] = str(a_key_value)
|
||||
|
||||
# Fill workflow key columns
|
||||
wf_val = self.workflow_context["a_workflow_history_key"]
|
||||
for col_name in self.workflow_key_entries:
|
||||
idx = header_to_index[col_name]
|
||||
work_row[idx] = wf_val
|
||||
|
||||
# Reorder to output order and write
|
||||
out_row = [work_row[j] for j in out_indices]
|
||||
writer.writerow(out_row)
|
||||
row_count += 1
|
||||
|
||||
# Atomically replace
|
||||
os.replace(temp_output, self.output_filepath)
|
||||
logging.info(
|
||||
f"CSV file enriched at '{self.output_filepath}', {row_count} rows generated"
|
||||
)
|
||||
|
||||
def _upload(self):
|
||||
# Upload CSV to object store
|
||||
logging.info(
|
||||
f"Uploading CSV file to '{self.global_config.bucket}/{self.task_conf.ods_prefix}/{self.output_filename}'"
|
||||
)
|
||||
objectstore.upload_file(
|
||||
self.client,
|
||||
self.output_filepath,
|
||||
self.global_config.bucket_namespace,
|
||||
self.global_config.bucket,
|
||||
self.task_conf.ods_prefix,
|
||||
self.output_filename,
|
||||
)
|
||||
logging.info(
|
||||
f"CSV file uploaded to '{self.global_config.bucket}/{self.task_conf.ods_prefix}/{self.output_filename}'"
|
||||
)
|
||||
|
||||
def _process_remote(self):
|
||||
# Process the source file
|
||||
logging.info(f"Processing source file '{self.output_filename}' with CT_MRDS.FILE_MANAGER.PROCESS_SOURCE_FILE database function.")
|
||||
try:
|
||||
manage_files.process_source_file(
|
||||
self.task_conf.ods_prefix, self.output_filename
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
f"Processing source file '{self.output_filename}' failed. Cleaning up..."
|
||||
)
|
||||
objectstore.delete_file(
|
||||
self.client,
|
||||
self.output_filename,
|
||||
self.global_config.bucket_namespace,
|
||||
self.global_config.bucket,
|
||||
self.task_conf.ods_prefix,
|
||||
)
|
||||
logging.error(
|
||||
f"CSV file '{self.global_config.bucket}/{self.task_conf.ods_prefix}/{self.output_filename}' deleted."
|
||||
)
|
||||
raise
|
||||
else:
|
||||
logging.info(f"Source file '{self.output_filename}' processed")
|
||||
|
||||
def _finalize(self):
|
||||
# Finalize task
|
||||
manage_runs.finalise_task(self.a_task_history_key, STATUS_SUCCESS)
|
||||
logging.info(f"Task '{self.task_conf.task_name}' completed successfully")
|
||||
|
||||
def process(self):
|
||||
# main processor function
|
||||
self._extract()
|
||||
self._enrich()
|
||||
self._upload()
|
||||
self._process_remote()
|
||||
self._finalize()
|
||||
Reference in New Issue
Block a user