init
This commit is contained in:
52
python/mrds_common/mrds/processors/csv_processor.py
Normal file
52
python/mrds_common/mrds/processors/csv_processor.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import logging
|
||||
import csv
|
||||
import os
|
||||
from .base import TaskProcessor
|
||||
|
||||
|
||||
class CSVTaskProcessor(TaskProcessor):
|
||||
|
||||
def _extract(self):
|
||||
input_path = self.global_config.source_filepath
|
||||
output_path = self.output_filepath
|
||||
encoding = self.global_config.encoding_type
|
||||
|
||||
logging.info(f"Reading source CSV file at '{input_path}'")
|
||||
|
||||
# Open both input & output at once for streaming row-by-row
|
||||
temp_output = output_path + ".tmp"
|
||||
with open(input_path, newline="", encoding=encoding) as inf, open(
|
||||
temp_output, newline="", encoding=encoding, mode="w"
|
||||
) as outf:
|
||||
|
||||
reader = csv.reader(inf)
|
||||
writer = csv.writer(outf, quoting=csv.QUOTE_ALL)
|
||||
|
||||
# Read and parse the header
|
||||
headers = next(reader)
|
||||
|
||||
# Build the list of headers to keep + their new names
|
||||
headers_to_keep = [old for _, old in self.csv_entries]
|
||||
headers_rename = [new for new, _ in self.csv_entries]
|
||||
|
||||
# Check if all specified headers exist in the input file
|
||||
missing = [h for h in headers_to_keep if h not in headers]
|
||||
if missing:
|
||||
raise ValueError(
|
||||
f"The following headers are not in the input CSV: {missing}"
|
||||
)
|
||||
|
||||
# Determine the indices of the headers to keep
|
||||
indices = [headers.index(old) for old in headers_to_keep]
|
||||
|
||||
# Write the renamed header
|
||||
writer.writerow(headers_rename)
|
||||
|
||||
# Stream through every data row and write out the filtered columns
|
||||
for row in reader:
|
||||
filtered = [row[i] for i in indices]
|
||||
writer.writerow(filtered)
|
||||
|
||||
# Atomically replace the old file
|
||||
os.replace(temp_output, output_path)
|
||||
logging.info(f"Core data written to CSV file at '{output_path}'")
|
||||
Reference in New Issue
Block a user