Files
mars-elt/python/mrds_common/mrds/processors/csv_processor.py
Grzegorz Michalski 2c225d68ac init
2026-03-02 09:47:35 +01:00

53 lines
1.8 KiB
Python

import logging
import csv
import os
from .base import TaskProcessor
class CSVTaskProcessor(TaskProcessor):
def _extract(self):
input_path = self.global_config.source_filepath
output_path = self.output_filepath
encoding = self.global_config.encoding_type
logging.info(f"Reading source CSV file at '{input_path}'")
# Open both input & output at once for streaming row-by-row
temp_output = output_path + ".tmp"
with open(input_path, newline="", encoding=encoding) as inf, open(
temp_output, newline="", encoding=encoding, mode="w"
) as outf:
reader = csv.reader(inf)
writer = csv.writer(outf, quoting=csv.QUOTE_ALL)
# Read and parse the header
headers = next(reader)
# Build the list of headers to keep + their new names
headers_to_keep = [old for _, old in self.csv_entries]
headers_rename = [new for new, _ in self.csv_entries]
# Check if all specified headers exist in the input file
missing = [h for h in headers_to_keep if h not in headers]
if missing:
raise ValueError(
f"The following headers are not in the input CSV: {missing}"
)
# Determine the indices of the headers to keep
indices = [headers.index(old) for old in headers_to_keep]
# Write the renamed header
writer.writerow(headers_rename)
# Stream through every data row and write out the filtered columns
for row in reader:
filtered = [row[i] for i in indices]
writer.writerow(filtered)
# Atomically replace the old file
os.replace(temp_output, output_path)
logging.info(f"Core data written to CSV file at '{output_path}'")