import logging import csv import os from .base import TaskProcessor class CSVTaskProcessor(TaskProcessor): def _extract(self): input_path = self.global_config.source_filepath output_path = self.output_filepath encoding = self.global_config.encoding_type logging.info(f"Reading source CSV file at '{input_path}'") # Open both input & output at once for streaming row-by-row temp_output = output_path + ".tmp" with open(input_path, newline="", encoding=encoding) as inf, open( temp_output, newline="", encoding=encoding, mode="w" ) as outf: reader = csv.reader(inf) writer = csv.writer(outf, quoting=csv.QUOTE_ALL) # Read and parse the header headers = next(reader) # Build the list of headers to keep + their new names headers_to_keep = [old for _, old in self.csv_entries] headers_rename = [new for new, _ in self.csv_entries] # Check if all specified headers exist in the input file missing = [h for h in headers_to_keep if h not in headers] if missing: raise ValueError( f"The following headers are not in the input CSV: {missing}" ) # Determine the indices of the headers to keep indices = [headers.index(old) for old in headers_to_keep] # Write the renamed header writer.writerow(headers_rename) # Stream through every data row and write out the filtered columns for row in reader: filtered = [row[i] for i in indices] writer.writerow(filtered) # Atomically replace the old file os.replace(temp_output, output_path) logging.info(f"Core data written to CSV file at '{output_path}'")