mars-elt/python/mrds_common/mrds/utils/csv_utils.py

import csv
import os

TASK_HISTORY_MULTIPLIER = 1_000_000_000


def read_csv_file(csv_filepath, encoding_type="utf-8"):
    with open(csv_filepath, "r", newline="", encoding=encoding_type) as csvfile:
        reader = list(csv.reader(csvfile))
    headers = reader[0]
    data_rows = reader[1:]
    return headers, data_rows


def write_data_to_csv_file(csv_filepath, data, encoding_type="utf-8"):
    temp_csv_filepath = csv_filepath + ".tmp"
    with open(temp_csv_filepath, "w", newline="", encoding=encoding_type) as csvfile:
        writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
        writer.writerow(data["headers"])
        writer.writerows(data["rows"])
    os.replace(temp_csv_filepath, csv_filepath)


def add_static_columns(data_rows, headers, static_entries):
    for column_header, value in static_entries:
        if column_header not in headers:
            headers.append(column_header)
            for row in data_rows:
                row.append(value)
        else:
            idx = headers.index(column_header)
            for row in data_rows:
                row[idx] = value


def add_a_key_columns(data_rows, headers, a_key_entries, task_history_key):
    for column_header in a_key_entries:
        if column_header not in headers:
            headers.append(column_header)
            for i, row in enumerate(data_rows, start=1):
                a_key_value = int(task_history_key) * TASK_HISTORY_MULTIPLIER + i
                row.append(str(a_key_value))
        else:
            idx = headers.index(column_header)
            for i, row in enumerate(data_rows, start=1):
                a_key_value = int(task_history_key) * TASK_HISTORY_MULTIPLIER + i
                row[idx] = str(a_key_value)


def add_workflow_key_columns(data_rows, headers, workflow_key_entries, workflow_key):
    for column_header in workflow_key_entries:
        if column_header not in headers:
            headers.append(column_header)
            for row in data_rows:
                row.append(workflow_key)
        else:
            idx = headers.index(column_header)
            for row in data_rows:
                row[idx] = workflow_key


def rearrange_columns(headers, data_rows, column_order):
    header_to_index = {header: idx for idx, header in enumerate(headers)}
    new_indices = [
        header_to_index[header] for header in column_order if header in header_to_index
    ]
    headers = [headers[idx] for idx in new_indices]
    data_rows = [[row[idx] for idx in new_indices] for row in data_rows]
    return headers, data_rows