From b38af8dee5f4f0cd946b2f5c7bfca67cbb16fefb Mon Sep 17 00:00:00 2001 From: Federico Justus Denkena Date: Mon, 9 Jun 2025 20:32:50 +0200 Subject: [PATCH] remove database connection, instead write back to csv file --- cleanup_script.py | 117 +++++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 65 deletions(-) diff --git a/cleanup_script.py b/cleanup_script.py index 129ab89..85c6670 100755 --- a/cleanup_script.py +++ b/cleanup_script.py @@ -3,14 +3,12 @@ import argparse import csv import secrets import sys -import time import json import logging import requests import hashlib import io -import psycopg as ps -import psycopg_pool as ps_pool +import datetime from rich import progress from rich.logging import RichHandler from rich.console import Console @@ -18,8 +16,9 @@ from rich.traceback import install install(show_locals=True, locals_max_length=150, locals_max_string=300) class Company: - def __init__(self, data, report): + def __init__(self, data, report, out): self.data = data + self.out = out self.bvdid = data["BvD ID Nummer"] self.name = data["Unternehmensname"] self.gv2020 = None @@ -179,27 +178,31 @@ class Company: def validate(self): #fallback, in case tax wasn't already calculated self.calculate_tax() - if self.st2020 and self.ek2020: - self.report.valid_data += 1 - return True - self.report.invalid_data +=1 - if self.st2021 and self.ek2021: - self.report.valid_data += 1 - return True - self.report.invalid_data +=1 - if self.st2022 and self.ek2022: - self.report.valid_data += 1 - return True - if self.st2024 and self.ek2024: - self.report.valid_data += 1 - return True - self.report.invalid_data +=1 - return False - + if True: + if self.st2020 and self.ek2020: + self.report.valid_data += 1 + else: + self.report.invalid_data +=1 + if self.st2021 and self.ek2021: + self.report.valid_data += 1 + else: + self.report.invalid_data +=1 + if self.st2022 and self.ek2022: + self.report.valid_data += 1 + else: + self.report.invalid_data +=1 + if self.st2023 and self.ek2023: + self.report.valid_data += 1 + else: + self.report.invalid_data +=1 + if self.st2024 and self.ek2024: + self.report.valid_data += 1 + else: + self.report.invalid_data +=1 class dataimport: - def __init__(self, filename, logfile, seek=0): + def __init__(self, filename, logfile, output, seek=0): self.seek = seek self.progress = progress.Progress( *progress.Progress.get_default_columns(), @@ -210,6 +213,7 @@ class dataimport: self.filename = filename FORMAT = "%(message)s" self.logfile = open(logfile, 'a') + self.output = output if self.logfile != "NONE": self.logconsole = Console(file=self.logfile) logging.basicConfig( @@ -225,54 +229,38 @@ class dataimport: show_path=False, show_time=False, level="NOTSET")]) self.log = logging.getLogger("import") - self.total_rows = self.get_total() + self.total_rows = self.get_total(self.filename) self.errors = 0 self.data = {} self.duplicate_database_id = None - self.task = self.progress.add_task(f"Importing {self.filename.split('/')[-1]}", total=self.get_total()) + self.task = self.progress.add_task(f"Importing {self.filename.split('/')[-1]}", total=self.get_total(self.filename)) self.progress.update(self.task, advance=self.seek) - global AUTHTOKEN - AUTHTOKEN = None self.valid_data = 0 self.invalid_data = 0 - #with ps_pool.ConnectionPool(conninfo="postgresql:///bachelorarbeit?sslmode=require&port=5432&host=denkena-consulting.com&passfile=/home/user/bachelorarbeit_importer/pgpass&user=bachelorarbeit_w&hostaddr=94.16.116.86", min_size=4, max_size=10, open=True, ) as pool: - # with pool.connection() as conn: - #self.db_setup() self.importer() - #AUTHTOKEN = self.authtoken - #self.log.info('AUTHTOKEN SET!') - - def db_setup(self, conn): - with conn.cursor() as cur: - cur.execute("CREATE TABLE IF NOT EXISTS test( bvd_id serial PRIMARY KEY)") - pass - def importer(self): with self.progress: - if AUTHTOKEN is not None: - self.authtoken = AUTHTOKEN - self.log.info('AUTHTOKEN obtained!') - else: - pass with open(self.filename, mode='r', encoding='utf-8-sig', newline='') as csv_file: - csv_reader = csv.DictReader(csv_file, delimiter=',') - rownum = 0 - for row in csv_reader: - if rownum < self.seek: + with open(self.output, mode='a+', encoding='utf-8-sig', newline='') as output_csv: + csv_reader = csv.DictReader(csv_file, delimiter=',') + out_names = [] + output_writer = csv.DictWriter(output_csv, fieldnames=out_names) + self.log.warning(self.get_total(self.output)) + if self.get_total(self.output) <= 0: + self.log.warning(f"WRITING HEADER FOR FILE {self.output}!") + output_writer.writeheader() + rownum = 0 + for row in csv_reader: + if rownum < self.seek: + rownum += 1 + continue + for key in csv_reader.fieldnames: + self.data[key] = row[key] + self.comp_import(self.data, output_writer) + self.data = {} rownum += 1 - continue - for key in csv_reader.fieldnames: - self.data[key] = row[key] - self.comp_import(self.data) - #if self.check_duplicate(data): - # self.patch_record(data) - # self.duplicate_database_id = None - #else: - # self.create_record(data) - self.data = {} - rownum += 1 - self.progress.update(self.task, advance=1) + self.progress.update(self.task, advance=1) self.progress.console.rule() self.log.info(f"Rows: {self.total_rows}") self.log.info(f"Valid: {self.valid_data}") @@ -286,24 +274,23 @@ class dataimport: else: self.log.critical("ERROR CALCULATION EXCEPTION") - def get_total(self): - return sum(1 for _ in open(self.filename, mode='r')) - 1 + def get_total(self, file): + return sum(1 for _ in open(file, mode='r')) - 1 - def comp_import(self, data): - current = Company(data, report=self) + def comp_import(self, data, out): + current = Company(data, report=self, out=out) current.validate() - - parser = argparse.ArgumentParser(description='Import data from ORBIS', epilog='Copyright Denkena Consulting') parser.add_argument('filename', nargs="+") parser.add_argument('-l', '--logfile', default="log_importer", nargs="?") +parser.add_argument('-o', '--output', default="export_cleaned.csv", nargs="?") parser.add_argument('-s', '--seek', type=int, default=0) args = parser.parse_args() if len(args.filename) > 1 and args.seek > 0: parser.error("Seek combined with multiple files is a bad idea!") for filename in args.filename: - dataimport(filename, args.logfile, args.seek) + dataimport(filename, args.logfile, args.output, args.seek)