diff --git a/README.md b/README.md index 5aca4b2..7982abb 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,22 @@ # bachelorarbeit_importer +Importiert ORBIS-Daten in ein vordefiniertes CSV-Format. -Importer von ORBIS-Daten zu Postgres 17 \ No newline at end of file +Zur Nutzung der Skripte müssen die in requirements.txt angegebenen Bibliotheken installiert werden. +Dies kann unter Linux mit dem Befehl 'pip install -r requirements.txt' geschehen. Andere Betriebssysteme können abweichende Installationsmöglichkeiten bieten. +Verwendung von cleanup_script.py: + -h: Zeige die Hilfeseite an. + -l [LOGFILE]: Gebe eine Datei für das Protokoll an. Wichtige Events werden darin vermerkt. + -o [OUTPUT]: Gebe die Datei an, in welche die aufbereiteten Daten geschrieben werden. + -s SEEK: Gebe die Position in einer Datei an, ab der der Import starten soll. + [filename]: Gebe zum Schluss des Befehls die relevanten Datei(en) an, welche in das neue CSV-Format gebracht werden sollen. +Beispiel: './cleanup_script.py data/st_neu_1.csv data/st_neu_2.csv data/st_neu_3.csv -o data/cleaned_st_neu_verlustausschluss.csv' + +Verwendung von display_script.py: + Die letzte Zeile des Skripts muss je nach Speicherort der bereinigten CSV (aus cleanup_script.py) angepasst werden. Dies ist keine funktionelle, nur eine formelle Änderung. + Das Skript wurde nur in Spyder getestet. Möglicherweise müssen zusätzliche Bibliotheken installiert und Änderungen am Code vorgenommen werden, damit die Nutzung auch headless möglich ist. Die Kompatibilität wurde mit Spyder IDE 5.4.2 unter Python Version 3.11.2, Qt Version 5.15.8, PyQt5 Version 5.15.9 und dem Linux-Kernel Version 6.14.4-1 getestet. Nur für die Funktionsfähigkeit in der angegebenen Umgebung kann Verantwortung getragen werden. Aller Wahrscheinlichkeit und Erfahrung nach ist die Nutzung in anderen Umgebungen jedoch problemlos möglich. + +Zur Datei orbis_suchmaske_approximation.png: + Diese Datei enthält den Screenshot einer approximative Suche in Orbis. Die originale Suche, welche zur Ausgabe der CSV-Quelldateien für diese Suche geführt hat, ist aufgrund wechselnder Verbindungen in die Uni-VPN leider nicht mehr erhalten. Die Approximation soll eine ähnliche Suche darstellen, die originale Suche wurde jedoch ausschließlich im Fließtext der Arbeit dokumentiert. + +Zur Datei table_generator_skript.py: + Dieses Skript ist für die Generierung der Tabelle unter Anlage 1 verantwortlich. Unter Installation der Abhängigkeiten in requirements.txt in python3.11.2 wurde die Funktion getestet. Der Speicherpfad der Datei muss zur fehlerfreien Funktion angepasst werden. diff --git a/cleanup_script.py b/cleanup_script.py index 57cc7f2..844dab0 100755 --- a/cleanup_script.py +++ b/cleanup_script.py @@ -35,12 +35,10 @@ class Company: self.cleaned_data["name"] = data["Unternehmensname"] for year in YEARS: self.clean_complex(year, "vor") - self.clean_complex(year, "nach") self.clean_simple(year, "Eigenkapital") self.clean_simple(year, "Steuern") - def clean_simple(self, year: int, type: str) -> None: """Clean simple data. This means tax and capital.""" try: @@ -52,43 +50,35 @@ class Company: """Get suffix for the simple cleaning process""" return "ek" if type == "Eigenkapital" else "st" - def clean_complex(self, year: int, state: str) -> None: """Clean the complex data. This means earnings before/after tax.""" try: - if f"Gewinn/(Verlust) {state} Steuern EUR {year}" in self.data.keys() and self.data[f"Gewinn/(Verlust) {state} Steuern EUR {year}"] != '' and not self.cleaned_data.get(f"gn{year}") and self.cut_negative(state, int(self.data[f"Gewinn/(Verlust) {state} Steuern EUR {year}"])): + if f"Gewinn/(Verlust) {state} Steuern EUR {year}" in self.data.keys() and self.data[f"Gewinn/(Verlust) {state} Steuern EUR {year}"] != '' and not self.cleaned_data.get(f"gn{year}"): self.cleaned_data[f"g{self.get_suffix(state)}{year}"] = int(self.data[f"Gewinn/(Verlust) {state} Steuern EUR {year}"]) - elif f"Gewinn/Verlust {state} Steuern EUR {year}" in self.data.keys() and self.data[f"Gewinn/Verlust {state} Steuern EUR {year}"] != '' and not self.cleaned_data.get(f"gn{year}") and self.cut_negative(state, int(self.data[f"Gewinn/(Verlust) {state} Steuern EUR {year}"])): + elif f"Gewinn/Verlust {state} Steuern EUR {year}" in self.data.keys() and self.data[f"Gewinn/Verlust {state} Steuern EUR {year}"] != '' and not self.cleaned_data.get(f"gn{year}"): self.cleaned_data[f"g{self.get_suffix(state)}{year}"] = int(self.data[f"Gewinn/Verlust {state} Steuern EUR {year}"]) else: self.report.log.debug(f"{self.cleaned_data['name']}:g{self.get_suffix(state)}{year} empty value") except ValueError: self.report.log.debug(f"{self.cleaned_data['name']}: g{self.get_suffix(state)}{year} ValueError") - def cut_negative(self, state: str, gv: int) -> bool: - if state == "vor": - return False if gv < 0 else True - elif state == "nach": - return True - else: - raise Exception("IMPOSSIBLE STATE CN") - def get_suffix(self, state: str) -> str: """Get suffix for the complex cleaning process.""" return "n" if state == "nach" else "v" - def calculate_all_tax(self) -> None: """Calculate tax for all relevant years.""" for year in YEARS: - self.calculate_tax(year) + self.calculate_values(year) - def calculate_tax(self, year: int) -> None: - """Calculate simple tax from provided values.""" + def calculate_values(self, year: int) -> None: + """Calculate missing values from provided values. GV - ST = GN""" if not self.cleaned_data.get(f"st{year}") and self.cleaned_data.get(f"gv{year}") != None and self.cleaned_data.get(f"gn{year}") != None: self.cleaned_data[f"st{year}"] = self.cleaned_data.get(f"gv{year}") - self.cleaned_data.get(f"gn{year}") - - + if not self.cleaned_data.get(f"gn{year}") and self.cleaned_data.get(f"gv{year}") != None and self.cleaned_data.get(f"st{year}") != None: + self.cleaned_data[f"gn{year}"] = self.cleaned_data.get(f"gv{year}") - self.cleaned_data.get(f"st{year}") + if not self.cleaned_data.get(f"gv{year}") and self.cleaned_data.get(f"gn{year}") != None and self.cleaned_data.get(f"st{year}") != None: + self.cleaned_data[f"gv{year}"] = self.cleaned_data.get(f"gn{year}") + self.cleaned_data.get(f"st{year}") def reporter(self) -> None: """Simple class to report valid and invalid data to the main import class.""" @@ -97,19 +87,25 @@ class Company: self.report.valid_data += 1 else: self.report.invalid_data +=1 - def calculate_data(self) -> None: """Calculate data relevant to the project.""" for year in YEARS: if self.cleaned_data.get(f"st{year}") and self.cleaned_data.get(f"gv{year}") and self.cleaned_data.get(f"gn{year}") and self.cleaned_data.get(f"ek{year}"): - self.cleaned_data[f"nomtax{year}"] = self.cleaned_data.get(f"st{year}") / self.cleaned_data.get(f"gv{year}") - self.cleaned_data[f"realtax{year}"] = (self.cleaned_data.get(f"st{year}") + (INFLATION_RATES[year] * self.cleaned_data.get(f"gv{year}"))) / self.cleaned_data.get(f"gv{year}") - self.cleaned_data[f"realefftax{year}"] = (self.cleaned_data.get(f"st{year}") + (INFLATION_RATES[year] * self.cleaned_data.get(f"gv{year}")) + (INFLATION_RATES[year] * self.cleaned_data.get(f"ek{year}"))) / self.cleaned_data.get(f"gv{year}") - + roenom = self.cleaned_data.get(f"gv{year}") / self.cleaned_data.get(f"ek{year}") + if roenom < 0 or roenom > 0.5: + continue # Skip outlier + self.cleaned_data[f"roenom{year}"] = roenom + self.cleaned_data[f"roereal{year}"] = self.cleaned_data.get(f"roenom{year}") - INFLATION_RATES[year] + self.cleaned_data[f"inflationloss{year}"] = self.cleaned_data.get(f"ek{year}") * INFLATION_RATES[year] + self.cleaned_data[f"roetaxed{year}"] = (self.cleaned_data.get(f"gv{year}") - self.cleaned_data.get(f"st{year}") - self.cleaned_data[f"inflationloss{year}"]) / self.cleaned_data.get(f"ek{year}") + self.cleaned_data[f"realefftax{year}"] = (self.cleaned_data.get(f"roereal{year}") - self.cleaned_data.get(f"roetaxed{year}")) / self.cleaned_data.get(f"roereal{year}") + self.cleaned_data[f"totalefftax{year}"] = (self.cleaned_data.get(f"roenom{year}") - self.cleaned_data.get(f"roetaxed{year}")) / self.cleaned_data.get(f"roereal{year}") def write(self) -> None: """Write the current dataset to CSV""" + #if self.cleaned_data.get("realefftax2014"): + # raise Exception(self.cleaned_data) self.writer.writerow(self.cleaned_data) @@ -198,9 +194,12 @@ class dataimport: fieldnames.append(f"gn{year}") fieldnames.append(f"st{year}") fieldnames.append(f"ek{year}") - fieldnames.append(f"nomtax{year}") - fieldnames.append(f"realtax{year}") + fieldnames.append(f"roenom{year}") + fieldnames.append(f"roereal{year}") + fieldnames.append(f"inflationloss{year}") + fieldnames.append(f"roetaxed{year}") fieldnames.append(f"realefftax{year}") + fieldnames.append(f"totalefftax{year}") return fieldnames def comp_import(self, data: dict, writer) -> None: @@ -212,7 +211,6 @@ class dataimport: current.write() - parser = argparse.ArgumentParser(description='Import data from ORBIS', epilog='Copyright Denkena Consulting') parser.add_argument('filename', nargs="+") parser.add_argument('-l', '--logfile', default="log_importer", nargs="?") diff --git a/display_script.py b/display_script.py index e28785d..e971cae 100755 --- a/display_script.py +++ b/display_script.py @@ -11,31 +11,27 @@ class display: def __init__(self, filename) -> None: """Start the actual import process. Seperates process and setup.""" with open(filename, mode='r', encoding='utf-8-sig', newline='') as csv_file: - reader = read_csv(csv_file) + years = np.array([2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]) + styles = ["roenom", "roereal", "roetaxed", "realefftax"] + cols = [f"{style}{year}" for style in styles for year in years] + reader = read_csv(csv_file, usecols=cols, dtype=np.float32, low_memory=True) plt.style.use('_mpl-gallery') - for style in ["nom", "real", "realeff"]: + for style in styles: + style_cols = [f"{style}{year}" for year in years] fig, ax = plt.subplots(figsize=(12, 6)) - ob1 = np.asarray(reader[f"{style}tax2014"].dropna()) - ob2 = np.asarray(reader[f"{style}tax2015"].dropna()) - ob3 = np.asarray(reader[f"{style}tax2016"].dropna()) - ob4 = np.asarray(reader[f"{style}tax2017"].dropna()) - ob5 = np.asarray(reader[f"{style}tax2018"].dropna()) - ob6 = np.asarray(reader[f"{style}tax2019"].dropna()) - ob7 = np.asarray(reader[f"{style}tax2020"].dropna()) - ob8 = np.asarray(reader[f"{style}tax2021"].dropna()) - ob9 = np.asarray(reader[f"{style}tax2022"].dropna()) - ob10 = np.asarray(reader[f"{style}tax2023"].dropna()) - ob11 = np.asarray(reader[f"{style}tax2024"].dropna()) - x = [ob1, ob2, ob3, ob4, ob5, ob6, ob7, ob8, ob9, ob10, ob11] - ax.boxplot(x, positions=[2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024], patch_artist=True, + data = reader[style_cols].dropna(how='all', axis=1) + data_list = [] + data_list = [data[col].dropna().values for col in style_cols if col in data.columns] + ax.boxplot(data_list, positions=years[:len(data_list)], patch_artist=True, showmeans=False, showfliers=False, medianprops={"color": "white", "linewidth": 0.5}, boxprops={"facecolor": "C0", "edgecolor": "white", "linewidth": 0.5}, whiskerprops={"color": "C0", "linewidth": 1.5}, capprops={"color": "C0", "linewidth": 1.5}) - ax.yaxis.set_major_formatter(mtick.PercentFormatter(1,0)) + ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) + plt.figtext(0.5, 0.01, style, ha='center', va='bottom') plt.show() -display(pathlib.Path('/home/user/bachelorarbeit_importer/data/', 'cleaned_st2.csv')) +display(pathlib.Path('/home/user/bachelorarbeit_importer/data/', 'cleaned_st_neu_verlustausschluss.csv')) diff --git a/hvpi_2014_2024.zip b/hvpi_2014_2024.zip new file mode 100644 index 0000000..5ed36a3 Binary files /dev/null and b/hvpi_2014_2024.zip differ diff --git a/orbis_suchmaske_approximation.png b/orbis_suchmaske_approximation.png new file mode 100644 index 0000000..3c9ef88 Binary files /dev/null and b/orbis_suchmaske_approximation.png differ diff --git a/plot_export/realeffectivetaxrate.png b/plot_export/realeffectivetaxrate.png new file mode 100644 index 0000000..12d69ad Binary files /dev/null and b/plot_export/realeffectivetaxrate.png differ diff --git a/plot_export/realefftax_losscut.png b/plot_export/realefftax_losscut.png new file mode 100644 index 0000000..3a5162b Binary files /dev/null and b/plot_export/realefftax_losscut.png differ diff --git a/plot_export/roenominal.png b/plot_export/roenominal.png new file mode 100644 index 0000000..c86875a Binary files /dev/null and b/plot_export/roenominal.png differ diff --git a/plot_export/roenominal_percentiles.png b/plot_export/roenominal_percentiles.png new file mode 100644 index 0000000..19dca26 Binary files /dev/null and b/plot_export/roenominal_percentiles.png differ diff --git a/plot_export/roereal.png b/plot_export/roereal.png new file mode 100644 index 0000000..34aedf5 Binary files /dev/null and b/plot_export/roereal.png differ diff --git a/plot_export/roereal_percentile.png b/plot_export/roereal_percentile.png new file mode 100644 index 0000000..5192931 Binary files /dev/null and b/plot_export/roereal_percentile.png differ diff --git a/plot_export/roetaxed b/plot_export/roetaxed new file mode 100644 index 0000000..5139418 Binary files /dev/null and b/plot_export/roetaxed differ diff --git a/plot_export/roetaxed.png b/plot_export/roetaxed.png new file mode 100644 index 0000000..92157c0 Binary files /dev/null and b/plot_export/roetaxed.png differ diff --git a/plot_export/roetaxed_percentile.png b/plot_export/roetaxed_percentile.png new file mode 100644 index 0000000..25a0a28 Binary files /dev/null and b/plot_export/roetaxed_percentile.png differ diff --git a/plot_export/totaleffectivetaxrate.png b/plot_export/totaleffectivetaxrate.png new file mode 100644 index 0000000..e5e4de4 Binary files /dev/null and b/plot_export/totaleffectivetaxrate.png differ diff --git a/table_generator_skript.py b/table_generator_skript.py new file mode 100755 index 0000000..5a6e8fa --- /dev/null +++ b/table_generator_skript.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import pathlib + +# Load the cleaned CSV file +file_path = pathlib.Path('/home/user/bachelorarbeit_importer/data/', 'cleaned_st_neu_verlustausschluss.csv') +df = pd.read_csv(file_path) + +# Select relevant columns for 2022 and drop rows with missing ek2022 or st2022 +df_2022 = df[['name', 'ek2022', 'st2022']].dropna() + +# Sort by Eigenkapital (ek2022) ascending +df_2022.sort_values('ek2022', inplace=True) + +# Define 5 classes based on equity size (in EUR): +bins = [0, 500000, 2000000, 10000000, 50000000, np.inf] +labels = ['< 500K EK', '500K EK < x < 2M EK', '2M < 10M EK', '10M EK < 50M EK', '> 50M EK'] +df_2022['Klasse'] = pd.cut(df_2022['ek2022'], bins=bins, labels=labels) + +# Calculate percentage of companies with negative taxes per class +for klasse, group in df_2022.groupby('Klasse', observed=False): + total_companies = len(group) + negative_tax_companies = len(group[group['st2022'] < 0]) + percentage = (negative_tax_companies / total_companies * 100) if total_companies > 0 else 0 + print(f"Prozent Unternehmen mit negativen Steuern in {klasse}: {percentage:.2f}%")