mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-22 07:22:54 +02:00
493 lines
16 KiB
Plaintext
493 lines
16 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Corporate Intelligence"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## API Research\n",
|
||
"\n",
|
||
"### BundesAPI"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"*Down to due maintenance work from 24.03. - 26.03.*\n",
|
||
"\n",
|
||
"Basically a Bundesanzeiger Scraping Wrapper"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"C:\\Users\\trist\\AppData\\Roaming\\Python\\Python310\\site-packages\\requests\\__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.12) or chardet (5.1.0)/charset_normalizer (2.0.12) doesn't match a supported version!\n",
|
||
" warnings.warn(\"urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported \"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"dict_keys(['Aufsichtsrat', 'Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020', 'Jahresabschluss zum Geschäftsjahr vom 01.01.2019 bis zum 31.12.2019', 'Jahresabschluss zum Geschäftsjahr vom 01.01.2018 bis zum 31.12.2018', 'Jahresabschluss zum Geschäftsjahr vom 01.01.2017 bis zum 31.12.2017', 'Jahresabschluss zum Geschäftsjahr vom 01.01.2016 bis zum 31.12.2016', 'Jahresabschluss zum Geschäftsjahr vom 01.01.2015 bis zum 31.12.2015', 'Jahresabschluss zum Geschäftsjahr vom 01.01.2014 bis zum 31.12.2014', 'Jahresabschluss zum Geschäftsjahr vom 01.01.2013 bis zum 31.12.2013', 'Jahresabschluss zum Geschäftsjahr vom 01.01.2012 bis zum 31.12.2012', 'Jahresabschluss zum Geschäftsjahr vom 01.10.2010 bis zum 30.06.2011', 'Jahresabschluss zum Geschäftsjahr vom 01.07.2011 bis zum 31.12.2011', 'Jahresbericht zum 31.3.2006', 'Jahresbericht 30.11.2022'])\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from deutschland.bundesanzeiger import Bundesanzeiger\n",
|
||
"ba = Bundesanzeiger()\n",
|
||
"# search term\n",
|
||
"data = ba.get_reports(\"Atos IT-Dienstleistung & Beratung GmbH\")\n",
|
||
"# returns a dictionary with all reports found as fulltext reports\n",
|
||
"print(data.keys())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"dict_keys(['date', 'name', 'company', 'report'])\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Note: There can be multiple \"Aufsichtsrat\" entries per Company, the API however does only return one because the keys are overwritten\n",
|
||
"jahresabschluss = data['Jahresabschluss zum Geschäftsjahr vom 01.01.2019 bis zum 31.12.2019']\n",
|
||
"\n",
|
||
"# Note: Although the report includes the entire text it lacks the formatting that would make extracting information a lot easier as the data is wrapped inside a <table> originally\n",
|
||
"with open(\"./jahresabschluss-example.txt\", \"w\") as file:\n",
|
||
" file.write(jahresabschluss['report'])\n",
|
||
"print(jahresabschluss.keys())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"None\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from deutschland.handelsregister import Handelsregister\n",
|
||
"hr = Handelsregister()\n",
|
||
"\n",
|
||
"results = hr.search(keywords=\"BLUECHILLED Verwaltungs GmbH\")\n",
|
||
"print(results)"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Offene Register"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Hint: Visualize schema with tools such a [DBeaver](https://dbeaver.io/)\n",
|
||
"\n",
|
||
"Note: Not up-to-date"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# SQLite export\n",
|
||
"import sqlite3\n",
|
||
"con = sqlite3.connect(\"../data/openregister.db\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"cur = con.cursor()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"[('name',),\n",
|
||
" ('registrations',),\n",
|
||
" ('officer',),\n",
|
||
" ('company',),\n",
|
||
" ('company_fts',),\n",
|
||
" ('company_fts_data',),\n",
|
||
" ('company_fts_idx',),\n",
|
||
" ('company_fts_docsize',),\n",
|
||
" ('company_fts_config',),\n",
|
||
" ('officer_fts',),\n",
|
||
" ('officer_fts_data',),\n",
|
||
" ('officer_fts_idx',),\n",
|
||
" ('officer_fts_docsize',),\n",
|
||
" ('officer_fts_config',),\n",
|
||
" ('name_fts',),\n",
|
||
" ('name_fts_data',),\n",
|
||
" ('name_fts_idx',),\n",
|
||
" ('name_fts_docsize',),\n",
|
||
" ('name_fts_config',)]"
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"schema = cur.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n",
|
||
"schema.fetchall()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>company_number</th>\n",
|
||
" <th>current_status</th>\n",
|
||
" <th>jurisdiction_code</th>\n",
|
||
" <th>name</th>\n",
|
||
" <th>registered_address</th>\n",
|
||
" <th>retrieved_at</th>\n",
|
||
" <th>register_flag_AD</th>\n",
|
||
" <th>register_flag_CD</th>\n",
|
||
" <th>register_flag_DK</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>native_company_number</th>\n",
|
||
" <th>registered_office</th>\n",
|
||
" <th>registrar</th>\n",
|
||
" <th>register_art</th>\n",
|
||
" <th>register_nummer</th>\n",
|
||
" <th>former_registrar</th>\n",
|
||
" <th>register_flag_</th>\n",
|
||
" <th>register_flag_Note:</th>\n",
|
||
" <th>_registerNummerSuffix</th>\n",
|
||
" <th>register_flag_Status information</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>K1101R_HRB150148</td>\n",
|
||
" <td>currently registered</td>\n",
|
||
" <td>de</td>\n",
|
||
" <td>olly UG (haftungsbeschränkt)</td>\n",
|
||
" <td>Waidmannstraße 1, 22769 Hamburg.</td>\n",
|
||
" <td>2018-11-09T18:03:03Z</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>Hamburg HRB 150148</td>\n",
|
||
" <td>Hamburg</td>\n",
|
||
" <td>Hamburg</td>\n",
|
||
" <td>HRB</td>\n",
|
||
" <td>150148</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>None</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>R1101_HRB81092</td>\n",
|
||
" <td>currently registered</td>\n",
|
||
" <td>de</td>\n",
|
||
" <td>BLUECHILLED Verwaltungs GmbH</td>\n",
|
||
" <td>Oststr.</td>\n",
|
||
" <td>2018-07-25T11:14:02Z</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>Düsseldorf HRB 81092</td>\n",
|
||
" <td>Düsseldorf</td>\n",
|
||
" <td>Düsseldorf</td>\n",
|
||
" <td>HRB</td>\n",
|
||
" <td>81092</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>None</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>H1101_H1101_HRB18423</td>\n",
|
||
" <td>currently registered</td>\n",
|
||
" <td>de</td>\n",
|
||
" <td>Mittelständische Beteiligungsgesellschaft Brem...</td>\n",
|
||
" <td>Langenstraße 2-4, 28195 Bremen.</td>\n",
|
||
" <td>2018-06-24T21:12:00Z</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>Bremen früher Bremen HRB 18423</td>\n",
|
||
" <td>Bremen</td>\n",
|
||
" <td>Bremen</td>\n",
|
||
" <td>HRB</td>\n",
|
||
" <td>18423</td>\n",
|
||
" <td>Bremen</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>None</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>R1101_HRB45109</td>\n",
|
||
" <td>currently registered</td>\n",
|
||
" <td>de</td>\n",
|
||
" <td>Albert Barufe GmbH</td>\n",
|
||
" <td>Hans-Sachs-Straße 11, 40721 Hilden.</td>\n",
|
||
" <td>2018-07-25T11:15:01Z</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>Düsseldorf HRB 45109</td>\n",
|
||
" <td>Hilden</td>\n",
|
||
" <td>Düsseldorf</td>\n",
|
||
" <td>HRB</td>\n",
|
||
" <td>45109</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>None</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>5</td>\n",
|
||
" <td>R1101_HRB37996</td>\n",
|
||
" <td>currently registered</td>\n",
|
||
" <td>de</td>\n",
|
||
" <td>ITERGO Informationstechnologie GmbH</td>\n",
|
||
" <td>ERGO-Platz 1, 40477 Düsseldorf.</td>\n",
|
||
" <td>2018-07-25T12:32:08Z</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>Düsseldorf HRB 37996</td>\n",
|
||
" <td>Düsseldorf</td>\n",
|
||
" <td>Düsseldorf</td>\n",
|
||
" <td>HRB</td>\n",
|
||
" <td>37996</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>None</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 25 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id company_number current_status jurisdiction_code \\\n",
|
||
"0 1 K1101R_HRB150148 currently registered de \n",
|
||
"1 2 R1101_HRB81092 currently registered de \n",
|
||
"2 3 H1101_H1101_HRB18423 currently registered de \n",
|
||
"3 4 R1101_HRB45109 currently registered de \n",
|
||
"4 5 R1101_HRB37996 currently registered de \n",
|
||
"\n",
|
||
" name \\\n",
|
||
"0 olly UG (haftungsbeschränkt) \n",
|
||
"1 BLUECHILLED Verwaltungs GmbH \n",
|
||
"2 Mittelständische Beteiligungsgesellschaft Brem... \n",
|
||
"3 Albert Barufe GmbH \n",
|
||
"4 ITERGO Informationstechnologie GmbH \n",
|
||
"\n",
|
||
" registered_address retrieved_at \\\n",
|
||
"0 Waidmannstraße 1, 22769 Hamburg. 2018-11-09T18:03:03Z \n",
|
||
"1 Oststr. 2018-07-25T11:14:02Z \n",
|
||
"2 Langenstraße 2-4, 28195 Bremen. 2018-06-24T21:12:00Z \n",
|
||
"3 Hans-Sachs-Straße 11, 40721 Hilden. 2018-07-25T11:15:01Z \n",
|
||
"4 ERGO-Platz 1, 40477 Düsseldorf. 2018-07-25T12:32:08Z \n",
|
||
"\n",
|
||
" register_flag_AD register_flag_CD register_flag_DK ... \\\n",
|
||
"0 1 1 1 ... \n",
|
||
"1 1 1 1 ... \n",
|
||
"2 1 1 1 ... \n",
|
||
"3 1 1 1 ... \n",
|
||
"4 1 1 1 ... \n",
|
||
"\n",
|
||
" native_company_number registered_office registrar \\\n",
|
||
"0 Hamburg HRB 150148 Hamburg Hamburg \n",
|
||
"1 Düsseldorf HRB 81092 Düsseldorf Düsseldorf \n",
|
||
"2 Bremen früher Bremen HRB 18423 Bremen Bremen \n",
|
||
"3 Düsseldorf HRB 45109 Hilden Düsseldorf \n",
|
||
"4 Düsseldorf HRB 37996 Düsseldorf Düsseldorf \n",
|
||
"\n",
|
||
" register_art register_nummer former_registrar register_flag_ \\\n",
|
||
"0 HRB 150148 None None \n",
|
||
"1 HRB 81092 None None \n",
|
||
"2 HRB 18423 Bremen None \n",
|
||
"3 HRB 45109 None None \n",
|
||
"4 HRB 37996 None None \n",
|
||
"\n",
|
||
" register_flag_Note: _registerNummerSuffix register_flag_Status information \n",
|
||
"0 None None None \n",
|
||
"1 None None None \n",
|
||
"2 None None None \n",
|
||
"3 None None None \n",
|
||
"4 None None None \n",
|
||
"\n",
|
||
"[5 rows x 25 columns]"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"df = pd.read_sql_query(\"SELECT * FROM company LIMIT 100\", con)\n",
|
||
"df.head()"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Open Corporates"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import requests\n",
|
||
"\n",
|
||
"BASE_URL = \"https://api.opencorporates.com\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"401"
|
||
]
|
||
},
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"response = requests.get(f\"{BASE_URL}/companies/search\")\n",
|
||
"response.status_code"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
""
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.7"
|
||
},
|
||
"orig_nbformat": 4
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|