mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-22 04:42:54 +02:00
code cleanup, presentation on data extraction
This commit is contained in:
parent
6e31bc62bd
commit
c9c7b0cf7a
@ -12,7 +12,7 @@ repos:
|
||||
- id: check-xml
|
||||
- id: check-ast
|
||||
- id: check-added-large-files
|
||||
args: [--enforce-all]
|
||||
args: [--enforce-all --maxkb=50000]
|
||||
- id: name-tests-test
|
||||
- id: detect-private-key
|
||||
- id: check-case-conflict
|
||||
|
File diff suppressed because one or more lines are too long
@ -49,7 +49,6 @@ class MongoEntryTransformer:
|
||||
@staticmethod
|
||||
def transform_ingoing(news: News) -> dict:
|
||||
transport_object = news.dict()
|
||||
print(transport_object)
|
||||
transport_object["_id"] = news.id
|
||||
del transport_object["id"]
|
||||
return transport_object
|
||||
|
@ -18,7 +18,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -28,7 +28,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -50,7 +50,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -60,6 +60,61 @@
|
||||
"c:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\data\\Unternehmensregister\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 0%| | 0/4192 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Could not process A. Gradmann GmbH & Co. KG\n",
|
||||
"Could not process A + P Baumaschinen GmbH & Co. KG\n",
|
||||
"Could not process A. Sluka-Verwaltungsgesellschaft mit beschränkter Haftung\n",
|
||||
"Could not process A + W. Sahm Bedachungs-GmbH\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 0%| | 1/4192 [00:48<56:17:10, 48.35s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Could not process A + W. Sahm Bedachungs-GmbH\n",
|
||||
"Could not process A + M Verwaltungs-GmbH\n",
|
||||
"Could not process A. Frauenrath Landschaftsbau GmbH & Co. KG.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 0%| | 2/4192 [01:29<51:30:19, 44.25s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Could not process A. Frauenrath Landschaftsbau GmbH & Co. KG.\n",
|
||||
"Could not process a s k - Kunststoffe GmbH\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 0%| | 2/4192 [02:09<75:11:08, 64.60s/it]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "KeyboardInterrupt",
|
||||
"evalue": "",
|
||||
@ -67,8 +122,9 @@
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\notebook.ipynb Cell 5\u001b[0m in \u001b[0;36m3\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W4sZmlsZQ%3D%3D?line=26'>27</a>\u001b[0m options\u001b[39m.\u001b[39madd_experimental_option(\u001b[39m\"\u001b[39m\u001b[39mprefs\u001b[39m\u001b[39m\"\u001b[39m, preferences)\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W4sZmlsZQ%3D%3D?line=28'>29</a>\u001b[0m driver \u001b[39m=\u001b[39m webdriver\u001b[39m.\u001b[39mChrome(options\u001b[39m=\u001b[39moptions)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W4sZmlsZQ%3D%3D?line=30'>31</a>\u001b[0m driver\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mhttps://www.unternehmensregister.de/ureg/\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W4sZmlsZQ%3D%3D?line=31'>32</a>\u001b[0m \u001b[39m# Accept Cookies\u001b[39;00m\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W4sZmlsZQ%3D%3D?line=32'>33</a>\u001b[0m driver\u001b[39m.\u001b[39mfind_elements(\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W4sZmlsZQ%3D%3D?line=33'>34</a>\u001b[0m By\u001b[39m.\u001b[39mXPATH, \u001b[39m'\u001b[39m\u001b[39m//button[text()=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mNur technisch notwendige Cookies akzeptieren\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m]\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W4sZmlsZQ%3D%3D?line=34'>35</a>\u001b[0m )[\u001b[39m0\u001b[39m]\u001b[39m.\u001b[39mclick()\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:355\u001b[0m, in \u001b[0;36mWebDriver.get\u001b[1;34m(self, url)\u001b[0m\n\u001b[0;32m 353\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mget\u001b[39m(\u001b[39mself\u001b[39m, url: \u001b[39mstr\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 354\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Loads a web page in the current browser session.\"\"\"\u001b[39;00m\n\u001b[1;32m--> 355\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mexecute(Command\u001b[39m.\u001b[39;49mGET, {\u001b[39m\"\u001b[39;49m\u001b[39murl\u001b[39;49m\u001b[39m\"\u001b[39;49m: url})\n",
|
||||
"\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\notebook.ipynb Cell 5\u001b[0m in \u001b[0;36m1\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W4sZmlsZQ%3D%3D?line=110'>111</a>\u001b[0m \u001b[39mfor\u001b[39;00m i \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(\u001b[39m6\u001b[39m):\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W4sZmlsZQ%3D%3D?line=111'>112</a>\u001b[0m driver\u001b[39m.\u001b[39mback()\n\u001b[1;32m--> <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W4sZmlsZQ%3D%3D?line=112'>113</a>\u001b[0m driver\u001b[39m.\u001b[39;49mfind_element(By\u001b[39m.\u001b[39;49mXPATH, \u001b[39m'\u001b[39;49m\u001b[39m//*[@class=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mfas fa-angle-right\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m]\u001b[39;49m\u001b[39m'\u001b[39;49m)\u001b[39m.\u001b[39;49mclick()\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W4sZmlsZQ%3D%3D?line=113'>114</a>\u001b[0m driver\u001b[39m.\u001b[39mclose()\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W4sZmlsZQ%3D%3D?line=114'>115</a>\u001b[0m \u001b[39mprint\u001b[39m(processed_companies)\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webelement.py:94\u001b[0m, in \u001b[0;36mWebElement.click\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 92\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mclick\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 93\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Clicks the element.\"\"\"\u001b[39;00m\n\u001b[1;32m---> 94\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_execute(Command\u001b[39m.\u001b[39;49mCLICK_ELEMENT)\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webelement.py:395\u001b[0m, in \u001b[0;36mWebElement._execute\u001b[1;34m(self, command, params)\u001b[0m\n\u001b[0;32m 393\u001b[0m params \u001b[39m=\u001b[39m {}\n\u001b[0;32m 394\u001b[0m params[\u001b[39m\"\u001b[39m\u001b[39mid\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_id\n\u001b[1;32m--> 395\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_parent\u001b[39m.\u001b[39;49mexecute(command, params)\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:344\u001b[0m, in \u001b[0;36mWebDriver.execute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m 341\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39msessionId\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m params:\n\u001b[0;32m 342\u001b[0m params[\u001b[39m\"\u001b[39m\u001b[39msessionId\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msession_id\n\u001b[1;32m--> 344\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcommand_executor\u001b[39m.\u001b[39;49mexecute(driver_command, params)\n\u001b[0;32m 345\u001b[0m \u001b[39mif\u001b[39;00m response:\n\u001b[0;32m 346\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror_handler\u001b[39m.\u001b[39mcheck_response(response)\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\remote_connection.py:290\u001b[0m, in \u001b[0;36mRemoteConnection.execute\u001b[1;34m(self, command, params)\u001b[0m\n\u001b[0;32m 288\u001b[0m data \u001b[39m=\u001b[39m utils\u001b[39m.\u001b[39mdump_json(params)\n\u001b[0;32m 289\u001b[0m url \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_url\u001b[39m}\u001b[39;00m\u001b[39m{\u001b[39;00mpath\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m--> 290\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_request(command_info[\u001b[39m0\u001b[39;49m], url, body\u001b[39m=\u001b[39;49mdata)\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\remote_connection.py:311\u001b[0m, in \u001b[0;36mRemoteConnection._request\u001b[1;34m(self, method, url, body)\u001b[0m\n\u001b[0;32m 308\u001b[0m body \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m 310\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mkeep_alive:\n\u001b[1;32m--> 311\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_conn\u001b[39m.\u001b[39;49mrequest(method, url, body\u001b[39m=\u001b[39;49mbody, headers\u001b[39m=\u001b[39;49mheaders)\n\u001b[0;32m 312\u001b[0m statuscode \u001b[39m=\u001b[39m response\u001b[39m.\u001b[39mstatus\n\u001b[0;32m 313\u001b[0m \u001b[39melse\u001b[39;00m:\n",
|
||||
|
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user