Added longitude/latitude and positional accuracy to the company data (#180)

This commit is contained in:
2023-10-02 17:18:04 +02:00
committed by GitHub
parent c96462532b
commit 05472cc16a
7 changed files with 6019 additions and 5911 deletions

794
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -33,7 +33,6 @@ description = "Data Integration, Extraction, and Visualization using Text Mining
documentation = "https://cuddly-waffle-r416zgy.pages.github.io/"
homepage = "https://cuddly-waffle-r416zgy.pages.github.io/"
keywords = ["deutschland", "economy", "transparenzregister", "dataintegration", "handelsregister"]
maintainers = [
"Philipp Horstenkamp <philipp@horstenkamp.de>",
"Tristan Nolde <contact@trisnol.dev>",
@ -49,8 +48,8 @@ repository = "https://github.com/fhswf/aki_prj23_transparenzregister"
version = "0.1.0"
[tool.poetry.dependencies]
aenum = "^3.1.15"
SQLAlchemy = "^1.4.49"
aenum = "^3.1.15"
cachetools = "^5.3.1"
dash = "^2.13.0"
dash-auth = "^2.0.0"
@ -58,6 +57,7 @@ dash-bootstrap-components = "^1.5.0"
deutschland = {git = "https://github.com/TrisNol/deutschland.git", branch = "hotfix/python-3.11-support"}
loguru = "^0.7.0"
matplotlib = "^3.7.2"
pgeocode = "^0.4.1"
psycopg2-binary = "^2.9.7"
pymongo = "^4.5.0"
python = "^3.11"

View File

@ -1,7 +1,9 @@
"""This module contains the data transfer and refinement functionalities between staging and production DB."""
from datetime import date
from functools import lru_cache
from typing import Any
import pgeocode
import sqlalchemy as sa
from cachetools import LRUCache, cached
from loguru import logger
@ -22,6 +24,8 @@ from aki_prj23_transparenzregister.utils.sql.connector import (
)
from aki_prj23_transparenzregister.utils.string_tools import simplify_string
nomi = pgeocode.Nominatim("de")
class DataInvalidError(ValueError):
"""This error is thrown if a db entry can't be parsed for the production db."""
@ -192,6 +196,26 @@ def get_company_id(
return company_id
@lru_cache(1000)
def get_geocodes(
zip_code: str,
) -> dict[str, float]:
"""Adds additional geo positioning data to locations.
Args:
zip_code: The zipcode where the company is located.
Returns:
latitude, longitude and positional accuracy.
"""
if not zip_code:
return {}
zip_query = nomi.query_postal_code(zip_code)[["latitude", "longitude", "accuracy"]]
if zip_query.isna().any():
return {}
return dict(zip_query[["latitude", "longitude"]], pos_accuracy=zip_query.accuracy)
@logger.catch(level="WARNING", reraise=True)
def add_company(company: dict[str, Any], db: Session) -> None:
"""Add a company with all its data found in the mongodb company entry.
@ -218,6 +242,7 @@ def add_company(company: dict[str, Any], db: Session) -> None:
zip_code=simplify_string(location.get("zip_code")),
street=simplify_string(location.get("street")),
last_update=last_update,
**get_geocodes(location.get("zip_code")), # type: ignore
)
db.add(company_entry)
db.commit()

View File

@ -44,6 +44,10 @@ class Company(Base):
street = sa.Column(sa.String(100), nullable=True)
zip_code = sa.Column(sa.String(5), nullable=True)
city = sa.Column(sa.String(100), nullable=True)
longitude = sa.Column(sa.Float, nullable=True)
latitude = sa.Column(sa.Float, nullable=True)
pos_accuracy = sa.Column(sa.Float, nullable=True)
last_update = sa.Column(sa.Date, nullable=False)
sector = sa.Column(sa.String(100), nullable=True)

View File

@ -144,18 +144,24 @@ def full_db(empty_db: Session, finance_statements: list[dict[str, Any]]) -> Sess
court_id=2,
name="Some Company GmbH",
street="Sesamstr.",
zip_code="12345",
zip_code="58644",
city="TV City",
last_update=datetime.date.fromisoformat("2023-01-01"),
latitude=51.3246,
longitude=7.6968,
pos_accuracy=4.0,
),
entities.Company(
hr="HRB 123",
court_id=1,
name="Other Company GmbH",
street="Sesamstr.",
zip_code="12345",
zip_code="58636",
city="TV City",
last_update=datetime.date.fromisoformat("2023-01-01"),
latitude=51.38,
longitude=7.7032,
pos_accuracy=4.0,
),
entities.Company(
hr="HRB 12",

View File

@ -26,8 +26,11 @@ def test_get_company_data(full_db: Session) -> None:
2: "Third Company GmbH",
},
"company_street": {0: "Sesamstr.", 1: "Sesamstr.", 2: None},
"company_zip_code": {0: "12345", 1: "12345", 2: None},
"company_zip_code": {0: "58644", 1: "58636", 2: None},
"company_city": {0: "TV City", 1: "TV City", 2: None},
"company_longitude": {0: 7.6968, 1: 7.7032, 2: None},
"company_latitude": {0: 51.3246, 1: 51.38, 2: None},
"company_pos_accuracy": {0: 4.0, 1: 4.0, 2: None},
"company_last_update": {
0: "2023-01-01",
1: "2023-01-01",

View File

@ -177,12 +177,12 @@ def test_get_person_id_value_check(
("name", "zip_code", "city", "id"),
[
("Some Company GmbH", "", "", 1),
("Some Company GmbH", "12345", "", 1),
("Some Company GmbH", "12345", "TV City", 1),
("Some Company GmbH", "58644", "", 1),
("Some Company GmbH", "58644", "TV City", 1),
("Some Company GmbH", "", "TV City", 1),
("Other Company GmbH", "", "", 2),
("Other Company GmbH", "12345", "", 2),
("Other Company GmbH", "12345", "TV City", 2),
("Other Company GmbH", "58636", "", 2),
("Other Company GmbH", "58636", "TV City", 2),
("Other Company GmbH", "", "TV City", 2),
("Third Company GmbH", "", "", 3),
],
@ -672,8 +672,11 @@ def test_relationships(documents: list[dict[str, Any]], full_db: Session) -> Non
2: "Third Company GmbH",
},
"street": {0: "Sesamstr.", 1: "Sesamstr.", 2: None},
"zip_code": {0: "12345", 1: "12345", 2: None},
"zip_code": {0: "58644", 1: "58636", 2: None},
"city": {0: "TV City", 1: "TV City", 2: None},
"longitude": {0: 7.6968, 1: 7.7032, 2: None},
"latitude": {0: 51.3246, 1: 51.38, 2: None},
"pos_accuracy": {0: 4.0, 1: 4.0, 2: None},
"last_update": {
0: pd.Timestamp("2023-01-01 00:00:00"),
1: pd.Timestamp("2023-01-01 00:00:00"),
@ -1014,3 +1017,18 @@ def test_add_annual_report_financial_key_error(full_db: Session) -> None:
{"financials": {"something-strange": 123.12}, "auditors": {}},
db=full_db,
)
@pytest.mark.working_on()
@pytest.mark.parametrize(
("zip_code", "results"),
[
("44809", {"latitude": 51.4997, "longitude": 7.1944, "pos_accuracy": 4.0}),
(None, {}),
("", {}),
("60547", {}),
("58590", {}),
],
)
def test_get_geocodes(zip_code: str | None, results: dict) -> None:
assert data_transfer.get_geocodes(zip_code) == results