# News

## Tagesschau API

In [8]:
import json
import requests
from bs4 import BeautifulSoup


class TagesschauAPI:
    def __init__(self):
        self.base_url = "https://www.tagesschau.de/api2"

    def get_news_for_sector(self, sector: str) -> dict:
        url = f"{self.base_url}/news/"
        regions = ",".join([str(i) for i in range(1, 16)])
        result = requests.get(url=url, params={"regions": regions, "ressort": sector})
        return result.json()

    def custom_search(self, query: str) -> dict:
        url = f"{self.base_url}/search/"
        result = requests.get(url=url, params={"searchText": query})
        return result.json()

    def get_news_details_text(self, url: str) -> dict:
        content = requests.get(url)
        soup = BeautifulSoup(content.text, features="html.parser")

        return " ".join(
            [elem.text.replace("\n", " ") for elem in soup.find_all("p")][1:]
        )


tagesschau = TagesschauAPI()

data = tagesschau.get_news_for_sector("wirtschaft")
with open("./data/temp.json", "w+", encoding="utf-8") as file:
    json.dump(data, file, ensure_ascii=False)

In [9]:
from tqdm import tqdm
import pandas as pd


news = []
for news_article in tqdm(data["news"]):
    info = {
        "id": news_article["externalId"],
        "title": news_article["title"],
        "date": news_article["date"],
        "text": tagesschau.get_news_details_text(news_article["detailsweb"]),
    }
    news.append(info)

df = pd.DataFrame(news)
df.head()

100%|██████████| 50/50 [00:15<00:00,  3.33it/s]


Unnamed: 0,id,title,date,text
0,873f8c93-c996-4e08-a077-7f2b182197aa,Netzagentur versteigert Flächen für Offshore-W...,2023-06-15T19:24:04.940+02:00,Mehrere Unternehmen bewerben sich um Flächen ...
1,08a99fad-b0be-4481-ac87-b71a8eeeb95e,Die neue Asien-Strategie von Siemens,2023-06-15T19:23:48.142+02:00,Der Siemens-Konzern hat neue Investitionen in...
2,5cc61bcb-d290-4114-b608-0d5aba426f27,DAX zeigt Stärke,2023-06-15T18:26:25.866+02:00,Der DAX hat die heutige Zinserhöhung erstaunl...
3,836120ce-9602-4296-9b04-8eff4da34be5,Befeuert Beyoncé die schwedische Inflation?,2023-06-15T16:31:26.141+02:00,Die Inflationsrate in Schweden ist zuletzt ni...
4,091386f3-78b5-4180-8d5b-7c5b371aed93,Leitzins in der Eurozone steigt auf vier Prozent,2023-06-15T14:16:40.611+02:00,Die Europäische Zentralbank hat den Leitzins ...


In [10]:
df.to_json("./data/news.json", orient="records", force_ascii=False)

In [60]:
custom_search_data = tagesschau.custom_search("Haltern am See")
custom_search_data

{'details': 'https://www.tagesschau.de/api2u/search',
 'searchText': 'Haltern am See',
 'pageSize': 25,
 'resultPage': 0,
 'totalItemCount': 3,
 'searchResults': [{'sophoraId': 'wdr-erneut-rettungsboot-motoren-der-dlrg-in-haltern-geklaut-100',
   'externalId': 'tagesschau_fm-story-WDR-c6323d1e-c262-4414-946c-e063dc4b9791',
   'title': 'Erneut Rettungsboot-Motoren der DLRG in Haltern geklaut',
   'date': '2023-06-14T13:26:00.000+02:00',
   'teaserImage': {'title': 'Diebe klauen Rettungsboot-Motoren der DLRG Haltern',
    'copyright': 'WDR/Markus Holtrichter',
    'alttext': 'Ein Mitglied der DLRG Haltern rudert im Paddelboot.',
    'imageVariants': {'1x1-144': 'https://images.tagesschau.de/image/c77cdc33-f678-4a0e-b411-3e98039c67d3/AAABiLnqOsk/AAABg8tMNQQ/1x1-144.jpg',
     '1x1-256': 'https://images.tagesschau.de/image/c77cdc33-f678-4a0e-b411-3e98039c67d3/AAABiLnqOsk/AAABg8tMOLk/1x1-256.jpg',
     '1x1-432': 'https://images.tagesschau.de/image/c77cdc33-f678-4a0e-b411-3e98039c67d3/AAABi