Veracity_AI/src/utils/webTextExtractor.py

import requests
from bs4 import BeautifulSoup

class WebTextExtractor:
    def __init__(self, url):
        self.url = url
        self.content = None
        self.text = None

    def fetch_content(self):
        """Holt den HTML-Inhalt von der Webseite."""
        response = requests.get(self.url)
        if response.status_code == 200:
            self.content = response.content
        else:
            raise Exception(f"Fehler beim Abrufen der Seite: {response.status_code}")

    def extract_text(self):
        """Extrahiert den Text ohne HTML-Tags aus dem HTML-Inhalt."""
        if self.content:
            soup = BeautifulSoup(self.content, 'html.parser')
            self.text = soup.get_text()
            # Optional: Entferne überflüssige Leerzeichen und Zeilenumbrüche
            self.text = ' '.join(self.text.split())
            self.text = self.resize_article(self.text)
        else:
            raise Exception("Kein Inhalt zum Parsen. Bitte zuerst fetch_content() aufrufen.")

    def get_text(self):
        """Gibt den extrahierten Text zurück."""
        if self.text:
            return self.text
        else:
            raise Exception("Kein Text extrahiert. Bitte zuerst extract_text() aufrufen.")

    def resize_article(self, article):
        """Resizes the article by removing the first 30 words and limiting the length to 512 words."""
        # Split the article into a list of words
        words = article.split()

        # Remove the first 30 words
        words = words[30:]

        # Calculate the number of words to keep (up to 512 words)
        num_to_keep = min(512, len(words))

        # Slice the list of words to keep only the first num_to_keep words
        words = words[:num_to_keep]

        # Join the remaining words back into a single string
        resized_article = ' '.join(words)

        return resized_article