import requests from bs4 import BeautifulSoup class WebTextExtractor: def __init__(self, url): self.url = url self.content = None self.text = None def fetch_content(self): """Holt den HTML-Inhalt von der Webseite.""" response = requests.get(self.url) if response.status_code == 200: self.content = response.content else: raise Exception(f"Fehler beim Abrufen der Seite: {response.status_code}") def extract_text(self): """Extrahiert den Text ohne HTML-Tags aus dem HTML-Inhalt.""" if self.content: soup = BeautifulSoup(self.content, 'html.parser') self.text = soup.get_text() # Optional: Entferne überflüssige Leerzeichen und Zeilenumbrüche self.text = ' '.join(self.text.split()) self.text = self.resize_article(self.text) else: raise Exception("Kein Inhalt zum Parsen. Bitte zuerst fetch_content() aufrufen.") def get_text(self): """Gibt den extrahierten Text zurück.""" if self.text: return self.text else: raise Exception("Kein Text extrahiert. Bitte zuerst extract_text() aufrufen.") def resize_article(self, article): """Resizes the article by removing the first 30 words and limiting the length to 512 words.""" # Split the article into a list of words words = article.split() # Remove the first 30 words words = words[30:] # Calculate the number of words to keep (up to 512 words) num_to_keep = min(512, len(words)) # Slice the list of words to keep only the first num_to_keep words words = words[:num_to_keep] # Join the remaining words back into a single string resized_article = ' '.join(words) return resized_article