55 lines
1.9 KiB
Python
55 lines
1.9 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
class WebTextExtractor:
|
|
def __init__(self, url):
|
|
self.url = url
|
|
self.content = None
|
|
self.text = None
|
|
|
|
def fetch_content(self):
|
|
"""Holt den HTML-Inhalt von der Webseite."""
|
|
response = requests.get(self.url)
|
|
if response.status_code == 200:
|
|
self.content = response.content
|
|
else:
|
|
raise Exception(f"Fehler beim Abrufen der Seite: {response.status_code}")
|
|
|
|
def extract_text(self):
|
|
"""Extrahiert den Text ohne HTML-Tags aus dem HTML-Inhalt."""
|
|
if self.content:
|
|
soup = BeautifulSoup(self.content, 'html.parser')
|
|
self.text = soup.get_text()
|
|
# Optional: Entferne überflüssige Leerzeichen und Zeilenumbrüche
|
|
self.text = ' '.join(self.text.split())
|
|
self.text = self.resize_article(self.text)
|
|
else:
|
|
raise Exception("Kein Inhalt zum Parsen. Bitte zuerst fetch_content() aufrufen.")
|
|
|
|
def get_text(self):
|
|
"""Gibt den extrahierten Text zurück."""
|
|
if self.text:
|
|
return self.text
|
|
else:
|
|
raise Exception("Kein Text extrahiert. Bitte zuerst extract_text() aufrufen.")
|
|
|
|
def resize_article(self, article):
|
|
"""Resizes the article by removing the first 30 words and limiting the length to 512 words."""
|
|
# Split the article into a list of words
|
|
words = article.split()
|
|
|
|
# Remove the first 30 words
|
|
words = words[30:]
|
|
|
|
# Calculate the number of words to keep (up to 512 words)
|
|
num_to_keep = min(512, len(words))
|
|
|
|
# Slice the list of words to keep only the first num_to_keep words
|
|
words = words[:num_to_keep]
|
|
|
|
# Join the remaining words back into a single string
|
|
resized_article = ' '.join(words)
|
|
|
|
return resized_article
|
|
|