Veracity_AI/src/utils/webTextExtractor.py

55 lines
1.9 KiB
Python

import requests
from bs4 import BeautifulSoup
class WebTextExtractor:
def __init__(self, url):
self.url = url
self.content = None
self.text = None
def fetch_content(self):
"""Holt den HTML-Inhalt von der Webseite."""
response = requests.get(self.url)
if response.status_code == 200:
self.content = response.content
else:
raise Exception(f"Fehler beim Abrufen der Seite: {response.status_code}")
def extract_text(self):
"""Extrahiert den Text ohne HTML-Tags aus dem HTML-Inhalt."""
if self.content:
soup = BeautifulSoup(self.content, 'html.parser')
self.text = soup.get_text()
# Optional: Entferne überflüssige Leerzeichen und Zeilenumbrüche
self.text = ' '.join(self.text.split())
self.text = self.resize_article(self.text)
else:
raise Exception("Kein Inhalt zum Parsen. Bitte zuerst fetch_content() aufrufen.")
def get_text(self):
"""Gibt den extrahierten Text zurück."""
if self.text:
return self.text
else:
raise Exception("Kein Text extrahiert. Bitte zuerst extract_text() aufrufen.")
def resize_article(self, article):
"""Resizes the article by removing the first 30 words and limiting the length to 512 words."""
# Split the article into a list of words
words = article.split()
# Remove the first 30 words
words = words[30:]
# Calculate the number of words to keep (up to 512 words)
num_to_keep = min(512, len(words))
# Slice the list of words to keep only the first num_to_keep words
words = words[:num_to_keep]
# Join the remaining words back into a single string
resized_article = ' '.join(words)
return resized_article