test_model #13
|
@ -0,0 +1,47 @@
|
|||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
class WebTextExtractor:
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
self.content = None
|
||||
self.text = None
|
||||
|
||||
def fetch_content(self):
|
||||
"""Holt den HTML-Inhalt von der Webseite."""
|
||||
response = requests.get(self.url)
|
||||
if response.status_code == 200:
|
||||
self.content = response.content
|
||||
else:
|
||||
raise Exception(f"Fehler beim Abrufen der Seite: {response.status_code}")
|
||||
|
||||
def extract_text(self):
|
||||
"""Extrahiert den Text ohne HTML-Tags aus dem HTML-Inhalt."""
|
||||
if self.content:
|
||||
soup = BeautifulSoup(self.content, 'html.parser')
|
||||
self.text = soup.get_text()
|
||||
# Optional: Entferne überflüssige Leerzeichen und Zeilenumbrüche
|
||||
self.text = ' '.join(self.text.split())
|
||||
else:
|
||||
raise Exception("Kein Inhalt zum Parsen. Bitte zuerst fetch_content() aufrufen.")
|
||||
|
||||
def get_text(self):
|
||||
"""Gibt den extrahierten Text zurück."""
|
||||
if self.text:
|
||||
return self.text
|
||||
else:
|
||||
raise Exception("Kein Text extrahiert. Bitte zuerst extract_text() aufrufen.")
|
||||
|
||||
|
||||
# Beispielaufruf
|
||||
if __name__ == "__main__":
|
||||
url = "https://de.wikipedia.org/wiki/Lineare_Algebra"
|
||||
extractor = WebTextExtractor(url)
|
||||
|
||||
# HTML-Inhalt abrufen
|
||||
extractor.fetch_content()
|
||||
|
||||
# Text extrahieren
|
||||
extractor.extract_text()
|
||||
|
||||
print(extractor.get_text())
|
Reference in New Issue