67 lines
2.1 KiB
Python
67 lines
2.1 KiB
Python
from urllib.parse import urlparse
|
|
|
|
|
|
from utils.webTextExtractor import WebTextExtractor
|
|
|
|
class TextData:
|
|
def __init__(self, url: str = "",text: str = "",result: str = "", is_fake_news: bool = False, provider: str = "") -> None:
|
|
self.url = url
|
|
self.text = text
|
|
self.result = result
|
|
self.is_fake_news = is_fake_news
|
|
self.provider = provider
|
|
self.confidence = None
|
|
self._extractor = None
|
|
|
|
def set_url(self, url: str) -> None:
|
|
self.url = url
|
|
self.text = "" # Reset text when URL changes
|
|
self._extractor = None # Reset extractor when URL changes
|
|
|
|
def text_from_url(self)-> bool:
|
|
if not self.url:
|
|
return False
|
|
|
|
if not self.text:
|
|
print("Extrahiere Text von URL...")
|
|
self._extractor = WebTextExtractor(self.url)
|
|
self._extractor.fetch_content()
|
|
self._extractor.extract_text()
|
|
self.text = self._extractor.get_text()
|
|
return True
|
|
|
|
def get_output(self):
|
|
|
|
if self.confidence != None:
|
|
output = f"Prediction: {self.result}" + f" Confidence: {self.confidence:.4f}"
|
|
return output
|
|
|
|
def get_provider(self)-> str:
|
|
self.extract_provider()
|
|
return self.provider
|
|
|
|
def extract_provider(self):
|
|
"""
|
|
Extract the domain (anbieter) from a given URL.
|
|
|
|
:param url: The URL to process
|
|
:return: The extracted domain or None if the URL is invalid
|
|
"""
|
|
if not self._is_valid_url(self.url):
|
|
self.provider = "Unknown"
|
|
parsed_url = urlparse(self.url)
|
|
domain_parts = parsed_url.netloc.split('.')
|
|
self.provider = f"{domain_parts[-2]}.{domain_parts[-1]}" if len(domain_parts) >= 2 else "Unknown"
|
|
|
|
def _is_valid_url(self, url: str) -> bool:
|
|
"""
|
|
Check if a given URL is valid.
|
|
|
|
:param url: The URL to validate
|
|
:return: True if the URL is valid, False otherwise
|
|
"""
|
|
try:
|
|
result = urlparse(url)
|
|
return all([result.scheme, result.netloc])
|
|
except ValueError:
|
|
return False |