Veracity_AI/src/models/data.py

67 lines
2.1 KiB
Python

from urllib.parse import urlparse
from utils.webTextExtractor import WebTextExtractor
class TextData:
def __init__(self, url: str = "",text: str = "",result: str = "", is_fake_news: bool = False, provider: str = "") -> None:
self.url = url
self.text = text
self.result = result
self.is_fake_news = is_fake_news
self.provider = provider
self.confidence = None
self._extractor = None
def set_url(self, url: str) -> None:
self.url = url
self.text = "" # Reset text when URL changes
self._extractor = None # Reset extractor when URL changes
def text_from_url(self)-> bool:
if not self.url:
return False
if not self.text:
print("Extrahiere Text von URL...")
self._extractor = WebTextExtractor(self.url)
self._extractor.fetch_content()
self._extractor.extract_text()
self.text = self._extractor.get_text()
return True
def get_output(self):
if self.confidence != None:
output = f"Prediction: {self.result}" + f" Confidence: {self.confidence:.4f}"
return output
def get_provider(self)-> str:
self.extract_provider()
return self.provider
def extract_provider(self):
"""
Extract the domain (anbieter) from a given URL.
:param url: The URL to process
:return: The extracted domain or None if the URL is invalid
"""
if not self._is_valid_url(self.url):
self.provider = "Unknown"
parsed_url = urlparse(self.url)
domain_parts = parsed_url.netloc.split('.')
self.provider = f"{domain_parts[-2]}.{domain_parts[-1]}" if len(domain_parts) >= 2 else "Unknown"
def _is_valid_url(self, url: str) -> bool:
"""
Check if a given URL is valid.
:param url: The URL to validate
:return: True if the URL is valid, False otherwise
"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except ValueError:
return False