from urllib.parse import urlparse from utils.webTextExtractor import WebTextExtractor class TextData: def __init__(self, url: str = "",text: str = "",result: str = "", is_fake_news: bool = False, provider: str = "") -> None: self.url = url self.text = text self.result = result self.is_fake_news = is_fake_news self.provider = provider self.confidence = None self._extractor = None def set_url(self, url: str) -> None: self.url = url self.text = "" # Reset text when URL changes self._extractor = None # Reset extractor when URL changes def text_from_url(self)-> bool: if not self.url: return False if not self.text: print("Extrahiere Text von URL...") self._extractor = WebTextExtractor(self.url) self._extractor.fetch_content() self._extractor.extract_text() self.text = self._extractor.get_text() return True def get_output(self): if self.confidence != None: output = f"Prediction: {self.result}" + f" Confidence: {self.confidence:.4f}" return output def get_provider(self)-> str: self.extract_provider() return self.provider def extract_provider(self): """ Extract the domain (anbieter) from a given URL. :param url: The URL to process :return: The extracted domain or None if the URL is invalid """ if not self._is_valid_url(self.url): self.provider = "Unknown" parsed_url = urlparse(self.url) domain_parts = parsed_url.netloc.split('.') self.provider = f"{domain_parts[-2]}.{domain_parts[-1]}" if len(domain_parts) >= 2 else "Unknown" def _is_valid_url(self, url: str) -> bool: """ Check if a given URL is valid. :param url: The URL to validate :return: True if the URL is valid, False otherwise """ try: result = urlparse(url) return all([result.scheme, result.netloc]) except ValueError: return False