updated url extraction

2024-10-08 08:24:51 +02:00 · 2024-10-08 08:24:51 +02:00 · a2787f4ae9
parent f353498fdf
commit a2787f4ae9
2 changed files with 19 additions and 2 deletions
--- a/docs/ui/ml_docs.md
+++ b/docs/ui/ml_docs.md
@ -11,7 +11,7 @@ https://api.python.langchain.com/en/latest/llms/langchain_community.llms.ollama.
 1. LLM For our custom LLM
 ```json
 {
-    "model": "phi3.5:latest",
+    "model": "mistral-nemo:12b-instruct-2407-q8_0",
    "apiBase": "https://ai.fabelous.app/v1/ollama/generic",
    "headers": {"Authorization": "Token xxx"},
    "system": "System Message",
--- a/src/utils/webTextExtractor.py
+++ b/src/utils/webTextExtractor.py
@ -22,6 +22,7 @@ class WebTextExtractor:
            self.text = soup.get_text()
            # Optional: Entferne überflüssige Leerzeichen und Zeilenumbrüche
            self.text = ' '.join(self.text.split())
+            self.text = self.resize_article(self.text)
        else:
            raise Exception("Kein Inhalt zum Parsen. Bitte zuerst fetch_content() aufrufen.")
    
@ -31,6 +32,22 @@ class WebTextExtractor:
            return self.text 
        else:
            raise Exception("Kein Text extrahiert. Bitte zuerst extract_text() aufrufen.")
+    
+    def resize_article(self, article):
+        """Resizes the article by removing the first 30 words and the last 10%."""
+        # Split the article into a list of words
+        words = article.split()

+        # Remove the first 30 words
+        words = words[30:]

-# Beispielaufruf
+        # Calculate the number of words to remove from the end (10% of the total words)
+        num_to_remove = int(len(words) * 0.1)
+
+        # Remove the last 10% of words
+        words = words[:-num_to_remove]
+
+        # Join the remaining words back into a single string
+        resized_article = ' '.join(words)
+
+        return resized_article