updated url extraction

This commit is contained in:
Falko Victor Habel 2024-10-08 08:24:51 +02:00
parent f353498fdf
commit a2787f4ae9
2 changed files with 19 additions and 2 deletions

View File

@ -11,7 +11,7 @@ https://api.python.langchain.com/en/latest/llms/langchain_community.llms.ollama.
1. LLM For our custom LLM
```json
{
"model": "phi3.5:latest",
"model": "mistral-nemo:12b-instruct-2407-q8_0",
"apiBase": "https://ai.fabelous.app/v1/ollama/generic",
"headers": {"Authorization": "Token xxx"},
"system": "System Message",

View File

@ -22,6 +22,7 @@ class WebTextExtractor:
self.text = soup.get_text()
# Optional: Entferne überflüssige Leerzeichen und Zeilenumbrüche
self.text = ' '.join(self.text.split())
self.text = self.resize_article(self.text)
else:
raise Exception("Kein Inhalt zum Parsen. Bitte zuerst fetch_content() aufrufen.")
@ -31,6 +32,22 @@ class WebTextExtractor:
return self.text
else:
raise Exception("Kein Text extrahiert. Bitte zuerst extract_text() aufrufen.")
def resize_article(self, article):
"""Resizes the article by removing the first 30 words and the last 10%."""
# Split the article into a list of words
words = article.split()
# Remove the first 30 words
words = words[30:]
# Beispielaufruf
# Calculate the number of words to remove from the end (10% of the total words)
num_to_remove = int(len(words) * 0.1)
# Remove the last 10% of words
words = words[:-num_to_remove]
# Join the remaining words back into a single string
resized_article = ' '.join(words)
return resized_article