updated url extraction
This commit is contained in:
parent
f353498fdf
commit
a2787f4ae9
|
@ -11,7 +11,7 @@ https://api.python.langchain.com/en/latest/llms/langchain_community.llms.ollama.
|
|||
1. LLM For our custom LLM
|
||||
```json
|
||||
{
|
||||
"model": "phi3.5:latest",
|
||||
"model": "mistral-nemo:12b-instruct-2407-q8_0",
|
||||
"apiBase": "https://ai.fabelous.app/v1/ollama/generic",
|
||||
"headers": {"Authorization": "Token xxx"},
|
||||
"system": "System Message",
|
||||
|
|
|
@ -22,6 +22,7 @@ class WebTextExtractor:
|
|||
self.text = soup.get_text()
|
||||
# Optional: Entferne überflüssige Leerzeichen und Zeilenumbrüche
|
||||
self.text = ' '.join(self.text.split())
|
||||
self.text = self.resize_article(self.text)
|
||||
else:
|
||||
raise Exception("Kein Inhalt zum Parsen. Bitte zuerst fetch_content() aufrufen.")
|
||||
|
||||
|
@ -31,6 +32,22 @@ class WebTextExtractor:
|
|||
return self.text
|
||||
else:
|
||||
raise Exception("Kein Text extrahiert. Bitte zuerst extract_text() aufrufen.")
|
||||
|
||||
def resize_article(self, article):
|
||||
"""Resizes the article by removing the first 30 words and the last 10%."""
|
||||
# Split the article into a list of words
|
||||
words = article.split()
|
||||
|
||||
# Remove the first 30 words
|
||||
words = words[30:]
|
||||
|
||||
# Beispielaufruf
|
||||
# Calculate the number of words to remove from the end (10% of the total words)
|
||||
num_to_remove = int(len(words) * 0.1)
|
||||
|
||||
# Remove the last 10% of words
|
||||
words = words[:-num_to_remove]
|
||||
|
||||
# Join the remaining words back into a single string
|
||||
resized_article = ' '.join(words)
|
||||
|
||||
return resized_article
|
||||
|
|
Loading…
Reference in New Issue