updated url extraction
This commit is contained in:
parent
f353498fdf
commit
a2787f4ae9
|
@ -11,7 +11,7 @@ https://api.python.langchain.com/en/latest/llms/langchain_community.llms.ollama.
|
||||||
1. LLM For our custom LLM
|
1. LLM For our custom LLM
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "phi3.5:latest",
|
"model": "mistral-nemo:12b-instruct-2407-q8_0",
|
||||||
"apiBase": "https://ai.fabelous.app/v1/ollama/generic",
|
"apiBase": "https://ai.fabelous.app/v1/ollama/generic",
|
||||||
"headers": {"Authorization": "Token xxx"},
|
"headers": {"Authorization": "Token xxx"},
|
||||||
"system": "System Message",
|
"system": "System Message",
|
||||||
|
|
|
@ -22,6 +22,7 @@ class WebTextExtractor:
|
||||||
self.text = soup.get_text()
|
self.text = soup.get_text()
|
||||||
# Optional: Entferne überflüssige Leerzeichen und Zeilenumbrüche
|
# Optional: Entferne überflüssige Leerzeichen und Zeilenumbrüche
|
||||||
self.text = ' '.join(self.text.split())
|
self.text = ' '.join(self.text.split())
|
||||||
|
self.text = self.resize_article(self.text)
|
||||||
else:
|
else:
|
||||||
raise Exception("Kein Inhalt zum Parsen. Bitte zuerst fetch_content() aufrufen.")
|
raise Exception("Kein Inhalt zum Parsen. Bitte zuerst fetch_content() aufrufen.")
|
||||||
|
|
||||||
|
@ -31,6 +32,22 @@ class WebTextExtractor:
|
||||||
return self.text
|
return self.text
|
||||||
else:
|
else:
|
||||||
raise Exception("Kein Text extrahiert. Bitte zuerst extract_text() aufrufen.")
|
raise Exception("Kein Text extrahiert. Bitte zuerst extract_text() aufrufen.")
|
||||||
|
|
||||||
|
def resize_article(self, article):
|
||||||
|
"""Resizes the article by removing the first 30 words and the last 10%."""
|
||||||
|
# Split the article into a list of words
|
||||||
|
words = article.split()
|
||||||
|
|
||||||
|
# Remove the first 30 words
|
||||||
|
words = words[30:]
|
||||||
|
|
||||||
# Beispielaufruf
|
# Calculate the number of words to remove from the end (10% of the total words)
|
||||||
|
num_to_remove = int(len(words) * 0.1)
|
||||||
|
|
||||||
|
# Remove the last 10% of words
|
||||||
|
words = words[:-num_to_remove]
|
||||||
|
|
||||||
|
# Join the remaining words back into a single string
|
||||||
|
resized_article = ' '.join(words)
|
||||||
|
|
||||||
|
return resized_article
|
||||||
|
|
Loading…
Reference in New Issue