From 47ef4bd071e7fe7666796ee101979e6a478962cd Mon Sep 17 00:00:00 2001 From: Falko Habel Date: Sun, 29 Dec 2024 20:52:17 +0100 Subject: [PATCH] inital commit with explanation and release window --- .gitignore | 3 +++ README.md | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- example.py | 5 ++++ 3 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 example.py diff --git a/.gitignore b/.gitignore index 5d381cc..8560245 100644 --- a/.gitignore +++ b/.gitignore @@ -153,6 +153,9 @@ dmypy.json # Cython debug symbols cython_debug/ +# Models +fabelous-albert-uncased + # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore diff --git a/README.md b/README.md index 7f38aa4..8ccbcab 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,76 @@ -# Fabelous-albert-uncased +# Fabelous-Albert-Uncased + +**Fabelous-Albert-Uncased** is a bilingual ALBERT model pretrained on German, English, and code. This uncased model is a Masked Language Model (MLM) and can be fine-tuned for a variety of tasks, including but not limited to: + +- Named Entity Recognition (NER) +- Binary Classification +- Text Completion + +The model has been designed for efficiency and compatibility, requiring the use of the `FastTokenizer` for optimal performance. + + +## Features + +### 1. **Bilingual Support** + - Trained on English and German text, enabling seamless bilingual tasks. + +### 2. **Code Understanding** + - Incorporates code in its training data, making it suitable for programming-related NLP tasks. + +### 3. **Uncased** + - Treats words as case-insensitive, which simplifies preprocessing steps and generalizes better for certain tasks. + +### 4. **Fine-Tuning Ready** + - Easily fine-tune for tasks such as text classification, named entity recognition, and more. + +--- + +## Downloading the Model + +You can download the `fabelous-albert-uncased` model using the following link: + +[Download Fabelous-Albert-Uncased Model](https://gitea.fabelous.app/Fabel/Fabelous-albert-uncased/releases/download/latest/fabelous-albert-uncased.zip) + +### Installation Instructions + +1. Click the link above to download a ZIP file containing the model files. +2. Extract the ZIP file into your desired directory. +3. Load the model in your Python project using the `transformers` library. + +## Usage Example + +Below is a sample code snippet to demonstrate how to use the `fabelous-albert-uncased` model for a masked language modeling task: + +```python +from transformers import pipeline + +# Load the pipeline with the Fabelous-Albert-Uncased model +unmasker = pipeline('fill-mask', model='fabelous-albert-uncased') + +# Perform masked language modeling +output = unmasker("Hello I'm a [MASK] model.") +print(output) +``` + + + +## Future Enhancements: New Model Announcement + +We are thrilled to announce that a new version of the model is currently under development! The upcoming model will: + +- **Quadruple the Training Size**: With four times more data, expect significantly improved performance across diverse tasks. +- **Cased Version**: In addition to the uncased version, a cased model will be introduced, preserving capitalization for more nuanced language understanding. +- **Extended Language Support**: Support for multiple additional languages beyond English and German. +- **Slow- and FastTokenizer Support**: Support for both tokenizer Versions. + +Stay tuned for updates as we prepare to release this enhanced model in the near future. + + +## License + +This model is released under the [Creative Commons Attribution 4.0 International Licence](https://creativecommons.org/licenses/by/4.0/). + +## Feedback and Support + +If you encounter any issues or have questions, feel free to reach out through the project's [Gitea Issues page](https://gitea.fabelous.app/Fabel/Fabelous-albert-uncased/issues) or contact our support team at support@fabelous.app. -This is a bilingual ALBERT model that has been pretrained on German, English, and code. It is a Masked Language Model (MLM) and can be fine-tuned for various tasks, such as Named Entity Recognition (NER) or binary classification. The model is uncased and is exclusively compatible with the FastTokenizer. \ No newline at end of file diff --git a/example.py b/example.py new file mode 100644 index 0000000..175ff85 --- /dev/null +++ b/example.py @@ -0,0 +1,5 @@ +from transformers import pipeline + + +unmasker = pipeline('fill-mask', model='fabelous-albert-uncased') +print(unmasker("Hello I'm a [MASK] model.")) \ No newline at end of file