aajanki
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -1,15 +1,15 @@
 version: 2.1
 
 orbs:
-  python: circleci/python@1.4
+  python: circleci/python@2.2.0
 
 workflows:
   main:
     jobs:
       - test:
           matrix:
             parameters:
-              pyversion: ["3.7", "3.8", "3.9", "3.10"]
+              pyversion: ["3.9", "3.10", "3.11", "3.12"]
 
 jobs:
   test:
@@ -29,6 +29,6 @@ jobs:
           pkg-manager: pip
       - run:
           name: Run tests
-          command: python -m pytest tests --junit-xml=test-results/report.xml
+          command: python -m pytest tests/unit --junit-xml=test-results/report.xml
       - store_test_results:
           path: test-results
diff --git a/.gitignore b/.gitignore
@@ -6,5 +6,6 @@ models/
 training/
 packages/
 metrics/
+notes/
 __pycache__/
 project.lock
diff --git a/Changelog b/Changelog
@@ -1,5 +1,35 @@
 Finnish language model for spaCy
 
+Version 0.15.1, 2024-11-14
+
+* Better cleaning of training data
+* Redacted person names in email addresses in the word frequency data
+
+Version 0.15.0, 2024-10-19
+
+* Compatible with spaCy 3.8
+* Improved spam filter on the MC4 corpus
+
+Version 0.14.0, 2023-10-14
+
+* Compatible with spaCy 3.7
+* The noun chunker includes chains of flats and nmods: e.g. "maaliskuun 7. päivänä"
+* The parser doesn't try to detect nsubj:outer, dislocated and goeswith
+  dependencies anymore. There's not enough training data to learn those.
+* Tokenize "-kampanja" as ["-", "kampanja"]
+* Tokenize "maa-" as ["maa", "-"]
+* Tokenize "/kk" as ["/", "kk"]
+* Other tokenizer improvements
+
+Version 0.13.0, 2023-07-21
+
+* Compatible with spaCy 3.6
+
+Version 0.12.0, 2023-02-01
+
+* Compatible with spaCy 3.5
+* Word occurrence probabilities (they have been broken in the past several versions)
+
 Version 0.11.0, 2022-07-23
 
 * Ported to spaCy 3.4

diff --git a/LICENSE b/LICENSE
@@ -1,4 +1,4 @@
-Copyright 2019-2022 Antti Ajanki <antti.ajanki@iki.fi>
+Copyright 2019-2023 Antti Ajanki <antti.ajanki@iki.fi>
 
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 # Experimental Finnish language model for spaCy
 
-Finnish language model for [spaCy](https://spacy.io/). The model does POS tagging, dependency parsing, word vectors, noun phrase extraction, token frequencies, morphological features, lemmatization and named entity recognition (NER). The lemmatization is based on [Voikko](https://voikko.puimula.org/).
+Finnish language model for [spaCy](https://spacy.io/). The model does POS tagging, dependency parsing, word vectors, noun phrase extraction, word occurrence probability estimates, morphological features, lemmatization and named entity recognition (NER). The lemmatization is based on [Voikko](https://voikko.puimula.org/).
 
 The main differences between this model and the [Finnish language model](https://spacy.io/models/fi) in the spaCy core:
 * This model includes a different lemmatizer implementation compared to spaCy core. My model's [lemmatization accuracy](https://github.com/aajanki/finnish-pos-accuracy#results) is considerably better but the execution speed is slightly lower.
@@ -14,6 +14,8 @@ Need the highest possible accuracy especially for lemmatization? Install this mo
 
 I'm planning to continue to experiment with new ideas on this repository and push the useful features to the spaCy core after testing them here.
 
+The training data is web pages collected during 2014 - 2020, before the rise of the AI slop surge. The data does contain some regular spam and poorly machine-translated pages. I have made some effort to filter out the most conspicuous spam pages.
+
 ## Install the Finnish language model
 
 First, install [the libvoikko native library and the Finnish morphology data files](https://voikko.puimula.org/python.html).
@@ -27,6 +29,10 @@ Compatibility with spaCy versions:
 
 | spacy-fi version | Compatible with spaCy versions |
 |------------------|--------------------------------|
+| 0.15.x           | 3.8.x                          |
+| 0.14.0           | 3.7.x                          |
+| 0.13.0           | 3.6.x                          |
+| 0.12.0           | 3.5.x                          |
 | 0.11.0           | 3.4.x                          |
 | 0.10.0           | 3.3.x                          |
 | 0.9.0            | >= 3.2.1 and < 3.3.0           |
@@ -48,6 +54,8 @@ for t in doc:
     print(f'{t.lemma_}\t{t.pos_}')
 ```
 
+The [dependency, part-of-speech and named entity labels](docs/tags.md) are documented on a separate page.
+
 ## Updating the model
 
 ### Setting up a development environment
@@ -61,13 +69,13 @@ sudo apt install libvoikko1 voikko-fi
 
 python3 -m venv .venv
 source .venv/bin/activate
-pip install wheel
 pip install -r requirements.txt
 ```
 
 ### Training the model
 
 ```sh
+spacy project assets
 spacy project run train-pipeline
 ```
 
@@ -85,10 +93,21 @@ Pretrain tok2vec weights:
 spacy project run pretrain
 ```
 
+Plot the pretraining loss:
+```sh
+python tools/plot_pretrain_loss.py training/pretrain/log.jsonl
+```
+
 ### Testing
 
+Unit tests:
+```
+python -m pytest tests/unit
+```
+
+Functional tests for a trained model:
 ```
-python -m pytest tests
+python -m pytest tests/functional
 ```
 
 Importing the trained model directly from the file system without
@@ -107,15 +126,15 @@ for t in doc:
 
 ### Packaging and publishing
 
-See [packaging.md](packaging.md).
+See [packaging.md](docs/packaging.md).
 
 ## License
 
 [MIT license](LICENSE)
 
-### License for the training data
+### Licenses for the training data
 
-The data sets downloaded by the tools/download_data.sh script are licensed as follows:
+The datasets used in training are licensed as follows:
 * [UD_Finnish-TDT](https://github.com/UniversalDependencies/UD_Finnish-TDT): Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)
 * [TurkuONE](https://github.com/TurkuNLP/turku-one): Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)
-* [MC4](https://huggingface.co/datasets/mc4): [ODC-BY](https://opendatacommons.org/licenses/by/1-0/) and [Common Crawl terms of use](https://commoncrawl.org/terms-of-use/)
+* [MC4](https://huggingface.co/datasets/allenai/c4): [ODC-BY](https://opendatacommons.org/licenses/by/1-0/) and [Common Crawl terms of use](https://commoncrawl.org/terms-of-use/)
diff --git a/configs/fi-ner.cfg b/configs/fi-ner.cfg
@@ -3,6 +3,7 @@ train = null
 dev = null
 vectors = null
 init_tok2vec = null
+vocab_lookups = null
 
 [system]
 gpu_allocator = null
@@ -17,6 +18,7 @@ before_creation = null
 after_creation = null
 after_pipeline_creation = null
 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+vectors = {"@vectors":"spacy.Vectors.v1"}
 
 [components]
 
@@ -78,13 +80,14 @@ seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
-patience = 1600
+patience = 2400
 max_epochs = 0
 max_steps = 20000
 eval_frequency = 200
 frozen_components = []
 before_to_disk = null
 annotating_components = []
+before_update = null
 
 [training.batcher]
 @batchers = "spacy.batch_by_words.v1"
@@ -126,10 +129,13 @@ ents_per_type = null
 vectors = ${paths.vectors}
 init_tok2vec = ${paths.init_tok2vec}
 vocab_data = null
-lookups = null
 before_init = null
 after_init = null
 
 [initialize.components]
 
+[initialize.lookups]
+@misc = "spacyfi.read_lookups_from_json.v1"
+path = ${paths.vocab_lookups}
+
 [initialize.tokenizer]
diff --git a/configs/fi.cfg b/configs/fi.cfg
@@ -4,7 +4,7 @@ train = null
 dev = null
 vectors = null
 init_tok2vec = null
-vocab = "data/vocab/vocab-data.jsonl"
+vocab_lookups = null
 attribute_ruler_patterns = "fi/lookups/attribute_ruler_patterns.json"
 lemmatizer_lookups = "fi/lookups/lemmatizer"
 
@@ -21,6 +21,7 @@ before_creation = null
 after_creation = null
 after_pipeline_creation = null
 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+vectors = {"@vectors":"spacy.Vectors.v1"}
 
 [components]
 
@@ -36,6 +37,7 @@ overwrite_lemma = false
 [components.morphologizer]
 factory = "morphologizer"
 extend = false
+label_smoothing = 0.0
 overwrite = true
 scorer = {"@scorers":"spacy.morphologizer_scorer.v1"}
 
@@ -72,6 +74,7 @@ upstream = "*"
 
 [components.tagger]
 factory = "tagger"
+label_smoothing = 0.0
 neg_prefix = "!"
 overwrite = false
 scorer = {"@scorers":"spacy.tagger_scorer.v1"}
@@ -137,13 +140,14 @@ seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
-patience = 1600
+patience = 2400
 max_epochs = 0
 max_steps = 20000
 eval_frequency = 200
 frozen_components = []
 before_to_disk = null
 annotating_components = []
+before_update = null
 
 [training.batcher]
 @batchers = "spacy.batch_by_words.v1"
@@ -222,8 +226,7 @@ learn_rate = 0.001
 [initialize]
 vectors = ${paths.vectors}
 init_tok2vec = ${paths.init_tok2vec}
-vocab_data = ${paths.vocab}
-lookups = null
+vocab_data = null
 before_init = null
 after_init = null
 
@@ -241,4 +244,8 @@ path = ${paths.attribute_ruler_patterns}
 @misc = "spacyfi.read_lookups_from_json.v1"
 path = ${paths.lemmatizer_lookups}
 
+[initialize.lookups]
+@misc = "spacyfi.read_lookups_from_json.v1"
+path = ${paths.vocab_lookups}
+
 [initialize.tokenizer]
diff --git a/configs/merged.cfg b/configs/merged.cfg
@@ -0,0 +1,49 @@
+[paths]
+vectors = null
+init_tok2vec = null
+
+[system]
+gpu_allocator = null
+seed = 0
+
+[nlp]
+lang = "fi"
+pipeline = ["tok2vec","tagger","morphologizer","parser","attribute_ruler","lemmatizer","ner"]
+
+[components]
+
+[components.attribute_ruler]
+source = "training/UD_Finnish-TDT/model-best/"
+
+[components.lemmatizer]
+source = "training/UD_Finnish-TDT/model-best/"
+
+[components.morphologizer]
+source = "training/UD_Finnish-TDT/model-best/"
+
+[components.parser]
+source = "training/UD_Finnish-TDT/model-best/"
+
+[components.tagger]
+source = "training/UD_Finnish-TDT/model-best/"
+
+[components.tok2vec]
+source = "training/UD_Finnish-TDT/model-best/"
+
+[components.ner]
+source = "training/turku-one/model-best/"
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.lookups]
+@misc = "spacyfi.read_lookups_from_json.v1"
+path = ${paths.vocab_lookups}
+
+[initialize.tokenizer]
diff --git a/docs/packaging.md b/docs/packaging.md
@@ -0,0 +1,27 @@
+# Packaging
+
+Remember to change the version in [fi/meta.json](fi/meta.json)!
+
+Update the Changelog.
+
+```sh
+tools/package_model.sh training/merged
+```
+
+Optionally, to override the default spaCy compatibility specification,
+add a new spec as the second parameter:
+
+```sh
+tools/package_model.sh training/merged ">=3.0.0,<3.2.0"
+```
+
+## Publishing
+
+```sh
+git tag v0.5.0
+git push --tags
+
+twine upload --repository spacy-fi-experimental-web-md packages/fi_experimental_web_md-0.5.0/dist/*
+```
+
+Create a [new release](https://github.com/aajanki/spacy-fi/releases/new) at Github.