aboutsummaryrefslogtreecommitdiff
path: root/src/main.rs
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-10-11 12:15:22 +0200
committerAlex Auvolat <alex@adnab.me>2023-10-11 12:15:22 +0200
commitd9078f7674c637dd8498ece74ffe6cb7d1e179b9 (patch)
tree3d8458dfbc65a6ad82805a69fe1b24324039d772 /src/main.rs
parentc7f229c9015f9feb64a616efb9c6f48cde535b92 (diff)
downloaddatagengo-d9078f7674c637dd8498ece74ffe6cb7d1e179b9.tar.gz
datagengo-d9078f7674c637dd8498ece74ffe6cb7d1e179b9.zip
add even more example sentences
Diffstat (limited to 'src/main.rs')
-rw-r--r--src/main.rs25
1 files changed, 14 insertions, 11 deletions
diff --git a/src/main.rs b/src/main.rs
index 7ecc195..66f1f51 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -152,7 +152,7 @@ fn main() {
.and_then(|x| Ok(serde_json::from_slice(&x)?))
.expect("failed to decode batches.json");
- add_examples(&mut batches, &ex);
+ add_extra_examples(&mut batches, &ex);
fs::write(
"data/batches.json",
@@ -984,7 +984,7 @@ fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) {
}
}
-fn add_examples(all_batches: &mut [Batch], examples: &[Example]) {
+fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) {
let mut chars = Charset::default();
for (i, batch) in all_batches.iter_mut().enumerate() {
chars = chars.union(&batch.chars);
@@ -1007,17 +1007,20 @@ fn add_examples(all_batches: &mut [Batch], examples: &[Example]) {
batch.extra_examples.clear();
let mut in_batch = Charset::from_iter(batch.examples.iter().map(|x| x.chars.chars().iter().copied()).flatten());
let mut in_extra = Charset::default();
- while batch.extra_examples.len() < 20 {
+ while batch.extra_examples.len() < 40 {
let best = candidates.iter().enumerate()
- .max_by_key(|(_, ex)| (ex.chars.diff(&in_batch).len(), ex.chars.diff(&in_extra).len()));
- if let Some((i, ex)) = best {
- batch.extra_examples.push(ex.clone());
- in_batch = in_batch.union(&ex.chars);
- in_extra = in_extra.union(&ex.chars);
- candidates.remove(i);
- } else {
- break;
+ .map(|(i, ex)| (i, ex, ex.chars.diff(&in_batch).len(), ex.chars.diff(&in_extra).len()))
+ .max_by_key(|(_, _, w1, w2)| (*w1, *w2));
+ if let Some((i, ex, w1, w2)) = best {
+ if w1 > 0 || w2 > 0 || batch.extra_examples.len() < 20 {
+ batch.extra_examples.push(ex.clone());
+ in_batch = in_batch.union(&ex.chars);
+ in_extra = in_extra.union(&ex.chars);
+ candidates.remove(i);
+ continue;
+ }
}
+ break;
}
batch.extra_examples.sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes()));