diff options
author | Alex Auvolat <alex@adnab.me> | 2023-10-11 12:15:22 +0200 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-10-11 12:15:22 +0200 |
commit | d9078f7674c637dd8498ece74ffe6cb7d1e179b9 (patch) | |
tree | 3d8458dfbc65a6ad82805a69fe1b24324039d772 /src | |
parent | c7f229c9015f9feb64a616efb9c6f48cde535b92 (diff) | |
download | datagengo-d9078f7674c637dd8498ece74ffe6cb7d1e179b9.tar.gz datagengo-d9078f7674c637dd8498ece74ffe6cb7d1e179b9.zip |
add even more example sentences
Diffstat (limited to 'src')
-rw-r--r-- | src/main.rs | 25 |
1 files changed, 14 insertions, 11 deletions
diff --git a/src/main.rs b/src/main.rs index 7ecc195..66f1f51 100644 --- a/src/main.rs +++ b/src/main.rs @@ -152,7 +152,7 @@ fn main() { .and_then(|x| Ok(serde_json::from_slice(&x)?)) .expect("failed to decode batches.json"); - add_examples(&mut batches, &ex); + add_extra_examples(&mut batches, &ex); fs::write( "data/batches.json", @@ -984,7 +984,7 @@ fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) { } } -fn add_examples(all_batches: &mut [Batch], examples: &[Example]) { +fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) { let mut chars = Charset::default(); for (i, batch) in all_batches.iter_mut().enumerate() { chars = chars.union(&batch.chars); @@ -1007,17 +1007,20 @@ fn add_examples(all_batches: &mut [Batch], examples: &[Example]) { batch.extra_examples.clear(); let mut in_batch = Charset::from_iter(batch.examples.iter().map(|x| x.chars.chars().iter().copied()).flatten()); let mut in_extra = Charset::default(); - while batch.extra_examples.len() < 20 { + while batch.extra_examples.len() < 40 { let best = candidates.iter().enumerate() - .max_by_key(|(_, ex)| (ex.chars.diff(&in_batch).len(), ex.chars.diff(&in_extra).len())); - if let Some((i, ex)) = best { - batch.extra_examples.push(ex.clone()); - in_batch = in_batch.union(&ex.chars); - in_extra = in_extra.union(&ex.chars); - candidates.remove(i); - } else { - break; + .map(|(i, ex)| (i, ex, ex.chars.diff(&in_batch).len(), ex.chars.diff(&in_extra).len())) + .max_by_key(|(_, _, w1, w2)| (*w1, *w2)); + if let Some((i, ex, w1, w2)) = best { + if w1 > 0 || w2 > 0 || batch.extra_examples.len() < 20 { + batch.extra_examples.push(ex.clone()); + in_batch = in_batch.union(&ex.chars); + in_extra = in_extra.union(&ex.chars); + candidates.remove(i); + continue; + } } + break; } batch.extra_examples.sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes())); |