aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-11-15 19:17:27 +0100
committerAlex Auvolat <alex@adnab.me>2023-11-15 19:22:34 +0100
commit4b20865335f14d086e5733fd2bd62acf22434e04 (patch)
treecb5c36097662243c4cb7c24debc1fc8813181036 /src
parent56654ce07fb1319a21e6d5f2bdcc0d024c4db398 (diff)
downloaddatagengo-4b20865335f14d086e5733fd2bd62acf22434e04.tar.gz
datagengo-4b20865335f14d086e5733fd2bd62acf22434e04.zip
again try add more diversity to extra examples
Diffstat (limited to 'src')
-rw-r--r--src/main.rs50
1 files changed, 29 insertions, 21 deletions
diff --git a/src/main.rs b/src/main.rs
index f78bf81..9209fd7 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -992,6 +992,13 @@ fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) {
println!("---- BATCH #{:03} ----", i);
chars = chars.union(&batch.chars);
+ // Count characters in batch in char_seen_count as a lot
+ for ex in batch.examples.iter() {
+ for c in ex.chars.iter() {
+ *char_seen_count.entry(c).or_default() += 5;
+ }
+ }
+
// Take only examples that:
// - contain kanji of this batch
// - only contain kanji of this or previous batches
@@ -1042,53 +1049,54 @@ fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) {
.filter(|x| char_seen_count.get(x).copied().unwrap_or(0) <= cnt),
)
};
- let c1 = fc(1);
- let c2 = fc(2);
- let c4 = fc(4);
- let c7 = fc(7);
+ let c1 = fc(5);
+ let c2 = fc(6);
+ let c3 = fc(7);
+ let c4 = fc(10);
let best = candidates
.iter()
.enumerate()
+ .filter(|(_, ex)| {
+ batch.extra_examples.len() < 20 || ex.chars.diff(&in_batch_extra).len() > 0
+ })
.map(|(i, ex)| {
let weight = (
ex.chars.inter_len(&c0),
ex.chars.inter_len(&c1),
ex.chars.inter_len(&c2),
+ ex.chars.inter_len(&c3),
ex.chars.inter_len(&c4),
- ex.chars.inter_len(&c7),
ex.chars.diff(&in_batch_extra).len(),
);
(i, ex, weight)
})
.max_by_key(|(_, _, w)| *w);
if let Some((i, ex, w)) = best {
- if ex.chars.diff(&in_batch_extra).len() > 0 || batch.extra_examples.len() < 20 {
- println!("{:?}\t{} - {}", w, ex.ja, ex.en);
+ println!("{:?}\t{} - {}", w, ex.ja, ex.en);
- batch.extra_examples.push(ex.clone());
- in_batch = in_batch.union(&ex.chars);
- in_batch_extra = in_batch_extra.union(&ex.chars);
+ batch.extra_examples.push(ex.clone());
+ in_batch = in_batch.union(&ex.chars);
+ in_batch_extra = in_batch_extra.union(&ex.chars);
- for c in ex.chars.iter() {
- *char_seen_count.entry(c).or_default() += 1;
- if batch.chars.contains(c) {
- *batch_char_seen_count.entry(c).or_default() += 1;
- }
+ for c in ex.chars.iter() {
+ *char_seen_count.entry(c).or_default() += 1;
+ if batch.chars.contains(c) {
+ *batch_char_seen_count.entry(c).or_default() += 1;
}
-
- candidates.remove(i);
- continue;
}
+
+ candidates.remove(i);
+ } else {
+ break;
}
- break;
}
batch
.extra_examples
.sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes()));
- for i in 1..10 {
+ for i in 1..20 {
println!(
"Seen {:02}: {}",
i,
@@ -1097,7 +1105,7 @@ fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) {
}
println!(
"Seen more: {}",
- char_seen_count.iter().filter(|(_, v)| **v >= 10).count()
+ char_seen_count.iter().filter(|(_, v)| **v >= 20).count()
);
}
}