again try add more diversity to extra examples

author: Alex Auvolat <alex@adnab.me> 2023-11-15 19:17:27 +0100
committer: Alex Auvolat <alex@adnab.me> 2023-11-15 19:22:34 +0100
commit: 4b20865335f14d086e5733fd2bd62acf22434e04 (patch)
tree: cb5c36097662243c4cb7c24debc1fc8813181036 /src
parent: 56654ce07fb1319a21e6d5f2bdcc0d024c4db398 (diff)
download: datagengo-4b20865335f14d086e5733fd2bd62acf22434e04.tar.gz
datagengo-4b20865335f14d086e5733fd2bd62acf22434e04.zip
1 files changed, 29 insertions, 21 deletions
diff --git a/src/main.rs b/src/main.rs
index f78bf81..9209fd7 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -992,6 +992,13 @@ fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) {
         println!("---- BATCH #{:03} ----", i);
         chars = chars.union(&batch.chars);
 
+        // Count characters in batch in char_seen_count as a lot
+        for ex in batch.examples.iter() {
+            for c in ex.chars.iter() {
+                *char_seen_count.entry(c).or_default() += 5;
+            }
+        }
+
         // Take only examples that:
         // - contain kanji of this batch
         // - only contain kanji of this or previous batches
@@ -1042,53 +1049,54 @@ fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) {
                         .filter(|x| char_seen_count.get(x).copied().unwrap_or(0) <= cnt),
                 )
             };
-            let c1 = fc(1);
-            let c2 = fc(2);
-            let c4 = fc(4);
-            let c7 = fc(7);
+            let c1 = fc(5);
+            let c2 = fc(6);
+            let c3 = fc(7);
+            let c4 = fc(10);
 
             let best = candidates
                 .iter()
                 .enumerate()
+                .filter(|(_, ex)| {
+                    batch.extra_examples.len() < 20 || ex.chars.diff(&in_batch_extra).len() > 0
+                })
                 .map(|(i, ex)| {
                     let weight = (
                         ex.chars.inter_len(&c0),
                         ex.chars.inter_len(&c1),
                         ex.chars.inter_len(&c2),
+                        ex.chars.inter_len(&c3),
                         ex.chars.inter_len(&c4),
-                        ex.chars.inter_len(&c7),
                         ex.chars.diff(&in_batch_extra).len(),
                     );
                     (i, ex, weight)
                 })
                 .max_by_key(|(_, _, w)| *w);
             if let Some((i, ex, w)) = best {
-                if ex.chars.diff(&in_batch_extra).len() > 0 || batch.extra_examples.len() < 20 {
-                    println!("{:?}\t{} - {}", w, ex.ja, ex.en);
+                println!("{:?}\t{} - {}", w, ex.ja, ex.en);
 
-                    batch.extra_examples.push(ex.clone());
-                    in_batch = in_batch.union(&ex.chars);
-                    in_batch_extra = in_batch_extra.union(&ex.chars);
+                batch.extra_examples.push(ex.clone());
+                in_batch = in_batch.union(&ex.chars);
+                in_batch_extra = in_batch_extra.union(&ex.chars);
 
-                    for c in ex.chars.iter() {
-                        *char_seen_count.entry(c).or_default() += 1;
-                        if batch.chars.contains(c) {
-                            *batch_char_seen_count.entry(c).or_default() += 1;
-                        }
+                for c in ex.chars.iter() {
+                    *char_seen_count.entry(c).or_default() += 1;
+                    if batch.chars.contains(c) {
+                        *batch_char_seen_count.entry(c).or_default() += 1;
                     }
-
-                    candidates.remove(i);
-                    continue;
                 }
+
+                candidates.remove(i);
+            } else {
+                break;
             }
-            break;
         }
 
         batch
             .extra_examples
             .sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes()));
 
-        for i in 1..10 {
+        for i in 1..20 {
             println!(
                 "Seen   {:02}: {}",
                 i,
@@ -1097,7 +1105,7 @@ fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) {
         }
         println!(
             "Seen more: {}",
-            char_seen_count.iter().filter(|(_, v)| **v >= 10).count()
+            char_seen_count.iter().filter(|(_, v)| **v >= 20).count()
         );
     }
 }
author	Alex Auvolat <alex@adnab.me>	2023-11-15 19:17:27 +0100
committer	Alex Auvolat <alex@adnab.me>	2023-11-15 19:22:34 +0100
commit	4b20865335f14d086e5733fd2bd62acf22434e04 (patch)
tree	cb5c36097662243c4cb7c24debc1fc8813181036 /src
parent	56654ce07fb1319a21e6d5f2bdcc0d024c4db398 (diff)
download	datagengo-4b20865335f14d086e5733fd2bd62acf22434e04.tar.gz datagengo-4b20865335f14d086e5733fd2bd62acf22434e04.zip