1 files changed, 0 insertions, 177 deletions
diff --git a/src/main.rs b/src/main.rs
index 252740b..1ad5e77 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -37,8 +37,6 @@ enum Cmd {
     Simplify,
     Cleanup,
     AddVocab,
-    AddExamples,
-    AddFurigana,
     Format,
     Server,
 }
@@ -116,53 +114,6 @@ async fn main() {
 
             save_batches(batches).expect("save_batches");
         }
-        Cmd::AddExamples => {
-            let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
-            let all_kanji = Charset::new(
-                kanji_levels
-                    .iter()
-                    .map(|(_, x)| x.to_string())
-                    .collect::<Vec<_>>()
-                    .join(""),
-            );
-
-            let mut ex = read_examples(&all_kanji).expect("read_examples");
-            ex.retain(|e| (5..=25).contains(&e.ja.chars().count()));
-
-            let mut batches = read_batches().expect("read_batches");
-
-            add_extra_examples(&mut batches, &ex);
-
-            save_batches(batches).expect("save_batches");
-        }
-        Cmd::AddFurigana => {
-            let mut batches = read_batches().expect("read_batches");
-
-            let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
-            let jmdict = roxmltree::Document::parse_with_options(
-                &jmdict,
-                roxmltree::ParsingOptions {
-                    allow_dtd: true,
-                    ..Default::default()
-                },
-            )
-            .expect("parse_jmdict");
-            let jmdict_idx = index_jmdict(&jmdict);
-
-            let overrides = read_furigana_overrides().expect("read_furigana_overrides");
-
-            for batch in batches.iter_mut() {
-                for ex in batch
-                    .examples
-                    .iter_mut()
-                    .chain(batch.extra_examples.iter_mut())
-                {
-                    ex.gen_furigana(&jmdict_idx, &overrides);
-                }
-            }
-
-            save_batches(batches).expect("save_batches");
-        }
         Cmd::Format => {
             let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
             let jmdict = roxmltree::Document::parse_with_options(
@@ -229,8 +180,6 @@ pub struct Batch {
     pub examples: Vec<Example>,
     #[serde(default)]
     pub extra_vocab: Vec<JlptVocab>,
-    #[serde(default)]
-    pub extra_examples: Vec<Example>,
 }
 
 fn gen_batches(
@@ -688,129 +637,3 @@ fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) {
         batch.extra_vocab = vocab;
     }
 }
-
-fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) {
-    let mut chars = Charset::default();
-    let mut char_seen_count: HashMap<char, usize> = HashMap::new();
-
-    for (i, batch) in all_batches.iter_mut().enumerate() {
-        println!("---- BATCH #{:03} ----", i);
-        chars = chars.union(&batch.chars);
-
-        // Count characters in batch in char_seen_count as a lot
-        for ex in batch.examples.iter() {
-            for c in ex.chars.iter() {
-                *char_seen_count.entry(c).or_default() += 5;
-            }
-        }
-
-        // Take only examples that:
-        // - contain kanji of this batch
-        // - only contain kanji of this or previous batches
-        // - are not in the batch's main example sentences
-        let candidates = examples
-            .iter()
-            .filter(|x| x.chars.inter_len(&batch.chars) > 0)
-            .filter(|x| x.chars.diff(&chars).len() == 0)
-            .filter(|x| batch.examples.iter().all(|y| y.ja != x.ja));
-
-        // Take only one candidate sentence for each possible set of represented kanji
-        let mut cand_by_chars = HashMap::new();
-        for c in candidates {
-            cand_by_chars.insert(c.chars.to_string(), c.clone());
-        }
-        let mut candidates = cand_by_chars
-            .into_iter()
-            .map(|(_, ex)| ex)
-            .collect::<Vec<_>>();
-
-        // Sorte candidates in a deterministic random order
-        candidates.sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes()));
-
-        batch.extra_examples.clear();
-
-        let mut batch_char_seen_count: HashMap<char, usize> = HashMap::new();
-        let mut in_batch =
-            Charset::from_iter(batch.examples.iter().map(|x| x.chars.iter()).flatten());
-        let mut in_batch_extra = Charset::default();
-
-        while batch.extra_examples.len() < 40 {
-            let batch_min_seen = batch
-                .chars
-                .iter()
-                .map(|x| batch_char_seen_count.get(&x).copied().unwrap_or(0))
-                .min()
-                .unwrap();
-            // Target chars: chars of the batch that have the less examples
-            let c0 =
-                Charset::from_iter(batch.chars.iter().filter(|x| {
-                    batch_char_seen_count.get(x).copied().unwrap_or(0) == batch_min_seen
-                }));
-            // Target chars: chars that have been seen less than cnt times
-            let fc = |cnt| {
-                Charset::from_iter(
-                    chars
-                        .iter()
-                        .filter(|x| char_seen_count.get(x).copied().unwrap_or(0) <= cnt),
-                )
-            };
-            let c1 = fc(5);
-            let c2 = fc(6);
-            let c3 = fc(7);
-            let c4 = fc(10);
-
-            let best = candidates
-                .iter()
-                .enumerate()
-                .filter(|(_, ex)| {
-                    batch.extra_examples.len() < 20 || ex.chars.diff(&in_batch_extra).len() > 0
-                })
-                .map(|(i, ex)| {
-                    let weight = (
-                        ex.chars.inter_len(&c0),
-                        ex.chars.inter_len(&c1),
-                        ex.chars.inter_len(&c2),
-                        ex.chars.inter_len(&c3),
-                        ex.chars.inter_len(&c4),
-                        ex.chars.diff(&in_batch_extra).len(),
-                    );
-                    (i, ex, weight)
-                })
-                .max_by_key(|(_, _, w)| *w);
-            if let Some((i, ex, w)) = best {
-                println!("{:?}\t{} - {}", w, ex.ja, ex.en);
-
-                batch.extra_examples.push(ex.clone());
-                in_batch = in_batch.union(&ex.chars);
-                in_batch_extra = in_batch_extra.union(&ex.chars);
-
-                for c in ex.chars.iter() {
-                    *char_seen_count.entry(c).or_default() += 1;
-                    if batch.chars.contains(c) {
-                        *batch_char_seen_count.entry(c).or_default() += 1;
-                    }
-                }
-
-                candidates.remove(i);
-            } else {
-                break;
-            }
-        }
-
-        batch
-            .extra_examples
-            .sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes()));
-
-        for i in 1..20 {
-            println!(
-                "Seen   {:02}: {}",
-                i,
-                char_seen_count.iter().filter(|(_, v)| **v == i).count()
-            );
-        }
-        println!(
-            "Seen more: {}",
-            char_seen_count.iter().filter(|(_, v)| **v >= 20).count()
-        );
-    }
-}