refactor; add simplify to remove ONE redundant sentencebatches-v1

author: Alex Auvolat <alex@adnab.me> 2023-09-24 17:26:00 +0200
committer: Alex Auvolat <alex@adnab.me> 2023-09-24 17:26:00 +0200
commit: a9de8d71a0fecbd483cbdc084ba109cb96250aaa (patch)
tree: cfd7c1551e56bf210fd119d54583e6e83f19bb0e
parent: 22bd3b7e415c5a33e8101bc38acaaab22de5b316 (diff)
download: datagengo-a9de8d71a0fecbd483cbdc084ba109cb96250aaa.tar.gz
datagengo-a9de8d71a0fecbd483cbdc084ba109cb96250aaa.zip
3 files changed, 439 insertions, 293 deletions
diff --git a/data/batches.json b/data/batches.json
index b8a3f59..cbcb15e 100644
--- a/data/batches.json
+++ b/data/batches.json
@@ -664,16 +664,6 @@
         ]
       },
       {
-        "ja": "オーストラリアは南アメリカより小さい。",
-        "en": "Australia is smaller than South America.",
-        "expl": "濠太剌利{オーストラリア} は 南 亜米利加{アメリカ} より 小さい",
-        "id": "ID=65144_227791",
-        "chars": [
-          "南",
-          "小"
-        ]
-      },
-      {
         "ja": "月は東から上る。",
         "en": "The moon rises in the east.",
         "expl": "月(つき)[01] は 東(ひがし) から 登る[02]{上る}",
diff --git a/src/charset.rs b/src/charset.rs
new file mode 100644
index 0000000..a79eddb
--- /dev/null
+++ b/src/charset.rs
@@ -0,0 +1,190 @@
+use std::cmp::Ordering;
+
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, Eq, PartialEq, Hash, Clone, Serialize, Deserialize, Default)]
+pub struct Charset(pub Vec<char>);
+
+impl Charset {
+    pub fn new<S: AsRef<str>>(s: S) -> Self {
+        let mut chars = s.as_ref().chars().collect::<Vec<_>>();
+        chars.sort();
+        chars.dedup();
+        Self(chars)
+    }
+    pub fn from_iter<S: IntoIterator<Item = char>>(s: S) -> Self {
+        let mut chars = s.into_iter().collect::<Vec<_>>();
+        chars.sort();
+        chars.dedup();
+        Self(chars)
+    }
+    pub fn intersects(&self, other: &Self) -> bool {
+        let mut it1 = self.0.iter().peekable();
+        let mut it2 = other.0.iter().peekable();
+        while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
+            match c1.cmp(c2) {
+                Ordering::Equal => return true,
+                Ordering::Less => it1.next(),
+                Ordering::Greater => it2.next(),
+            };
+        }
+        false
+    }
+    pub fn inter_len(&self, other: &Self) -> usize {
+        if other.len() > 20 * self.len() {
+            // alternative path
+            return self
+                .0
+                .iter()
+                .filter(|x| other.0.binary_search(x).is_ok())
+                .count();
+        }
+        let mut it1 = self.0.iter().peekable();
+        let mut it2 = other.0.iter().peekable();
+        let mut ret = 0;
+        while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
+            match c1.cmp(c2) {
+                Ordering::Equal => {
+                    ret += 1;
+                    it1.next();
+                    it2.next();
+                }
+                Ordering::Less => {
+                    it1.next();
+                }
+                Ordering::Greater => {
+                    it2.next();
+                }
+            };
+        }
+        ret
+    }
+    pub fn inter(&self, other: &Self) -> Charset {
+        let mut it1 = self.0.iter().peekable();
+        let mut it2 = other.0.iter().peekable();
+        let mut ret = Vec::new();
+        while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
+            match c1.cmp(c2) {
+                Ordering::Equal => {
+                    ret.push(**c1);
+                    it1.next();
+                    it2.next();
+                }
+                Ordering::Less => {
+                    it1.next();
+                }
+                Ordering::Greater => {
+                    it2.next();
+                }
+            };
+        }
+        Self(ret)
+    }
+    pub fn union(&self, other: &Self) -> Charset {
+        let mut it1 = self.0.iter().peekable();
+        let mut it2 = other.0.iter().peekable();
+        let mut ret = Vec::new();
+        while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
+            match c1.cmp(c2) {
+                Ordering::Equal => {
+                    ret.push(**c1);
+                    it1.next();
+                    it2.next();
+                }
+                Ordering::Less => {
+                    ret.push(**c1);
+                    it1.next();
+                }
+                Ordering::Greater => {
+                    ret.push(**c2);
+                    it2.next();
+                }
+            };
+        }
+        while let Some(c) = it1.peek() {
+            ret.push(**c);
+            it1.next();
+        }
+        while let Some(c) = it2.peek() {
+            ret.push(**c);
+            it2.next();
+        }
+        Self(ret)
+    }
+    pub fn diff(&self, other: &Self) -> Charset {
+        let mut it1 = self.0.iter().peekable();
+        let mut it2 = other.0.iter().peekable();
+        let mut ret = Vec::new();
+        while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
+            match c1.cmp(c2) {
+                Ordering::Equal => {
+                    it1.next();
+                    it2.next();
+                }
+                Ordering::Less => {
+                    ret.push(**c1);
+                    it1.next();
+                }
+                Ordering::Greater => {
+                    it2.next();
+                }
+            };
+        }
+        while let Some(c) = it1.peek() {
+            ret.push(**c);
+            it1.next();
+        }
+        Self(ret)
+    }
+    pub fn len(&self) -> usize {
+        self.0.len()
+    }
+    pub fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+    pub fn chars(&self) -> &[char] {
+        &self.0
+    }
+    pub fn contains(&self, c: char) -> bool {
+        self.0.binary_search(&c).is_ok()
+    }
+    pub fn to_string(&self) -> String {
+        self.0.iter().collect::<String>()
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_charset() {
+        let c1 = Charset::new("azerty");
+        let c2 = Charset::new("uiopqsqdf");
+        let c3 = Charset::new("hello, world");
+
+        assert!(!c1.intersects(&c2));
+        assert!(c1.intersects(&c3));
+        assert!(c2.intersects(&c3));
+
+        assert_eq!(c1.inter_len(&c2), 0);
+        assert_eq!(c1.inter_len(&c3), 2);
+        assert_eq!(c2.inter_len(&c3), 2);
+
+        assert_eq!(c1.inter(&c2), Charset::new(""));
+        assert_eq!(c1.inter(&c3), Charset::new("er"));
+        assert_eq!(c2.inter(&c3), Charset::new("od"));
+
+        assert_eq!(c1.union(&c2), Charset::new("azertyuiopqsdf"));
+        assert_eq!(c1.union(&c3), Charset::new("azertyhello, world"));
+        assert_eq!(c2.union(&c3), Charset::new("uiopqsdfhello, world"));
+
+        assert_eq!(c1.diff(&c2), Charset::new("azerty"));
+        assert_eq!(c1.diff(&c3), Charset::new("azty"));
+        assert_eq!(c2.diff(&c3), Charset::new("uipqsf"));
+
+        assert_eq!(c2.diff(&c1), Charset::new("uiopqsdf"));
+        assert_eq!(c3.diff(&c1), Charset::new("hllo, wold"));
+        assert_eq!(c3.diff(&c2), Charset::new("hell, wrl"));
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index 837b7ff..a283891 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,13 +1,15 @@
 use std::collections::HashMap;
 use std::fs;
-use std::cmp::Ordering;
 use std::io::{self, BufRead, Write};
 
 use anyhow::{anyhow, Result};
 use rayon::prelude::*;
-use serde::{Serialize, Deserialize};
+use serde::{Deserialize, Serialize};
 use structopt::StructOpt;
 
+mod charset;
+use charset::Charset;
+
 #[derive(Debug, StructOpt)]
 #[structopt(name = "datagengo", about = "Japanese example practice maker")]
 struct Opt {
@@ -17,11 +19,12 @@ struct Opt {
 
 #[derive(Debug, StructOpt)]
 enum Cmd {
-	ParseKanjidic,
+    ParseKanjidic,
     New {
         #[structopt(default_value = "10")]
         count: usize,
     },
+    Simplify,
     Format,
 }
 
@@ -35,13 +38,17 @@ fn main() {
                 println!("{}: {}", level, chars.to_string());
             }
         }
-        Cmd::New{ count } => {
+        Cmd::New { count } => {
             let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
-            let all_kanji = Charset::new(kanji_levels.iter()
-                .map(|(_, x)| x.to_string())
-                .collect::<Vec<_>>()
-                .join(""));
-            let kanji_levels = kanji_levels.into_iter()
+            let all_kanji = Charset::new(
+                kanji_levels
+                    .iter()
+                    .map(|(_, x)| x.to_string())
+                    .collect::<Vec<_>>()
+                    .join(""),
+            );
+            let kanji_levels = kanji_levels
+                .into_iter()
                 .map(|(l, x)| (l, Charset::new(x)))
                 .collect::<Vec<_>>();
             let mut ex = read_examples(&all_kanji).expect("read_examples");
@@ -58,18 +65,40 @@ fn main() {
                 }
                 batches.push(batch);
             }
-            fs::write("data/batches.json", serde_json::to_string_pretty(&batches).expect("serialize").as_bytes()).expect("save");
+            fs::write(
+                "data/batches.json",
+                serde_json::to_string_pretty(&batches)
+                    .expect("serialize")
+                    .as_bytes(),
+            )
+            .expect("save");
+        }
+        Cmd::Simplify => {
+            let mut batches: Vec<Batch> = fs::read("data/batches.json")
+                .map_err(anyhow::Error::from)
+                .and_then(|x| Ok(serde_json::from_slice(&x)?))
+                .unwrap_or_default();
+            for batch in batches.iter_mut() {
+                simplify_batch(batch);
+            }
+            fs::write(
+                "data/batches.json",
+                serde_json::to_string_pretty(&batches)
+                    .expect("serialize")
+                    .as_bytes(),
+            )
+            .expect("save");
         }
         Cmd::Format => {
-            let jmdict = fs::read_to_string("data/JMdict_e.xml")
-                .expect("read_jmdict");
+            let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
             let jmdict = roxmltree::Document::parse_with_options(
                 &jmdict,
                 roxmltree::ParsingOptions {
                     allow_dtd: true,
                     ..Default::default()
-                })
-                .expect("parse_jmdict");
+                },
+            )
+            .expect("parse_jmdict");
             let jmdict_idx = index_jmdict(&jmdict);
 
             let batches = fs::read("data/batches.json")
@@ -80,7 +109,8 @@ fn main() {
             fs::create_dir_all("public").expect("mkdir public");
             fs::copy("static/style.css", "public/style.css").expect("copy style.css");
 
-            batches.iter()
+            batches
+                .iter()
                 .enumerate()
                 .for_each(|x| format_batch(&jmdict_idx, batches.len(), x));
 
@@ -90,9 +120,17 @@ fn main() {
     }
 }
 
+// =====================================================================
+//                      PARSING DATA FILES
+// =====================================================================
+
 type DictIndex<'a> = HashMap<&'a str, Vec<roxmltree::Node<'a, 'a>>>;
 fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> {
-    let dict = dict.root().children().find(|x| x.has_tag_name("JMdict")).unwrap();
+    let dict = dict
+        .root()
+        .children()
+        .find(|x| x.has_tag_name("JMdict"))
+        .unwrap();
 
     let mut ret: DictIndex<'a> = HashMap::new();
     for x in dict.children().filter(|x| x.has_tag_name("entry")) {
@@ -138,17 +176,19 @@ fn parse_kanjidic() -> Result<Vec<(String, Charset)>> {
             }
         }
         if let Some(lit) = literal {
-            levels.entry((jlpt, grade)).or_insert(String::new()).extend(lit.chars());
+            levels
+                .entry((jlpt, grade))
+                .or_insert(String::new())
+                .extend(lit.chars());
         }
     }
 
     let mut levels = levels.into_iter().collect::<Vec<_>>();
     levels.sort_by_key(|((j, g), _)| match (j, g) {
-            (Some(j), Some(g)) => (10-*j)*20+*g,
-            (Some(j), None) => (10-*j)*20+15,
-            (None, Some(g)) => 1000+*g,
-            (None, None) => 1015,
-
+        (Some(j), Some(g)) => (10 - *j) * 20 + *g,
+        (Some(j), None) => (10 - *j) * 20 + 15,
+        (None, Some(g)) => 1000 + *g,
+        (None, None) => 1015,
     });
 
     let mut ret = Vec::new();
@@ -212,38 +252,87 @@ fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> {
             }
         }
         if i % 10000 == 0 {
-            eprintln!("read examples: {}/300", i/1000);
+            eprintln!("read examples: {}/300", i / 1000);
         }
     }
 
     Ok(ret)
 }
 
-fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &[Example]) -> Result<Batch> {
-    let prev_chars = Charset::from_iter(previous.iter()
-        .map(|x| x.chars.chars().iter().copied())
-        .flatten());
+// =====================================================================
+//                      BATCH STRUCTURES AND GENERATION
+// =====================================================================
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+struct Example {
+    ja: String,
+    en: String,
+    expl: String,
+    id: Option<String>,
+    chars: Charset,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+struct Batch {
+    level: String,
+    chars: Charset,
+    chars_p1: Charset,
+    chars_p2: Charset,
+    chars_bad: Charset,
+    examples: Vec<Example>,
+}
 
-    let (mut target_i, target_level, mut target_chars) = kanji_levels.iter().enumerate()
+fn gen_batch(
+    previous: &[Batch],
+    kanji_levels: &[(String, Charset)],
+    examples: &[Example],
+) -> Result<Batch> {
+    let prev_chars = Charset::from_iter(
+        previous
+            .iter()
+            .map(|x| x.chars.chars().iter().copied())
+            .flatten(),
+    );
+
+    let (mut target_i, target_level, mut target_chars) = kanji_levels
+        .iter()
+        .enumerate()
         .map(|(i, (l, c))| (i, l, c.diff(&prev_chars)))
         .find(|(_, _, c)| !c.is_empty())
         .ok_or(anyhow!("no more batches to make!"))?;
 
-    let chars_p1 = previous.iter().rev().next()
-        .map(|b| b.chars.clone()).unwrap_or(Charset::default());
-
-    let chars_p2 = previous.iter().rev().skip(1).next()
-        .map(|b| b.chars.clone()).unwrap_or(Charset::default());
+    let chars_p1 = previous
+        .iter()
+        .rev()
+        .next()
+        .map(|b| b.chars.clone())
+        .unwrap_or(Charset::default());
+
+    let chars_p2 = previous
+        .iter()
+        .rev()
+        .skip(1)
+        .next()
+        .map(|b| b.chars.clone())
+        .unwrap_or(Charset::default());
 
     let mut chars_late = Charset::default();
 
-    let mut chars_bad = Charset::from_iter(kanji_levels.iter().skip(target_i+1)
-        .map(|(_, c)| c.chars().iter().copied())
-        .flatten());
-    let mut chars_bad_avoid = Charset::from_iter(kanji_levels.iter().skip(target_i+1)
-        .filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10"))
-        .map(|(_, c)| c.chars().iter().copied())
-        .flatten());
+    let mut chars_bad = Charset::from_iter(
+        kanji_levels
+            .iter()
+            .skip(target_i + 1)
+            .map(|(_, c)| c.chars().iter().copied())
+            .flatten(),
+    );
+    let mut chars_bad_avoid = Charset::from_iter(
+        kanji_levels
+            .iter()
+            .skip(target_i + 1)
+            .filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10"))
+            .map(|(_, c)| c.chars().iter().copied())
+            .flatten(),
+    );
 
     let mut batch = Batch {
         level: target_level.to_string(),
@@ -295,31 +384,39 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &
                         - 40i32 * ex.chars.inter_len(&chars_bad) as i32
         };
         */
-        let cost = |ex: &Example, ex_tgt_inter: usize| { (
-            - (ex.chars.inter_len(&chars_bad_avoid) as i32),
-            ex_tgt_inter,
-            ex.chars.inter_len(&chars_late),
-            2*ex.chars.inter_len(&chars_p1) + ex.chars.inter_len(&chars_p2),
-            - (ex.ja.chars().count() as i32),
-        ) };
-        let cand_1 = examples.par_iter()
+        let cost = |ex: &Example, ex_tgt_inter: usize| {
+            (
+                -(ex.chars.inter_len(&chars_bad_avoid) as i32),
+                ex_tgt_inter,
+                ex.chars.inter_len(&chars_late),
+                2 * ex.chars.inter_len(&chars_p1) + ex.chars.inter_len(&chars_p2),
+                -(ex.ja.chars().count() as i32),
+            )
+        };
+        let cand_1 = examples
+            .par_iter()
             .map(|ex| (ex, ex.chars.inter_len(&target_chars)))
-            .filter(|(_, ex_tgt_inter)| (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len)
+            .filter(|(_, ex_tgt_inter)| {
+                (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len
+            })
             .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter));
-        let cand  = cand_1.or_else(|| {
-            examples.par_iter()
-            .map(|ex| (ex, ex.chars.inter_len(&target_chars)))
-            .filter(|(_, ex_tgt_inter)| *ex_tgt_inter > 0)
-            .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter))
+        let cand = cand_1.or_else(|| {
+            examples
+                .par_iter()
+                .map(|ex| (ex, ex.chars.inter_len(&target_chars)))
+                .filter(|(_, ex_tgt_inter)| *ex_tgt_inter > 0)
+                .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter))
         });
         if let Some((ex, _)) = cand {
-            eprintln!("* add {} (rep: {}, p1: {}, p2: {}, bad: {})    {}",
+            eprintln!(
+                "* add {} (rep: {}, p1: {}, p2: {}, bad: {})    {}",
                 ex.chars.inter(&target_chars).to_string(),
                 ex.chars.inter(&batch.chars).to_string(),
                 ex.chars.inter(&chars_p1).to_string(),
                 ex.chars.inter(&chars_p2).to_string(),
                 ex.chars.inter(&chars_bad).to_string(),
-                ex.ja);
+                ex.ja
+            );
             batch.chars = batch.chars.union(&ex.chars.inter(&target_chars));
             target_chars = target_chars.diff(&ex.chars);
             chars_late = chars_late.diff(&ex.chars);
@@ -328,7 +425,10 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &
             stalled = false;
         } else {
             if stalled {
-                eprintln!("could not find suitable sentence, stopping batch now (need {})", need);
+                eprintln!(
+                    "could not find suitable sentence, stopping batch now (need {})",
+                    need
+                );
                 break;
             }
             stalled = true;
@@ -342,20 +442,63 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &
     Ok(batch)
 }
 
+fn simplify_batch(batch: &mut Batch) {
+    let mut char_cnt = HashMap::<char, usize>::new();
+    for ex in batch.examples.iter() {
+        for ch in batch.chars.inter(&ex.chars).chars() {
+            *char_cnt.entry(*ch).or_default() += 1;
+        }
+    }
+
+    loop {
+        let i_opt = batch.examples.iter().position(|ex| {
+            batch
+                .chars
+                .inter(&ex.chars)
+                .chars()
+                .iter()
+                .all(|x| char_cnt[x] >= 2)
+        });
+        if let Some(i) = i_opt {
+            println!(
+                "Removing {} [{}]",
+                batch.examples[i].ja,
+                batch.examples[i].chars.to_string()
+            );
+            batch.examples.remove(i);
+        } else {
+            break;
+        }
+    }
+}
+
+// =====================================================================
+//                          FORMATTING TO HTML
+// =====================================================================
+
 fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) {
     format_batch_aux(dict_idx, count, i, batch).expect("format batch");
 }
 
-fn format_batch_aux<'a>(dict_idx: &DictIndex<'a>, count: usize, i: usize, batch: &Batch) -> Result<()> {
+fn format_batch_aux<'a>(
+    dict_idx: &DictIndex<'a>,
+    count: usize,
+    i: usize,
+    batch: &Batch,
+) -> Result<()> {
     let mut f = io::BufWriter::new(fs::File::create(format!("public/{:03}.html", i))?);
-    write!(f, r#"<!DOCTYPE html>
+    write!(
+        f,
+        r#"<!DOCTYPE html>
         <html>
             <head>
                 <meta charset=\"UTF-8\" />
                 <title>Batch #{:03}</title>
                 <link rel="stylesheet" type="text/css" href="style.css" />
             </head>
-            <body>"#, i)?;
+            <body>"#,
+        i
+    )?;
 
     writeln!(f, r#"<p><a href="index.html">index</a>"#)?;
     for j in 0..count {
@@ -368,7 +511,11 @@ fn format_batch_aux<'a>(dict_idx: &DictIndex<'a>, count: usize, i: usize, batch:
     writeln!(f, r#"</p>"#)?;
     writeln!(f, "<p>Level: {}</p>", batch.level)?;
 
-    writeln!(f, r#"<p class="ja chars">【{}】</p>"#, batch.chars.to_string())?;
+    writeln!(
+        f,
+        r#"<p class="ja chars">【{}】</p>"#,
+        batch.chars.to_string()
+    )?;
 
     for ex in batch.examples.iter() {
         writeln!(f, "<hr />")?;
@@ -415,7 +562,11 @@ fn format_batch_aux<'a>(dict_idx: &DictIndex<'a>, count: usize, i: usize, batch:
         }
         writeln!(f, r#"<p class="chars">"#)?;
         for c in ex.chars.inter(&batch.chars).chars().iter() {
-            writeln!(f, r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#, c, c)?;
+            writeln!(
+                f,
+                r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#,
+                c, c
+            )?;
         }
         writeln!(f, r#"</p>"#)?;
         for be in expl_all {
@@ -436,7 +587,8 @@ fn expl_clean_word(w: &str) -> (&str, Option<&str>) {
             ret = s;
         }
     }
-    let p = w.split_once('(')
+    let p = w
+        .split_once('(')
         .and_then(|(_, r)| r.split_once(')'))
         .map(|(p, _)| p);
     (ret, p)
@@ -467,45 +619,63 @@ fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &roxmltree::Node<'a, 'a>) -
 
 fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> {
     let mut f = io::BufWriter::new(fs::File::create("public/index.html")?);
-    write!(f, r#"<!DOCTYPE html>
+    write!(
+        f,
+        r#"<!DOCTYPE html>
         <html>
             <head>
                 <meta charset=\"UTF-8\" />
                 <title>List of batches</title>
                 <link rel="stylesheet" type="text/css" href="style.css" />
             </head>
-            <body>"#)?;
+            <body>"#
+    )?;
 
     writeln!(f, "<table>")?;
     writeln!(f, "<tr><th>Num</th><th>Level</th><th>Chars</th><th>Examples</th><th>Review</th><th>Ignore</th></tr>")?;
     for (i, batch) in batches.iter().enumerate() {
-        writeln!(f, r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td>{}</td><td>&nbsp;&nbsp;{}</td><td>{}</td><td>{}</td></tr>"#,
-                 i, i,
-                 batch.level,
-                 batch.chars.to_string(),
-                 batch.examples.len(),
-                 batch.chars_p1.union(&batch.chars_p2).to_string(),
-                 batch.chars_bad.to_string())?;
+        writeln!(
+            f,
+            r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td>{}</td><td>&nbsp;&nbsp;{}</td><td>{}</td><td>{}</td></tr>"#,
+            i,
+            i,
+            batch.level,
+            batch.chars.to_string(),
+            batch.examples.len(),
+            batch.chars_p1.union(&batch.chars_p2).to_string(),
+            batch.chars_bad.to_string()
+        )?;
     }
     writeln!(f, r#"</table>"#)?;
 
     writeln!(f, "<hr />")?;
 
-    let all_chars = Charset::from_iter(batches.iter()
-        .map(|x| x.chars.chars().iter().copied())
-        .flatten());
+    let all_chars = Charset::from_iter(
+        batches
+            .iter()
+            .map(|x| x.chars.chars().iter().copied())
+            .flatten(),
+    );
     writeln!(f, "<table>")?;
-    writeln!(f, "<tr><th>Level</th><th>Count</th><th>Chars</th><th>Missing chars</th></tr>")?;
+    writeln!(
+        f,
+        "<tr><th>Level</th><th>Count</th><th>Chars</th><th>Missing chars</th></tr>"
+    )?;
     for (lvl, chars) in kanji_levels.iter() {
         if lvl == "N0+" || lvl == "N0-9" || lvl.ends_with("-10") {
             continue;
         }
         let chars = Charset::new(chars);
         let missing = chars.diff(&all_chars);
-        writeln!(f, "<tr><td>{}</td><td>{}</td><td>{}</td><td>{} ({})</td></tr>",
+        writeln!(
+            f,
+            "<tr><td>{}</td><td>{}</td><td>{}</td><td>{} ({})</td></tr>",
             lvl,
-            chars.len(), chars.to_string(),
-            missing.to_string(), missing.len())?;
+            chars.len(),
+            chars.to_string(),
+            missing.to_string(),
+            missing.len()
+        )?;
     }
     writeln!(f, "</table>")?;
 
@@ -513,207 +683,3 @@ fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<
     f.flush()?;
     Ok(())
 }
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-struct Example {
-    ja: String,
-    en: String,
-    expl: String,
-    id: Option<String>,
-    chars: Charset,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-struct Batch {
-    level: String,
-    chars: Charset,
-    chars_p1: Charset,
-    chars_p2: Charset,
-    chars_bad: Charset,
-    examples: Vec<Example>,
-}
-
-#[derive(Debug, Eq, PartialEq, Hash, Clone, Serialize, Deserialize, Default)]
-struct Charset(Vec<char>);
-
-impl Charset {
-    fn new<S: AsRef<str>>(s: S) -> Self {
-        let mut chars = s.as_ref().chars().collect::<Vec<_>>();
-        chars.sort();
-        chars.dedup();
-        Self(chars)
-    }
-    fn from_iter<S: IntoIterator<Item=char>>(s: S) -> Self {
-        let mut chars = s.into_iter().collect::<Vec<_>>();
-        chars.sort();
-        chars.dedup();
-        Self(chars)
-    }
-    fn intersects(&self, other: &Self) -> bool {
-        let mut it1 = self.0.iter().peekable();
-        let mut it2 = other.0.iter().peekable();
-        while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
-            match c1.cmp(c2) {
-                Ordering::Equal => return true,
-                Ordering::Less => it1.next(),
-                Ordering::Greater => it2.next(),
-            };
-        }
-        false
-    }
-    fn inter_len(&self, other: &Self) -> usize {
-        if other.len() > 20*self.len() {
-            // alternative path
-            return self.0.iter()
-                .filter(|x| other.0.binary_search(x).is_ok())
-                .count();
-        }
-        let mut it1 = self.0.iter().peekable();
-        let mut it2 = other.0.iter().peekable();
-        let mut ret = 0;
-        while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
-            match c1.cmp(c2) {
-                Ordering::Equal => {
-                    ret += 1;
-                    it1.next();
-                    it2.next();
-                }
-                Ordering::Less => {
-                    it1.next();
-                }
-                Ordering::Greater => {
-                    it2.next();
-                }
-            };
-        }
-        ret
-    }
-    fn inter(&self, other: &Self) -> Charset {
-        let mut it1 = self.0.iter().peekable();
-        let mut it2 = other.0.iter().peekable();
-        let mut ret = Vec::new();
-        while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
-            match c1.cmp(c2) {
-                Ordering::Equal => {
-                    ret.push(**c1);
-                    it1.next();
-                    it2.next();
-                }
-                Ordering::Less => {
-                    it1.next();
-                }
-                Ordering::Greater => {
-                    it2.next();
-                }
-            };
-        }
-        Self(ret)
-    }
-    fn union(&self, other: &Self) -> Charset {
-        let mut it1 = self.0.iter().peekable();
-        let mut it2 = other.0.iter().peekable();
-        let mut ret = Vec::new();
-        while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
-            match c1.cmp(c2) {
-                Ordering::Equal => {
-                    ret.push(**c1);
-                    it1.next();
-                    it2.next();
-                }
-                Ordering::Less => {
-                    ret.push(**c1);
-                    it1.next();
-                }
-                Ordering::Greater => {
-                    ret.push(**c2);
-                    it2.next();
-                }
-            };
-        }
-        while let Some(c) = it1.peek() {
-            ret.push(**c);
-            it1.next();
-        }
-        while let Some(c) = it2.peek() {
-            ret.push(**c);
-            it2.next();
-        }
-        Self(ret)
-    }
-    fn diff(&self, other: &Self) -> Charset {
-        let mut it1 = self.0.iter().peekable();
-        let mut it2 = other.0.iter().peekable();
-        let mut ret = Vec::new();
-        while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
-            match c1.cmp(c2) {
-                Ordering::Equal => {
-                    it1.next();
-                    it2.next();
-                }
-                Ordering::Less => {
-                    ret.push(**c1);
-                    it1.next();
-                }
-                Ordering::Greater => {
-                    it2.next();
-                }
-            };
-        }
-        while let Some(c) = it1.peek() {
-            ret.push(**c);
-            it1.next();
-        }
-        Self(ret)
-    }
-    fn len(&self) -> usize {
-        self.0.len()
-    }
-    fn is_empty(&self) -> bool {
-        self.0.is_empty()
-    }
-    fn chars(&self) -> &[char] {
-        &self.0
-    }
-    fn contains(&self, c: char) -> bool {
-        self.0.binary_search(&c).is_ok()
-    }
-    fn to_string(&self) -> String {
-        self.0.iter().collect::<String>()
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use super::*;
-
-    #[test]
-    fn test_charset() {
-        let c1 = Charset::new("azerty");
-        let c2 = Charset::new("uiopqsqdf");
-        let c3 = Charset::new("hello, world");
-
-        assert!(!c1.intersects(&c2));
-        assert!(c1.intersects(&c3));
-        assert!(c2.intersects(&c3));
-
-        assert_eq!(c1.inter_len(&c2), 0);
-        assert_eq!(c1.inter_len(&c3), 2);
-        assert_eq!(c2.inter_len(&c3), 2);
-
-        assert_eq!(c1.inter(&c2), Charset::new(""));
-        assert_eq!(c1.inter(&c3), Charset::new("er"));
-        assert_eq!(c2.inter(&c3), Charset::new("od"));
-
-        assert_eq!(c1.union(&c2), Charset::new("azertyuiopqsdf"));
-        assert_eq!(c1.union(&c3), Charset::new("azertyhello, world"));
-        assert_eq!(c2.union(&c3), Charset::new("uiopqsdfhello, world"));
-
-        assert_eq!(c1.diff(&c2), Charset::new("azerty"));
-        assert_eq!(c1.diff(&c3), Charset::new("azty"));
-        assert_eq!(c2.diff(&c3), Charset::new("uipqsf"));
-
-        assert_eq!(c2.diff(&c1), Charset::new("uiopqsdf"));
-        assert_eq!(c3.diff(&c1), Charset::new("hllo, wold"));
-        assert_eq!(c3.diff(&c2), Charset::new("hell, wrl"));
-    }
-}
author	Alex Auvolat <alex@adnab.me>	2023-09-24 17:26:00 +0200
committer	Alex Auvolat <alex@adnab.me>	2023-09-24 17:26:00 +0200
commit	a9de8d71a0fecbd483cbdc084ba109cb96250aaa (patch)
tree	cfd7c1551e56bf210fd119d54583e6e83f19bb0e
parent	22bd3b7e415c5a33e8101bc38acaaab22de5b316 (diff)
download	datagengo-a9de8d71a0fecbd483cbdc084ba109cb96250aaa.tar.gz datagengo-a9de8d71a0fecbd483cbdc084ba109cb96250aaa.zip