diff options
author | Alex Auvolat <alex@adnab.me> | 2023-09-24 17:26:00 +0200 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-09-24 17:26:00 +0200 |
commit | a9de8d71a0fecbd483cbdc084ba109cb96250aaa (patch) | |
tree | cfd7c1551e56bf210fd119d54583e6e83f19bb0e | |
parent | 22bd3b7e415c5a33e8101bc38acaaab22de5b316 (diff) | |
download | datagengo-a9de8d71a0fecbd483cbdc084ba109cb96250aaa.tar.gz datagengo-a9de8d71a0fecbd483cbdc084ba109cb96250aaa.zip |
refactor; add simplify to remove ONE redundant sentencebatches-v1
-rw-r--r-- | data/batches.json | 10 | ||||
-rw-r--r-- | src/charset.rs | 190 | ||||
-rw-r--r-- | src/main.rs | 532 |
3 files changed, 439 insertions, 293 deletions
diff --git a/data/batches.json b/data/batches.json index b8a3f59..cbcb15e 100644 --- a/data/batches.json +++ b/data/batches.json @@ -664,16 +664,6 @@ ] }, { - "ja": "オーストラリアは南アメリカより小さい。", - "en": "Australia is smaller than South America.", - "expl": "濠太剌利{オーストラリア} は 南 亜米利加{アメリカ} より 小さい", - "id": "ID=65144_227791", - "chars": [ - "南", - "小" - ] - }, - { "ja": "月は東から上る。", "en": "The moon rises in the east.", "expl": "月(つき)[01] は 東(ひがし) から 登る[02]{上る}", diff --git a/src/charset.rs b/src/charset.rs new file mode 100644 index 0000000..a79eddb --- /dev/null +++ b/src/charset.rs @@ -0,0 +1,190 @@ +use std::cmp::Ordering; + +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Eq, PartialEq, Hash, Clone, Serialize, Deserialize, Default)] +pub struct Charset(pub Vec<char>); + +impl Charset { + pub fn new<S: AsRef<str>>(s: S) -> Self { + let mut chars = s.as_ref().chars().collect::<Vec<_>>(); + chars.sort(); + chars.dedup(); + Self(chars) + } + pub fn from_iter<S: IntoIterator<Item = char>>(s: S) -> Self { + let mut chars = s.into_iter().collect::<Vec<_>>(); + chars.sort(); + chars.dedup(); + Self(chars) + } + pub fn intersects(&self, other: &Self) -> bool { + let mut it1 = self.0.iter().peekable(); + let mut it2 = other.0.iter().peekable(); + while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) { + match c1.cmp(c2) { + Ordering::Equal => return true, + Ordering::Less => it1.next(), + Ordering::Greater => it2.next(), + }; + } + false + } + pub fn inter_len(&self, other: &Self) -> usize { + if other.len() > 20 * self.len() { + // alternative path + return self + .0 + .iter() + .filter(|x| other.0.binary_search(x).is_ok()) + .count(); + } + let mut it1 = self.0.iter().peekable(); + let mut it2 = other.0.iter().peekable(); + let mut ret = 0; + while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) { + match c1.cmp(c2) { + Ordering::Equal => { + ret += 1; + it1.next(); + it2.next(); + } + Ordering::Less => { + it1.next(); + } + Ordering::Greater => { + it2.next(); + } + }; + } + ret + } + pub fn inter(&self, other: &Self) -> Charset { + let mut it1 = self.0.iter().peekable(); + let mut it2 = other.0.iter().peekable(); + let mut ret = Vec::new(); + while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) { + match c1.cmp(c2) { + Ordering::Equal => { + ret.push(**c1); + it1.next(); + it2.next(); + } + Ordering::Less => { + it1.next(); + } + Ordering::Greater => { + it2.next(); + } + }; + } + Self(ret) + } + pub fn union(&self, other: &Self) -> Charset { + let mut it1 = self.0.iter().peekable(); + let mut it2 = other.0.iter().peekable(); + let mut ret = Vec::new(); + while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) { + match c1.cmp(c2) { + Ordering::Equal => { + ret.push(**c1); + it1.next(); + it2.next(); + } + Ordering::Less => { + ret.push(**c1); + it1.next(); + } + Ordering::Greater => { + ret.push(**c2); + it2.next(); + } + }; + } + while let Some(c) = it1.peek() { + ret.push(**c); + it1.next(); + } + while let Some(c) = it2.peek() { + ret.push(**c); + it2.next(); + } + Self(ret) + } + pub fn diff(&self, other: &Self) -> Charset { + let mut it1 = self.0.iter().peekable(); + let mut it2 = other.0.iter().peekable(); + let mut ret = Vec::new(); + while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) { + match c1.cmp(c2) { + Ordering::Equal => { + it1.next(); + it2.next(); + } + Ordering::Less => { + ret.push(**c1); + it1.next(); + } + Ordering::Greater => { + it2.next(); + } + }; + } + while let Some(c) = it1.peek() { + ret.push(**c); + it1.next(); + } + Self(ret) + } + pub fn len(&self) -> usize { + self.0.len() + } + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + pub fn chars(&self) -> &[char] { + &self.0 + } + pub fn contains(&self, c: char) -> bool { + self.0.binary_search(&c).is_ok() + } + pub fn to_string(&self) -> String { + self.0.iter().collect::<String>() + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_charset() { + let c1 = Charset::new("azerty"); + let c2 = Charset::new("uiopqsqdf"); + let c3 = Charset::new("hello, world"); + + assert!(!c1.intersects(&c2)); + assert!(c1.intersects(&c3)); + assert!(c2.intersects(&c3)); + + assert_eq!(c1.inter_len(&c2), 0); + assert_eq!(c1.inter_len(&c3), 2); + assert_eq!(c2.inter_len(&c3), 2); + + assert_eq!(c1.inter(&c2), Charset::new("")); + assert_eq!(c1.inter(&c3), Charset::new("er")); + assert_eq!(c2.inter(&c3), Charset::new("od")); + + assert_eq!(c1.union(&c2), Charset::new("azertyuiopqsdf")); + assert_eq!(c1.union(&c3), Charset::new("azertyhello, world")); + assert_eq!(c2.union(&c3), Charset::new("uiopqsdfhello, world")); + + assert_eq!(c1.diff(&c2), Charset::new("azerty")); + assert_eq!(c1.diff(&c3), Charset::new("azty")); + assert_eq!(c2.diff(&c3), Charset::new("uipqsf")); + + assert_eq!(c2.diff(&c1), Charset::new("uiopqsdf")); + assert_eq!(c3.diff(&c1), Charset::new("hllo, wold")); + assert_eq!(c3.diff(&c2), Charset::new("hell, wrl")); + } +} diff --git a/src/main.rs b/src/main.rs index 837b7ff..a283891 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,13 +1,15 @@ use std::collections::HashMap; use std::fs; -use std::cmp::Ordering; use std::io::{self, BufRead, Write}; use anyhow::{anyhow, Result}; use rayon::prelude::*; -use serde::{Serialize, Deserialize}; +use serde::{Deserialize, Serialize}; use structopt::StructOpt; +mod charset; +use charset::Charset; + #[derive(Debug, StructOpt)] #[structopt(name = "datagengo", about = "Japanese example practice maker")] struct Opt { @@ -17,11 +19,12 @@ struct Opt { #[derive(Debug, StructOpt)] enum Cmd { - ParseKanjidic, + ParseKanjidic, New { #[structopt(default_value = "10")] count: usize, }, + Simplify, Format, } @@ -35,13 +38,17 @@ fn main() { println!("{}: {}", level, chars.to_string()); } } - Cmd::New{ count } => { + Cmd::New { count } => { let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); - let all_kanji = Charset::new(kanji_levels.iter() - .map(|(_, x)| x.to_string()) - .collect::<Vec<_>>() - .join("")); - let kanji_levels = kanji_levels.into_iter() + let all_kanji = Charset::new( + kanji_levels + .iter() + .map(|(_, x)| x.to_string()) + .collect::<Vec<_>>() + .join(""), + ); + let kanji_levels = kanji_levels + .into_iter() .map(|(l, x)| (l, Charset::new(x))) .collect::<Vec<_>>(); let mut ex = read_examples(&all_kanji).expect("read_examples"); @@ -58,18 +65,40 @@ fn main() { } batches.push(batch); } - fs::write("data/batches.json", serde_json::to_string_pretty(&batches).expect("serialize").as_bytes()).expect("save"); + fs::write( + "data/batches.json", + serde_json::to_string_pretty(&batches) + .expect("serialize") + .as_bytes(), + ) + .expect("save"); + } + Cmd::Simplify => { + let mut batches: Vec<Batch> = fs::read("data/batches.json") + .map_err(anyhow::Error::from) + .and_then(|x| Ok(serde_json::from_slice(&x)?)) + .unwrap_or_default(); + for batch in batches.iter_mut() { + simplify_batch(batch); + } + fs::write( + "data/batches.json", + serde_json::to_string_pretty(&batches) + .expect("serialize") + .as_bytes(), + ) + .expect("save"); } Cmd::Format => { - let jmdict = fs::read_to_string("data/JMdict_e.xml") - .expect("read_jmdict"); + let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); let jmdict = roxmltree::Document::parse_with_options( &jmdict, roxmltree::ParsingOptions { allow_dtd: true, ..Default::default() - }) - .expect("parse_jmdict"); + }, + ) + .expect("parse_jmdict"); let jmdict_idx = index_jmdict(&jmdict); let batches = fs::read("data/batches.json") @@ -80,7 +109,8 @@ fn main() { fs::create_dir_all("public").expect("mkdir public"); fs::copy("static/style.css", "public/style.css").expect("copy style.css"); - batches.iter() + batches + .iter() .enumerate() .for_each(|x| format_batch(&jmdict_idx, batches.len(), x)); @@ -90,9 +120,17 @@ fn main() { } } +// ===================================================================== +// PARSING DATA FILES +// ===================================================================== + type DictIndex<'a> = HashMap<&'a str, Vec<roxmltree::Node<'a, 'a>>>; fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> { - let dict = dict.root().children().find(|x| x.has_tag_name("JMdict")).unwrap(); + let dict = dict + .root() + .children() + .find(|x| x.has_tag_name("JMdict")) + .unwrap(); let mut ret: DictIndex<'a> = HashMap::new(); for x in dict.children().filter(|x| x.has_tag_name("entry")) { @@ -138,17 +176,19 @@ fn parse_kanjidic() -> Result<Vec<(String, Charset)>> { } } if let Some(lit) = literal { - levels.entry((jlpt, grade)).or_insert(String::new()).extend(lit.chars()); + levels + .entry((jlpt, grade)) + .or_insert(String::new()) + .extend(lit.chars()); } } let mut levels = levels.into_iter().collect::<Vec<_>>(); levels.sort_by_key(|((j, g), _)| match (j, g) { - (Some(j), Some(g)) => (10-*j)*20+*g, - (Some(j), None) => (10-*j)*20+15, - (None, Some(g)) => 1000+*g, - (None, None) => 1015, - + (Some(j), Some(g)) => (10 - *j) * 20 + *g, + (Some(j), None) => (10 - *j) * 20 + 15, + (None, Some(g)) => 1000 + *g, + (None, None) => 1015, }); let mut ret = Vec::new(); @@ -212,38 +252,87 @@ fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> { } } if i % 10000 == 0 { - eprintln!("read examples: {}/300", i/1000); + eprintln!("read examples: {}/300", i / 1000); } } Ok(ret) } -fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &[Example]) -> Result<Batch> { - let prev_chars = Charset::from_iter(previous.iter() - .map(|x| x.chars.chars().iter().copied()) - .flatten()); +// ===================================================================== +// BATCH STRUCTURES AND GENERATION +// ===================================================================== + +#[derive(Debug, Clone, Serialize, Deserialize)] +struct Example { + ja: String, + en: String, + expl: String, + id: Option<String>, + chars: Charset, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +struct Batch { + level: String, + chars: Charset, + chars_p1: Charset, + chars_p2: Charset, + chars_bad: Charset, + examples: Vec<Example>, +} - let (mut target_i, target_level, mut target_chars) = kanji_levels.iter().enumerate() +fn gen_batch( + previous: &[Batch], + kanji_levels: &[(String, Charset)], + examples: &[Example], +) -> Result<Batch> { + let prev_chars = Charset::from_iter( + previous + .iter() + .map(|x| x.chars.chars().iter().copied()) + .flatten(), + ); + + let (mut target_i, target_level, mut target_chars) = kanji_levels + .iter() + .enumerate() .map(|(i, (l, c))| (i, l, c.diff(&prev_chars))) .find(|(_, _, c)| !c.is_empty()) .ok_or(anyhow!("no more batches to make!"))?; - let chars_p1 = previous.iter().rev().next() - .map(|b| b.chars.clone()).unwrap_or(Charset::default()); - - let chars_p2 = previous.iter().rev().skip(1).next() - .map(|b| b.chars.clone()).unwrap_or(Charset::default()); + let chars_p1 = previous + .iter() + .rev() + .next() + .map(|b| b.chars.clone()) + .unwrap_or(Charset::default()); + + let chars_p2 = previous + .iter() + .rev() + .skip(1) + .next() + .map(|b| b.chars.clone()) + .unwrap_or(Charset::default()); let mut chars_late = Charset::default(); - let mut chars_bad = Charset::from_iter(kanji_levels.iter().skip(target_i+1) - .map(|(_, c)| c.chars().iter().copied()) - .flatten()); - let mut chars_bad_avoid = Charset::from_iter(kanji_levels.iter().skip(target_i+1) - .filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10")) - .map(|(_, c)| c.chars().iter().copied()) - .flatten()); + let mut chars_bad = Charset::from_iter( + kanji_levels + .iter() + .skip(target_i + 1) + .map(|(_, c)| c.chars().iter().copied()) + .flatten(), + ); + let mut chars_bad_avoid = Charset::from_iter( + kanji_levels + .iter() + .skip(target_i + 1) + .filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10")) + .map(|(_, c)| c.chars().iter().copied()) + .flatten(), + ); let mut batch = Batch { level: target_level.to_string(), @@ -295,31 +384,39 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: & - 40i32 * ex.chars.inter_len(&chars_bad) as i32 }; */ - let cost = |ex: &Example, ex_tgt_inter: usize| { ( - - (ex.chars.inter_len(&chars_bad_avoid) as i32), - ex_tgt_inter, - ex.chars.inter_len(&chars_late), - 2*ex.chars.inter_len(&chars_p1) + ex.chars.inter_len(&chars_p2), - - (ex.ja.chars().count() as i32), - ) }; - let cand_1 = examples.par_iter() + let cost = |ex: &Example, ex_tgt_inter: usize| { + ( + -(ex.chars.inter_len(&chars_bad_avoid) as i32), + ex_tgt_inter, + ex.chars.inter_len(&chars_late), + 2 * ex.chars.inter_len(&chars_p1) + ex.chars.inter_len(&chars_p2), + -(ex.ja.chars().count() as i32), + ) + }; + let cand_1 = examples + .par_iter() .map(|ex| (ex, ex.chars.inter_len(&target_chars))) - .filter(|(_, ex_tgt_inter)| (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len) + .filter(|(_, ex_tgt_inter)| { + (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len + }) .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter)); - let cand = cand_1.or_else(|| { - examples.par_iter() - .map(|ex| (ex, ex.chars.inter_len(&target_chars))) - .filter(|(_, ex_tgt_inter)| *ex_tgt_inter > 0) - .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter)) + let cand = cand_1.or_else(|| { + examples + .par_iter() + .map(|ex| (ex, ex.chars.inter_len(&target_chars))) + .filter(|(_, ex_tgt_inter)| *ex_tgt_inter > 0) + .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter)) }); if let Some((ex, _)) = cand { - eprintln!("* add {} (rep: {}, p1: {}, p2: {}, bad: {}) {}", + eprintln!( + "* add {} (rep: {}, p1: {}, p2: {}, bad: {}) {}", ex.chars.inter(&target_chars).to_string(), ex.chars.inter(&batch.chars).to_string(), ex.chars.inter(&chars_p1).to_string(), ex.chars.inter(&chars_p2).to_string(), ex.chars.inter(&chars_bad).to_string(), - ex.ja); + ex.ja + ); batch.chars = batch.chars.union(&ex.chars.inter(&target_chars)); target_chars = target_chars.diff(&ex.chars); chars_late = chars_late.diff(&ex.chars); @@ -328,7 +425,10 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: & stalled = false; } else { if stalled { - eprintln!("could not find suitable sentence, stopping batch now (need {})", need); + eprintln!( + "could not find suitable sentence, stopping batch now (need {})", + need + ); break; } stalled = true; @@ -342,20 +442,63 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: & Ok(batch) } +fn simplify_batch(batch: &mut Batch) { + let mut char_cnt = HashMap::<char, usize>::new(); + for ex in batch.examples.iter() { + for ch in batch.chars.inter(&ex.chars).chars() { + *char_cnt.entry(*ch).or_default() += 1; + } + } + + loop { + let i_opt = batch.examples.iter().position(|ex| { + batch + .chars + .inter(&ex.chars) + .chars() + .iter() + .all(|x| char_cnt[x] >= 2) + }); + if let Some(i) = i_opt { + println!( + "Removing {} [{}]", + batch.examples[i].ja, + batch.examples[i].chars.to_string() + ); + batch.examples.remove(i); + } else { + break; + } + } +} + +// ===================================================================== +// FORMATTING TO HTML +// ===================================================================== + fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) { format_batch_aux(dict_idx, count, i, batch).expect("format batch"); } -fn format_batch_aux<'a>(dict_idx: &DictIndex<'a>, count: usize, i: usize, batch: &Batch) -> Result<()> { +fn format_batch_aux<'a>( + dict_idx: &DictIndex<'a>, + count: usize, + i: usize, + batch: &Batch, +) -> Result<()> { let mut f = io::BufWriter::new(fs::File::create(format!("public/{:03}.html", i))?); - write!(f, r#"<!DOCTYPE html> + write!( + f, + r#"<!DOCTYPE html> <html> <head> <meta charset=\"UTF-8\" /> <title>Batch #{:03}</title> <link rel="stylesheet" type="text/css" href="style.css" /> </head> - <body>"#, i)?; + <body>"#, + i + )?; writeln!(f, r#"<p><a href="index.html">index</a>"#)?; for j in 0..count { @@ -368,7 +511,11 @@ fn format_batch_aux<'a>(dict_idx: &DictIndex<'a>, count: usize, i: usize, batch: writeln!(f, r#"</p>"#)?; writeln!(f, "<p>Level: {}</p>", batch.level)?; - writeln!(f, r#"<p class="ja chars">【{}】</p>"#, batch.chars.to_string())?; + writeln!( + f, + r#"<p class="ja chars">【{}】</p>"#, + batch.chars.to_string() + )?; for ex in batch.examples.iter() { writeln!(f, "<hr />")?; @@ -415,7 +562,11 @@ fn format_batch_aux<'a>(dict_idx: &DictIndex<'a>, count: usize, i: usize, batch: } writeln!(f, r#"<p class="chars">"#)?; for c in ex.chars.inter(&batch.chars).chars().iter() { - writeln!(f, r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#, c, c)?; + writeln!( + f, + r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#, + c, c + )?; } writeln!(f, r#"</p>"#)?; for be in expl_all { @@ -436,7 +587,8 @@ fn expl_clean_word(w: &str) -> (&str, Option<&str>) { ret = s; } } - let p = w.split_once('(') + let p = w + .split_once('(') .and_then(|(_, r)| r.split_once(')')) .map(|(p, _)| p); (ret, p) @@ -467,45 +619,63 @@ fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &roxmltree::Node<'a, 'a>) - fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> { let mut f = io::BufWriter::new(fs::File::create("public/index.html")?); - write!(f, r#"<!DOCTYPE html> + write!( + f, + r#"<!DOCTYPE html> <html> <head> <meta charset=\"UTF-8\" /> <title>List of batches</title> <link rel="stylesheet" type="text/css" href="style.css" /> </head> - <body>"#)?; + <body>"# + )?; writeln!(f, "<table>")?; writeln!(f, "<tr><th>Num</th><th>Level</th><th>Chars</th><th>Examples</th><th>Review</th><th>Ignore</th></tr>")?; for (i, batch) in batches.iter().enumerate() { - writeln!(f, r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td>{}</td><td> {}</td><td>{}</td><td>{}</td></tr>"#, - i, i, - batch.level, - batch.chars.to_string(), - batch.examples.len(), - batch.chars_p1.union(&batch.chars_p2).to_string(), - batch.chars_bad.to_string())?; + writeln!( + f, + r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td>{}</td><td> {}</td><td>{}</td><td>{}</td></tr>"#, + i, + i, + batch.level, + batch.chars.to_string(), + batch.examples.len(), + batch.chars_p1.union(&batch.chars_p2).to_string(), + batch.chars_bad.to_string() + )?; } writeln!(f, r#"</table>"#)?; writeln!(f, "<hr />")?; - let all_chars = Charset::from_iter(batches.iter() - .map(|x| x.chars.chars().iter().copied()) - .flatten()); + let all_chars = Charset::from_iter( + batches + .iter() + .map(|x| x.chars.chars().iter().copied()) + .flatten(), + ); writeln!(f, "<table>")?; - writeln!(f, "<tr><th>Level</th><th>Count</th><th>Chars</th><th>Missing chars</th></tr>")?; + writeln!( + f, + "<tr><th>Level</th><th>Count</th><th>Chars</th><th>Missing chars</th></tr>" + )?; for (lvl, chars) in kanji_levels.iter() { if lvl == "N0+" || lvl == "N0-9" || lvl.ends_with("-10") { continue; } let chars = Charset::new(chars); let missing = chars.diff(&all_chars); - writeln!(f, "<tr><td>{}</td><td>{}</td><td>{}</td><td>{} ({})</td></tr>", + writeln!( + f, + "<tr><td>{}</td><td>{}</td><td>{}</td><td>{} ({})</td></tr>", lvl, - chars.len(), chars.to_string(), - missing.to_string(), missing.len())?; + chars.len(), + chars.to_string(), + missing.to_string(), + missing.len() + )?; } writeln!(f, "</table>")?; @@ -513,207 +683,3 @@ fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result< f.flush()?; Ok(()) } - -#[derive(Debug, Clone, Serialize, Deserialize)] -struct Example { - ja: String, - en: String, - expl: String, - id: Option<String>, - chars: Charset, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -struct Batch { - level: String, - chars: Charset, - chars_p1: Charset, - chars_p2: Charset, - chars_bad: Charset, - examples: Vec<Example>, -} - -#[derive(Debug, Eq, PartialEq, Hash, Clone, Serialize, Deserialize, Default)] -struct Charset(Vec<char>); - -impl Charset { - fn new<S: AsRef<str>>(s: S) -> Self { - let mut chars = s.as_ref().chars().collect::<Vec<_>>(); - chars.sort(); - chars.dedup(); - Self(chars) - } - fn from_iter<S: IntoIterator<Item=char>>(s: S) -> Self { - let mut chars = s.into_iter().collect::<Vec<_>>(); - chars.sort(); - chars.dedup(); - Self(chars) - } - fn intersects(&self, other: &Self) -> bool { - let mut it1 = self.0.iter().peekable(); - let mut it2 = other.0.iter().peekable(); - while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) { - match c1.cmp(c2) { - Ordering::Equal => return true, - Ordering::Less => it1.next(), - Ordering::Greater => it2.next(), - }; - } - false - } - fn inter_len(&self, other: &Self) -> usize { - if other.len() > 20*self.len() { - // alternative path - return self.0.iter() - .filter(|x| other.0.binary_search(x).is_ok()) - .count(); - } - let mut it1 = self.0.iter().peekable(); - let mut it2 = other.0.iter().peekable(); - let mut ret = 0; - while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) { - match c1.cmp(c2) { - Ordering::Equal => { - ret += 1; - it1.next(); - it2.next(); - } - Ordering::Less => { - it1.next(); - } - Ordering::Greater => { - it2.next(); - } - }; - } - ret - } - fn inter(&self, other: &Self) -> Charset { - let mut it1 = self.0.iter().peekable(); - let mut it2 = other.0.iter().peekable(); - let mut ret = Vec::new(); - while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) { - match c1.cmp(c2) { - Ordering::Equal => { - ret.push(**c1); - it1.next(); - it2.next(); - } - Ordering::Less => { - it1.next(); - } - Ordering::Greater => { - it2.next(); - } - }; - } - Self(ret) - } - fn union(&self, other: &Self) -> Charset { - let mut it1 = self.0.iter().peekable(); - let mut it2 = other.0.iter().peekable(); - let mut ret = Vec::new(); - while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) { - match c1.cmp(c2) { - Ordering::Equal => { - ret.push(**c1); - it1.next(); - it2.next(); - } - Ordering::Less => { - ret.push(**c1); - it1.next(); - } - Ordering::Greater => { - ret.push(**c2); - it2.next(); - } - }; - } - while let Some(c) = it1.peek() { - ret.push(**c); - it1.next(); - } - while let Some(c) = it2.peek() { - ret.push(**c); - it2.next(); - } - Self(ret) - } - fn diff(&self, other: &Self) -> Charset { - let mut it1 = self.0.iter().peekable(); - let mut it2 = other.0.iter().peekable(); - let mut ret = Vec::new(); - while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) { - match c1.cmp(c2) { - Ordering::Equal => { - it1.next(); - it2.next(); - } - Ordering::Less => { - ret.push(**c1); - it1.next(); - } - Ordering::Greater => { - it2.next(); - } - }; - } - while let Some(c) = it1.peek() { - ret.push(**c); - it1.next(); - } - Self(ret) - } - fn len(&self) -> usize { - self.0.len() - } - fn is_empty(&self) -> bool { - self.0.is_empty() - } - fn chars(&self) -> &[char] { - &self.0 - } - fn contains(&self, c: char) -> bool { - self.0.binary_search(&c).is_ok() - } - fn to_string(&self) -> String { - self.0.iter().collect::<String>() - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_charset() { - let c1 = Charset::new("azerty"); - let c2 = Charset::new("uiopqsqdf"); - let c3 = Charset::new("hello, world"); - - assert!(!c1.intersects(&c2)); - assert!(c1.intersects(&c3)); - assert!(c2.intersects(&c3)); - - assert_eq!(c1.inter_len(&c2), 0); - assert_eq!(c1.inter_len(&c3), 2); - assert_eq!(c2.inter_len(&c3), 2); - - assert_eq!(c1.inter(&c2), Charset::new("")); - assert_eq!(c1.inter(&c3), Charset::new("er")); - assert_eq!(c2.inter(&c3), Charset::new("od")); - - assert_eq!(c1.union(&c2), Charset::new("azertyuiopqsdf")); - assert_eq!(c1.union(&c3), Charset::new("azertyhello, world")); - assert_eq!(c2.union(&c3), Charset::new("uiopqsdfhello, world")); - - assert_eq!(c1.diff(&c2), Charset::new("azerty")); - assert_eq!(c1.diff(&c3), Charset::new("azty")); - assert_eq!(c2.diff(&c3), Charset::new("uipqsf")); - - assert_eq!(c2.diff(&c1), Charset::new("uiopqsdf")); - assert_eq!(c3.diff(&c1), Charset::new("hllo, wold")); - assert_eq!(c3.diff(&c2), Charset::new("hell, wrl")); - } -} |