diff options
author | Alex Auvolat <alex@adnab.me> | 2023-07-21 09:02:59 +0200 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-07-21 09:02:59 +0200 |
commit | 13997439f8f1440b56c1e7dd449e3444aad28197 (patch) | |
tree | 82c07bbc84191901af8be9d9fbb9f8c009055f51 /src | |
download | datagengo-13997439f8f1440b56c1e7dd449e3444aad28197.tar.gz datagengo-13997439f8f1440b56c1e7dd449e3444aad28197.zip |
first commit
Diffstat (limited to 'src')
-rw-r--r-- | src/main.rs | 235 |
1 files changed, 235 insertions, 0 deletions
diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..f99d236 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,235 @@ +use std::collections::HashMap; +use std::fs; +use std::cmp::Ordering; +use std::io::{self, BufRead}; + +use anyhow::Result; +use structopt::StructOpt; + +#[derive(Debug, StructOpt)] +#[structopt(name = "datagengo", about = "Japanese example practice maker")] +struct Opt { + #[structopt(subcommand)] + cmd: Cmd, +} + +#[derive(Debug, StructOpt)] +enum Cmd { + ParseKanjidic, + New, +} + +fn main() { + let opt = Opt::from_args(); + + match opt.cmd { + Cmd::ParseKanjidic => { + let levels = parse_kanjidic().expect("error"); + for (jlpt, grade, chars) in levels.iter() { + println!("{}.{}: {}", jlpt, grade, chars); + } + }, + Cmd::New => { + let kanji_levels = read_kanji_levels().expect("error"); + let all_kanji = Charset::new(kanji_levels.iter() + .map(|(_, x)| x.to_string()) + .collect::<Vec<_>>() + .join("")); + let ex = read_examples(&all_kanji).expect("error"); + println!("{:#?}", &ex[..10]); + } + } +} + +fn parse_kanjidic() -> Result<Vec<(i32, i32, String)>> { + let file = fs::read_to_string("data/kanjidic2.xml")?; + let xml = roxmltree::Document::parse(&file)?; + let kanjidic = xml.root().first_child().unwrap(); + assert!(kanjidic.has_tag_name("kanjidic2")); + + let mut levels = HashMap::new(); + + for x in kanjidic.children() { + if !x.has_tag_name("character") { + continue; + } + let mut literal = None; + let mut jlpt = None; + let mut grade = None; + for y in x.children() { + if y.has_tag_name("literal") { + literal = y.text(); + } + if y.has_tag_name("misc") { + for z in y.children() { + if z.has_tag_name("grade") { + grade = z.text().and_then(|x| str::parse::<i32>(x).ok()); + } + if z.has_tag_name("jlpt") { + jlpt = z.text().and_then(|x| str::parse::<i32>(x).ok()); + } + } + } + } + if jlpt.is_none() && grade.is_none() { + continue; + } + let level = (jlpt.unwrap_or(0), grade.unwrap_or(0)); + if let Some(lit) = literal { + levels.entry(level).or_insert(String::new()).extend(lit.chars()); + } + } + + let mut levels = levels.into_iter().map(|((j, g), c)| (j, g, c)).collect::<Vec<_>>(); + levels.sort_by_key(|(j, g, _)| (-*j, *g)); + Ok(levels) +} + +fn read_kanji_levels() -> Result<Vec<(String, String)>> { + Ok(fs::read_to_string("data/kanji_levels.txt")? + .lines() + .filter_map(|l| l.split_once(": ")) + .map(|(l, k)| (l.to_string(), k.to_string())) + .collect::<Vec<_>>()) +} + +fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> { + let file = fs::File::open("data/examples.utf")?; + + let mut ret = Vec::new(); + let mut a = "".to_string(); + + for line in io::BufReader::new(file).lines() { + let line = line?; + if line.starts_with("A:") { + a = line; + } else if line.starts_with("B:") { + let s = a.strip_prefix("A: "); + let t = line.strip_prefix("B: "); + if let (Some(a), Some(b)) = (s, t) { + if let Some((ja, eng)) = a.split_once("\t") { + if let Some((eng, id)) = eng.split_once("#") { + ret.push(Example { + ja: ja.to_string(), + en: eng.to_string(), + expl: b.to_string(), + id: Some(id.to_string()), + chars: Charset::new(ja).inter_chars(all_kanji), + }); + } else { + ret.push(Example { + ja: ja.to_string(), + en: eng.to_string(), + expl: b.to_string(), + id: None, + chars: Charset::new(ja).inter_chars(all_kanji), + }); + } + } + } + } + if ret.len() > 100 { + break; + } + } + + Ok(ret) +} + +#[derive(Debug)] +struct Example { + ja: String, + en: String, + expl: String, + id: Option<String>, + chars: Charset, +} + +#[derive(Debug)] +struct Charset(Vec<char>); + +impl Charset { + fn new<S: AsRef<str>>(s: S) -> Self { + let mut chars = s.as_ref().chars().collect::<Vec<_>>(); + chars.sort(); + chars.dedup(); + Self(chars) + } + fn intersects(&self, other: &Self) -> bool { + let mut it1 = self.0.iter().peekable(); + let mut it2 = other.0.iter().peekable(); + while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) { + match c1.cmp(c2) { + Ordering::Equal => return true, + Ordering::Less => it1.next(), + Ordering::Greater => it2.next(), + }; + } + false + } + fn count_inter(&self, other: &Self) -> usize { + let mut it1 = self.0.iter().peekable(); + let mut it2 = other.0.iter().peekable(); + let mut ret = 0; + while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) { + match c1.cmp(c2) { + Ordering::Equal => { + ret += 1; + it1.next(); + it2.next(); + } + Ordering::Less => { + it1.next(); + } + Ordering::Greater => { + it2.next(); + } + }; + } + ret + } + fn inter_chars(&self, other: &Self) -> Charset { + let mut it1 = self.0.iter().peekable(); + let mut it2 = other.0.iter().peekable(); + let mut ret = Vec::new(); + while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) { + match c1.cmp(c2) { + Ordering::Equal => { + ret.push(**c1); + it1.next(); + it2.next(); + } + Ordering::Less => { + it1.next(); + } + Ordering::Greater => { + it2.next(); + } + }; + } + Self(ret) + } + fn chars(&self) -> &[char] { + &self.0 + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_charset() { + let c1 = Charset::new("azerty"); + let c2 = Charset::new("uiopqsqdf"); + let c3 = Charset::new("hello, world"); + + assert!(!c1.intersects(&c2)); + assert!(c1.intersects(&c3)); + assert!(c2.intersects(&c3)); + + assert_eq!(c1.count_inter(&c2), 0); + assert_eq!(c1.count_inter(&c3), 2); + assert_eq!(c2.count_inter(&c3), 2); + } +} |