aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-07-21 09:02:59 +0200
committerAlex Auvolat <alex@adnab.me>2023-07-21 09:02:59 +0200
commit13997439f8f1440b56c1e7dd449e3444aad28197 (patch)
tree82c07bbc84191901af8be9d9fbb9f8c009055f51 /src
downloaddatagengo-13997439f8f1440b56c1e7dd449e3444aad28197.tar.gz
datagengo-13997439f8f1440b56c1e7dd449e3444aad28197.zip
first commit
Diffstat (limited to 'src')
-rw-r--r--src/main.rs235
1 files changed, 235 insertions, 0 deletions
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..f99d236
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,235 @@
+use std::collections::HashMap;
+use std::fs;
+use std::cmp::Ordering;
+use std::io::{self, BufRead};
+
+use anyhow::Result;
+use structopt::StructOpt;
+
+#[derive(Debug, StructOpt)]
+#[structopt(name = "datagengo", about = "Japanese example practice maker")]
+struct Opt {
+ #[structopt(subcommand)]
+ cmd: Cmd,
+}
+
+#[derive(Debug, StructOpt)]
+enum Cmd {
+ ParseKanjidic,
+ New,
+}
+
+fn main() {
+ let opt = Opt::from_args();
+
+ match opt.cmd {
+ Cmd::ParseKanjidic => {
+ let levels = parse_kanjidic().expect("error");
+ for (jlpt, grade, chars) in levels.iter() {
+ println!("{}.{}: {}", jlpt, grade, chars);
+ }
+ },
+ Cmd::New => {
+ let kanji_levels = read_kanji_levels().expect("error");
+ let all_kanji = Charset::new(kanji_levels.iter()
+ .map(|(_, x)| x.to_string())
+ .collect::<Vec<_>>()
+ .join(""));
+ let ex = read_examples(&all_kanji).expect("error");
+ println!("{:#?}", &ex[..10]);
+ }
+ }
+}
+
+fn parse_kanjidic() -> Result<Vec<(i32, i32, String)>> {
+ let file = fs::read_to_string("data/kanjidic2.xml")?;
+ let xml = roxmltree::Document::parse(&file)?;
+ let kanjidic = xml.root().first_child().unwrap();
+ assert!(kanjidic.has_tag_name("kanjidic2"));
+
+ let mut levels = HashMap::new();
+
+ for x in kanjidic.children() {
+ if !x.has_tag_name("character") {
+ continue;
+ }
+ let mut literal = None;
+ let mut jlpt = None;
+ let mut grade = None;
+ for y in x.children() {
+ if y.has_tag_name("literal") {
+ literal = y.text();
+ }
+ if y.has_tag_name("misc") {
+ for z in y.children() {
+ if z.has_tag_name("grade") {
+ grade = z.text().and_then(|x| str::parse::<i32>(x).ok());
+ }
+ if z.has_tag_name("jlpt") {
+ jlpt = z.text().and_then(|x| str::parse::<i32>(x).ok());
+ }
+ }
+ }
+ }
+ if jlpt.is_none() && grade.is_none() {
+ continue;
+ }
+ let level = (jlpt.unwrap_or(0), grade.unwrap_or(0));
+ if let Some(lit) = literal {
+ levels.entry(level).or_insert(String::new()).extend(lit.chars());
+ }
+ }
+
+ let mut levels = levels.into_iter().map(|((j, g), c)| (j, g, c)).collect::<Vec<_>>();
+ levels.sort_by_key(|(j, g, _)| (-*j, *g));
+ Ok(levels)
+}
+
+fn read_kanji_levels() -> Result<Vec<(String, String)>> {
+ Ok(fs::read_to_string("data/kanji_levels.txt")?
+ .lines()
+ .filter_map(|l| l.split_once(": "))
+ .map(|(l, k)| (l.to_string(), k.to_string()))
+ .collect::<Vec<_>>())
+}
+
+fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> {
+ let file = fs::File::open("data/examples.utf")?;
+
+ let mut ret = Vec::new();
+ let mut a = "".to_string();
+
+ for line in io::BufReader::new(file).lines() {
+ let line = line?;
+ if line.starts_with("A:") {
+ a = line;
+ } else if line.starts_with("B:") {
+ let s = a.strip_prefix("A: ");
+ let t = line.strip_prefix("B: ");
+ if let (Some(a), Some(b)) = (s, t) {
+ if let Some((ja, eng)) = a.split_once("\t") {
+ if let Some((eng, id)) = eng.split_once("#") {
+ ret.push(Example {
+ ja: ja.to_string(),
+ en: eng.to_string(),
+ expl: b.to_string(),
+ id: Some(id.to_string()),
+ chars: Charset::new(ja).inter_chars(all_kanji),
+ });
+ } else {
+ ret.push(Example {
+ ja: ja.to_string(),
+ en: eng.to_string(),
+ expl: b.to_string(),
+ id: None,
+ chars: Charset::new(ja).inter_chars(all_kanji),
+ });
+ }
+ }
+ }
+ }
+ if ret.len() > 100 {
+ break;
+ }
+ }
+
+ Ok(ret)
+}
+
+#[derive(Debug)]
+struct Example {
+ ja: String,
+ en: String,
+ expl: String,
+ id: Option<String>,
+ chars: Charset,
+}
+
+#[derive(Debug)]
+struct Charset(Vec<char>);
+
+impl Charset {
+ fn new<S: AsRef<str>>(s: S) -> Self {
+ let mut chars = s.as_ref().chars().collect::<Vec<_>>();
+ chars.sort();
+ chars.dedup();
+ Self(chars)
+ }
+ fn intersects(&self, other: &Self) -> bool {
+ let mut it1 = self.0.iter().peekable();
+ let mut it2 = other.0.iter().peekable();
+ while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
+ match c1.cmp(c2) {
+ Ordering::Equal => return true,
+ Ordering::Less => it1.next(),
+ Ordering::Greater => it2.next(),
+ };
+ }
+ false
+ }
+ fn count_inter(&self, other: &Self) -> usize {
+ let mut it1 = self.0.iter().peekable();
+ let mut it2 = other.0.iter().peekable();
+ let mut ret = 0;
+ while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
+ match c1.cmp(c2) {
+ Ordering::Equal => {
+ ret += 1;
+ it1.next();
+ it2.next();
+ }
+ Ordering::Less => {
+ it1.next();
+ }
+ Ordering::Greater => {
+ it2.next();
+ }
+ };
+ }
+ ret
+ }
+ fn inter_chars(&self, other: &Self) -> Charset {
+ let mut it1 = self.0.iter().peekable();
+ let mut it2 = other.0.iter().peekable();
+ let mut ret = Vec::new();
+ while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
+ match c1.cmp(c2) {
+ Ordering::Equal => {
+ ret.push(**c1);
+ it1.next();
+ it2.next();
+ }
+ Ordering::Less => {
+ it1.next();
+ }
+ Ordering::Greater => {
+ it2.next();
+ }
+ };
+ }
+ Self(ret)
+ }
+ fn chars(&self) -> &[char] {
+ &self.0
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ #[test]
+ fn test_charset() {
+ let c1 = Charset::new("azerty");
+ let c2 = Charset::new("uiopqsqdf");
+ let c3 = Charset::new("hello, world");
+
+ assert!(!c1.intersects(&c2));
+ assert!(c1.intersects(&c3));
+ assert!(c2.intersects(&c3));
+
+ assert_eq!(c1.count_inter(&c2), 0);
+ assert_eq!(c1.count_inter(&c3), 2);
+ assert_eq!(c2.count_inter(&c3), 2);
+ }
+}