aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-11-27 17:26:59 +0100
committerAlex Auvolat <alex@adnab.me>2023-11-27 17:26:59 +0100
commitd2a46c25219c21ac4f128da8512302935654d38e (patch)
treea6d66ac4639e4d68fe57f9e8da72b08ecfb14d9f
parentb15723f33b486124a50408873d30998bb9d31b3b (diff)
downloaddatagengo-d2a46c25219c21ac4f128da8512302935654d38e.tar.gz
datagengo-d2a46c25219c21ac4f128da8512302935654d38e.zip
split code into several files
-rw-r--r--src/datafiles.rs321
-rw-r--r--src/format.rs349
-rw-r--r--src/main.rs664
3 files changed, 676 insertions, 658 deletions
diff --git a/src/datafiles.rs b/src/datafiles.rs
new file mode 100644
index 0000000..629badf
--- /dev/null
+++ b/src/datafiles.rs
@@ -0,0 +1,321 @@
+use std::collections::HashMap;
+use std::fs;
+use std::io::{self, BufRead};
+
+use anyhow::Result;
+use serde::{Deserialize, Serialize};
+
+use crate::charset::Charset;
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct Example {
+ pub ja: String,
+ pub en: String,
+ pub expl: String,
+ pub id: Option<String>,
+ pub chars: Charset,
+}
+
+// =====================================================================
+// PARSING DATA FILES
+// =====================================================================
+
+pub type DictIndex<'a> = HashMap<&'a str, Vec<roxmltree::Node<'a, 'a>>>;
+pub fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> {
+ let dict = dict
+ .root()
+ .children()
+ .find(|x| x.has_tag_name("JMdict"))
+ .unwrap();
+
+ let mut ret: DictIndex<'a> = HashMap::new();
+ for x in dict.children().filter(|x| x.has_tag_name("entry")) {
+ for r in x.children().filter(|x| x.has_tag_name("k_ele")) {
+ if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) {
+ let txt = keb.text().unwrap().trim();
+ ret.entry(txt).or_default().push(x);
+ }
+ }
+ }
+
+ ret
+}
+
+pub fn parse_kanjidic() -> Result<Vec<(String, Charset)>> {
+ let n3_kanji = Charset::new(&fs::read_to_string("data/n3_kanji.txt")?.trim());
+
+ let file = fs::read_to_string("data/kanjidic2.xml")?;
+ let xml = roxmltree::Document::parse(&file)?;
+ let kanjidic = xml.root().first_child().unwrap();
+ assert!(kanjidic.has_tag_name("kanjidic2"));
+
+ let mut levels = HashMap::new();
+
+ for x in kanjidic.children() {
+ if !x.has_tag_name("character") {
+ continue;
+ }
+ let mut literal = None;
+ let mut jlpt = None;
+ let mut grade = None;
+ for y in x.children() {
+ if y.has_tag_name("literal") {
+ literal = y.text();
+ }
+ if y.has_tag_name("misc") {
+ for z in y.children() {
+ if z.has_tag_name("jlpt") {
+ jlpt = z.text().and_then(|x| str::parse::<i32>(x).ok());
+ }
+ if z.has_tag_name("grade") {
+ grade = z.text().and_then(|x| str::parse::<i32>(x).ok());
+ }
+ }
+ }
+ }
+ match grade {
+ Some(i) if i <= 6 => grade = Some(7),
+ _ => (),
+ }
+ if let Some(lit) = literal {
+ assert_eq!(lit.chars().count(), 1);
+ let jlpt = match jlpt {
+ Some(4) => Some(5),
+ Some(3) => Some(4),
+ Some(2) if n3_kanji.contains(lit.chars().next().unwrap()) => Some(3),
+ x => x,
+ };
+ levels
+ .entry((jlpt, grade))
+ .or_insert(String::new())
+ .extend(lit.chars());
+ }
+ }
+
+ let mut levels = levels.into_iter().collect::<Vec<_>>();
+ levels.sort_by_key(|((j, g), _)| match (j, g) {
+ (Some(j), Some(g)) => (10 - *j) * 20 + *g,
+ (Some(j), None) => (10 - *j) * 20 + 15,
+ (None, Some(g)) => 1000 + *g,
+ (None, None) => 1015,
+ });
+
+ let mut ret = Vec::new();
+ let mut pc = Charset::default();
+ for ((j, g), chars) in levels.into_iter() {
+ let name = match (j, g) {
+ (Some(j), Some(7)) => format!("N{}a", j),
+ (Some(j), Some(8)) => format!("N{}b", j),
+ (Some(j), Some(g)) => format!("N{}-{}", j, g),
+ (Some(j), None) => format!("N{}+", j),
+ (None, Some(7)) => format!("N0a"),
+ (None, Some(8)) => format!("N0b"),
+ (None, Some(g)) => format!("N0-{}", g),
+ (None, None) => format!("N0+"),
+ };
+ let chars = Charset::new(chars).diff(&pc);
+ pc = pc.union(&chars);
+ ret.push((name, chars));
+ }
+
+ Ok(ret)
+}
+
+pub fn read_kanji_levels() -> Result<Vec<(String, String)>> {
+ Ok(fs::read_to_string("data/kanji_levels.txt")?
+ .lines()
+ .filter_map(|l| l.split_once(": "))
+ .map(|(l, k)| (l.to_string(), k.to_string()))
+ .collect::<Vec<_>>())
+}
+
+pub fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> {
+ let file = fs::File::open("data/examples.utf")?;
+
+ let mut ret = Vec::new();
+ let mut a = "".to_string();
+
+ for (i, line) in io::BufReader::new(file).lines().enumerate() {
+ let line = line?;
+ if line.starts_with("A:") {
+ a = line;
+ } else if line.starts_with("B:") {
+ let s = a.strip_prefix("A: ");
+ let t = line.strip_prefix("B: ");
+ if let (Some(a), Some(b)) = (s, t) {
+ if let Some((ja, eng)) = a.split_once("\t") {
+ if let Some((eng, id)) = eng.split_once("#") {
+ ret.push(Example {
+ ja: ja.to_string(),
+ en: eng.to_string(),
+ expl: b.to_string(),
+ id: Some(id.to_string()),
+ chars: Charset::new(ja).inter(all_kanji),
+ });
+ } else {
+ ret.push(Example {
+ ja: ja.to_string(),
+ en: eng.to_string(),
+ expl: b.to_string(),
+ id: None,
+ chars: Charset::new(ja).inter(all_kanji),
+ });
+ }
+ }
+ }
+ }
+ if i % 10000 == 0 {
+ eprintln!("read examples: {}/300 (x1000)", i / 1000);
+ }
+ }
+
+ Ok(ret)
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+pub struct JlptVocab {
+ pub level: String,
+ pub chars: Charset,
+ pub kanji: String,
+ pub kana: String,
+ pub en: String,
+}
+
+impl JlptVocab {
+ pub fn to_string(&self) -> String {
+ format!(
+ "{}\t{}\t{}\t{}\t{}",
+ self.level,
+ self.chars.to_string(),
+ self.kanji,
+ self.kana,
+ self.en
+ )
+ }
+}
+
+pub fn parse_jlpt_vocab(all_kanji: &Charset) -> Result<()> {
+ let mut vocab = vec![];
+ vocab.extend(parse_jlpt_vocab_combined(
+ "data/n5_vocab.txt",
+ "N5",
+ all_kanji,
+ )?);
+ vocab.extend(parse_jlpt_vocab_split(
+ "data/n4_vocab_hiragana.txt",
+ "data/n4_vocab_eng.txt",
+ "N4",
+ all_kanji,
+ )?);
+ vocab.extend(parse_jlpt_vocab_split(
+ "data/n3_vocab_hiragana.txt",
+ "data/n3_vocab_eng.txt",
+ "N3",
+ all_kanji,
+ )?);
+ vocab.extend(parse_jlpt_vocab_split(
+ "data/n2_vocab_hiragana.txt",
+ "data/n2_vocab_eng.txt",
+ "N2",
+ all_kanji,
+ )?);
+ vocab.extend(parse_jlpt_vocab_split(
+ "data/n1_vocab_hiragana.txt",
+ "data/n1_vocab_eng.txt",
+ "N1",
+ all_kanji,
+ )?);
+ for v in vocab.iter() {
+ println!("{}", v.to_string());
+ }
+ Ok(())
+}
+
+fn parse_jlpt_vocab_combined(
+ file: &str,
+ level: &str,
+ all_kanji: &Charset,
+) -> Result<Vec<JlptVocab>> {
+ let lines = jlpt_vocab_read_file(file)?;
+ let mut ret = vec![];
+ for (kanji, answer) in lines {
+ let (eng, kana) = match answer.split_once('\n') {
+ Some((a, b)) => (a, b.trim()),
+ None => (answer.trim(), ""),
+ };
+ for kanji in kanji.split('/') {
+ ret.push(JlptVocab {
+ level: level.to_string(),
+ chars: Charset::new(kanji).inter(all_kanji),
+ kanji: kanji.to_string(),
+ kana: kana.to_string(),
+ en: eng.to_string(),
+ });
+ }
+ }
+ Ok(ret)
+}
+
+fn parse_jlpt_vocab_split(
+ kana_file: &str,
+ eng_file: &str,
+ level: &str,
+ all_kanji: &Charset,
+) -> Result<Vec<JlptVocab>> {
+ let eng_lines = jlpt_vocab_read_file(eng_file)?
+ .into_iter()
+ .collect::<HashMap<String, String>>();
+
+ let lines = jlpt_vocab_read_file(kana_file)?;
+ let mut ret = vec![];
+ for (kanji, kana) in lines {
+ let eng = eng_lines.get(&kanji).or(eng_lines.get(&kana));
+ if let Some(eng) = eng {
+ for kanji in kanji.split('/') {
+ ret.push(JlptVocab {
+ level: level.to_string(),
+ chars: Charset::new(kanji).inter(all_kanji),
+ kanji: kanji.to_string(),
+ kana: kana.to_string(),
+ en: eng.to_string(),
+ });
+ }
+ }
+ }
+ Ok(ret)
+}
+
+fn jlpt_vocab_read_file(file: &str) -> Result<Vec<(String, String)>> {
+ let re = regex::Regex::new(r#"<span class="\w+">"#)?;
+
+ let file = fs::File::open(file)?;
+ let mut ret = vec![];
+ for line in io::BufReader::new(file).lines() {
+ let line = line?.replace("<br>", "\n").replace("</span>", "");
+ let line = re.replace_all(&line, "");
+ if let Some((a, b)) = line.split_once('|') {
+ ret.push((a.trim().to_string(), b.trim().to_string()));
+ }
+ }
+
+ Ok(ret)
+}
+
+pub fn load_jlpt_vocab() -> Result<Vec<JlptVocab>> {
+ let file = fs::File::open("data/jlpt_vocab.txt")?;
+ let mut ret = vec![];
+ for line in io::BufReader::new(file).lines() {
+ let line = line?;
+ let line = line.splitn(5, "\t").collect::<Vec<_>>();
+ if line.len() == 5 {
+ ret.push(JlptVocab {
+ level: line[0].to_string(),
+ chars: Charset::new(line[1]),
+ kanji: line[2].to_string(),
+ kana: line[3].to_string(),
+ en: line[4].to_string(),
+ });
+ }
+ }
+ Ok(ret)
+}
diff --git a/src/format.rs b/src/format.rs
new file mode 100644
index 0000000..1cdde1b
--- /dev/null
+++ b/src/format.rs
@@ -0,0 +1,349 @@
+use std::fs;
+
+use anyhow::Result;
+
+use crate::charset::Charset;
+use crate::*;
+
+// =====================================================================
+// FORMATTING TO HTML
+// =====================================================================
+
+pub fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) {
+ format_batch_aux(dict_idx, count, i, batch).expect("format batch");
+}
+
+fn format_batch_aux<'a>(
+ dict_idx: &DictIndex<'a>,
+ count: usize,
+ i: usize,
+ batch: &Batch,
+) -> Result<()> {
+ let mut f = io::BufWriter::new(fs::File::create(format!("public/{:03}.html", i))?);
+ write!(
+ f,
+ r#"<!DOCTYPE html>
+ <html>
+ <head>
+ <meta charset=\"UTF-8\" />
+ <title>Batch #{:03}</title>
+ <link rel="stylesheet" type="text/css" href="style.css" />
+ </head>
+ <body><div class="batch_page">"#,
+ i
+ )?;
+
+ writeln!(f, r#"<p><a href="index.html">index</a>"#)?;
+ for j in 0..count {
+ if j != i {
+ writeln!(f, r#" <a href="{:03}.html">{:03}</a>"#, j, j)?;
+ } else {
+ writeln!(f, " {:03}", j)?;
+ }
+ }
+ writeln!(f, r#"</p>"#)?;
+ writeln!(f, "<p>Level: {}</p>", batch.level)?;
+
+ write!(f, r#"<p class="ja">"#)?;
+ let mut ex_prev = Charset::default();
+ for ex in batch.examples.iter() {
+ let ex_chars = ex.chars.inter(&batch.chars);
+ for c in ex_chars.diff(&ex_prev).chars().iter() {
+ write!(
+ f,
+ r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#,
+ c, c
+ )?;
+ }
+ ex_prev = ex_prev.union(&ex_chars);
+ }
+ writeln!(f, r#"</p>"#)?;
+
+ for ex in batch.examples.iter() {
+ writeln!(f, "<hr />")?;
+ write!(f, r#"<p class="ja">"#)?;
+ for c in ex.ja.chars() {
+ if batch.chars.contains(c) {
+ write!(f, r#"<span class="char_cur">{}</span>"#, c)?;
+ } else if batch.chars_p1.contains(c) {
+ write!(f, r#"<span class="char_p1">{}</span>"#, c)?;
+ } else if batch.chars_p2.contains(c) {
+ write!(f, r#"<span class="char_p2">{}</span>"#, c)?;
+ } else if batch.chars_bad.contains(c) {
+ write!(f, r#"<span class="char_bad">{}</span>"#, c)?;
+ } else {
+ write!(f, "{}", c)?;
+ }
+ }
+ writeln!(f, "</p>")?;
+ writeln!(f, r#"<p class="en">{}</p>"#, ex.en)?;
+
+ writeln!(f, r#"<details><summary>Explanation</summary>"#)?;
+ let mut expl_batch = Vec::new();
+ let mut expl_all = Vec::new();
+ for word in ex.expl.split(|c| c == ' ' || c == '~') {
+ let (keb, reb) = expl_clean_word(word);
+ let wchars = Charset::new(keb);
+ if !wchars.intersects(&ex.chars) {
+ continue;
+ }
+ if let Some(ents) = dict_idx.get(keb) {
+ for ent in ents.iter() {
+ if let Some(s) = dict_str(keb, reb, ent) {
+ if wchars.intersects(&batch.chars) {
+ expl_batch.push(s);
+ } else {
+ expl_all.push(s);
+ }
+ }
+ }
+ }
+ }
+ for be in expl_batch {
+ writeln!(f, r#"<p>{}</p>"#, be)?;
+ }
+ writeln!(f, r#"<p class="chars">"#)?;
+ for c in ex.chars.inter(&batch.chars).chars().iter() {
+ writeln!(
+ f,
+ r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#,
+ c, c
+ )?;
+ }
+ writeln!(f, r#"</p>"#)?;
+ for be in expl_all {
+ writeln!(f, r#"<p>{}</p>"#, be)?;
+ }
+ writeln!(f, r#"</details>"#)?;
+ }
+
+ writeln!(f, "<hr />")?;
+ format_vocab(
+ &mut f,
+ &batch
+ .extra_vocab
+ .iter()
+ .filter(|v| batch.level.contains(&v.level))
+ .collect::<Vec<_>>(),
+ "Extra vocabulary (this level)",
+ )?;
+ format_vocab(
+ &mut f,
+ &batch
+ .extra_vocab
+ .iter()
+ .filter(|v| !batch.level.contains(&v.level))
+ .collect::<Vec<_>>(),
+ "Extra vocabulary (previous levels)",
+ )?;
+
+ writeln!(
+ f,
+ r#"<details><summary>Extra examples (reading practice)</summary><table class="extratable">"#
+ )?;
+ for ex in batch.extra_examples.iter() {
+ let mut expl1 = Vec::new();
+ let mut expl2 = Vec::new();
+ for word in ex.expl.split(|c| c == ' ' || c == '~') {
+ let (keb, reb) = expl_clean_word(word);
+ let wchars = Charset::new(keb);
+ if !wchars.intersects(&ex.chars) {
+ continue;
+ }
+ if let Some(ents) = dict_idx.get(keb) {
+ for ent in ents.iter() {
+ if let Some(s) = dict_str_short(keb, reb, ent) {
+ if wchars.intersects(&batch.chars) {
+ expl1.push(s);
+ } else {
+ expl2.push(s);
+ }
+ }
+ }
+ }
+ }
+ expl1.extend(expl2.into_iter());
+ let expl = expl1.join("<br />");
+ writeln!(
+ f,
+ r#"<tr><td><details><summary class="tab_large2 font_ja">&nbsp;&nbsp;{}&nbsp;&nbsp;</summary><div style="text-align: center">{}<br />{}</div></details></td></tr>"#,
+ ex.ja, ex.en, expl
+ )?;
+ }
+ writeln!(f, r#"</table></details>"#)?;
+
+ writeln!(f, "<hr />")?;
+ writeln!(f, "<p>\(≧▽≦)/</p>")?;
+
+ write!(f, "<div></body></html>")?;
+ f.flush()?;
+ Ok(())
+}
+
+fn format_vocab(f: &mut impl Write, vocab: &[&JlptVocab], t: &str) -> Result<()> {
+ if !vocab.is_empty() {
+ writeln!(
+ f,
+ r#"<details><summary>{}</summary><table class="vocabtable">"#,
+ t
+ )?;
+ for v in vocab {
+ writeln!(
+ f,
+ r#"<tr><td>{}</td><td>&nbsp;&nbsp;<span class="tab_large font_ja">{}</span>&nbsp;&nbsp;</td><td>{}</td><td class="font_ja">{}</td></tr>"#,
+ v.level, v.kanji, v.en, v.kana
+ )?;
+ }
+ writeln!(f, "</table></details>")?;
+ }
+ Ok(())
+}
+
+fn expl_clean_word(w: &str) -> (&str, Option<&str>) {
+ let mut ret = w;
+ for delim in ['(', '{', '['] {
+ if let Some((s, _)) = ret.split_once(delim) {
+ ret = s;
+ }
+ }
+ let p = w
+ .split_once('(')
+ .and_then(|(_, r)| r.split_once(')'))
+ .map(|(p, _)| p);
+ (ret, p)
+}
+
+fn dict_str_short<'a>(
+ qkeb: &str,
+ qreb: Option<&str>,
+ ent: &roxmltree::Node<'a, 'a>,
+) -> Option<String> {
+ let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
+ let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
+ let reb = reb.text().unwrap().trim();
+
+ if qreb.map(|x| x != reb).unwrap_or(false) {
+ return None;
+ }
+
+ Some(format!(
+ r#"<span class="font_ja">{} 【{}】</span>"#,
+ qkeb, reb
+ ))
+}
+
+fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &roxmltree::Node<'a, 'a>) -> Option<String> {
+ let mut ret = dict_str_short(qkeb, qreb, ent)?;
+
+ for sense in ent.children().filter(|x| x.has_tag_name("sense")) {
+ if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) {
+ ret.extend(format!(" {};", s.text().unwrap().trim()).chars());
+ }
+ }
+
+ if ret.chars().rev().next() == Some(';') {
+ ret.pop();
+ }
+ Some(ret)
+}
+
+pub fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> {
+ let mut f = io::BufWriter::new(fs::File::create("public/index.html")?);
+ write!(
+ f,
+ r#"<!DOCTYPE html>
+ <html>
+ <head>
+ <meta charset=\"UTF-8\" />
+ <title>List of batches</title>
+ <link rel="stylesheet" type="text/css" href="style.css" />
+ </head>
+ <body><div class="index_page">"#
+ )?;
+
+ writeln!(f, r#"<p><a href="about.html">About / How-to</a></p><hr />"#)?;
+
+ writeln!(f, "<table>")?;
+ writeln!(f, "<tr><th>Num</th><th>Level</th><th>Kanji</th><th>Examples</th><th>Lesson-1</th><th>Lesson-2</th><th>Ignore</th></tr>")?;
+ for (i, batch) in batches.iter().enumerate() {
+ writeln!(
+ f,
+ r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td class="font_ja">{}</td><td>&nbsp;&nbsp;{}</td><td class="font_ja">{}</td><td class="font_ja">{}</td><td class="font_ja">{}</td></tr>"#,
+ i,
+ i,
+ batch.level,
+ batch.chars.to_string(),
+ batch.examples.len(),
+ batch.chars_p1.to_string(),
+ batch.chars_p2.to_string(),
+ batch.chars_bad.to_string()
+ )?;
+ }
+ writeln!(f, r#"</table>"#)?;
+
+ writeln!(f, "<hr />")?;
+
+ let all_chars = Charset::from_iter(
+ batches
+ .iter()
+ .map(|x| x.chars.chars().iter().copied())
+ .flatten(),
+ );
+ writeln!(f, "<table>")?;
+ writeln!(
+ f,
+ r#"<tr><th>Level</th><th>Count</th><th width="60%">Kanji</th><th>Missing kanji</th></tr>"#
+ )?;
+ for (lvl, chars) in kanji_levels.iter() {
+ if lvl == "N0+" || lvl.ends_with("-10") {
+ continue;
+ }
+ let chars = Charset::new(chars);
+ let missing = chars.diff(&all_chars);
+ writeln!(
+ f,
+ r#"<tr><td>{}</td><td>{}</td><td class="font_ja">{}</td><td><span class="font_ja">{}</span> ({})</td></tr>"#,
+ lvl,
+ chars.len(),
+ chars.to_string(),
+ missing.to_string(),
+ missing.len()
+ )?;
+ }
+ writeln!(f, "</table>")?;
+
+ write!(f, "</div></body></html>")?;
+ f.flush()?;
+ Ok(())
+}
+
+pub fn format_about() -> Result<()> {
+ let mut f = io::BufWriter::new(fs::File::create("public/about.html")?);
+ write!(
+ f,
+ r#"<!DOCTYPE html>
+ <html>
+ <head>
+ <meta charset=\"UTF-8\" />
+ <title>Datagengo README</title>
+ <link rel="stylesheet" type="text/css" href="style.css" />
+ </head>
+ <body>"#
+ )?;
+
+ writeln!(f, r#"<div class="about_page">"#)?;
+ writeln!(
+ f,
+ r#"<p><a href="index.html">Back to lessons</a></p><hr />"#
+ )?;
+
+ writeln!(
+ f,
+ "{}",
+ markdown::to_html(&fs::read_to_string("README.md")?)
+ )?;
+
+ writeln!(f, r#"</div></body></html>"#)?;
+
+ Ok(())
+}
diff --git a/src/main.rs b/src/main.rs
index 4ec20d0..5d6b7d7 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,15 +1,19 @@
use std::collections::HashMap;
use std::fs;
-use std::io::{self, BufRead, Write};
+use std::io::{self, Write};
-use anyhow::Result;
+//use anyhow::Result;
use rand::prelude::*;
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use structopt::StructOpt;
mod charset;
+mod datafiles;
+mod format;
use charset::Charset;
+use datafiles::*;
+use format::*;
#[derive(Debug, StructOpt)]
#[structopt(name = "datagengo", about = "Japanese example practice maker")]
@@ -195,325 +199,12 @@ fn main() {
}
// =====================================================================
-// PARSING DATA FILES
-// =====================================================================
-
-type DictIndex<'a> = HashMap<&'a str, Vec<roxmltree::Node<'a, 'a>>>;
-fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> {
- let dict = dict
- .root()
- .children()
- .find(|x| x.has_tag_name("JMdict"))
- .unwrap();
-
- let mut ret: DictIndex<'a> = HashMap::new();
- for x in dict.children().filter(|x| x.has_tag_name("entry")) {
- for r in x.children().filter(|x| x.has_tag_name("k_ele")) {
- if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) {
- let txt = keb.text().unwrap().trim();
- ret.entry(txt).or_default().push(x);
- }
- }
- }
-
- ret
-}
-
-fn parse_kanjidic() -> Result<Vec<(String, Charset)>> {
- let n3_kanji = Charset::new(&fs::read_to_string("data/n3_kanji.txt")?.trim());
-
- let file = fs::read_to_string("data/kanjidic2.xml")?;
- let xml = roxmltree::Document::parse(&file)?;
- let kanjidic = xml.root().first_child().unwrap();
- assert!(kanjidic.has_tag_name("kanjidic2"));
-
- let mut levels = HashMap::new();
-
- for x in kanjidic.children() {
- if !x.has_tag_name("character") {
- continue;
- }
- let mut literal = None;
- let mut jlpt = None;
- let mut grade = None;
- for y in x.children() {
- if y.has_tag_name("literal") {
- literal = y.text();
- }
- if y.has_tag_name("misc") {
- for z in y.children() {
- if z.has_tag_name("jlpt") {
- jlpt = z.text().and_then(|x| str::parse::<i32>(x).ok());
- }
- if z.has_tag_name("grade") {
- grade = z.text().and_then(|x| str::parse::<i32>(x).ok());
- }
- }
- }
- }
- match grade {
- Some(i) if i <= 6 => grade = Some(7),
- _ => (),
- }
- if let Some(lit) = literal {
- assert_eq!(lit.chars().count(), 1);
- let jlpt = match jlpt {
- Some(4) => Some(5),
- Some(3) => Some(4),
- Some(2) if n3_kanji.contains(lit.chars().next().unwrap()) => Some(3),
- x => x,
- };
- levels
- .entry((jlpt, grade))
- .or_insert(String::new())
- .extend(lit.chars());
- }
- }
-
- let mut levels = levels.into_iter().collect::<Vec<_>>();
- levels.sort_by_key(|((j, g), _)| match (j, g) {
- (Some(j), Some(g)) => (10 - *j) * 20 + *g,
- (Some(j), None) => (10 - *j) * 20 + 15,
- (None, Some(g)) => 1000 + *g,
- (None, None) => 1015,
- });
-
- let mut ret = Vec::new();
- let mut pc = Charset::default();
- for ((j, g), chars) in levels.into_iter() {
- let name = match (j, g) {
- (Some(j), Some(7)) => format!("N{}a", j),
- (Some(j), Some(8)) => format!("N{}b", j),
- (Some(j), Some(g)) => format!("N{}-{}", j, g),
- (Some(j), None) => format!("N{}+", j),
- (None, Some(7)) => format!("N0a"),
- (None, Some(8)) => format!("N0b"),
- (None, Some(g)) => format!("N0-{}", g),
- (None, None) => format!("N0+"),
- };
- let chars = Charset::new(chars).diff(&pc);
- pc = pc.union(&chars);
- ret.push((name, chars));
- }
-
- Ok(ret)
-}
-
-fn read_kanji_levels() -> Result<Vec<(String, String)>> {
- Ok(fs::read_to_string("data/kanji_levels.txt")?
- .lines()
- .filter_map(|l| l.split_once(": "))
- .map(|(l, k)| (l.to_string(), k.to_string()))
- .collect::<Vec<_>>())
-}
-
-fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> {
- let file = fs::File::open("data/examples.utf")?;
-
- let mut ret = Vec::new();
- let mut a = "".to_string();
-
- for (i, line) in io::BufReader::new(file).lines().enumerate() {
- let line = line?;
- if line.starts_with("A:") {
- a = line;
- } else if line.starts_with("B:") {
- let s = a.strip_prefix("A: ");
- let t = line.strip_prefix("B: ");
- if let (Some(a), Some(b)) = (s, t) {
- if let Some((ja, eng)) = a.split_once("\t") {
- if let Some((eng, id)) = eng.split_once("#") {
- ret.push(Example {
- ja: ja.to_string(),
- en: eng.to_string(),
- expl: b.to_string(),
- id: Some(id.to_string()),
- chars: Charset::new(ja).inter(all_kanji),
- });
- } else {
- ret.push(Example {
- ja: ja.to_string(),
- en: eng.to_string(),
- expl: b.to_string(),
- id: None,
- chars: Charset::new(ja).inter(all_kanji),
- });
- }
- }
- }
- }
- if i % 10000 == 0 {
- eprintln!("read examples: {}/300 (x1000)", i / 1000);
- }
- }
-
- Ok(ret)
-}
-
-#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
-struct JlptVocab {
- level: String,
- chars: Charset,
- kanji: String,
- kana: String,
- en: String,
-}
-
-impl JlptVocab {
- fn to_string(&self) -> String {
- format!(
- "{}\t{}\t{}\t{}\t{}",
- self.level,
- self.chars.to_string(),
- self.kanji,
- self.kana,
- self.en
- )
- }
-}
-
-fn parse_jlpt_vocab(all_kanji: &Charset) -> Result<()> {
- let mut vocab = vec![];
- vocab.extend(parse_jlpt_vocab_combined(
- "data/n5_vocab.txt",
- "N5",
- all_kanji,
- )?);
- vocab.extend(parse_jlpt_vocab_split(
- "data/n4_vocab_hiragana.txt",
- "data/n4_vocab_eng.txt",
- "N4",
- all_kanji,
- )?);
- vocab.extend(parse_jlpt_vocab_split(
- "data/n3_vocab_hiragana.txt",
- "data/n3_vocab_eng.txt",
- "N3",
- all_kanji,
- )?);
- vocab.extend(parse_jlpt_vocab_split(
- "data/n2_vocab_hiragana.txt",
- "data/n2_vocab_eng.txt",
- "N2",
- all_kanji,
- )?);
- vocab.extend(parse_jlpt_vocab_split(
- "data/n1_vocab_hiragana.txt",
- "data/n1_vocab_eng.txt",
- "N1",
- all_kanji,
- )?);
- for v in vocab.iter() {
- println!("{}", v.to_string());
- }
- Ok(())
-}
-
-fn parse_jlpt_vocab_combined(
- file: &str,
- level: &str,
- all_kanji: &Charset,
-) -> Result<Vec<JlptVocab>> {
- let lines = jlpt_vocab_read_file(file)?;
- let mut ret = vec![];
- for (kanji, answer) in lines {
- let (eng, kana) = match answer.split_once('\n') {
- Some((a, b)) => (a, b.trim()),
- None => (answer.trim(), ""),
- };
- for kanji in kanji.split('/') {
- ret.push(JlptVocab {
- level: level.to_string(),
- chars: Charset::new(kanji).inter(all_kanji),
- kanji: kanji.to_string(),
- kana: kana.to_string(),
- en: eng.to_string(),
- });
- }
- }
- Ok(ret)
-}
-
-fn parse_jlpt_vocab_split(
- kana_file: &str,
- eng_file: &str,
- level: &str,
- all_kanji: &Charset,
-) -> Result<Vec<JlptVocab>> {
- let eng_lines = jlpt_vocab_read_file(eng_file)?
- .into_iter()
- .collect::<HashMap<String, String>>();
-
- let lines = jlpt_vocab_read_file(kana_file)?;
- let mut ret = vec![];
- for (kanji, kana) in lines {
- let eng = eng_lines.get(&kanji).or(eng_lines.get(&kana));
- if let Some(eng) = eng {
- for kanji in kanji.split('/') {
- ret.push(JlptVocab {
- level: level.to_string(),
- chars: Charset::new(kanji).inter(all_kanji),
- kanji: kanji.to_string(),
- kana: kana.to_string(),
- en: eng.to_string(),
- });
- }
- }
- }
- Ok(ret)
-}
-
-fn jlpt_vocab_read_file(file: &str) -> Result<Vec<(String, String)>> {
- let re = regex::Regex::new(r#"<span class="\w+">"#)?;
-
- let file = fs::File::open(file)?;
- let mut ret = vec![];
- for line in io::BufReader::new(file).lines() {
- let line = line?.replace("<br>", "\n").replace("</span>", "");
- let line = re.replace_all(&line, "");
- if let Some((a, b)) = line.split_once('|') {
- ret.push((a.trim().to_string(), b.trim().to_string()));
- }
- }
-
- Ok(ret)
-}
-
-fn load_jlpt_vocab() -> Result<Vec<JlptVocab>> {
- let file = fs::File::open("data/jlpt_vocab.txt")?;
- let mut ret = vec![];
- for line in io::BufReader::new(file).lines() {
- let line = line?;
- let line = line.splitn(5, "\t").collect::<Vec<_>>();
- if line.len() == 5 {
- ret.push(JlptVocab {
- level: line[0].to_string(),
- chars: Charset::new(line[1]),
- kanji: line[2].to_string(),
- kana: line[3].to_string(),
- en: line[4].to_string(),
- });
- }
- }
- Ok(ret)
-}
-
-// =====================================================================
// BATCH STRUCTURES AND GENERATION
// =====================================================================
const CHARS_PER_BATCH: usize = 20;
const MAX_NEW_CHARS_PER_EX: usize = 5;
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
-struct Example {
- ja: String,
- en: String,
- expl: String,
- id: Option<String>,
- chars: Charset,
-}
-
#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
struct Batch {
level: String,
@@ -1109,346 +800,3 @@ fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) {
);
}
}
-
-// =====================================================================
-// FORMATTING TO HTML
-// =====================================================================
-
-fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) {
- format_batch_aux(dict_idx, count, i, batch).expect("format batch");
-}
-
-fn format_batch_aux<'a>(
- dict_idx: &DictIndex<'a>,
- count: usize,
- i: usize,
- batch: &Batch,
-) -> Result<()> {
- let mut f = io::BufWriter::new(fs::File::create(format!("public/{:03}.html", i))?);
- write!(
- f,
- r#"<!DOCTYPE html>
- <html>
- <head>
- <meta charset=\"UTF-8\" />
- <title>Batch #{:03}</title>
- <link rel="stylesheet" type="text/css" href="style.css" />
- </head>
- <body><div class="batch_page">"#,
- i
- )?;
-
- writeln!(f, r#"<p><a href="index.html">index</a>"#)?;
- for j in 0..count {
- if j != i {
- writeln!(f, r#" <a href="{:03}.html">{:03}</a>"#, j, j)?;
- } else {
- writeln!(f, " {:03}", j)?;
- }
- }
- writeln!(f, r#"</p>"#)?;
- writeln!(f, "<p>Level: {}</p>", batch.level)?;
-
- write!(f, r#"<p class="ja">"#)?;
- let mut ex_prev = Charset::default();
- for ex in batch.examples.iter() {
- let ex_chars = ex.chars.inter(&batch.chars);
- for c in ex_chars.diff(&ex_prev).chars().iter() {
- write!(
- f,
- r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#,
- c, c
- )?;
- }
- ex_prev = ex_prev.union(&ex_chars);
- }
- writeln!(f, r#"</p>"#)?;
-
- for ex in batch.examples.iter() {
- writeln!(f, "<hr />")?;
- write!(f, r#"<p class="ja">"#)?;
- for c in ex.ja.chars() {
- if batch.chars.contains(c) {
- write!(f, r#"<span class="char_cur">{}</span>"#, c)?;
- } else if batch.chars_p1.contains(c) {
- write!(f, r#"<span class="char_p1">{}</span>"#, c)?;
- } else if batch.chars_p2.contains(c) {
- write!(f, r#"<span class="char_p2">{}</span>"#, c)?;
- } else if batch.chars_bad.contains(c) {
- write!(f, r#"<span class="char_bad">{}</span>"#, c)?;
- } else {
- write!(f, "{}", c)?;
- }
- }
- writeln!(f, "</p>")?;
- writeln!(f, r#"<p class="en">{}</p>"#, ex.en)?;
-
- writeln!(f, r#"<details><summary>Explanation</summary>"#)?;
- let mut expl_batch = Vec::new();
- let mut expl_all = Vec::new();
- for word in ex.expl.split(|c| c == ' ' || c == '~') {
- let (keb, reb) = expl_clean_word(word);
- let wchars = Charset::new(keb);
- if !wchars.intersects(&ex.chars) {
- continue;
- }
- if let Some(ents) = dict_idx.get(keb) {
- for ent in ents.iter() {
- if let Some(s) = dict_str(keb, reb, ent) {
- if wchars.intersects(&batch.chars) {
- expl_batch.push(s);
- } else {
- expl_all.push(s);
- }
- }
- }
- }
- }
- for be in expl_batch {
- writeln!(f, r#"<p>{}</p>"#, be)?;
- }
- writeln!(f, r#"<p class="chars">"#)?;
- for c in ex.chars.inter(&batch.chars).chars().iter() {
- writeln!(
- f,
- r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#,
- c, c
- )?;
- }
- writeln!(f, r#"</p>"#)?;
- for be in expl_all {
- writeln!(f, r#"<p>{}</p>"#, be)?;
- }
- writeln!(f, r#"</details>"#)?;
- }
-
- writeln!(f, "<hr />")?;
- format_vocab(
- &mut f,
- &batch
- .extra_vocab
- .iter()
- .filter(|v| batch.level.contains(&v.level))
- .collect::<Vec<_>>(),
- "Extra vocabulary (this level)",
- )?;
- format_vocab(
- &mut f,
- &batch
- .extra_vocab
- .iter()
- .filter(|v| !batch.level.contains(&v.level))
- .collect::<Vec<_>>(),
- "Extra vocabulary (previous levels)",
- )?;
-
- writeln!(
- f,
- r#"<details><summary>Extra examples (reading practice)</summary><table class="extratable">"#
- )?;
- for ex in batch.extra_examples.iter() {
- let mut expl1 = Vec::new();
- let mut expl2 = Vec::new();
- for word in ex.expl.split(|c| c == ' ' || c == '~') {
- let (keb, reb) = expl_clean_word(word);
- let wchars = Charset::new(keb);
- if !wchars.intersects(&ex.chars) {
- continue;
- }
- if let Some(ents) = dict_idx.get(keb) {
- for ent in ents.iter() {
- if let Some(s) = dict_str_short(keb, reb, ent) {
- if wchars.intersects(&batch.chars) {
- expl1.push(s);
- } else {
- expl2.push(s);
- }
- }
- }
- }
- }
- expl1.extend(expl2.into_iter());
- let expl = expl1.join("<br />");
- writeln!(
- f,
- r#"<tr><td><details><summary class="tab_large2 font_ja">&nbsp;&nbsp;{}&nbsp;&nbsp;</summary><div style="text-align: center">{}<br />{}</div></details></td></tr>"#,
- ex.ja, ex.en, expl
- )?;
- }
- writeln!(f, r#"</table></details>"#)?;
-
- writeln!(f, "<hr />")?;
- writeln!(f, "<p>\(≧▽≦)/</p>")?;
-
- write!(f, "<div></body></html>")?;
- f.flush()?;
- Ok(())
-}
-
-fn format_vocab(f: &mut impl Write, vocab: &[&JlptVocab], t: &str) -> Result<()> {
- if !vocab.is_empty() {
- writeln!(
- f,
- r#"<details><summary>{}</summary><table class="vocabtable">"#,
- t
- )?;
- for v in vocab {
- writeln!(
- f,
- r#"<tr><td>{}</td><td>&nbsp;&nbsp;<span class="tab_large font_ja">{}</span>&nbsp;&nbsp;</td><td>{}</td><td class="font_ja">{}</td></tr>"#,
- v.level, v.kanji, v.en, v.kana
- )?;
- }
- writeln!(f, "</table></details>")?;
- }
- Ok(())
-}
-
-fn expl_clean_word(w: &str) -> (&str, Option<&str>) {
- let mut ret = w;
- for delim in ['(', '{', '['] {
- if let Some((s, _)) = ret.split_once(delim) {
- ret = s;
- }
- }
- let p = w
- .split_once('(')
- .and_then(|(_, r)| r.split_once(')'))
- .map(|(p, _)| p);
- (ret, p)
-}
-
-fn dict_str_short<'a>(
- qkeb: &str,
- qreb: Option<&str>,
- ent: &roxmltree::Node<'a, 'a>,
-) -> Option<String> {
- let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
- let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
- let reb = reb.text().unwrap().trim();
-
- if qreb.map(|x| x != reb).unwrap_or(false) {
- return None;
- }
-
- Some(format!(
- r#"<span class="font_ja">{} 【{}】</span>"#,
- qkeb, reb
- ))
-}
-
-fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &roxmltree::Node<'a, 'a>) -> Option<String> {
- let mut ret = dict_str_short(qkeb, qreb, ent)?;
-
- for sense in ent.children().filter(|x| x.has_tag_name("sense")) {
- if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) {
- ret.extend(format!(" {};", s.text().unwrap().trim()).chars());
- }
- }
-
- if ret.chars().rev().next() == Some(';') {
- ret.pop();
- }
- Some(ret)
-}
-
-fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> {
- let mut f = io::BufWriter::new(fs::File::create("public/index.html")?);
- write!(
- f,
- r#"<!DOCTYPE html>
- <html>
- <head>
- <meta charset=\"UTF-8\" />
- <title>List of batches</title>
- <link rel="stylesheet" type="text/css" href="style.css" />
- </head>
- <body><div class="index_page">"#
- )?;
-
- writeln!(f, r#"<p><a href="about.html">About / How-to</a></p><hr />"#)?;
-
- writeln!(f, "<table>")?;
- writeln!(f, "<tr><th>Num</th><th>Level</th><th>Kanji</th><th>Examples</th><th>Lesson-1</th><th>Lesson-2</th><th>Ignore</th></tr>")?;
- for (i, batch) in batches.iter().enumerate() {
- writeln!(
- f,
- r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td class="font_ja">{}</td><td>&nbsp;&nbsp;{}</td><td class="font_ja">{}</td><td class="font_ja">{}</td><td class="font_ja">{}</td></tr>"#,
- i,
- i,
- batch.level,
- batch.chars.to_string(),
- batch.examples.len(),
- batch.chars_p1.to_string(),
- batch.chars_p2.to_string(),
- batch.chars_bad.to_string()
- )?;
- }
- writeln!(f, r#"</table>"#)?;
-
- writeln!(f, "<hr />")?;
-
- let all_chars = Charset::from_iter(
- batches
- .iter()
- .map(|x| x.chars.chars().iter().copied())
- .flatten(),
- );
- writeln!(f, "<table>")?;
- writeln!(
- f,
- r#"<tr><th>Level</th><th>Count</th><th width="60%">Kanji</th><th>Missing kanji</th></tr>"#
- )?;
- for (lvl, chars) in kanji_levels.iter() {
- if lvl == "N0+" || lvl.ends_with("-10") {
- continue;
- }
- let chars = Charset::new(chars);
- let missing = chars.diff(&all_chars);
- writeln!(
- f,
- r#"<tr><td>{}</td><td>{}</td><td class="font_ja">{}</td><td><span class="font_ja">{}</span> ({})</td></tr>"#,
- lvl,
- chars.len(),
- chars.to_string(),
- missing.to_string(),
- missing.len()
- )?;
- }
- writeln!(f, "</table>")?;
-
- write!(f, "</div></body></html>")?;
- f.flush()?;
- Ok(())
-}
-
-fn format_about() -> Result<()> {
- let mut f = io::BufWriter::new(fs::File::create("public/about.html")?);
- write!(
- f,
- r#"<!DOCTYPE html>
- <html>
- <head>
- <meta charset=\"UTF-8\" />
- <title>Datagengo README</title>
- <link rel="stylesheet" type="text/css" href="style.css" />
- </head>
- <body>"#
- )?;
-
- writeln!(f, r#"<div class="about_page">"#)?;
- writeln!(
- f,
- r#"<p><a href="index.html">Back to lessons</a></p><hr />"#
- )?;
-
- writeln!(
- f,
- "{}",
- markdown::to_html(&fs::read_to_string("README.md")?)
- )?;
-
- writeln!(f, r#"</div></body></html>"#)?;
-
- Ok(())
-}