aboutsummaryrefslogtreecommitdiff
path: root/src/main.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/main.rs')
-rw-r--r--src/main.rs664
1 files changed, 6 insertions, 658 deletions
diff --git a/src/main.rs b/src/main.rs
index 4ec20d0..5d6b7d7 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,15 +1,19 @@
use std::collections::HashMap;
use std::fs;
-use std::io::{self, BufRead, Write};
+use std::io::{self, Write};
-use anyhow::Result;
+//use anyhow::Result;
use rand::prelude::*;
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use structopt::StructOpt;
mod charset;
+mod datafiles;
+mod format;
use charset::Charset;
+use datafiles::*;
+use format::*;
#[derive(Debug, StructOpt)]
#[structopt(name = "datagengo", about = "Japanese example practice maker")]
@@ -195,325 +199,12 @@ fn main() {
}
// =====================================================================
-// PARSING DATA FILES
-// =====================================================================
-
-type DictIndex<'a> = HashMap<&'a str, Vec<roxmltree::Node<'a, 'a>>>;
-fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> {
- let dict = dict
- .root()
- .children()
- .find(|x| x.has_tag_name("JMdict"))
- .unwrap();
-
- let mut ret: DictIndex<'a> = HashMap::new();
- for x in dict.children().filter(|x| x.has_tag_name("entry")) {
- for r in x.children().filter(|x| x.has_tag_name("k_ele")) {
- if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) {
- let txt = keb.text().unwrap().trim();
- ret.entry(txt).or_default().push(x);
- }
- }
- }
-
- ret
-}
-
-fn parse_kanjidic() -> Result<Vec<(String, Charset)>> {
- let n3_kanji = Charset::new(&fs::read_to_string("data/n3_kanji.txt")?.trim());
-
- let file = fs::read_to_string("data/kanjidic2.xml")?;
- let xml = roxmltree::Document::parse(&file)?;
- let kanjidic = xml.root().first_child().unwrap();
- assert!(kanjidic.has_tag_name("kanjidic2"));
-
- let mut levels = HashMap::new();
-
- for x in kanjidic.children() {
- if !x.has_tag_name("character") {
- continue;
- }
- let mut literal = None;
- let mut jlpt = None;
- let mut grade = None;
- for y in x.children() {
- if y.has_tag_name("literal") {
- literal = y.text();
- }
- if y.has_tag_name("misc") {
- for z in y.children() {
- if z.has_tag_name("jlpt") {
- jlpt = z.text().and_then(|x| str::parse::<i32>(x).ok());
- }
- if z.has_tag_name("grade") {
- grade = z.text().and_then(|x| str::parse::<i32>(x).ok());
- }
- }
- }
- }
- match grade {
- Some(i) if i <= 6 => grade = Some(7),
- _ => (),
- }
- if let Some(lit) = literal {
- assert_eq!(lit.chars().count(), 1);
- let jlpt = match jlpt {
- Some(4) => Some(5),
- Some(3) => Some(4),
- Some(2) if n3_kanji.contains(lit.chars().next().unwrap()) => Some(3),
- x => x,
- };
- levels
- .entry((jlpt, grade))
- .or_insert(String::new())
- .extend(lit.chars());
- }
- }
-
- let mut levels = levels.into_iter().collect::<Vec<_>>();
- levels.sort_by_key(|((j, g), _)| match (j, g) {
- (Some(j), Some(g)) => (10 - *j) * 20 + *g,
- (Some(j), None) => (10 - *j) * 20 + 15,
- (None, Some(g)) => 1000 + *g,
- (None, None) => 1015,
- });
-
- let mut ret = Vec::new();
- let mut pc = Charset::default();
- for ((j, g), chars) in levels.into_iter() {
- let name = match (j, g) {
- (Some(j), Some(7)) => format!("N{}a", j),
- (Some(j), Some(8)) => format!("N{}b", j),
- (Some(j), Some(g)) => format!("N{}-{}", j, g),
- (Some(j), None) => format!("N{}+", j),
- (None, Some(7)) => format!("N0a"),
- (None, Some(8)) => format!("N0b"),
- (None, Some(g)) => format!("N0-{}", g),
- (None, None) => format!("N0+"),
- };
- let chars = Charset::new(chars).diff(&pc);
- pc = pc.union(&chars);
- ret.push((name, chars));
- }
-
- Ok(ret)
-}
-
-fn read_kanji_levels() -> Result<Vec<(String, String)>> {
- Ok(fs::read_to_string("data/kanji_levels.txt")?
- .lines()
- .filter_map(|l| l.split_once(": "))
- .map(|(l, k)| (l.to_string(), k.to_string()))
- .collect::<Vec<_>>())
-}
-
-fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> {
- let file = fs::File::open("data/examples.utf")?;
-
- let mut ret = Vec::new();
- let mut a = "".to_string();
-
- for (i, line) in io::BufReader::new(file).lines().enumerate() {
- let line = line?;
- if line.starts_with("A:") {
- a = line;
- } else if line.starts_with("B:") {
- let s = a.strip_prefix("A: ");
- let t = line.strip_prefix("B: ");
- if let (Some(a), Some(b)) = (s, t) {
- if let Some((ja, eng)) = a.split_once("\t") {
- if let Some((eng, id)) = eng.split_once("#") {
- ret.push(Example {
- ja: ja.to_string(),
- en: eng.to_string(),
- expl: b.to_string(),
- id: Some(id.to_string()),
- chars: Charset::new(ja).inter(all_kanji),
- });
- } else {
- ret.push(Example {
- ja: ja.to_string(),
- en: eng.to_string(),
- expl: b.to_string(),
- id: None,
- chars: Charset::new(ja).inter(all_kanji),
- });
- }
- }
- }
- }
- if i % 10000 == 0 {
- eprintln!("read examples: {}/300 (x1000)", i / 1000);
- }
- }
-
- Ok(ret)
-}
-
-#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
-struct JlptVocab {
- level: String,
- chars: Charset,
- kanji: String,
- kana: String,
- en: String,
-}
-
-impl JlptVocab {
- fn to_string(&self) -> String {
- format!(
- "{}\t{}\t{}\t{}\t{}",
- self.level,
- self.chars.to_string(),
- self.kanji,
- self.kana,
- self.en
- )
- }
-}
-
-fn parse_jlpt_vocab(all_kanji: &Charset) -> Result<()> {
- let mut vocab = vec![];
- vocab.extend(parse_jlpt_vocab_combined(
- "data/n5_vocab.txt",
- "N5",
- all_kanji,
- )?);
- vocab.extend(parse_jlpt_vocab_split(
- "data/n4_vocab_hiragana.txt",
- "data/n4_vocab_eng.txt",
- "N4",
- all_kanji,
- )?);
- vocab.extend(parse_jlpt_vocab_split(
- "data/n3_vocab_hiragana.txt",
- "data/n3_vocab_eng.txt",
- "N3",
- all_kanji,
- )?);
- vocab.extend(parse_jlpt_vocab_split(
- "data/n2_vocab_hiragana.txt",
- "data/n2_vocab_eng.txt",
- "N2",
- all_kanji,
- )?);
- vocab.extend(parse_jlpt_vocab_split(
- "data/n1_vocab_hiragana.txt",
- "data/n1_vocab_eng.txt",
- "N1",
- all_kanji,
- )?);
- for v in vocab.iter() {
- println!("{}", v.to_string());
- }
- Ok(())
-}
-
-fn parse_jlpt_vocab_combined(
- file: &str,
- level: &str,
- all_kanji: &Charset,
-) -> Result<Vec<JlptVocab>> {
- let lines = jlpt_vocab_read_file(file)?;
- let mut ret = vec![];
- for (kanji, answer) in lines {
- let (eng, kana) = match answer.split_once('\n') {
- Some((a, b)) => (a, b.trim()),
- None => (answer.trim(), ""),
- };
- for kanji in kanji.split('/') {
- ret.push(JlptVocab {
- level: level.to_string(),
- chars: Charset::new(kanji).inter(all_kanji),
- kanji: kanji.to_string(),
- kana: kana.to_string(),
- en: eng.to_string(),
- });
- }
- }
- Ok(ret)
-}
-
-fn parse_jlpt_vocab_split(
- kana_file: &str,
- eng_file: &str,
- level: &str,
- all_kanji: &Charset,
-) -> Result<Vec<JlptVocab>> {
- let eng_lines = jlpt_vocab_read_file(eng_file)?
- .into_iter()
- .collect::<HashMap<String, String>>();
-
- let lines = jlpt_vocab_read_file(kana_file)?;
- let mut ret = vec![];
- for (kanji, kana) in lines {
- let eng = eng_lines.get(&kanji).or(eng_lines.get(&kana));
- if let Some(eng) = eng {
- for kanji in kanji.split('/') {
- ret.push(JlptVocab {
- level: level.to_string(),
- chars: Charset::new(kanji).inter(all_kanji),
- kanji: kanji.to_string(),
- kana: kana.to_string(),
- en: eng.to_string(),
- });
- }
- }
- }
- Ok(ret)
-}
-
-fn jlpt_vocab_read_file(file: &str) -> Result<Vec<(String, String)>> {
- let re = regex::Regex::new(r#"<span class="\w+">"#)?;
-
- let file = fs::File::open(file)?;
- let mut ret = vec![];
- for line in io::BufReader::new(file).lines() {
- let line = line?.replace("<br>", "\n").replace("</span>", "");
- let line = re.replace_all(&line, "");
- if let Some((a, b)) = line.split_once('|') {
- ret.push((a.trim().to_string(), b.trim().to_string()));
- }
- }
-
- Ok(ret)
-}
-
-fn load_jlpt_vocab() -> Result<Vec<JlptVocab>> {
- let file = fs::File::open("data/jlpt_vocab.txt")?;
- let mut ret = vec![];
- for line in io::BufReader::new(file).lines() {
- let line = line?;
- let line = line.splitn(5, "\t").collect::<Vec<_>>();
- if line.len() == 5 {
- ret.push(JlptVocab {
- level: line[0].to_string(),
- chars: Charset::new(line[1]),
- kanji: line[2].to_string(),
- kana: line[3].to_string(),
- en: line[4].to_string(),
- });
- }
- }
- Ok(ret)
-}
-
-// =====================================================================
// BATCH STRUCTURES AND GENERATION
// =====================================================================
const CHARS_PER_BATCH: usize = 20;
const MAX_NEW_CHARS_PER_EX: usize = 5;
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
-struct Example {
- ja: String,
- en: String,
- expl: String,
- id: Option<String>,
- chars: Charset,
-}
-
#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
struct Batch {
level: String,
@@ -1109,346 +800,3 @@ fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) {
);
}
}
-
-// =====================================================================
-// FORMATTING TO HTML
-// =====================================================================
-
-fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) {
- format_batch_aux(dict_idx, count, i, batch).expect("format batch");
-}
-
-fn format_batch_aux<'a>(
- dict_idx: &DictIndex<'a>,
- count: usize,
- i: usize,
- batch: &Batch,
-) -> Result<()> {
- let mut f = io::BufWriter::new(fs::File::create(format!("public/{:03}.html", i))?);
- write!(
- f,
- r#"<!DOCTYPE html>
- <html>
- <head>
- <meta charset=\"UTF-8\" />
- <title>Batch #{:03}</title>
- <link rel="stylesheet" type="text/css" href="style.css" />
- </head>
- <body><div class="batch_page">"#,
- i
- )?;
-
- writeln!(f, r#"<p><a href="index.html">index</a>"#)?;
- for j in 0..count {
- if j != i {
- writeln!(f, r#" <a href="{:03}.html">{:03}</a>"#, j, j)?;
- } else {
- writeln!(f, " {:03}", j)?;
- }
- }
- writeln!(f, r#"</p>"#)?;
- writeln!(f, "<p>Level: {}</p>", batch.level)?;
-
- write!(f, r#"<p class="ja">"#)?;
- let mut ex_prev = Charset::default();
- for ex in batch.examples.iter() {
- let ex_chars = ex.chars.inter(&batch.chars);
- for c in ex_chars.diff(&ex_prev).chars().iter() {
- write!(
- f,
- r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#,
- c, c
- )?;
- }
- ex_prev = ex_prev.union(&ex_chars);
- }
- writeln!(f, r#"</p>"#)?;
-
- for ex in batch.examples.iter() {
- writeln!(f, "<hr />")?;
- write!(f, r#"<p class="ja">"#)?;
- for c in ex.ja.chars() {
- if batch.chars.contains(c) {
- write!(f, r#"<span class="char_cur">{}</span>"#, c)?;
- } else if batch.chars_p1.contains(c) {
- write!(f, r#"<span class="char_p1">{}</span>"#, c)?;
- } else if batch.chars_p2.contains(c) {
- write!(f, r#"<span class="char_p2">{}</span>"#, c)?;
- } else if batch.chars_bad.contains(c) {
- write!(f, r#"<span class="char_bad">{}</span>"#, c)?;
- } else {
- write!(f, "{}", c)?;
- }
- }
- writeln!(f, "</p>")?;
- writeln!(f, r#"<p class="en">{}</p>"#, ex.en)?;
-
- writeln!(f, r#"<details><summary>Explanation</summary>"#)?;
- let mut expl_batch = Vec::new();
- let mut expl_all = Vec::new();
- for word in ex.expl.split(|c| c == ' ' || c == '~') {
- let (keb, reb) = expl_clean_word(word);
- let wchars = Charset::new(keb);
- if !wchars.intersects(&ex.chars) {
- continue;
- }
- if let Some(ents) = dict_idx.get(keb) {
- for ent in ents.iter() {
- if let Some(s) = dict_str(keb, reb, ent) {
- if wchars.intersects(&batch.chars) {
- expl_batch.push(s);
- } else {
- expl_all.push(s);
- }
- }
- }
- }
- }
- for be in expl_batch {
- writeln!(f, r#"<p>{}</p>"#, be)?;
- }
- writeln!(f, r#"<p class="chars">"#)?;
- for c in ex.chars.inter(&batch.chars).chars().iter() {
- writeln!(
- f,
- r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#,
- c, c
- )?;
- }
- writeln!(f, r#"</p>"#)?;
- for be in expl_all {
- writeln!(f, r#"<p>{}</p>"#, be)?;
- }
- writeln!(f, r#"</details>"#)?;
- }
-
- writeln!(f, "<hr />")?;
- format_vocab(
- &mut f,
- &batch
- .extra_vocab
- .iter()
- .filter(|v| batch.level.contains(&v.level))
- .collect::<Vec<_>>(),
- "Extra vocabulary (this level)",
- )?;
- format_vocab(
- &mut f,
- &batch
- .extra_vocab
- .iter()
- .filter(|v| !batch.level.contains(&v.level))
- .collect::<Vec<_>>(),
- "Extra vocabulary (previous levels)",
- )?;
-
- writeln!(
- f,
- r#"<details><summary>Extra examples (reading practice)</summary><table class="extratable">"#
- )?;
- for ex in batch.extra_examples.iter() {
- let mut expl1 = Vec::new();
- let mut expl2 = Vec::new();
- for word in ex.expl.split(|c| c == ' ' || c == '~') {
- let (keb, reb) = expl_clean_word(word);
- let wchars = Charset::new(keb);
- if !wchars.intersects(&ex.chars) {
- continue;
- }
- if let Some(ents) = dict_idx.get(keb) {
- for ent in ents.iter() {
- if let Some(s) = dict_str_short(keb, reb, ent) {
- if wchars.intersects(&batch.chars) {
- expl1.push(s);
- } else {
- expl2.push(s);
- }
- }
- }
- }
- }
- expl1.extend(expl2.into_iter());
- let expl = expl1.join("<br />");
- writeln!(
- f,
- r#"<tr><td><details><summary class="tab_large2 font_ja">&nbsp;&nbsp;{}&nbsp;&nbsp;</summary><div style="text-align: center">{}<br />{}</div></details></td></tr>"#,
- ex.ja, ex.en, expl
- )?;
- }
- writeln!(f, r#"</table></details>"#)?;
-
- writeln!(f, "<hr />")?;
- writeln!(f, "<p>\(≧▽≦)/</p>")?;
-
- write!(f, "<div></body></html>")?;
- f.flush()?;
- Ok(())
-}
-
-fn format_vocab(f: &mut impl Write, vocab: &[&JlptVocab], t: &str) -> Result<()> {
- if !vocab.is_empty() {
- writeln!(
- f,
- r#"<details><summary>{}</summary><table class="vocabtable">"#,
- t
- )?;
- for v in vocab {
- writeln!(
- f,
- r#"<tr><td>{}</td><td>&nbsp;&nbsp;<span class="tab_large font_ja">{}</span>&nbsp;&nbsp;</td><td>{}</td><td class="font_ja">{}</td></tr>"#,
- v.level, v.kanji, v.en, v.kana
- )?;
- }
- writeln!(f, "</table></details>")?;
- }
- Ok(())
-}
-
-fn expl_clean_word(w: &str) -> (&str, Option<&str>) {
- let mut ret = w;
- for delim in ['(', '{', '['] {
- if let Some((s, _)) = ret.split_once(delim) {
- ret = s;
- }
- }
- let p = w
- .split_once('(')
- .and_then(|(_, r)| r.split_once(')'))
- .map(|(p, _)| p);
- (ret, p)
-}
-
-fn dict_str_short<'a>(
- qkeb: &str,
- qreb: Option<&str>,
- ent: &roxmltree::Node<'a, 'a>,
-) -> Option<String> {
- let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
- let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
- let reb = reb.text().unwrap().trim();
-
- if qreb.map(|x| x != reb).unwrap_or(false) {
- return None;
- }
-
- Some(format!(
- r#"<span class="font_ja">{} 【{}】</span>"#,
- qkeb, reb
- ))
-}
-
-fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &roxmltree::Node<'a, 'a>) -> Option<String> {
- let mut ret = dict_str_short(qkeb, qreb, ent)?;
-
- for sense in ent.children().filter(|x| x.has_tag_name("sense")) {
- if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) {
- ret.extend(format!(" {};", s.text().unwrap().trim()).chars());
- }
- }
-
- if ret.chars().rev().next() == Some(';') {
- ret.pop();
- }
- Some(ret)
-}
-
-fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> {
- let mut f = io::BufWriter::new(fs::File::create("public/index.html")?);
- write!(
- f,
- r#"<!DOCTYPE html>
- <html>
- <head>
- <meta charset=\"UTF-8\" />
- <title>List of batches</title>
- <link rel="stylesheet" type="text/css" href="style.css" />
- </head>
- <body><div class="index_page">"#
- )?;
-
- writeln!(f, r#"<p><a href="about.html">About / How-to</a></p><hr />"#)?;
-
- writeln!(f, "<table>")?;
- writeln!(f, "<tr><th>Num</th><th>Level</th><th>Kanji</th><th>Examples</th><th>Lesson-1</th><th>Lesson-2</th><th>Ignore</th></tr>")?;
- for (i, batch) in batches.iter().enumerate() {
- writeln!(
- f,
- r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td class="font_ja">{}</td><td>&nbsp;&nbsp;{}</td><td class="font_ja">{}</td><td class="font_ja">{}</td><td class="font_ja">{}</td></tr>"#,
- i,
- i,
- batch.level,
- batch.chars.to_string(),
- batch.examples.len(),
- batch.chars_p1.to_string(),
- batch.chars_p2.to_string(),
- batch.chars_bad.to_string()
- )?;
- }
- writeln!(f, r#"</table>"#)?;
-
- writeln!(f, "<hr />")?;
-
- let all_chars = Charset::from_iter(
- batches
- .iter()
- .map(|x| x.chars.chars().iter().copied())
- .flatten(),
- );
- writeln!(f, "<table>")?;
- writeln!(
- f,
- r#"<tr><th>Level</th><th>Count</th><th width="60%">Kanji</th><th>Missing kanji</th></tr>"#
- )?;
- for (lvl, chars) in kanji_levels.iter() {
- if lvl == "N0+" || lvl.ends_with("-10") {
- continue;
- }
- let chars = Charset::new(chars);
- let missing = chars.diff(&all_chars);
- writeln!(
- f,
- r#"<tr><td>{}</td><td>{}</td><td class="font_ja">{}</td><td><span class="font_ja">{}</span> ({})</td></tr>"#,
- lvl,
- chars.len(),
- chars.to_string(),
- missing.to_string(),
- missing.len()
- )?;
- }
- writeln!(f, "</table>")?;
-
- write!(f, "</div></body></html>")?;
- f.flush()?;
- Ok(())
-}
-
-fn format_about() -> Result<()> {
- let mut f = io::BufWriter::new(fs::File::create("public/about.html")?);
- write!(
- f,
- r#"<!DOCTYPE html>
- <html>
- <head>
- <meta charset=\"UTF-8\" />
- <title>Datagengo README</title>
- <link rel="stylesheet" type="text/css" href="style.css" />
- </head>
- <body>"#
- )?;
-
- writeln!(f, r#"<div class="about_page">"#)?;
- writeln!(
- f,
- r#"<p><a href="index.html">Back to lessons</a></p><hr />"#
- )?;
-
- writeln!(
- f,
- "{}",
- markdown::to_html(&fs::read_to_string("README.md")?)
- )?;
-
- writeln!(f, r#"</div></body></html>"#)?;
-
- Ok(())
-}