aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-09-25 14:43:31 +0200
committerAlex Auvolat <alex@adnab.me>2023-09-25 14:43:31 +0200
commitbde10f88a32a1c9deb274b81938155ebcc2e9400 (patch)
tree240991e233a6492bb8e2d72d52274329855d8521 /src
parent50e66cc177a8146e43c397b73c11312a2a0cf2cd (diff)
downloaddatagengo-bde10f88a32a1c9deb274b81938155ebcc2e9400.tar.gz
datagengo-bde10f88a32a1c9deb274b81938155ebcc2e9400.zip
add vocab per jlpt
Diffstat (limited to 'src')
-rw-r--r--src/main.rs104
1 files changed, 104 insertions, 0 deletions
diff --git a/src/main.rs b/src/main.rs
index 335add8..48d8bf9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -21,6 +21,7 @@ struct Opt {
#[derive(Debug, StructOpt)]
enum Cmd {
ParseKanjidic,
+ ParseJlptVocab,
New {
#[structopt(default_value = "10")]
count: usize,
@@ -42,6 +43,14 @@ fn main() {
println!("{}: {}", level, chars.to_string());
}
}
+ Cmd::ParseJlptVocab => {
+ let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
+ let all_kanji = Charset::from_iter(kanji_levels
+ .iter()
+ .map(|(_, c)| c.chars())
+ .flatten());
+ parse_jlpt_vocab(&all_kanji).expect("error");
+ }
Cmd::New { truncate, count } => {
let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
let all_kanji = Charset::new(
@@ -288,6 +297,101 @@ fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> {
Ok(ret)
}
+#[derive(Clone, Debug)]
+struct JlptVocab {
+ level: String,
+ chars: Charset,
+ kanji: String,
+ kana: String,
+ en: String,
+}
+
+impl JlptVocab {
+ fn to_string(&self) -> String {
+ format!("{}\t{}\t{}\t{}\t{}",
+ self.level,
+ self.chars.to_string(),
+ self.kanji,
+ self.kana,
+ self.en)
+ }
+}
+
+fn parse_jlpt_vocab(all_kanji: &Charset) -> Result<()> {
+ let mut vocab = vec![];
+ vocab.extend(parse_jlpt_vocab_combined("data/n5_vocab.txt", "N4", all_kanji)?);
+ vocab.extend(parse_jlpt_vocab_split("data/n4_vocab_hiragana.txt", "data/n4_vocab_eng.txt", "N3", all_kanji)?);
+ vocab.extend(parse_jlpt_vocab_split("data/n3_vocab_hiragana.txt", "data/n3_vocab_eng.txt", "N2a", all_kanji)?);
+ vocab.extend(parse_jlpt_vocab_split("data/n2_vocab_hiragana.txt", "data/n2_vocab_eng.txt", "N2b", all_kanji)?);
+ vocab.extend(parse_jlpt_vocab_split("data/n1_vocab_hiragana.txt", "data/n1_vocab_eng.txt", "N1", all_kanji)?);
+ for v in vocab.iter() {
+ println!("{}", v.to_string());
+ }
+ Ok(())
+}
+
+fn parse_jlpt_vocab_combined(file: &str, level: &str, all_kanji: &Charset) -> Result<Vec<JlptVocab>> {
+ let lines = jlpt_vocab_read_file(file)?;
+ let mut ret = vec![];
+ for (kanji, answer) in lines {
+ let (eng, kana) = match answer.split_once('\n') {
+ Some((a, b)) => (a, b.trim()),
+ None => (answer.trim(), ""),
+ };
+ for kanji in kanji.split('/') {
+ ret.push(JlptVocab {
+ level: level.to_string(),
+ chars: Charset::new(kanji).inter(all_kanji),
+ kanji: kanji.to_string(),
+ kana: kana.to_string(),
+ en: eng.to_string()
+ });
+ }
+ }
+ Ok(ret)
+}
+
+fn parse_jlpt_vocab_split(kana_file: &str, eng_file: &str, level: &str, all_kanji: &Charset) -> Result<Vec<JlptVocab>> {
+ let eng_lines = jlpt_vocab_read_file(eng_file)?
+ .into_iter()
+ .collect::<HashMap<String, String>>();
+
+ let lines = jlpt_vocab_read_file(kana_file)?;
+ let mut ret = vec![];
+ for (kanji, kana) in lines {
+ let eng = eng_lines.get(&kanji).or(eng_lines.get(&kana));
+ if let Some(eng) = eng {
+ for kanji in kanji.split('/') {
+ ret.push(JlptVocab {
+ level: level.to_string(),
+ chars: Charset::new(kanji).inter(all_kanji),
+ kanji: kanji.to_string(),
+ kana: kana.to_string(),
+ en: eng.to_string()
+ });
+ }
+ }
+ }
+ Ok(ret)
+}
+
+fn jlpt_vocab_read_file(file: &str) -> Result<Vec<(String, String)>> {
+ let re = regex::Regex::new(r#"<span class="\w+">"#)?;
+
+ let file = fs::File::open(file)?;
+ let mut ret = vec![];
+ for line in io::BufReader::new(file).lines() {
+ let line = line?.replace("<br>", "\n")
+ .replace("</span>", "");
+ let line = re.replace_all(&line, "");
+ if let Some((a, b)) = line.split_once('|') {
+ ret.push((a.trim().to_string(), b.trim().to_string()));
+ }
+ }
+
+ Ok(ret)
+}
+
// =====================================================================
// BATCH STRUCTURES AND GENERATION
// =====================================================================