aboutsummaryrefslogtreecommitdiff
path: root/src/datafiles.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/datafiles.rs')
-rw-r--r--src/datafiles.rs42
1 files changed, 34 insertions, 8 deletions
diff --git a/src/datafiles.rs b/src/datafiles.rs
index 7a6a5d5..fc6194f 100644
--- a/src/datafiles.rs
+++ b/src/datafiles.rs
@@ -1,6 +1,7 @@
use std::collections::HashMap;
use std::fs;
use std::io::{self, BufRead};
+use std::sync::Arc;
use anyhow::Result;
use serde::{Deserialize, Serialize};
@@ -22,21 +23,46 @@ pub struct Example {
// PARSING DATA FILES
// =====================================================================
-pub type DictEntry<'a> = roxmltree::Node<'a, 'a>;
-pub type DictIndex<'a> = HashMap<&'a str, Vec<DictEntry<'a>>>;
-pub fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> {
+pub struct DictEntry {
+ pub reb: String,
+ pub ent_seq: String,
+ pub sense: Box<[String]>,
+}
+
+pub type DictIndex = HashMap<String, Vec<Arc<DictEntry>>>;
+pub fn index_jmdict(dict: &roxmltree::Document) -> DictIndex {
let dict = dict
.root()
.children()
.find(|x| x.has_tag_name("JMdict"))
.unwrap();
- let mut ret: DictIndex<'a> = HashMap::new();
- for x in dict.children().filter(|x| x.has_tag_name("entry")) {
- for r in x.children().filter(|x| x.has_tag_name("k_ele")) {
+ let mut ret: DictIndex = HashMap::new();
+ for ent in dict.children().filter(|x| x.has_tag_name("entry")) {
+ let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
+ let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
+ let reb = reb.text().unwrap().trim().to_string();
+
+ let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap();
+ let ent_seq = ent_seq.text().unwrap().trim().to_string();
+
+ let sense = ent
+ .children()
+ .filter(|x| x.has_tag_name("sense"))
+ .filter_map(|sense| sense.children().find(|x| x.has_tag_name("gloss")))
+ .map(|gloss| gloss.text().unwrap().trim().to_string())
+ .collect::<Vec<_>>()
+ .into_boxed_slice();
+ let parsed_ent = Arc::new(DictEntry {
+ reb,
+ ent_seq,
+ sense,
+ });
+
+ for r in ent.children().filter(|x| x.has_tag_name("k_ele")) {
if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) {
- let txt = keb.text().unwrap().trim();
- ret.entry(txt).or_default().push(x);
+ let txt = keb.text().unwrap().trim().to_string();
+ ret.entry(txt).or_default().push(parsed_ent.clone());
}
}
}