diff options
author | Alex Auvolat <alex@adnab.me> | 2024-03-15 17:36:55 +0100 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2024-03-15 17:36:55 +0100 |
commit | 4cd9081dc3a41594174480c1565fd2427550c50a (patch) | |
tree | 8a09db13b11477eaba40fa1a12347015738a5f8d /src/datafiles.rs | |
parent | f6a778698b8f0b709a7afa693ab42f8a0c314fe7 (diff) | |
download | datagengo-4cd9081dc3a41594174480c1565fd2427550c50a.tar.gz datagengo-4cd9081dc3a41594174480c1565fd2427550c50a.zip |
reduce memory usage & fix scroll to top
Diffstat (limited to 'src/datafiles.rs')
-rw-r--r-- | src/datafiles.rs | 42 |
1 files changed, 34 insertions, 8 deletions
diff --git a/src/datafiles.rs b/src/datafiles.rs index 7a6a5d5..fc6194f 100644 --- a/src/datafiles.rs +++ b/src/datafiles.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use std::fs; use std::io::{self, BufRead}; +use std::sync::Arc; use anyhow::Result; use serde::{Deserialize, Serialize}; @@ -22,21 +23,46 @@ pub struct Example { // PARSING DATA FILES // ===================================================================== -pub type DictEntry<'a> = roxmltree::Node<'a, 'a>; -pub type DictIndex<'a> = HashMap<&'a str, Vec<DictEntry<'a>>>; -pub fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> { +pub struct DictEntry { + pub reb: String, + pub ent_seq: String, + pub sense: Box<[String]>, +} + +pub type DictIndex = HashMap<String, Vec<Arc<DictEntry>>>; +pub fn index_jmdict(dict: &roxmltree::Document) -> DictIndex { let dict = dict .root() .children() .find(|x| x.has_tag_name("JMdict")) .unwrap(); - let mut ret: DictIndex<'a> = HashMap::new(); - for x in dict.children().filter(|x| x.has_tag_name("entry")) { - for r in x.children().filter(|x| x.has_tag_name("k_ele")) { + let mut ret: DictIndex = HashMap::new(); + for ent in dict.children().filter(|x| x.has_tag_name("entry")) { + let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); + let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); + let reb = reb.text().unwrap().trim().to_string(); + + let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap(); + let ent_seq = ent_seq.text().unwrap().trim().to_string(); + + let sense = ent + .children() + .filter(|x| x.has_tag_name("sense")) + .filter_map(|sense| sense.children().find(|x| x.has_tag_name("gloss"))) + .map(|gloss| gloss.text().unwrap().trim().to_string()) + .collect::<Vec<_>>() + .into_boxed_slice(); + let parsed_ent = Arc::new(DictEntry { + reb, + ent_seq, + sense, + }); + + for r in ent.children().filter(|x| x.has_tag_name("k_ele")) { if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) { - let txt = keb.text().unwrap().trim(); - ret.entry(txt).or_default().push(x); + let txt = keb.text().unwrap().trim().to_string(); + ret.entry(txt).or_default().push(parsed_ent.clone()); } } } |