aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-07-21 09:02:59 +0200
committerAlex Auvolat <alex@adnab.me>2023-07-21 09:02:59 +0200
commit13997439f8f1440b56c1e7dd449e3444aad28197 (patch)
tree82c07bbc84191901af8be9d9fbb9f8c009055f51
downloaddatagengo-13997439f8f1440b56c1e7dd449e3444aad28197.tar.gz
datagengo-13997439f8f1440b56c1e7dd449e3444aad28197.zip
first commit
-rw-r--r--.gitignore5
-rw-r--r--Cargo.lock248
-rw-r--r--Cargo.toml11
-rw-r--r--data/kanji_levels.txt25
-rw-r--r--src/main.rs235
5 files changed, 524 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..6fd94d6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+/target
+
+kanjidic*.xml
+JMdict*.xml
+examples.utf
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..f5ba8c2
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,248 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "ansi_term"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.72"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b13c32d80ecc7ab747b80c3784bce54ee8a7a0cc4fbda9bf4cda2cf6fe90854"
+
+[[package]]
+name = "atty"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "winapi",
+]
+
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
+[[package]]
+name = "clap"
+version = "2.34.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c"
+dependencies = [
+ "ansi_term",
+ "atty",
+ "bitflags",
+ "strsim",
+ "textwrap",
+ "unicode-width",
+ "vec_map",
+]
+
+[[package]]
+name = "datagengo"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "roxmltree",
+ "structopt",
+]
+
+[[package]]
+name = "heck"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c"
+dependencies = [
+ "unicode-segmentation",
+]
+
+[[package]]
+name = "hermit-abi"
+version = "0.1.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+
+[[package]]
+name = "libc"
+version = "0.2.147"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
+
+[[package]]
+name = "proc-macro-error"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
+dependencies = [
+ "proc-macro-error-attr",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro-error-attr"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.66"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5fe8a65d69dd0808184ebb5f836ab526bb259db23c657efa38711b1072ee47f0"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "roxmltree"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d8f595a457b6b8c6cda66a48503e92ee8d19342f905948f29c383200ec9eb1d8"
+dependencies = [
+ "xmlparser",
+]
+
+[[package]]
+name = "strsim"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
+
+[[package]]
+name = "structopt"
+version = "0.3.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10"
+dependencies = [
+ "clap",
+ "lazy_static",
+ "structopt-derive",
+]
+
+[[package]]
+name = "structopt-derive"
+version = "0.4.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0"
+dependencies = [
+ "heck",
+ "proc-macro-error",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "syn"
+version = "1.0.109"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "textwrap"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
+dependencies = [
+ "unicode-width",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c"
+
+[[package]]
+name = "unicode-segmentation"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
+
+[[package]]
+name = "unicode-width"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
+
+[[package]]
+name = "vec_map"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
+
+[[package]]
+name = "version_check"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "xmlparser"
+version = "0.13.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd"
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..f00cad3
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "datagengo"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow = "1.0"
+structopt = "0.3"
+roxmltree = "0.18"
diff --git a/data/kanji_levels.txt b/data/kanji_levels.txt
new file mode 100644
index 0000000..da163bc
--- /dev/null
+++ b/data/kanji_levels.txt
@@ -0,0 +1,25 @@
+4.1: 一右雨円下火花学気休金九空月見五口校左三山四子耳七車手十出女小上人水生先千川足大男中天土二日入年白八百本名木目立六
+4.2: 何会外間魚言古午後語行高国今時社週書少食新西前多長店電東道読南買半父分聞母北毎万友来話
+4.3: 安飲駅
+3.1: 音犬字森正青赤早村町田文夕力林
+3.2: 引遠夏家歌画回海楽顔帰牛京強教近兄計元光工広考合黒作姉市思止紙自室弱首秋春場色心親図声切走太体台知地池茶昼朝鳥通弟冬答頭同肉売風歩方妹明門夜野曜用理
+3.3: 悪暗意医員院運屋界開寒漢館起急究去業銀区軽研県仕使始死事持写者主終習集住重所暑乗真進世送族待代題短着注転都度動発病品服物勉味問薬有洋旅
+3.4: 以英建験好菜産試借説低働特飯不別便民料
+3.5: 質貸堂
+3.6: 映私洗
+2.1: 王貝玉糸石草竹虫
+2.2: 羽雲園黄科絵角活丸岩記形原戸交公才細算寺数星晴雪線船組谷直点当内馬麦番米鳴毛
+2.3: 委育泳央横温化荷階感岸期客球級橋局曲苦具君係決血庫湖向幸港号根祭坂皿指歯次式実取守酒受州拾宿助勝商消章植深申神身整昔全想相息速他打対第炭談柱調追定庭鉄登島投湯等童農波配倍箱畑反板悲皮美鼻筆氷表秒負部福平返放命面役油由遊予様葉陽落流両緑礼列練路和
+2.4: 愛案位衣印栄塩億加果課貨改械害各覚完官管観関願器希機季議求泣給漁競共協極訓群軍景芸欠結健固候康香差最材昨刷察札参散残司児治辞失種周祝順初焼照省笑城信臣成清静席積折節戦浅選然争側束続卒孫帯達単置仲兆底的伝徒努灯熱念敗飛必標付夫富府副兵変辺包法望末満未無約勇要浴利陸良量輪類令例冷連労老録
+2.5: 圧囲易移因営永液演応仮価可河過解快格確額刊慣喜基寄規技逆久救旧居許境均禁型経件検険減現限個故効厚構耕航講鉱告混査再妻採際在罪財殺雑賛史師志支枝資似示識授修術述準招象賞常情条状職制勢性政精製税績責接設絶祖総像増造則測損団断築貯張停程適導銅得毒独任燃能破判版犯比費非備評貧婦布武復複仏粉編保報豊暴貿防務夢迷綿輸余容率略留領歴
+2.6: 異胃域宇延灰拡革割巻干看簡危机疑吸供胸勤敬警劇券権呼誤紅降刻骨困砂座済冊詞誌捨若収純処署諸除将承蒸針専泉善層操窓装臓蔵存尊退宅担探暖段値宙著庁頂賃痛展党届難乳認脳拝背晩否批腹並閉片補暮宝訪亡忘棒枚優郵預幼欲翌乱卵裏律論
+2.8: 依偉違鋭越煙汚奥押欧菓介皆較乾換汗環甘含祈喫詰巨叫恐挟況狭偶隅掘靴傾恵迎肩賢軒枯雇互御更硬肯荒郊腰込婚歳咲伺刺脂湿舟柔緒召床昇紹畳触伸寝辛震吹姓占双捜掃燥憎贈替袋濯恥遅畜駐超沈珍泥滴殿塗渡途怒倒凍塔盗筒到逃突曇鈍軟猫悩濃杯泊薄爆肌髪抜般販彼疲被匹怖普浮符膚舞封幅払沸壁捕募抱坊帽忙磨埋眠娘戻与溶踊頼絡粒了涼療涙零齢恋湾腕
+1.2: 汽弓刀矢里
+1.3: 宮詩昭丁帳笛豆羊
+1.4: 井沖芽賀街潟岐旗挙鏡熊郡径功佐崎氏滋鹿唱松倉巣隊典徳奈縄梅博媛票牧養梨
+1.5: 衛益往幹眼紀義興句潔護災桜酸士飼舎謝序証織素属態提統肥弁墓脈
+1.6: 遺沿恩我閣株揮貴郷筋系激穴憲絹厳源己后孝皇鋼穀裁策蚕姿至視磁射尺樹宗就衆従縦縮熟傷障仁垂推寸盛聖誠舌宣染銭創奏誕忠潮腸敵糖討納派俳肺班秘俵奮陛幕密盟模訳覧臨朗
+1.8: 亜哀握扱威尉慰為維緯壱逸稲芋姻陰隠韻渦唄浦影詠疫悦謁閲宴援炎猿縁艶鉛凹旺殴翁憶乙卸穏佳嫁寡暇架禍稼箇華蚊雅餓塊壊怪悔懐戒拐劾慨概涯該垣嚇核殻獲穫郭隔岳掛喝括渇滑褐轄且鎌刈冠勘勧喚堪寛患憾敢棺款歓監緩缶肝艦貫還鑑閑陥頑企伎奇幾忌既棋棄軌輝飢騎鬼亀偽儀宜戯擬欺犠菊吉却脚虐丘及朽窮糾拒拠虚距享凶峡恭狂矯脅響驚仰凝暁錦斤琴緊菌襟謹吟駆駒愚虞遇屈繰桑勲薫刑啓契慶憩掲携渓継茎蛍鶏鯨撃傑倹兼剣圏堅嫌懸拳献謙遣顕幻弦玄孤弧虎誇顧鼓呉娯悟碁侯坑孔巧恒慌抗拘控攻江洪溝甲稿絞綱衡貢購酵項剛拷豪克酷獄墾恨懇昆紺魂唆沙詐鎖債催宰彩栽采砕斎載剤削搾索錯撮擦傘惨桟暫嗣施旨祉紫肢諮賜雌侍慈璽軸執漆疾芝赦斜煮遮蛇邪爵酌釈寂朱殊狩珠趣儒寿需囚愁秀臭襲酬醜充汁渋獣銃叔淑粛塾俊瞬准循旬殉潤盾巡遵庶叙徐償匠升奨宵尚彰抄掌晶沼渉焦症硝礁祥称粧肖衝訟詔詳鐘丈冗剰壌嬢浄譲醸錠嘱飾殖辱侵唇娠審慎振浸紳薪診刃尋甚尽迅陣須酢帥炊睡粋衰遂酔随髄崇枢据杉澄瀬畝是征牲誓請逝斉隻惜斥析籍跡拙摂窃仙扇栓潜旋繊薦践遷鮮漸禅繕塑措疎礎租粗訴阻僧喪壮爽挿曹槽荘葬藻遭霜騒促即俗賊汰堕妥惰駄耐怠泰滞胎逮滝卓択拓沢託濁諾但奪脱棚丹嘆旦淡端胆鍛壇弾痴稚致蓄逐秩窒嫡抽衷鋳弔彫徴懲挑眺聴跳勅朕鎮陳津墜椎塚漬坪釣鶴亭偵貞呈堤帝廷抵締艇訂逓邸摘哲徹撤迭添吐斗奴唐悼搭桃棟痘藤謄踏透陶騰闘憧洞瞳胴峠匿督篤凸屯豚那尼弐虹如尿妊忍寧粘把覇婆廃排輩培媒賠陪伯拍舶迫漠縛鉢伐罰閥伴帆搬畔繁藩範煩頒盤蛮卑妃扉披泌碑罷避尾微眉姫漂描苗浜賓頻敏瓶扶敷腐譜賦赴附侮伏覆噴墳憤紛雰丙併塀幣弊柄癖偏遍舗穂慕簿倣俸奉峰崩泡砲縫胞芳褒邦飽乏傍剖妨房某冒紡肪膨謀僕墨撲朴睦没堀奔翻凡盆摩魔麻膜又抹繭慢漫魅岬妙矛霧婿銘滅免茂妄猛盲網耗黙紋冶弥厄躍柳愉癒諭唯幽悠憂湧猶裕誘雄融誉庸揚揺擁窯謡抑翼羅裸雷酪嵐欄濫藍吏履璃痢離硫隆竜慮虜僚寮猟瞭糧陵倫厘隣瑠塁累励鈴隷霊麗暦劣烈裂廉錬呂炉露廊楼浪漏郎賄惑枠
+1.9: 阿葵茜渥旭梓絢綾鮎杏伊惟亥郁磯允胤卯丑叡瑛苑於伽嘉茄霞魁凱馨叶樺茅侃莞巌嬉毅稀誼鞠橘亨匡喬尭桐欣欽芹衿玖矩栗袈圭慧桂絃胡伍吾梧瑚鯉倖宏弘昂晃浩紘鴻嵯瑳裟哉冴朔笹皐燦爾蒔汐偲紗勺洲峻竣舜駿淳醇曙渚恕庄捷昌梢菖蕉丞穣晋榛秦翠錘瑞嵩雛碩銑惣綜聡蒼黛鯛鷹啄琢只辰巽檀智猪暢脹蝶槻蔦椿紬悌汀禎杜寅酉惇敦凪捺楠乃之巴萩肇鳩隼斐緋柊彦彪彬芙楓蕗碧甫輔朋萌鳳鵬槙柾亦麿巳稔椋孟匁也耶靖佑宥柚祐邑楊耀蓉遥蘭李琉亮凌稜諒遼琳麟伶嶺怜玲蓮禄倭亘侑勁奎崚彗昴晏晨晟暉栞椰毬洸洵滉漱澪燎燿瑶皓眸笙綺綸翔脩茉莉菫詢諄赳迪頌颯黎凜熙
+0.4: 茨岡阪埼栃阜
+0.8: 挨宛闇椅畏萎咽淫臼餌怨臆俺苛牙崖蓋骸柿顎葛釜瓦韓玩畿僅巾串窟稽詣隙桁鍵舷股乞勾喉梗頃痕挫塞柵拶斬嫉腫呪蹴拭尻芯腎裾凄醒戚脊煎羨腺詮膳曽狙遡痩捉袖遜唾堆戴誰綻酎捗潰爪諦溺貼妬賭頓謎鍋匂捻罵箸斑氾汎膝肘蔽蔑蜂貌勃昧枕蜜冥麺餅妖沃侶賂弄麓脇丼傲刹哺喩嗅嘲毀彙恣惧慄憬拉摯曖楷鬱璧瘍箋籠緻羞訃諧貪踪辣錮塡頰𠮟剝
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..f99d236
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,235 @@
+use std::collections::HashMap;
+use std::fs;
+use std::cmp::Ordering;
+use std::io::{self, BufRead};
+
+use anyhow::Result;
+use structopt::StructOpt;
+
+#[derive(Debug, StructOpt)]
+#[structopt(name = "datagengo", about = "Japanese example practice maker")]
+struct Opt {
+ #[structopt(subcommand)]
+ cmd: Cmd,
+}
+
+#[derive(Debug, StructOpt)]
+enum Cmd {
+ ParseKanjidic,
+ New,
+}
+
+fn main() {
+ let opt = Opt::from_args();
+
+ match opt.cmd {
+ Cmd::ParseKanjidic => {
+ let levels = parse_kanjidic().expect("error");
+ for (jlpt, grade, chars) in levels.iter() {
+ println!("{}.{}: {}", jlpt, grade, chars);
+ }
+ },
+ Cmd::New => {
+ let kanji_levels = read_kanji_levels().expect("error");
+ let all_kanji = Charset::new(kanji_levels.iter()
+ .map(|(_, x)| x.to_string())
+ .collect::<Vec<_>>()
+ .join(""));
+ let ex = read_examples(&all_kanji).expect("error");
+ println!("{:#?}", &ex[..10]);
+ }
+ }
+}
+
+fn parse_kanjidic() -> Result<Vec<(i32, i32, String)>> {
+ let file = fs::read_to_string("data/kanjidic2.xml")?;
+ let xml = roxmltree::Document::parse(&file)?;
+ let kanjidic = xml.root().first_child().unwrap();
+ assert!(kanjidic.has_tag_name("kanjidic2"));
+
+ let mut levels = HashMap::new();
+
+ for x in kanjidic.children() {
+ if !x.has_tag_name("character") {
+ continue;
+ }
+ let mut literal = None;
+ let mut jlpt = None;
+ let mut grade = None;
+ for y in x.children() {
+ if y.has_tag_name("literal") {
+ literal = y.text();
+ }
+ if y.has_tag_name("misc") {
+ for z in y.children() {
+ if z.has_tag_name("grade") {
+ grade = z.text().and_then(|x| str::parse::<i32>(x).ok());
+ }
+ if z.has_tag_name("jlpt") {
+ jlpt = z.text().and_then(|x| str::parse::<i32>(x).ok());
+ }
+ }
+ }
+ }
+ if jlpt.is_none() && grade.is_none() {
+ continue;
+ }
+ let level = (jlpt.unwrap_or(0), grade.unwrap_or(0));
+ if let Some(lit) = literal {
+ levels.entry(level).or_insert(String::new()).extend(lit.chars());
+ }
+ }
+
+ let mut levels = levels.into_iter().map(|((j, g), c)| (j, g, c)).collect::<Vec<_>>();
+ levels.sort_by_key(|(j, g, _)| (-*j, *g));
+ Ok(levels)
+}
+
+fn read_kanji_levels() -> Result<Vec<(String, String)>> {
+ Ok(fs::read_to_string("data/kanji_levels.txt")?
+ .lines()
+ .filter_map(|l| l.split_once(": "))
+ .map(|(l, k)| (l.to_string(), k.to_string()))
+ .collect::<Vec<_>>())
+}
+
+fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> {
+ let file = fs::File::open("data/examples.utf")?;
+
+ let mut ret = Vec::new();
+ let mut a = "".to_string();
+
+ for line in io::BufReader::new(file).lines() {
+ let line = line?;
+ if line.starts_with("A:") {
+ a = line;
+ } else if line.starts_with("B:") {
+ let s = a.strip_prefix("A: ");
+ let t = line.strip_prefix("B: ");
+ if let (Some(a), Some(b)) = (s, t) {
+ if let Some((ja, eng)) = a.split_once("\t") {
+ if let Some((eng, id)) = eng.split_once("#") {
+ ret.push(Example {
+ ja: ja.to_string(),
+ en: eng.to_string(),
+ expl: b.to_string(),
+ id: Some(id.to_string()),
+ chars: Charset::new(ja).inter_chars(all_kanji),
+ });
+ } else {
+ ret.push(Example {
+ ja: ja.to_string(),
+ en: eng.to_string(),
+ expl: b.to_string(),
+ id: None,
+ chars: Charset::new(ja).inter_chars(all_kanji),
+ });
+ }
+ }
+ }
+ }
+ if ret.len() > 100 {
+ break;
+ }
+ }
+
+ Ok(ret)
+}
+
+#[derive(Debug)]
+struct Example {
+ ja: String,
+ en: String,
+ expl: String,
+ id: Option<String>,
+ chars: Charset,
+}
+
+#[derive(Debug)]
+struct Charset(Vec<char>);
+
+impl Charset {
+ fn new<S: AsRef<str>>(s: S) -> Self {
+ let mut chars = s.as_ref().chars().collect::<Vec<_>>();
+ chars.sort();
+ chars.dedup();
+ Self(chars)
+ }
+ fn intersects(&self, other: &Self) -> bool {
+ let mut it1 = self.0.iter().peekable();
+ let mut it2 = other.0.iter().peekable();
+ while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
+ match c1.cmp(c2) {
+ Ordering::Equal => return true,
+ Ordering::Less => it1.next(),
+ Ordering::Greater => it2.next(),
+ };
+ }
+ false
+ }
+ fn count_inter(&self, other: &Self) -> usize {
+ let mut it1 = self.0.iter().peekable();
+ let mut it2 = other.0.iter().peekable();
+ let mut ret = 0;
+ while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
+ match c1.cmp(c2) {
+ Ordering::Equal => {
+ ret += 1;
+ it1.next();
+ it2.next();
+ }
+ Ordering::Less => {
+ it1.next();
+ }
+ Ordering::Greater => {
+ it2.next();
+ }
+ };
+ }
+ ret
+ }
+ fn inter_chars(&self, other: &Self) -> Charset {
+ let mut it1 = self.0.iter().peekable();
+ let mut it2 = other.0.iter().peekable();
+ let mut ret = Vec::new();
+ while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
+ match c1.cmp(c2) {
+ Ordering::Equal => {
+ ret.push(**c1);
+ it1.next();
+ it2.next();
+ }
+ Ordering::Less => {
+ it1.next();
+ }
+ Ordering::Greater => {
+ it2.next();
+ }
+ };
+ }
+ Self(ret)
+ }
+ fn chars(&self) -> &[char] {
+ &self.0
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ #[test]
+ fn test_charset() {
+ let c1 = Charset::new("azerty");
+ let c2 = Charset::new("uiopqsqdf");
+ let c3 = Charset::new("hello, world");
+
+ assert!(!c1.intersects(&c2));
+ assert!(c1.intersects(&c3));
+ assert!(c2.intersects(&c3));
+
+ assert_eq!(c1.count_inter(&c2), 0);
+ assert_eq!(c1.count_inter(&c3), 2);
+ assert_eq!(c2.count_inter(&c3), 2);
+ }
+}