Commit 09c4ae63 authored by szr712's avatar szr712

修改语文数据集,增加中小学课文

parent c2754b41
...@@ -15,14 +15,6 @@ import io ...@@ -15,14 +15,6 @@ import io
def wenzi2pinyin(text):
pinyin_list = lazy_pinyin(text, style=Style.TONE3)
# print(pinyin_list)
tones_list = [int(py[-1]) if py[-1].isdigit()
else 0 for py in pinyin_list]
pinyin_list = lazy_pinyin(text, style=Style.NORMAL)
return "".join(pinyin_list)
def yield_tokens(file_path): def yield_tokens(file_path):
with io.open(file_path, encoding = 'utf-8') as f: with io.open(file_path, encoding = 'utf-8') as f:
for line in f: for line in f:
......
...@@ -29,7 +29,7 @@ import itertools ...@@ -29,7 +29,7 @@ import itertools
def wenzi2pinyin(text): def wenzi2pinyin(text):
pinyin_list = lazy_pinyin(text, style=Style.TONE3) pinyin_list = lazy_pinyin(text, style=Style.TONE3)
# print(pinyin_list) # print(pinyin_list)
tones_list = [int(py[-1]) if py[-1].isdigit() tones_list = [int(py[-1]) if py[-1]<="9" and py[-1]>="0"
else 0 for py in pinyin_list] else 0 for py in pinyin_list]
pinyin_list = lazy_pinyin(text, style=Style.NORMAL) pinyin_list = lazy_pinyin(text, style=Style.NORMAL)
......
...@@ -45,6 +45,6 @@ if __name__=="__main__": ...@@ -45,6 +45,6 @@ if __name__=="__main__":
# with open("./data/voc/yunmu.txt","r",encoding="utf-8") as f: # with open("./data/voc/yunmu.txt","r",encoding="utf-8") as f:
# yunmus=f.readlines() # yunmus=f.readlines()
# yunmus=[a.strip() for a in yunmus] # yunmus=[a.strip() for a in yunmus]
convert_pinyin("dev_hanzi.txt","./data/Chinese/dev","./data/Chinese/dev","dev_pinyin_split.txt") # convert_pinyin("dev_hanzi.txt","./data/Chinese/dev","./data/Chinese/dev","dev_pinyin_split.txt")
# for file in os.listdir(hanzi_dir): for file in os.listdir(hanzi_dir):
# convert_pinyin(file,hanzi_dir,pinyin_dir,file) convert_pinyin(file,hanzi_dir,pinyin_dir,file)
\ No newline at end of file \ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment