indiejoseph commited on
Commit
2cabcd4
0 Parent(s):

first commit

Browse files
Files changed (5) hide show
  1. app.py +31 -0
  2. flagged/log.csv +2 -0
  3. requirements.txt +4 -0
  4. translation_pipeline.py +162 -0
  5. translator.py +286 -0
app.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import BertTokenizerFast
3
+ from optimum.onnxruntime import ORTModelForSeq2SeqLM
4
+ from translation_pipeline import TranslationPipeline
5
+ from translator import Translator
6
+
7
+ model_id = "indiejoseph/bart-translation-zh-yue-onnx"
8
+ tokenizer = BertTokenizerFast.from_pretrained(model_id)
9
+ model = ORTModelForSeq2SeqLM.from_pretrained(model_id, use_cache=False)
10
+
11
+ pipe = TranslationPipeline(model=model, tokenizer=tokenizer)
12
+ translator = Translator(pipe, batch_size=2, max_length=1024)
13
+
14
+
15
+ def translate(zh: str):
16
+ return translator([zh])[0]
17
+
18
+
19
+ demo = gr.Interface(
20
+ fn=translate,
21
+ inputs=[
22
+ gr.Textbox(label="官話", type="text"),
23
+ ],
24
+ outputs=[
25
+ gr.Textbox(label="廣東話", type="text"),
26
+ ],
27
+ examples=[["瞧瞧你说的是人话吗?"], ["余文乐关掉潮店被嘲临走还坑人"]],
28
+ )
29
+
30
+ if __name__ == "__main__":
31
+ demo.launch(show_api=False)
flagged/log.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 官話,廣東話,flag,username,timestamp
2
+ 瞧瞧你说的是人话吗?,睇吓你講緊人話咩?,,,2023-12-13 22:31:14.104913
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ datasets
3
+ transformers
4
+ optimum[onnxruntime]
translation_pipeline.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import TranslationPipeline
2
+ from transformers.pipelines.text2text_generation import ReturnType
3
+ from transformers import BartForConditionalGeneration, BertTokenizer
4
+ import re
5
+
6
+
7
+ hans_chars = set(
8
+ "万与丑专业丛东丝丢两严丧个丬丰临为丽举么义乌乐乔习乡书买乱争于亏云亘亚产亩亲亵亸亿仅从仑仓仪们价众优伙会伛伞伟传伤伥伦伧伪伫体余佣佥侠侣侥侦侧侨侩侪侬俣俦俨俩俪俭债倾偬偻偾偿傥傧储傩儿兑兖党兰关兴兹养兽冁内冈册写军农冢冯冲决况冻净凄凉凌减凑凛几凤凫凭凯击凼凿刍划刘则刚创删别刬刭刽刿剀剂剐剑剥剧劝办务劢动励劲劳势勋勐勚匀匦匮区医华协单卖卢卤卧卫却卺厂厅历厉压厌厍厕厢厣厦厨厩厮县参叆叇双发变叙叠叶号叹叽吁后吓吕吗吣吨听启吴呒呓呕呖呗员呙呛呜咏咔咙咛咝咤咴咸哌响哑哒哓哔哕哗哙哜哝哟唛唝唠唡唢唣唤唿啧啬啭啮啰啴啸喷喽喾嗫呵嗳嘘嘤嘱噜噼嚣嚯团园囱围囵国图圆圣圹场坂坏块坚坛坜坝坞坟坠垄垅垆垒垦垧垩垫垭垯垱垲垴埘埙埚埝埯堑堕塆墙壮声壳壶壸处备复够头夸夹夺奁奂奋奖奥妆妇妈妩妪妫姗姜娄娅娆娇娈娱娲娴婳婴婵婶媪嫒嫔嫱嬷孙学孪宁宝实宠审宪宫宽宾寝对寻导寿将尔尘尧尴尸尽层屃屉届属屡屦屿岁岂岖岗岘岙岚岛岭岳岽岿峃峄峡峣峤峥峦崂崃崄崭嵘嵚嵛嵝嵴巅巩巯币帅师帏帐帘帜带帧帮帱帻帼幂幞干并广庄庆庐庑库应庙庞废庼廪开异弃张弥弪弯弹强归当录彟彦彻径徕御忆忏忧忾怀态怂怃怄怅怆怜总怼怿恋恳恶恸恹恺恻恼恽悦悫悬悭悯惊惧惨惩惫惬惭惮惯愍愠愤愦愿慑慭憷懑懒懔戆戋戏戗战戬户扎扑扦执扩扪扫扬扰抚抛抟抠抡抢护报担拟拢拣拥拦拧拨择挂挚挛挜挝挞挟挠挡挢挣挤挥挦捞损捡换捣据捻掳掴掷掸掺掼揸揽揿搀搁搂搅携摄摅摆摇摈摊撄撑撵撷撸撺擞攒敌敛数斋斓斗斩断无旧时旷旸昙昼昽显晋晒晓晔晕晖暂暧札术朴机杀杂权条来杨杩杰极构枞枢枣枥枧枨枪枫枭柜柠柽栀栅标栈栉栊栋栌栎栏树栖样栾桊桠桡桢档桤桥桦桧桨桩梦梼梾检棂椁椟椠椤椭楼榄榇榈榉槚槛槟槠横樯樱橥橱橹橼檐檩欢欤欧歼殁殇残殒殓殚殡殴毁毂毕毙毡毵氇气氢氩氲汇汉污汤汹沓沟没沣沤沥沦沧沨沩沪沵泞泪泶泷泸泺泻泼泽泾洁洒洼浃浅浆浇浈浉浊测浍济浏浐浑浒浓浔浕涂涌涛涝涞涟涠涡涢涣涤润涧涨涩淀渊渌渍渎渐渑渔渖渗温游湾湿溃溅溆溇滗滚滞滟滠满滢滤滥滦滨滩滪漤潆潇潋潍潜潴澜濑濒灏灭灯灵灾灿炀炉炖炜炝点炼炽烁烂烃烛烟烦烧烨烩烫烬热焕焖焘煅煳熘爱爷牍牦牵牺犊犟状犷犸犹狈狍狝狞独狭狮狯狰狱狲猃猎猕猡猪猫猬献獭玑玙玚玛玮环现玱玺珉珏珐珑珰珲琎琏琐琼瑶瑷璇璎瓒瓮瓯电画畅畲畴疖疗疟疠疡疬疮疯疱疴痈痉痒痖痨痪痫痴瘅瘆瘗瘘瘪瘫瘾瘿癞癣癫癯皑皱皲盏盐监盖盗盘眍眦眬着睁睐睑瞒瞩矫矶矾矿砀码砖砗砚砜砺砻砾础硁硅硕硖硗硙硚确硷碍碛碜碱碹磙礼祎祢祯祷祸禀禄禅离秃秆种积称秽秾稆税稣稳穑穷窃窍窑窜窝窥窦窭竖竞笃笋笔笕笺笼笾筑筚筛筜筝筹签简箓箦箧箨箩箪箫篑篓篮篱簖籁籴类籼粜粝粤粪粮糁糇紧絷纟纠纡红纣纤纥约级纨纩纪纫纬纭纮纯纰纱纲纳纴纵纶纷纸纹纺纻纼纽纾线绀绁绂练组绅细织终绉绊绋绌绍绎经绐绑绒结绔绕绖绗绘给绚绛络绝绞统绠绡绢绣绤绥绦继绨绩绪绫绬续绮绯绰绱绲绳维绵绶绷绸绹绺绻综绽绾绿缀缁缂缃缄缅缆缇缈缉缊缋缌缍缎缏缐缑缒缓缔缕编缗缘缙缚缛缜缝缞缟缠缡缢缣缤缥缦缧缨缩缪缫缬缭缮缯缰缱缲缳缴缵罂网罗罚罢罴羁羟羡翘翙翚耢耧耸耻聂聋职聍联聩聪肃肠肤肷肾肿胀胁胆胜胧胨胪胫胶脉脍脏脐脑脓脔脚脱脶脸腊腌腘腭腻腼腽腾膑臜舆舣舰舱舻艰艳艹艺节芈芗芜芦苁苇苈苋苌苍苎苏苘苹茎茏茑茔茕茧荆荐荙荚荛荜荞荟荠荡荣荤荥荦荧荨荩荪荫荬荭荮药莅莜莱莲莳莴莶获莸莹莺莼萚萝萤营萦萧萨葱蒇蒉蒋蒌蓝蓟蓠蓣蓥蓦蔷蔹蔺蔼蕲蕴薮藁藓虏虑虚虫虬虮虽虾虿蚀蚁蚂蚕蚝蚬蛊蛎蛏蛮蛰蛱蛲蛳蛴蜕蜗蜡蝇蝈蝉蝎蝼蝾螀螨蟏衅衔补衬衮袄袅袆袜袭袯装裆裈裢裣裤裥褛褴襁襕见观觃规觅视觇览觉觊觋觌觍觎觏觐觑觞触觯詟誉誊讠计订讣认讥讦讧讨让讪讫训议讯记讱讲讳讴讵讶讷许讹论讻讼讽设访诀证诂诃评诅识诇诈诉诊诋诌词诎诏诐译诒诓诔试诖诗诘诙诚诛诜话诞诟诠诡询诣诤该详诧诨诩诪诫诬语诮误诰诱诲诳说诵诶请诸诹诺读诼诽课诿谀谁谂调谄谅谆谇谈谊谋谌谍谎谏谐谑谒谓谔谕谖谗谘谙谚谛谜谝谞谟谠谡谢谣谤谥谦谧谨谩谪谫谬谭谮谯谰谱谲谳谴谵谶谷豮贝贞负贠贡财责贤败账货质贩贪贫贬购贮贯贰贱贲贳贴贵贶贷贸费贺贻贼贽贾贿赀赁赂赃资赅赆赇赈赉赊赋赌赍赎赏赐赑赒赓赔赕赖赗赘赙赚赛赜赝赞赟赠赡赢赣赪赵赶趋趱趸跃跄跖跞践跶跷跸跹跻踊踌踪踬踯蹑蹒蹰蹿躏躜躯车轧轨轩轪轫转轭轮软轰轱轲轳轴轵轶轷轸轹轺轻轼载轾轿辀辁辂较辄辅辆辇辈辉辊辋辌辍辎辏辐辑辒输辔辕辖辗辘辙辚辞辩辫边辽达迁过迈运还这进远违连迟迩迳迹适选逊递逦逻遗遥邓邝邬邮邹邺邻郁郄郏郐郑郓郦郧郸酝酦酱酽酾酿释里鉅鉴銮錾钆钇针钉钊钋钌钍钎钏钐钑钒钓钔钕钖钗钘钙钚钛钝钞钟钠钡钢钣钤钥钦钧钨钩钪钫钬钭钮钯钰钱钲钳钴钵钶钷钸钹钺钻钼钽钾钿铀铁铂铃铄铅铆铈铉铊铋铍铎铏铐铑铒铕铗铘铙铚铛铜铝铞铟铠铡铢铣铤铥铦铧铨铪铫铬铭铮铯铰铱铲铳铴铵银铷铸铹铺铻铼铽链铿销锁锂锃锄锅锆锇锈锉锊锋锌锍锎锏锐锑锒锓锔锕锖锗错锚锜锞锟锠锡锢锣锤锥锦锨锩锫锬锭键锯锰锱锲锳锴锵锶锷锸锹锺锻锼锽锾锿镀镁镂镃镆镇镈镉镊镌镍镎镏镐镑镒镕镖镗镙镚镛镜镝镞镟镠镡镢镣镤镥镦镧镨镩镪镫镬镭镮镯镰镱镲镳镴镶长门闩闪闫闬闭问闯闰闱闲闳间闵闶闷闸闹闺闻闼闽闾闿阀阁阂阃阄阅阆阇阈阉阊阋阌阍阎阏阐阑阒阓阔阕阖阗阘阙阚阛队阳阴阵阶际陆陇陈陉陕陧陨险随隐隶隽难雏雠雳雾霁霉霭靓静靥鞑鞒鞯鞴韦韧韨韩韪韫韬韵页顶顷顸项顺须顼顽顾顿颀颁颂颃预颅领颇颈颉颊颋颌颍颎颏颐频颒颓颔颕颖颗题颙颚颛颜额颞颟颠颡颢颣颤颥颦颧风飏飐飑飒飓飔飕飖飗飘飙飚飞飨餍饤饥饦饧饨饩饪饫饬饭饮饯饰饱饲饳饴饵饶饷饸饹饺饻饼饽饾饿馀馁馂馃馄馅馆馇馈馉馊馋馌馍馎馏馐馑馒馓馔馕马驭驮驯驰驱驲驳驴驵驶驷驸驹驺驻驼驽驾驿骀骁骂骃骄骅骆骇骈骉骊骋验骍骎骏骐骑骒骓骔骕骖骗骘骙骚骛骜骝骞骟骠骡骢骣骤骥骦骧髅髋髌鬓魇魉鱼鱽鱾鱿鲀鲁鲂鲄鲅鲆鲇鲈鲉鲊鲋鲌鲍鲎鲏鲐鲑鲒鲓鲔鲕鲖鲗鲘鲙鲚鲛鲜鲝鲞鲟鲠鲡鲢鲣鲤鲥鲦鲧鲨鲩鲪鲫鲬鲭鲮鲯鲰鲱鲲鲳鲴鲵鲶鲷鲸鲹鲺鲻鲼鲽鲾鲿鳀鳁鳂鳃鳄鳅鳆鳇鳈鳉鳊鳋鳌鳍鳎鳏鳐鳑鳒鳓鳔鳕鳖鳗鳘鳙鳛鳜鳝鳞鳟鳠鳡鳢鳣鸟鸠鸡鸢鸣鸤鸥鸦鸧鸨鸩鸪鸫鸬鸭鸮鸯鸰鸱鸲鸳鸴鸵鸶鸷鸸鸹鸺鸻鸼鸽鸾鸿鹀鹁鹂鹃鹄鹅鹆鹇鹈鹉鹊鹋鹌鹍鹎鹏鹐鹑鹒鹓鹔鹕鹖鹗鹘鹚鹛鹜鹝鹞鹟鹠鹡鹢鹣鹤鹥鹦鹧鹨鹩鹪鹫鹬鹭鹯鹰鹱鹲鹳鹴鹾麦麸黄黉黡黩黪黾鼋鼌鼍鼗鼹齄齐齑齿龀龁龂龃龄龅龆龇龈龉龊龋龌龙龚龛龟志制咨只里系范松没尝尝闹面准钟别闲干尽脏拼"
9
+ )
10
+ # 茶几, 杰撻撻, 岳高頭, 划, 扑, 霉, 撑, 余(姓), 朴(姓), 疱疹, 涂(姓), 海璇, 卧底
11
+ hans_chars = hans_chars - set("晒吓咔制斗响面污准揸志游系拼扎捻谷佣着檐伙只御里姜干后凌涌松噼呵札云咸丑几杰郁岳划扑霉撑痴鉅硅余朴疱涂璇卧")
12
+
13
+
14
+ def fix_chinese_text_generation_space(text):
15
+ output_text = text
16
+ output_text = re.sub(
17
+ r'([\u3401-\u9FFF+——!,。?、~@#¥%…&*():;《)《》“”()»〔〕\-!$^*()_+|~=`{}\[\]:";\'<>?,.\/\\])\s([^0-9a-zA-Z])',
18
+ r"\1\2",
19
+ output_text,
20
+ )
21
+ output_text = re.sub(
22
+ r'([^0-9a-zA-Z])\s([\u3401-\u9FFF+——!,。?、~@#¥%…&*():;《)《》“”()»〔〕\-!$^*()_+|~=`{}\[\]:";\'<>?,.\/\\])',
23
+ r"\1\2",
24
+ output_text,
25
+ )
26
+ output_text = re.sub(
27
+ r'([\u3401-\u9FFF+——!,。?、~@#¥%…&*():;《)《》“”()»〔〕\-!$^*()_+|~=`{}\[\]:";\'<>?,.\/\\])\s([a-zA-Z0-9])',
28
+ r"\1\2",
29
+ output_text,
30
+ )
31
+ output_text = re.sub(
32
+ r'([a-zA-Z0-9])\s([\u3401-\u9FFF+——!,。?、~@#¥%…&*():;《)《》“”()»〔〕\-!$^*()_+|~=`{}\[\]:";\'<>?,.\/\\])',
33
+ r"\1\2",
34
+ output_text,
35
+ )
36
+ output_text = re.sub(r"$\s([0-9])", r"$\1", output_text)
37
+ output_text = re.sub(",", ",", output_text)
38
+ output_text = re.sub(
39
+ r"([0-9]),([0-9])", r"\1,\2", output_text
40
+ ) # fix comma in numbers
41
+ # fix multiple commas
42
+ output_text = re.sub(r"\s?[,]+\s?", ",", output_text)
43
+ output_text = re.sub(r"\s?[、]+\s?", "、", output_text)
44
+ # fix period
45
+ output_text = re.sub(r"\s?[。]+\s?", "。", output_text)
46
+ # fix ...
47
+ output_text = re.sub(r"\s?\.{3,}\s?", "...", output_text)
48
+ # fix exclamation mark
49
+ output_text = re.sub(r"\s?[!!]+\s?", "!", output_text)
50
+ # fix question mark
51
+ output_text = re.sub(r"\s?[??]+\s?", "?", output_text)
52
+ # fix colon
53
+ output_text = re.sub(r"\s?[::]+\s?", ":", output_text)
54
+ # fix quotation mark
55
+ output_text = re.sub(r'\s?(["“”\']+)\s?', r"\1", output_text)
56
+ # fix semicolon
57
+ output_text = re.sub(r"\s?[;;]+\s?", ";", output_text)
58
+ # fix dots
59
+ output_text = re.sub(r"\s?([~●.…]+)\s?", r"\1", output_text)
60
+ output_text = re.sub(r"\s?\[…\]\s?", "", output_text)
61
+ output_text = re.sub(r"\s?\[\.\.\.\]\s?", "", output_text)
62
+ output_text = re.sub(r"\s?\.{3,}\s?", "...", output_text)
63
+ # fix slash
64
+ output_text = re.sub(r"\s?[//]+\s?", "/", output_text)
65
+ # fix dollar sign
66
+ output_text = re.sub(r"\s?[$$]+\s?", "$", output_text)
67
+ # fix @
68
+ output_text = re.sub(r"\s?([@@]+)\s?", "@", output_text)
69
+ # fix baskets
70
+ output_text = re.sub(r"\s?([\[\(<〖【「『()』」】〗>\)\]]+)\s?", r"\1", output_text)
71
+
72
+ return output_text
73
+
74
+
75
+ class TranslationPipeline(TranslationPipeline):
76
+ def __init__(
77
+ self,
78
+ model,
79
+ tokenizer,
80
+ device=None,
81
+ max_length=512,
82
+ src_lang=None,
83
+ tgt_lang=None,
84
+ num_beams=2,
85
+ do_sample=True,
86
+ repetition_penalty=1.5,
87
+ length_penalty=1.0,
88
+ sequence_bias=None,
89
+ ):
90
+ self.model = model
91
+ self.tokenizer = tokenizer
92
+
93
+ def get_tokens_as_tuple(word):
94
+ return tuple(self.tokenizer([word], add_special_tokens=False).input_ids[0])
95
+
96
+ sequence_bias = (
97
+ dict(
98
+ zip(
99
+ [get_tokens_as_tuple(char) for char in hans_chars],
100
+ [float("-inf")] * len(hans_chars),
101
+ )
102
+ )
103
+ if sequence_bias is None
104
+ else sequence_bias
105
+ )
106
+
107
+ super().__init__(
108
+ self.model,
109
+ self.tokenizer,
110
+ device=device,
111
+ max_length=max_length,
112
+ src_lang=src_lang,
113
+ tgt_lang=tgt_lang,
114
+ num_beams=num_beams,
115
+ do_sample=do_sample,
116
+ repetition_penalty=repetition_penalty,
117
+ length_penalty=length_penalty,
118
+ sequence_bias=sequence_bias,
119
+ )
120
+
121
+ def _load_tokenizer(self):
122
+ return BertTokenizer.from_pretrained(self.model_name_or_path)
123
+
124
+ def _load_model(self):
125
+ return BartForConditionalGeneration.from_pretrained(self.model_name_or_path)
126
+
127
+ def postprocess(
128
+ self,
129
+ model_outputs,
130
+ return_type=ReturnType.TEXT,
131
+ clean_up_tokenization_spaces=True,
132
+ ):
133
+ records = super().postprocess(
134
+ model_outputs,
135
+ return_type=return_type,
136
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
137
+ )
138
+ for rec in records:
139
+ translation_text = fix_chinese_text_generation_space(
140
+ rec["translation_text"].strip()
141
+ )
142
+
143
+ rec["translation_text"] = translation_text
144
+ return records
145
+
146
+
147
+ if __name__ == "__main__":
148
+ from transformers import BertTokenizerFast
149
+ from optimum.onnxruntime import ORTModelForSeq2SeqLM
150
+
151
+ model_id = "indiejoseph/bart-translation-zh-yue-onnx"
152
+
153
+ tokenizer = BertTokenizerFast.from_pretrained(model_id)
154
+ model = ORTModelForSeq2SeqLM.from_pretrained(model_id, use_cache=False)
155
+ pipe = TranslationPipeline(model=model, tokenizer=tokenizer)
156
+
157
+ print(
158
+ pipe(
159
+ "近年成为许多港人热门移居地的英国中部城巿诺定咸(又译诺丁汉,Nottingham),多年来一直面对财政困境,市议会周三(11月29日)宣布破产,是继英国第二大城市伯明翰今年9月宣布破产后,近期「爆煲」的另一个英国主要城市。诺定咸除了维持法例规定必须提供的服务外,巿政府将暂停所有非必要的公共开支。",
160
+ max_length=300,
161
+ )
162
+ )
translator.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections.abc import Callable
2
+ from typing import List, Union
3
+ from datasets import Dataset
4
+ import re
5
+ import pickle
6
+ import unicodedata
7
+ import os
8
+ from transformers.pipelines.pt_utils import KeyDataset
9
+
10
+
11
+ class Translator:
12
+ def __init__(
13
+ self,
14
+ pipe: Callable,
15
+ max_length: int = 500,
16
+ batch_size: int = 16,
17
+ save_every_step=100,
18
+ text_key="text",
19
+ save_filename=None,
20
+ ):
21
+ self.pipe = pipe
22
+ self.max_length = (
23
+ pipe.model.config.max_length
24
+ if hasattr(pipe.model.config, "max_length")
25
+ else max_length
26
+ )
27
+ self.batch_size = batch_size
28
+ self.save_every_step = save_every_step
29
+ self.save_filename = save_filename
30
+ self.text_key = text_key
31
+
32
+ def _is_chinese(self, text: str) -> bool:
33
+ return (
34
+ re.search(
35
+ r"[\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002a6df\U0002a700-\U0002ebef\U00030000-\U000323af\ufa0e\ufa0f\ufa11\ufa13\ufa14\ufa1f\ufa21\ufa23\ufa24\ufa27\ufa28\ufa29\u3006\u3007][\ufe00-\ufe0f\U000e0100-\U000e01ef]?",
36
+ text,
37
+ )
38
+ is not None
39
+ )
40
+
41
+ def _split_sentences(self, text: str) -> List[str]:
42
+ if len(text) <= self.max_length:
43
+ return [text]
44
+
45
+ delimiter = set()
46
+ delimiter.update("。!?;…!?")
47
+ sent_list = []
48
+ sent = text
49
+
50
+ while len(sent) > self.max_length:
51
+ # find the index of delimiter near the max_length
52
+ for i in range(self.max_length, 0, -1):
53
+ if text[i] in delimiter:
54
+ sent_list.append(sent[0 : i + 1])
55
+ sent = sent[i + 1 :]
56
+ break
57
+
58
+ if len(sent) > 0:
59
+ sent_list.append(sent)
60
+
61
+ return sent_list
62
+
63
+ def _preprocess(self, text: str) -> (str, str):
64
+ lines = text.split("\n")
65
+ sentences = []
66
+ template = text.replace("{", "{{").replace("}", "}}")
67
+ chunk_index = 0
68
+
69
+ for line in lines:
70
+ sentence = line.strip()
71
+ if len(sentence) > 0 and self._is_chinese(sentence):
72
+ chunks = self._split_sentences(sentence)
73
+
74
+ for chunk in chunks:
75
+ sentences.append(chunk)
76
+ chunk = chunk.replace("{", "{{").replace("}", "}}")
77
+ template = template.replace(chunk, "{%d}" % chunk_index, 1)
78
+ chunk_index += 1
79
+
80
+ return sentences, template
81
+
82
+ def _postprocess(
83
+ self, template: str, src_sentences: List[str], translations: List[str]
84
+ ) -> str:
85
+ processed = []
86
+ alphanumeric_regex = re.compile(
87
+ "([a-zA-Za-zA-Z0-9\d+'\",,(\()\)::;;“”。\.\??\!!‘’]+)"
88
+ )
89
+
90
+ def hash_text(text: List[str]) -> str:
91
+ text = "|".join(text)
92
+ puncts_map = str.maketrans(",;:()。?!“”‘’", ",;:().?!\"\"''")
93
+ text = text.translate(puncts_map)
94
+ return unicodedata.normalize("NFKC", text).lower()
95
+
96
+ for i, p in enumerate(translations):
97
+ src_sentence = src_sentences[i]
98
+ # p = re.sub(',', ',', p) # replace all commas
99
+ # p = re.sub(';', ';', p) # replace semi-colon
100
+ # p = re.sub(':', ':', p) # replace colon
101
+ # p = re.sub('\(', '(', p) # replace round basket
102
+ # p = re.sub('\)', ')', p) # replace round basket
103
+ # p = re.sub(r'([\d]),([\d])', r'\1,\2', p)
104
+
105
+ src_matches = re.findall(alphanumeric_regex, src_sentence)
106
+ translated_matches = re.findall(alphanumeric_regex, p)
107
+
108
+ # length not match or no match
109
+ if (
110
+ len(src_matches) != len(translated_matches)
111
+ or len(src_matches) == 0
112
+ or len(translated_matches) == 0
113
+ ):
114
+ processed.append(p)
115
+ continue
116
+
117
+ # normalize full-width to half-width and lower case
118
+ src_hashes = hash_text(src_matches)
119
+ translated_hashes = hash_text(translated_matches)
120
+
121
+ if src_hashes != translated_hashes:
122
+ processed.append(p)
123
+ continue
124
+
125
+ # replace all matches
126
+ for j in range(len(src_matches)):
127
+ p = p.replace(translated_matches[j], src_matches[j], 1)
128
+
129
+ processed.append(p)
130
+
131
+ output = template.format(*processed)
132
+
133
+ return output
134
+
135
+ def _save(self, translations):
136
+ with open("{}.pkl".format(self.save_filename), "wb") as f:
137
+ pickle.dump(translations, f)
138
+
139
+ def __call__(self, inputs: Union[List[str], Dataset]) -> List[str]:
140
+ templates = []
141
+ sentences = []
142
+ sentence_indices = []
143
+ outputs = []
144
+
145
+ if isinstance(inputs, Dataset):
146
+ ds = inputs
147
+ else:
148
+ if isinstance(inputs, str):
149
+ inputs = [inputs]
150
+ ds = Dataset.from_list([{"text": text} for text in inputs])
151
+
152
+ for i, text_input in enumerate(ds):
153
+ chunks, template = self._preprocess(text_input["text"])
154
+ templates.append(template)
155
+ sentence_indices.append([])
156
+
157
+ for chunk in chunks:
158
+ sentences.append(chunk)
159
+ sentence_indices[len(sentence_indices) - 1].append(len(sentences) - 1)
160
+
161
+ resume_from_file = (
162
+ "{}.pkl".format(self.save_filename)
163
+ if os.path.isfile("{}.pkl".format(self.save_filename))
164
+ else None
165
+ )
166
+ translations = (
167
+ []
168
+ if resume_from_file == None
169
+ else pickle.load(open(resume_from_file, "rb"))
170
+ )
171
+
172
+ print("translations:", len(translations))
173
+ print("dataset:", len(translations))
174
+
175
+ if resume_from_file != None:
176
+ print("Resuming from {}({} records)".format(resume_from_file, translations))
177
+
178
+ ds = Dataset.from_list(
179
+ [{"text": text} for text in sentences[len(translations) :]]
180
+ )
181
+ total_records = len(ds)
182
+
183
+ if total_records > 0:
184
+ step = 0
185
+ for out in self.pipe(
186
+ KeyDataset(ds, self.text_key), batch_size=self.batch_size
187
+ ):
188
+ translations.append(out[0])
189
+
190
+ # export generate result every n steps
191
+ if (
192
+ step != 0
193
+ and self.save_filename != None
194
+ and step % self.save_every_step == 0
195
+ ):
196
+ self._save(translations)
197
+
198
+ step += 1
199
+
200
+ if self.save_filename != None and total_records > 0:
201
+ self._save(translations)
202
+
203
+ for i, template in enumerate(templates):
204
+ try:
205
+ src_sentences = [sentences[index] for index in sentence_indices[i]]
206
+ translated_sentences = [
207
+ translations[index]["translation_text"]
208
+ for index in sentence_indices[i]
209
+ ]
210
+ output = self._postprocess(
211
+ template, src_sentences, translated_sentences
212
+ )
213
+ outputs.append(output)
214
+ except Exception as error:
215
+ print(error)
216
+ print(template)
217
+ # print(template, sentence_indices[i], len(translations))
218
+
219
+ return outputs
220
+
221
+
222
+ def fake_pipe(text: List[str], batch_size: str):
223
+ for i in range(len(text)):
224
+ if "Acetaminophen" in text[i]:
225
+ # test case error
226
+ yield [
227
+ {"translation_text": text[i].replace("Acetaminophen", "ACEtaminophen")}
228
+ ]
229
+ if "123" in text[i]:
230
+ yield [{"translation_text": text[i].replace("123", "123")}]
231
+ if "abc" in text[i]:
232
+ yield [{"translation_text": text[i].replace("abc", "ABC")}]
233
+ yield [{"translation_text": text[i]}]
234
+
235
+
236
+ if __name__ == "__main__":
237
+ translator = Translator(fake_pipe, max_length=60)
238
+
239
+ text1 = "对于编写聊天机器人的脚本,你可以采用不同的方法,包括使用基于规则的系统、自然语言处理(NLP)技术和机器学习模型。下面是一个简单的例子,展示如何使用基于规则的方法来构建一个简单的聊天机器人:"
240
+ text2 = """对于编写聊天机器人的脚本,你可以采用不同的方法,包括使用基于规则的系统、自然语言处理(NLP)技术和机器学习模型。下面是一个简单的例子,展示如何使用基于规则的方法来构建一个简单的聊天机器人:
241
+
242
+ ```
243
+ # 设置用于匹配输入的关键字,并定义相应的回答数据字典。
244
+ keywords = {'你好': '你好!很高兴见到你。',
245
+ '再见': '再见!有机会再聊。',
246
+ '你叫什么': '我是一个聊天机器人。',
247
+ '你是谁': '我是一个基于人工智能技术制作的聊天机器人。'}
248
+
249
+ # 定义用于处理用户输入的函数。
250
+ def chatbot(input_text):
251
+ # 遍历关键字数据字典,匹配用户的输入。
252
+ for key in keywords:
253
+ if key in input_text:
254
+ # 如果匹配到了关键字,返回相应的回答。
255
+ return keywords[key]
256
+ # 如果没有找到匹配的关键字,返回默认回答。
257
+ return "对不起,我不知道你在说什么。"
258
+
259
+ # 运行聊天机器人。
260
+ while True:
261
+ # 获取用户输入。
262
+ user_input = input('用户: ')
263
+ # 如果用户输入“再见”,退出程序。
264
+ if user_input == '再见':
265
+ break
266
+ # 处理用户输入,并打印回答。
267
+ print('机器人: ' + chatbot(user_input))
268
+ ```
269
+
270
+ 这是一个非常简单的例子。对于实用的聊天机器人,可能需要使用更复杂的 NLP 技术和机器学习模型,以更好地理解和回答用户的问题。"""
271
+ text3 = "布洛芬(Ibuprofen)同撲熱息痛(Acetaminophen)係兩種常見嘅非處方藥���用於緩解疼痛、發燒同關節痛。"
272
+ text4 = "123 abc def's"
273
+ outputs = translator([text1, text2, text3])
274
+
275
+ # print('Output: ', outputs[0], '\nInput: ', text1)
276
+
277
+ text2_lines = text2.split("\n")
278
+ for i, text in enumerate(outputs[1].split("\n")):
279
+ # fine different line
280
+ if text != text2_lines[i]:
281
+ print("Output: ", text, "\nInput: ", text2_lines[i])
282
+ break
283
+
284
+ assert outputs[0] == text1
285
+ assert outputs[1] == text2
286
+ assert outputs[2] == text3