From eacef5925d340534951337b7c3cd5d40075a356a Mon Sep 17 00:00:00 2001 From: shibing624 Date: Sat, 3 Feb 2024 20:57:28 +0800 Subject: [PATCH] update confusion --- examples/kenlm/my_custom_confusion.txt | 2 +- examples/kenlm/use_custom_confusion.py | 2 +- examples/macbert/model_correction_pipeline_demo.py | 7 ++++--- pycorrector/confusion_corrector.py | 2 +- requirements-dev.txt | 3 ++- requirements.txt | 3 +-- 6 files changed, 10 insertions(+), 9 deletions(-) diff --git a/examples/kenlm/my_custom_confusion.txt b/examples/kenlm/my_custom_confusion.txt index cf6188e3..b73939ea 100644 --- a/examples/kenlm/my_custom_confusion.txt +++ b/examples/kenlm/my_custom_confusion.txt @@ -11,4 +11,4 @@ iphonex iphoneX happt happen shylock shylock 份额 份额 -天俺门 天安门 \ No newline at end of file +天氨门 天安门 \ No newline at end of file diff --git a/examples/kenlm/use_custom_confusion.py b/examples/kenlm/use_custom_confusion.py index 29ce492d..61b05a2e 100644 --- a/examples/kenlm/use_custom_confusion.py +++ b/examples/kenlm/use_custom_confusion.py @@ -19,7 +19,7 @@ '上述承诺内容系本人真实意思表示', # 正常 '大家一哄而伞怎么回事', # 成语 '交通银行的份额没有减少', # 误杀 - '我爱北京天俺门', # 漏召回 + '北京天氨门,我爱北京天氨门', # 漏召回 ] m = Corrector() print(m.correct_batch(error_sentences)) diff --git a/examples/macbert/model_correction_pipeline_demo.py b/examples/macbert/model_correction_pipeline_demo.py index dde4c4c6..ab981a27 100644 --- a/examples/macbert/model_correction_pipeline_demo.py +++ b/examples/macbert/model_correction_pipeline_demo.py @@ -6,8 +6,7 @@ import sys sys.path.append("../..") -from pycorrector import MacBertCorrector -from pycorrector import ConfusionCorrector +from pycorrector import MacBertCorrector, ConfusionCorrector if __name__ == '__main__': error_sentences = [ @@ -30,11 +29,13 @@ '因为爸爸在看录音机,所以我没得看', '不过在许多传统国家,女人向未得到平等', '我想喝小明同学。', # 漏召回 + '北京天氨门,我爱北京天氨门', # 漏召回 ] model1 = MacBertCorrector() # add confusion corrector for post process - confusion_dict = {"喝小明同学": "喝小茗同学", "老人让坐": "老人让座", "平净": "平静", "分知": "分支"} + confusion_dict = {"喝小明同学": "喝小茗同学", "老人让坐": "老人让座", "平净": "平静", "分知": "分支", + "天氨门": "天安门"} model2 = ConfusionCorrector(custom_confusion_path_or_dict=confusion_dict) for line in error_sentences: r1 = model1.correct(line) diff --git a/pycorrector/confusion_corrector.py b/pycorrector/confusion_corrector.py index 1867b858..7c1e99e4 100644 --- a/pycorrector/confusion_corrector.py +++ b/pycorrector/confusion_corrector.py @@ -58,7 +58,7 @@ def correct(self, sentence: str): # 自定义混淆集加入疑似错误词典 for err, truth in self.custom_confusion.items(): for i in re.finditer(err, sentence): - start,end = i.span() + start, end = i.span() corrected_sentence = corrected_sentence[:start] + truth + corrected_sentence[end:] details.append((err, truth, start)) return {'source': sentence, 'target': corrected_sentence, 'errors': details} diff --git a/requirements-dev.txt b/requirements-dev.txt index df4b98d4..0bb077ea 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -13,4 +13,5 @@ datasets tensorboardX paddlenlp paddlepaddle -pytest \ No newline at end of file +pytest +kenlm \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4c7c46eb..5c9d9ecc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,5 +6,4 @@ datasets numpy pandas six -loguru -kenlm \ No newline at end of file +loguru \ No newline at end of file