diff --git a/pycorrector/confusion_corrector.py b/pycorrector/confusion_corrector.py index 585e14ad..1867b858 100644 --- a/pycorrector/confusion_corrector.py +++ b/pycorrector/confusion_corrector.py @@ -5,6 +5,7 @@ 功能:1)补充纠错对,提升召回率;2)对误杀加白,提升准确率 """ import os +import re from typing import List from loguru import logger @@ -56,10 +57,10 @@ def correct(self, sentence: str): details = [] # 自定义混淆集加入疑似错误词典 for err, truth in self.custom_confusion.items(): - idx = sentence.find(err) - if idx > -1: - corrected_sentence = sentence[:idx] + truth + sentence[(idx + len(err)):] - details.append((err, truth, idx)) + for i in re.finditer(err, sentence): + start,end = i.span() + corrected_sentence = corrected_sentence[:start] + truth + corrected_sentence[end:] + details.append((err, truth, start)) return {'source': sentence, 'target': corrected_sentence, 'errors': details} def correct_batch(self, sentences: List[str]): diff --git a/pycorrector/detector.py b/pycorrector/detector.py index 8dca07c0..7513e308 100644 --- a/pycorrector/detector.py +++ b/pycorrector/detector.py @@ -4,6 +4,7 @@ @description: error word detector """ import os +import re from codecs import open import numpy as np @@ -396,9 +397,8 @@ def _detect(self, sentence, start_idx=0, **kwargs): self.check_detector_initialized() # 1. 自定义混淆集加入疑似错误词典 for confuse in self.custom_confusion: - idx = sentence.find(confuse) - if idx > -1: - maybe_err = [confuse, idx + start_idx, idx + len(confuse) + start_idx, ErrorType.confusion] + for i in re.finditer(confuse, sentence): + maybe_err = [confuse, i.span()[0] + start_idx, i.span()[1] + start_idx, ErrorType.confusion] self._add_maybe_error_item(maybe_err, maybe_errors) # 2. 专名错误检测