Python基础:英文拼写检查器
本文使用Python实现一个简单的英文拼写检查器。
程序主要功能描述如下
- 从用户输入或指定文件读取文本内容
- 文本预处理:忽略非英文内容
- 逐一检查单词拼写(检测单词是否存在于单词库EnglishWords.txt中)
- 针对错误单词,可由用户选择忽略、问号标记、添加到词典、推荐正确单词等选项
- 使用字符串契合度作为指标推荐正确单词
- 生成并输出统计数据
- 将统计数据和修订后的文本存储到指定文件中
- 菜单美化:添加边框
- 错误处理:防止程序因用户输入或文件读写等异常原因崩溃
# --- RyanXin SpellChecker --- import os import time from difflib import SequenceMatcher class SpellChecker(): def __init__(self): '''start the process of SpellChecker''' # initialize English words list if not os.path.exists("EnglishWords.txt"): # file check print("* Error: can not find the file \"EnglishWords.txt\".") return # use the hash set rather than list to speed up matching process self.wordList = set(line.rstrip('\n') for line in open("EnglishWords.txt", "r")) # initialize user dictionary which was kept in the file if os.path.exists("userDictionary.txt"): self.dictionary = set(line.rstrip() for line in open("userDictionary.txt", "r") if line.rstrip()) else: self.dictionary = set() # main loop starts from here print("Welcome to use Spell Checker!" ,end='') while True: if self.displayMenu() == -1: break # quit self.checkingWords() self.generateResults() while True: isRecheck = input("Return to the main menu? (y/n): ").strip().lower() if isRecheck in ('yes','y','no','n'): break if isRecheck[0] == 'n': break # quit # store user dictionary with open("userDictionary.txt", "w") as f: for w in self.dictionary: f.write(w + '\n') def displayMenu(self): ''' Main Menu: Work mode selection''' while True: print() # display main menu self.formatDisplay("1. Spell check a sentence;\n2. Spell check a file;\n0. Quit.", "Work mode selection menu.") while True: # validate user input mode = input("What do you want to do: ").strip() if mode in ('1','2','0'): break else: print("* Invalid input. Please choose again.") if mode == '1': # Spell check a sentence self.texts = input("\nPlease enter a sentence: ").strip() if self.texts: # check empty string break else: print("* The sentence cannot be empty.") # return to main menu continue elif mode == '2': # Spell check a file filename = input("\nPlease enter a filename: ").strip() if not filename: # check empty string print("* The file name cannot be empty.") continue if os.path.exists(filename): # filename validation try: with open(filename, 'r') as f: self.texts = f.read() break except: # possible reading error, e.g. non-text files print("* Error: unexpected error occurred when reading your file.") continue else: print("* Error: The file does not exist.") continue else: # quit return -1 return 0 def checkingWords(self): '''word checking process''' self.start_time = time.perf_counter() self.texts = self.texts.split() # split original input self.original_length = len(self.texts) self.total_number = 0 # non-empty English words self.correct_number = 0 # words spelt correctly self.incorrect_number = 0 # words spelt incorrectly self.added_number = 0 # words added to the user dictionary self.accepted_number = 0 # words changed by the user accepting the suggested word # iterate with every word for i,word in enumerate(self.texts): raw = self.texts[i] # filter out non-alpha characters and convert to lower case letters word = ''.join(x for x in word if x.isalpha()).lower() # store processed word self.texts[i] = word if not word: # skip empty word print("\nSkipped non-alpha word \"" + raw + "\". (" + str(i+1) + "/" + str(self.original_length) + ")") continue self.total_number += 1 print("\nchecking word: \"" + word + "\". (" + str(i+1) + "/" + str(self.original_length) + ")") # words in wordlist or user dictionary will be treated as correct if word in self.wordList or word in self.dictionary: self.correct_number += 1 print('OK') else: # deal with incorrect word self.handleIncorrectWord(i, word) self.end_time = time.perf_counter() def handleIncorrectWord(self, i, word): '''deal with incorrect word''' print("Encountered an incorrect word.") self.formatDisplay("--> " + word + "\n\n1. Ignore;\n2. Mark;\n3. Add to dictionary;\n4. Suggest likely correct spelling.") while True: # validate user input handle_method = input("Please choose the way to handle the incorrect word: ").strip() if handle_method in ('1','2','3','4'): break else: print("* Invalid input. Please choose again.") def markWord(word): '''mark word and update original texts''' marked = '?' + word + '?' self.texts[i] = marked print("\"" + word + "\" has been marked as \"" + marked + "\".") if handle_method == '1': # ignore self.incorrect_number += 1 print("\"" + word + "\" has been ignored.") elif handle_method == '2': # mark self.incorrect_number += 1 markWord(word) elif handle_method == '3': # add to dictionary self.correct_number += 1 self.dictionary.add(word) self.added_number += 1 print("\"" + word + "\" has been added into the dictionary and will be treated as correct word in the future.") else: # give suggestion suggest_word = ('', 0) # (suggested word, max matching ratio) for w in self.wordList.union(self.dictionary): # match with every word in word list and user dictionary score = SequenceMatcher(None, word, w).ratio() if score > suggest_word[1]: suggest_word = (w, score) # ask user accept or not print("- A possible suggestion is \"" + suggest_word[0] + "\".") while True: isAccept = input("Accept this suggestion? (y/n): ").strip().lower() if isAccept in ('yes','y','no','n'): break if isAccept[0] == 'y': # accept the suggestion self.correct_number += 1 self.accepted_number += 1 # modify word and update original texts self.texts[i] = suggest_word[0] print("\"" + word + "\" has been changed to \"" + suggest_word[0] + "\".") else: # reject the suggestion self.incorrect_number += 1 print("The suggestion has been rejected.") # if reject, the word will be marked for user to further check markWord(word) def generateResults(self): # generate summary statistics statistics = "- Original words number: " + str(self.original_length) + ";\n" statistics += "- Total number of English words: " + str(self.total_number) + ";\n" statistics += "- Words spelt correctly: " + str(self.correct_number) + ";\n" statistics += "- Words spelt incorrectly: " + str(self.incorrect_number) + ";\n" statistics += "- Words added to the dictionary: " + str(self.added_number) + ";\n" statistics += "- Suggested words accepted: " + str(self.accepted_number) + ";\n" statistics += "- Spellcheck at: " + time.asctime(time.localtime(time.time()))[4:] + ";\n" statistics += "- The amount of time elapsed: {:.2f}s.".format(self.end_time - self.start_time) print() self.formatDisplay(statistics, "Summary Statistics") # generate file with summary statistics and checked texts while True: try: while True: # ensure file name is not empty result_file_name = input("Name of new file with results: ").strip() if not result_file_name: # check empty string print("* The file name cannot be empty.") else: break with open(result_file_name, 'w') as f: f.write("Spellcheck Statistics\n") f.write(statistics) f.write('\n\n') f.write(' '.join(w for w in self.texts if w)) # skip empty words f.write('\n') print("File \"" + result_file_name + "\" has been successfully created.\n") break except: # possible error: invalid file name characters, etc. print("* Error: unexpected error occurred when writing into the file. Please try again.") def formatDisplay(self, contents, title=None, width=40): '''beautify and display contents with borders''' contents = contents.split('\n') # the top border print("╔" + "═"*width + "╗") # display title area (center) if title: left_distance = (width-len(title))//2 right_distance = width - len(title) - left_distance print("║" + " "*left_distance + title + " "*right_distance + "║") # center the title print("║" + " " + "─"*(width-2) + " " + "║") # Dividing line # display all contents (left align) for line in contents: print("║" + line + " "*(width-len(line)) + "║") # the bottom border print("╚" + "═"*width + "╝") if __name__ == '__main__': SpellChecker()
import os import time from difflib import SequenceMatcher class SpellChecker(): def __init__(self): '''start the process of SpellChecker''' # initialize English words list if not os.path.exists("EnglishWords.txt"): # file check print("* Error: can not find the file \"EnglishWords.txt\".") return # use the hash set rather than list to speed up matching process self.wordList = set(line.rstrip('\n') for line in open("EnglishWords.txt", "r")) # initialize user dictionary which was kept in the file if os.path.exists("userDictionary.txt"): self.dictionary = set(line.rstrip() for line in open("userDictionary.txt", "r") if line.rstrip()) else: self.dictionary = set() # main loop starts from here print("Welcome to use Spell Checker!" ,end='') while True: if self.displayMenu() == -1: break # quit self.checkingWords() self.generateResults() while True: isRecheck = input("Return to the main menu? (y/n): ").strip().lower() if isRecheck in ('yes','y','no','n'): break if isRecheck[0] == 'n': break # quit # store user dictionary with open("userDictionary.txt", "w") as f: for w in self.dictionary: f.write(w + '\n') def displayMenu(self): ''' Main Menu: Work mode selection''' while True: print() # display main menu self.formatDisplay("1. Spell check a sentence;\n2. Spell check a file;\n0. Quit.", "Work mode selection menu.") while True: # validate user input mode = input("What do you want to do: ").strip() if mode in ('1','2','0'): break else: print("* Invalid input. Please choose again.") if mode == '1': # Spell check a sentence self.texts = input("\nPlease enter a sentence: ").strip() if self.texts: # check empty string break else: print("* The sentence cannot be empty.") # return to main menu continue elif mode == '2': # Spell check a file filename = input("\nPlease enter a filename: ").strip() if not filename: # check empty string print("* The file name cannot be empty.") continue if os.path.exists(filename): # filename validation try: with open(filename, 'r') as f: self.texts = f.read() break except: # possible reading error, e.g. non-text files print("* Error: unexpected error occurred when reading your file.") continue else: print("* Error: The file does not exist.") continue else: # quit return -1 return 0 def checkingWords(self): '''word checking process''' self.start_time = time.perf_counter() self.texts = self.texts.split() # split original input self.original_length = len(self.texts) self.total_number = 0 # non-empty English words self.correct_number = 0 # words spelt correctly self.incorrect_number = 0 # words spelt incorrectly self.added_number = 0 # words added to the user dictionary self.accepted_number = 0 # words changed by the user accepting the suggested word for i,word in enumerate(self.texts): # iterate with every word word = self.processWord(word) if not word: # skip empty word print("\nSkipped non-alpha word \"" + self.texts[i] + "\". (" + str(i+1) + "/" + str(self.original_length) + ")") continue self.total_number += 1 print("\nchecking word: \"" + word + "\". (" + str(i+1) + "/" + str(self.original_length) + ")") # words in wordlist or user dictionary will be treated as correct if word in self.wordList or word in self.dictionary: self.correct_number += 1 print('OK') else: # deal with incorrect word self.handleIncorrectWord(i, word) self.end_time = time.perf_counter() def processWord(self, word): '''extract pure English word''' wd = ''.join(x for x in word if x.isalpha()) # filter out non-alpha characters wd = wd.lower() # transform into lower case letters if not wd: # check if wd is empty return '' # back up non-alpha characters in both sides of the word for l in range(len(word)): if word[l].isalpha(): break for r in range(len(word)-1, -1, -1): if word[r].isalpha(): break self.left_sides = word[:l] self.right_sides = word[r+1:] return wd # return processed word def handleIncorrectWord(self, i, word): '''deal with incorrect word''' print("Encountered an incorrect word.") self.formatDisplay("--> " + word + "\n\n1. Ignore;\n2. Mark;\n3. Add to dictionary;\n4. Suggest likely correct spelling.") while True: # validate user input handle_method = input("Please choose the way to handle the incorrect word: ").strip() if handle_method in ('1','2','3','4'): break else: print("* Invalid input. Please choose again.") def markWord(word): '''mark word and restore non-alpha characters''' marked = '?' + word + '?' self.texts[i] = self.left_sides + marked + self.right_sides print("\"" + word + "\" has been marked as \"" + marked + "\".") if handle_method == '1': # ignore self.incorrect_number += 1 print("\"" + word + "\" has been ignored.") elif handle_method == '2': # mark self.incorrect_number += 1 markWord(word) elif handle_method == '3': # add to dictionary self.correct_number += 1 self.dictionary.add(word) self.added_number += 1 print("\"" + word + "\" has been added into the dictionary and will be treated as correct word in the future.") else: # give suggestion suggest_word = ('', 0) # (suggested word, max matching ratio) for w in self.wordList.union(self.dictionary): # match with every word in word list and user dictionary score = SequenceMatcher(None, word, w).ratio() if score > suggest_word[1]: suggest_word = (w, score) # ask user accept or not print("- A possible suggestion is \"" + suggest_word[0] + "\".") while True: isAccept = input("Accept this suggestion? (y/n): ").strip().lower() if isAccept in ('yes','y','no','n'): break if isAccept[0] == 'y': # accept the suggestion self.correct_number += 1 self.accepted_number += 1 # modify word and restore non-alpha characters self.texts[i] = self.left_sides + suggest_word[0] + self.right_sides print("\"" + word + "\" has been changed to \"" + suggest_word[0] + "\".") else: # reject the suggestion self.incorrect_number += 1 print("The suggestion has been rejected.") markWord(word) def generateResults(self): # generate summary statistics statistics = "- Original words number: " + str(self.original_length) + ";\n" statistics += "- Total number of English words: " + str(self.total_number) + ";\n" statistics += "- Words spelt correctly: " + str(self.correct_number) + ";\n" statistics += "- Words spelt incorrectly: " + str(self.incorrect_number) + ";\n" statistics += "- Words added to the dictionary: " + str(self.added_number) + ";\n" statistics += "- Suggested words accepted: " + str(self.accepted_number) + ";\n" statistics += "- Spellcheck at: " + time.asctime(time.localtime(time.time()))[4:] + ";\n" statistics += "- The amount of time elapsed: {:.2f}s.".format(self.end_time - self.start_time) print() self.formatDisplay(statistics, "Summary Statistics") # generate file with summary statistics and checked texts while True: try: while True: # ensure file name is not empty result_file_name = input("Name of new file with results: ").strip() if not result_file_name: # check empty string print("* The file name cannot be empty.") else: break with open(result_file_name, 'w') as f: f.write("Spellcheck Statistics\n") f.write(statistics) f.write('\n\n') f.write(' '.join(self.texts)) f.write('\n') print("File \"" + result_file_name + "\" has been successfully created.\n") break except: # possible error: invalid file name characters, etc. print("* Error: unexpected error occurred when writing into the file. Please try again.") def formatDisplay(self, contents, title=None, width=40): '''beautify and display contents with borders''' contents = contents.split('\n') # the top border print("╔" + "═"*width + "╗") # display title area (center) if title: left_distance = (width-len(title))//2 right_distance = width - len(title) - left_distance print("║" + " "*left_distance + title + " "*right_distance + "║") # center the title print("║" + " " + "─"*(width-2) + " " + "║") # Dividing line # display all contents (left align) for line in contents: print("║" + line + " "*(width-len(line)) + "║") # the bottom border print("╚" + "═"*width + "╝") if __name__ == '__main__': SpellChecker()