题目描述
莎士比亚发表了大量作品。给定一个包含莎士比亚作品的文件夹,实现一个命令行工具:输入一个单词,显示该单词在每个文件中出现的次数以及具体行号。
这本质上是实现一个简化版的 grep + 统计功能。
示例输出
$ word_search "love" ./shakespeare/
romeo_and_juliet.txt:
Count: 42
Lines: 5, 23, 45, 67, ...
hamlet.txt:
Count: 18
Lines: 12, 89, 156, ...
解题思路
- 目录遍历:递归遍历指定目录下的所有文本文件
- 逐行扫描:读取每个文件,逐行查找目标单词
- 结果收集:记录每个文件中单词出现次数和行号
Python 实现
import os
import re
from collections import defaultdict
def search_word_in_folder(folder_path: str, target_word: str,
case_sensitive: bool = False) -> dict:
"""
在指定文件夹中搜索单词
返回: {文件路径: {'count': 次数, 'lines': [行号列表]}}
"""
results = {}
# 编译正则表达式(单词边界匹配)
flags = 0 if case_sensitive else re.IGNORECASE
pattern = re.compile(r'\b' + re.escape(target_word) + r'\b', flags)
for root, dirs, files in os.walk(folder_path):
for filename in files:
if not filename.endswith('.txt'):
continue
filepath = os.path.join(root, filename)
file_result = search_word_in_file(filepath, pattern)
if file_result['count'] > 0:
results[filepath] = file_result
return results
def search_word_in_file(filepath: str, pattern) -> dict:
"""在单个文件中搜索单词"""
count = 0
lines = []
try:
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
for line_num, line in enumerate(f, 1):
matches = pattern.findall(line)
if matches:
count += len(matches)
lines.append(line_num)
except IOError as e:
print(f"Error reading {filepath}: {e}")
return {'count': count, 'lines': lines}
def print_results(results: dict, target_word: str):
"""格式化打印结果"""
if not results:
print(f"Word '{target_word}' not found in any file.")
return
total_count = 0
for filepath, data in sorted(results.items()):
print(f"\n{os.path.basename(filepath)}:")
print(f" Count: {data['count']}")
print(f" Lines: {', '.join(map(str, data['lines'][:20]))}", end='')
if len(data['lines']) > 20:
print(f" ... and {len(data['lines']) - 20} more")
else:
print()
total_count += data['count']
print(f"\n--- Total occurrences: {total_count} ---")
# 命令行入口
if __name__ == '__main__':
import sys
if len(sys.argv) < 3:
print("Usage: python word_search.py <word> <folder_path>")
sys.exit(1)
word = sys.argv[1]
folder = sys.argv[2]
results = search_word_in_folder(folder, word)
print_results(results, word)
进阶功能
1. 构建索引加速查询
如果需要多次查询,可以预先构建倒排索引:
from collections import defaultdict
def build_index(folder_path: str) -> dict:
"""
构建倒排索引
返回: {单词: {文件: [行号列表]}}
"""
index = defaultdict(lambda: defaultdict(list))
for root, dirs, files in os.walk(folder_path):
for filename in files:
if not filename.endswith('.txt'):
continue
filepath = os.path.join(root, filename)
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
for line_num, line in enumerate(f, 1):
words = re.findall(r'\b\w+\b', line.lower())
for word in words:
index[word][filepath].append(line_num)
return index
def search_index(index: dict, word: str) -> dict:
"""在索引中搜索"""
word = word.lower()
if word not in index:
return {}
return {
filepath: {'count': len(lines), 'lines': lines}
for filepath, lines in index[word].items()
}
2. 支持正则表达式搜索
def search_regex(folder_path: str, regex_pattern: str) -> dict:
"""支持正则表达式搜索"""
pattern = re.compile(regex_pattern, re.IGNORECASE)
results = {}
for root, dirs, files in os.walk(folder_path):
for filename in files:
if not filename.endswith('.txt'):
continue
filepath = os.path.join(root, filename)
file_result = search_word_in_file(filepath, pattern)
if file_result['count'] > 0:
results[filepath] = file_result
return results
复杂度分析
- 时间复杂度:O(N × L),N 是文件总数,L 是平均文件行数
- 空间复杂度:O(M) 存储结果,M 是匹配的行数
需要面试辅助服务?联系我们
- 📧 Email: [email protected]
- 📱 Phone: +86 17863968105
需要面试真题? 立刻联系微信 Coding0201,获得真题。