Bloomberg 面试题：在莎士比亚作品文件夹中统计单词出现次数与行号

题目描述

莎士比亚发表了大量作品。给定一个包含莎士比亚作品的文件夹，实现一个命令行工具：输入一个单词，显示该单词在每个文件中出现的次数以及具体行号。

这本质上是实现一个简化版的 grep + 统计功能。

示例输出

$ word_search "love" ./shakespeare/

romeo_and_juliet.txt:
  Count: 42
  Lines: 5, 23, 45, 67, ...

hamlet.txt:
  Count: 18
  Lines: 12, 89, 156, ...

解题思路

目录遍历：递归遍历指定目录下的所有文本文件
逐行扫描：读取每个文件，逐行查找目标单词
结果收集：记录每个文件中单词出现次数和行号

Python 实现

import os
import re
from collections import defaultdict

def search_word_in_folder(folder_path: str, target_word: str, 
                          case_sensitive: bool = False) -> dict:
    """
    在指定文件夹中搜索单词
    返回: {文件路径: {'count': 次数, 'lines': [行号列表]}}
    """
    results = {}
    
    # 编译正则表达式（单词边界匹配）
    flags = 0 if case_sensitive else re.IGNORECASE
    pattern = re.compile(r'\b' + re.escape(target_word) + r'\b', flags)
    
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if not filename.endswith('.txt'):
                continue
            
            filepath = os.path.join(root, filename)
            file_result = search_word_in_file(filepath, pattern)
            
            if file_result['count'] > 0:
                results[filepath] = file_result
    
    return results

def search_word_in_file(filepath: str, pattern) -> dict:
    """在单个文件中搜索单词"""
    count = 0
    lines = []
    
    try:
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
            for line_num, line in enumerate(f, 1):
                matches = pattern.findall(line)
                if matches:
                    count += len(matches)
                    lines.append(line_num)
    except IOError as e:
        print(f"Error reading {filepath}: {e}")
    
    return {'count': count, 'lines': lines}

def print_results(results: dict, target_word: str):
    """格式化打印结果"""
    if not results:
        print(f"Word '{target_word}' not found in any file.")
        return
    
    total_count = 0
    for filepath, data in sorted(results.items()):
        print(f"\n{os.path.basename(filepath)}:")
        print(f"  Count: {data['count']}")
        print(f"  Lines: {', '.join(map(str, data['lines'][:20]))}", end='')
        if len(data['lines']) > 20:
            print(f" ... and {len(data['lines']) - 20} more")
        else:
            print()
        total_count += data['count']
    
    print(f"\n--- Total occurrences: {total_count} ---")

# 命令行入口
if __name__ == '__main__':
    import sys
    
    if len(sys.argv) < 3:
        print("Usage: python word_search.py <word> <folder_path>")
        sys.exit(1)
    
    word = sys.argv[1]
    folder = sys.argv[2]
    
    results = search_word_in_folder(folder, word)
    print_results(results, word)

进阶功能

1. 构建索引加速查询

如果需要多次查询，可以预先构建倒排索引：

from collections import defaultdict

def build_index(folder_path: str) -> dict:
    """
    构建倒排索引
    返回: {单词: {文件: [行号列表]}}
    """
    index = defaultdict(lambda: defaultdict(list))
    
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if not filename.endswith('.txt'):
                continue
            
            filepath = os.path.join(root, filename)
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                for line_num, line in enumerate(f, 1):
                    words = re.findall(r'\b\w+\b', line.lower())
                    for word in words:
                        index[word][filepath].append(line_num)
    
    return index

def search_index(index: dict, word: str) -> dict:
    """在索引中搜索"""
    word = word.lower()
    if word not in index:
        return {}
    
    return {
        filepath: {'count': len(lines), 'lines': lines}
        for filepath, lines in index[word].items()
    }

2. 支持正则表达式搜索

def search_regex(folder_path: str, regex_pattern: str) -> dict:
    """支持正则表达式搜索"""
    pattern = re.compile(regex_pattern, re.IGNORECASE)
    results = {}
    
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if not filename.endswith('.txt'):
                continue
            
            filepath = os.path.join(root, filename)
            file_result = search_word_in_file(filepath, pattern)
            
            if file_result['count'] > 0:
                results[filepath] = file_result
    
    return results

复杂度分析

时间复杂度：O(N × L)，N 是文件总数，L 是平均文件行数
空间复杂度：O(M) 存储结果，M 是匹配的行数

需要面试辅助服务？联系我们

📧 Email: [email protected]
📱 Phone: +86 17863968105

需要面试真题？立刻联系微信 Coding0201，获得真题。

联系方式

Email: [email protected] Telegram: @OAVOProxy