简单的搜索引擎:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace TextSearchSummarizer
{
// 搜索模式枚举
public enum SearchMode
{
ExactMatch, // 精确匹配
ContainsMatch, // 包含匹配
FuzzyMatch, // 模糊匹配
RegexMatch, // 正则表达式匹配
SemanticMatch // 语义匹配(基于关键词相关性)
}
// 搜索结果项
public class SearchResultItem
{
public string MatchedText { get; set; } = string.Empty;
public string Context { get; set; } = string.Empty;
public string HighlightedContext { get; set; } = string.Empty;
public int StartPosition { get; set; }
public int EndPosition { get; set; }
public double RelevanceScore { get; set; }
public string MatchedKeyword { get; set; } = string.Empty;
public int ContextLength { get; set; }
public int LineNumber { get; set; }
}
// 摘要生成结果
public class SummaryResult
{
public string Summary { get; set; } = string.Empty;
public List<SearchResultItem> MatchedItems { get; set; } = new();
public Dictionary<string, int> KeywordFrequency { get; set; } = new();
public int TotalMatches { get; set; }
public double AverageRelevanceScore { get; set; }
public TimeSpan ProcessingTime { get; set; }
public string HighlightedSummary { get; set; } = string.Empty;
public void DisplaySummary()
{
Console.WriteLine($"=== 搜索摘要结果 ===");
Console.WriteLine($"总匹配数: {TotalMatches}");
Console.WriteLine($"平均相关度: {AverageRelevanceScore:F2}");
Console.WriteLine($"处理时间: {ProcessingTime.TotalMilliseconds:F0} ms");
Console.WriteLine($"摘要长度: {Summary.Length} 字符\n");
Console.WriteLine($"关键词频率:");
foreach (var kv in KeywordFrequency.OrderByDescending(x => x.Value))
{
Console.WriteLine($" {kv.Key}: {kv.Value} 次");
}
Console.WriteLine($"\n摘要内容:");
Console.WriteLine($"{Summary}");
}
}
// 搜索配置
public class SearchConfig
{
public SearchMode SearchMode { get; set; } = SearchMode.ContainsMatch;
public int ContextLength { get; set; } = 200; // 上下文长度
public int MaxResults { get; set; } = 100; // 最大结果数
public int MaxSummaryLength { get; set; } = 1000; // 最大摘要长度
public bool IgnoreCase { get; set; } = true; // 忽略大小写
public bool RemoveDuplicates { get; set; } = true; // 去重
public double MinRelevanceScore { get; set; } = 0.0; // 最小相关度阈值
public bool HighlightKeywords { get; set; } = true; // 高亮关键词
public string HighlightStart { get; set; } = "**"; // 高亮开始标记
public string HighlightEnd { get; set; } = "**"; // 高亮结束标记
public bool SortByRelevance { get; set; } = true; // 按相关度排序
public bool IncludeLineNumbers { get; set; } = false; // 包含行号
}
// 主要的文本搜索摘要生成器
public class TextSearchSummarizer
{
private readonly SearchConfig _config;
public TextSearchSummarizer(SearchConfig? config = null)
{
_config = config ?? new SearchConfig();
}
// 主要的搜索和摘要生成方法
public async Task<SummaryResult> SearchAndSummarizeAsync(string text, string[] keywords)
{
var startTime = DateTime.Now;
var result = new SummaryResult();
try
{
if (string.IsNullOrEmpty(text) || keywords == null || keywords.Length == 0)
{
return result;
}
// 1. 执行搜索
var searchResults = await SearchTextAsync(text, keywords);
// 2. 过滤和排序结果
var filteredResults = FilterAndSortResults(searchResults);
// 3. 生成摘要
var summary = GenerateSummary(filteredResults);
// 4. 计算统计信息
var keywordFreq = CalculateKeywordFrequency(filteredResults);
// 5. 生成高亮摘要
var highlightedSummary = _config.HighlightKeywords
? HighlightKeywords(summary, keywords)
: summary;
// 填充结果
result.Summary = summary;
result.HighlightedSummary = highlightedSummary;
result.MatchedItems = filteredResults;
result.KeywordFrequency = keywordFreq;
result.TotalMatches = filteredResults.Count;
result.AverageRelevanceScore = filteredResults.Any()
? filteredResults.Average(r => r.RelevanceScore)
: 0;
result.ProcessingTime = DateTime.Now - startTime;
return result;
}
catch (Exception ex)
{
throw new InvalidOperationException($"搜索和摘要生成失败: {ex.Message}", ex);
}
}
// 执行文本搜索
private async Task<List<SearchResultItem>> SearchTextAsync(string text, string[] keywords)
{
return await Task.Run(() =>
{
var results = new List<SearchResultItem>();
var lines = text.Split(new[] { '\r', '\n' }, StringSplitOptions.None);
var lineStartPositions = GetLineStartPositions(text);
foreach (var keyword in keywords)
{
if (string.IsNullOrWhiteSpace(keyword)) continue;
var keywordResults = _config.SearchMode switch
{
SearchMode.ExactMatch => SearchExactMatch(text, keyword, lines, lineStartPositions),
SearchMode.ContainsMatch => SearchContainsMatch(text, keyword, lines, lineStartPositions),
SearchMode.FuzzyMatch => SearchFuzzyMatch(text, keyword, lines, lineStartPositions),
SearchMode.RegexMatch => SearchRegexMatch(text, keyword, lines, lineStartPositions),
SearchMode.SemanticMatch => SearchSemanticMatch(text, keyword, lines, lineStartPositions),
_ => SearchContainsMatch(text, keyword, lines, lineStartPositions)
};
results.AddRange(keywordResults);
}
return results;
});
}
// 获取每行的起始位置
private int[] GetLineStartPositions(string text)
{
var positions = new List<int> { 0 };
for (int i = 0; i < text.Length; i++)
{
if (text[i] == '\n')
{
positions.Add(i + 1);
}
}
return positions.ToArray();
}
// 精确匹配搜索
private List<SearchResultItem> SearchExactMatch(string text, string keyword, string[] lines, int[] lineStartPositions)
{
var results = new List<SearchResultItem>();
var comparison = _config.IgnoreCase ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal;
int startIndex = 0;
while (startIndex < text.Length)
{
int foundIndex = text.IndexOf(keyword, startIndex, comparison);
if (foundIndex == -1) break;
var item = CreateSearchResultItem(text, keyword, foundIndex, foundIndex + keyword.Length, lines, lineStartPositions);
item.RelevanceScore = 1.0; // 精确匹配得分最高
results.Add(item);
startIndex = foundIndex + 1;
}
return results;
}
// 包含匹配搜索
private List<SearchResultItem> SearchContainsMatch(string text, string keyword, string[] lines, int[] lineStartPositions)
{
return SearchExactMatch(text, keyword, lines, lineStartPositions); // 与精确匹配相同
}
// 模糊匹配搜索
private List<SearchResultItem> SearchFuzzyMatch(string text, string keyword, string[] lines, int[] lineStartPositions)
{
var results = new List<SearchResultItem>();
var words = text.Split(new[] { ' ', '\t', '\r', '\n', '.', ',', '!', '?', ';', ':' },
StringSplitOptions.RemoveEmptyEntries);
var comparison = _config.IgnoreCase ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal;
for (int i = 0; i < words.Length; i++)
{
var word = words[i];
var similarity = CalculateStringSimilarity(keyword, word);
if (similarity > 0.7) // 相似度阈值
{
int wordPosition = FindWordPosition(text, word, i);
if (wordPosition >= 0)
{
var item = CreateSearchResultItem(text, word, wordPosition, wordPosition + word.Length, lines, lineStartPositions);
item.RelevanceScore = similarity;
item.MatchedKeyword = keyword;
results.Add(item);
}
}
}
return results;
}
// 正则表达式搜索
private List<SearchResultItem> SearchRegexMatch(string text, string pattern, string[] lines, int[] lineStartPositions)
{
var results = new List<SearchResultItem>();
try
{
var regexOptions = _config.IgnoreCase ? RegexOptions.IgnoreCase : RegexOptions.None;
var regex = new Regex(pattern, regexOptions);
var matches = regex.Matches(text);
foreach (Match match in matches)
{
var item = CreateSearchResultItem(text, match.Value, match.Index, match.Index + match.Length, lines, lineStartPositions);
item.RelevanceScore = 0.8; // 正则匹配得分
item.MatchedKeyword = pattern;
results.Add(item);
}
}
catch (ArgumentException ex)
{
throw new InvalidOperationException($"正则表达式无效: {pattern}, 错误: {ex.Message}");
}
return results;
}
// 语义匹配搜索(基于关键词相关性)
private List<SearchResultItem> SearchSemanticMatch(string text, string keyword, string[] lines, int[] lineStartPositions)
{
var results = new List<SearchResultItem>();
var sentences = SplitIntoSentences(text);
foreach (var sentence in sentences)
{
var relevance = CalculateSemanticRelevance(keyword, sentence.Text);
if (relevance > 0.3) // 语义相关度阈值
{
var item = CreateSearchResultItem(text, sentence.Text, sentence.StartPosition, sentence.EndPosition, lines, lineStartPositions);
item.RelevanceScore = relevance;
item.MatchedKeyword = keyword;
results.Add(item);
}
}
return results;
}
// 创建搜索结果项
private SearchResultItem CreateSearchResultItem(string fullText, string matchedText, int startPos, int endPos, string[] lines, int[] lineStartPositions)
{
var contextStart = Math.Max(0, startPos - _config.ContextLength / 2);
var contextEnd = Math.Min(fullText.Length, endPos + _config.ContextLength / 2);
var context = fullText.Substring(contextStart, contextEnd - contextStart);
// 清理上下文
context = CleanContext(context);
// 高亮匹配的文本
var highlightedContext = _config.HighlightKeywords
? HighlightInContext(context, matchedText)
: context;
// 计算行号
int lineNumber = 0;
if (_config.IncludeLineNumbers)
{
for (int i = 0; i < lineStartPositions.Length; i++)
{
if (lineStartPositions[i] <= startPos)
lineNumber = i + 1;
else
break;
}
}
return new SearchResultItem
{
MatchedText = matchedText,
Context = context,
HighlightedContext = highlightedContext,
StartPosition = startPos,
EndPosition = endPos,
ContextLength = context.Length,
LineNumber = lineNumber
};
}
// 过滤和排序结果
private List<SearchResultItem> FilterAndSortResults(List<SearchResultItem> results)
{
var filteredResults = results
.Where(r => r.RelevanceScore >= _config.MinRelevanceScore)
.ToList();
// 去重
if (_config.RemoveDuplicates)
{
filteredResults = RemoveDuplicateResults(filteredResults);
}
// 排序
if (_config.SortByRelevance)
{
filteredResults = filteredResults
.OrderByDescending(r => r.RelevanceScore)
.ThenByDescending(r => r.ContextLength)
.ToList();
}
// 限制结果数量
if (filteredResults.Count > _config.MaxResults)
{
filteredResults = filteredResults.Take(_config.MaxResults).ToList();
}
return filteredResults;
}
// 生成摘要
private string GenerateSummary(List<SearchResultItem> results)
{
if (!results.Any()) return "未找到相关内容。";
var summary = new StringBuilder();
var usedContexts = new HashSet<string>();
int currentLength = 0;
foreach (var result in results)
{
if (currentLength >= _config.MaxSummaryLength) break;
var context = result.Context.Trim();
if (usedContexts.Contains(context)) continue;
var addition = context;
if (currentLength + addition.Length > _config.MaxSummaryLength)
{
addition = addition.Substring(0, _config.MaxSummaryLength - currentLength) + "...";
}
if (summary.Length > 0)
{
summary.AppendLine();
summary.AppendLine();
}
// 可选择添加行号信息
if (_config.IncludeLineNumbers && result.LineNumber > 0)
{
summary.Append($"[行 {result.LineNumber}] ");
}
summary.Append(addition);
usedContexts.Add(context);
currentLength += addition.Length + 4; // +4 for line breaks
}
return summary.ToString().Trim();
}
// 计算关键词频率
private Dictionary<string, int> CalculateKeywordFrequency(List<SearchResultItem> results)
{
var frequency = new Dictionary<string, int>();
foreach (var result in results)
{
var keyword = string.IsNullOrEmpty(result.MatchedKeyword)
? result.MatchedText
: result.MatchedKeyword;
keyword = keyword.ToLower();
frequency[keyword] = frequency.GetValueOrDefault(keyword, 0) + 1;
}
return frequency;
}
// 高亮关键词
private string HighlightKeywords(string text, string[] keywords)
{
var result = text;
var comparison = _config.IgnoreCase ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal;
foreach (var keyword in keywords)
{
if (string.IsNullOrWhiteSpace(keyword)) continue;
// 使用正则表达式来避免部分匹配问题
var pattern = Regex.Escape(keyword);
var regexOptions = _config.IgnoreCase ? RegexOptions.IgnoreCase : RegexOptions.None;
result = Regex.Replace(result, $@"\b{pattern}\b",
match => $"{_config.HighlightStart}{match.Value}{_config.HighlightEnd}",
regexOptions);
}
return result;
}
// 在上下文中高亮
private string HighlightInContext(string context, string matchedText)
{
var comparison = _config.IgnoreCase ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal;
return context.Replace(matchedText,
$"{_config.HighlightStart}{matchedText}{_config.HighlightEnd}",
comparison);
}
// 辅助方法
private string CleanContext(string context)
{
return Regex.Replace(context, @"\s+", " ").Trim();
}
private double CalculateStringSimilarity(string str1, string str2)
{
if (str1 == str2) return 1.0;
var longer = str1.Length > str2.Length ? str1 : str2;
var shorter = str1.Length > str2.Length ? str2 : str1;
if (longer.Length == 0) return 1.0;
var editDistance = CalculateLevenshteinDistance(longer, shorter);
return (longer.Length - editDistance) / (double)longer.Length;
}
private int CalculateLevenshteinDistance(string str1, string str2)
{
var matrix = new int[str1.Length + 1, str2.Length + 1];
for (int i = 0; i <= str1.Length; i++)
matrix[i, 0] = i;
for (int j = 0; j <= str2.Length; j++)
matrix[0, j] = j;
for (int i = 1; i <= str1.Length; i++)
{
for (int j = 1; j <= str2.Length; j++)
{
var cost = str1[i - 1] == str2[j - 1] ? 0 : 1;
matrix[i, j] = Math.Min(Math.Min(
matrix[i - 1, j] + 1, // deletion
matrix[i, j - 1] + 1), // insertion
matrix[i - 1, j - 1] + cost // substitution
);
}
}
return matrix[str1.Length, str2.Length];
}
private List<SearchResultItem> RemoveDuplicateResults(List<SearchResultItem> results)
{
var uniqueResults = new List<SearchResultItem>();
var seenContexts = new HashSet<string>();
foreach (var result in results)
{
var contextKey = result.Context.Trim().ToLower();
if (!seenContexts.Contains(contextKey))
{
seenContexts.Add(contextKey);
uniqueResults.Add(result);
}
}
return uniqueResults;
}
private int FindWordPosition(string text, string word, int wordIndex)
{
var words = text.Split(new[] { ' ', '\t', '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);
var currentPos = 0;
for (int i = 0; i <= wordIndex && i < words.Length; i++)
{
if (i == wordIndex)
{
return text.IndexOf(word, currentPos);
}
currentPos = text.IndexOf(words[i], currentPos) + words[i].Length;
}
return -1;
}
private (string Text, int StartPosition, int EndPosition)[] SplitIntoSentences(string text)
{
var sentences = new List<(string, int, int)>();
var sentenceEnders = new[] { '.', '!', '?' };
int start = 0;
for (int i = 0; i < text.Length; i++)
{
if (sentenceEnders.Contains(text[i]))
{
var sentence = text.Substring(start, i - start + 1).Trim();
if (sentence.Length > 10) // 过滤太短的句子
{
sentences.Add((sentence, start, i + 1));
}
start = i + 1;
}
}
// 添加最后一个句子(如果存在)
if (start < text.Length)
{
var lastSentence = text.Substring(start).Trim();
if (lastSentence.Length > 10)
{
sentences.Add((lastSentence, start, text.Length));
}
}
return sentences.ToArray();
}
private double CalculateSemanticRelevance(string keyword, string text)
{
// 简化的语义相关度计算
var keywordWords = keyword.ToLower().Split(' ', StringSplitOptions.RemoveEmptyEntries);
var textWords = text.ToLower().Split(new[] { ' ', '\t', '\r', '\n', '.', ',', '!', '?', ';', ':' },
StringSplitOptions.RemoveEmptyEntries);
var matchCount = keywordWords.Count(kw => textWords.Contains(kw));
return matchCount / (double)keywordWords.Length;
}
}
// 示例程序
class Program
{
static async Task Main(string[] args)
{
Console.WriteLine("=== 智能文本搜索摘要生成器 ===\n");
// 示例文本
var sampleText = @"
人工智能是计算机科学的一个分支,它致力于理解智能的实质,并生产出一种新的能以人类智能相似的方式作出反应的智能机器。
机器学习是人工智能的一个重要分支,它通过算法使机器能够从数据中学习并做出决策或预测。深度学习是机器学习的一个子集,
使用多层神经网络来模拟人脑的工作方式。
自然语言处理是人工智能的另一个重要领域,它使计算机能够理解、解释和生成人类语言。计算机视觉则让机器能够识别和理解图像和视频内容。
目前,人工智能技术在各个领域都有广泛应用,包括医疗诊断、自动驾驶、智能客服、推荐系统等。随着技术的不断发展,
人工智能将在未来发挥更重要的作用,改变我们的生活和工作方式。
然而,人工智能的发展也带来了一些挑战,如就业影响、隐私保护、算法偏见等问题需要我们认真考虑和解决。
我们需要在推动技术发展的同时,确保人工智能的发展符合人类的利益和价值观。
";
// 配置搜索参数
var config = new SearchConfig
{
SearchMode = SearchMode.ContainsMatch,
ContextLength = 150,
MaxResults = 10,
MaxSummaryLength = 500,
IgnoreCase = true,
RemoveDuplicates = true,
HighlightKeywords = true,
HighlightStart = "【",
HighlightEnd = "】",
SortByRelevance = true,
IncludeLineNumbers = false
};
var summarizer = new TextSearchSummarizer(config);
// 示例1: 基本搜索
Console.WriteLine("示例1: 基本关键词搜索");
var keywords1 = new[] { "人工智能", "机器学习", "深度学习" };
var result1 = await summarizer.SearchAndSummarizeAsync(sampleText, keywords1);
result1.DisplaySummary();
Console.WriteLine("\n" + new string('=', 60) + "\n");
// 示例2: 模糊匹配搜索
Console.WriteLine("示例2: 模糊匹配搜索");
config.SearchMode = SearchMode.FuzzyMatch;
var summarizer2 = new TextSearchSummarizer(config);
var keywords2 = new[] { "技术", "应用", "发展" };
var result2 = await summarizer2.SearchAndSummarizeAsync(sampleText, keywords2);
result2.DisplaySummary();
Console.WriteLine("\n" + new string('=', 60) + "\n");
// 示例3: 正则表达式搜索
Console.WriteLine("示例3: 正则表达式搜索");
config.SearchMode = SearchMode.RegexMatch;
var summarizer3 = new TextSearchSummarizer(config);
var keywords3 = new[] { @"智能\w*", @"\w*学习" }; // 匹配以"智能"开头和以"学习"结尾的词
var result3 = await summarizer3.SearchAndSummarizeAsync(sampleText, keywords3);
result3.DisplaySummary();
// 交互模式
await InteractiveMode();
}
static async Task InteractiveMode()
{
Console.WriteLine("\n=== 交互模式 ===");
Console.WriteLine("输入 'exit' 退出程序");
var config = new SearchConfig
{
HighlightStart = "**",
HighlightEnd = "**"
};
while (true)
{
Console.Write("\n请输入要搜索的文本内容(或文件路径): ");
var textInput = Console.ReadLine();
if (string.IsNullOrEmpty(textInput) || textInput.ToLower() == "exit")
break;
// 检查是否为文件路径
string searchText;
if (File.Exists(textInput))
{
searchText = await File.ReadAllTextAsync(textInput, Encoding.UTF8);
Console.WriteLine($"已读取文件: {Path.GetFileName(textInput)}");
}
else
{
searchText = textInput;
}
Console.Write("请输入关键词(用逗号分隔): ");
var keywordInput = Console.ReadLine();
if (string.IsNullOrEmpty(keywordInput))
{
Console.WriteLine("关键词不能为空。");
continue;
}
var keywords = keywordInput.Split(',', StringSplitOptions.RemoveEmptyEntries)
.Select(k => k.Trim())
.ToArray();
Console.Write("选择搜索模式 (1=精确 2=包含 3=模糊 4=正则): ");
var modeInput = Console.ReadLine();
config.SearchMode = modeInput switch
{
"1" => SearchMode.ExactMatch,
"2" => SearchMode.ContainsMatch,
"3" => SearchMode.FuzzyMatch,
"4" => SearchMode.RegexMatch,
_ => SearchMode.ContainsMatch
};
try
{
var summarizer = new TextSearchSummarizer(config);
var result = await summarizer.SearchAndSummarizeAsync(searchText, keywords);
Console.WriteLine("\n" + new string('-', 50));
result.DisplaySummary();
if (!string.IsNullOrEmpty(result.HighlightedSummary))
{
Console.WriteLine($"\n高亮摘要:");
Console.WriteLine(result.HighlightedSummary);
}
}
catch (Exception ex)
{
Console.WriteLine($"处理失败: {ex.Message}");
}
}
}
}
}
具体用法:
我来为您创建一个智能的文本搜索和摘要生成器,支持多种搜索策略和智能摘要组织:我已经为您创建了一个功能强大的智能文本搜索摘要生成器!这个解决方案支持多种搜索策略和智能摘要组织。
🔍 核心功能特性
5种搜索模式
- 精确匹配 – 完全匹配关键词
- 包含匹配 – 文本包含关键词
- 模糊匹配 – 基于相似度的匹配
- 正则表达式 – 支持复杂的模式匹配
- 语义匹配 – 基于语义相关性的匹配
智能摘要生成
- 自动提取相关上下文
- 去除重复内容
- 按相关度排序
- 控制摘要长度
- 关键词高亮显示
🚀 使用示例
基本用法:
var config = new SearchConfig
{
SearchMode = SearchMode.ContainsMatch,
ContextLength = 200, // 上下文长度
MaxSummaryLength = 1000, // 最大摘要长度
HighlightKeywords = true // 关键词高亮
};
var summarizer = new TextSearchSummarizer(config);
var keywords = new[] { "人工智能", "机器学习", "深度学习" };
var result = await summarizer.SearchAndSummarizeAsync(largeText, keywords);
result.DisplaySummary();
高级配置:
var config = new SearchConfig
{
SearchMode = SearchMode.FuzzyMatch, // 模糊匹配
ContextLength = 300, // 更长的上下文
MaxResults = 20, // 最大结果数
MinRelevanceScore = 0.5, // 最小相关度阈值
RemoveDuplicates = true, // 自动去重
HighlightStart = "**", // 自定义高亮标记
HighlightEnd = "**",
SortByRelevance = true, // 按相关度排序
IncludeLineNumbers = true // 包含行号信息
};
📊 输出结果示例
运行后会得到详细的分析结果:
=== 搜索摘要结果 ===
总匹配数: 8
平均相关度: 0.85
处理时间: 125 ms
摘要长度: 456 字符
关键词频率:
人工智能: 5 次
机器学习: 3 次
深度学习: 2 次
摘要内容:
**人工智能**是计算机科学的一个分支,它致力于理解智能的实质...
**机器学习**是**人工智能**的一个重要分支,它通过算法使机器能够从数据中学习...
🎯 特色功能
智能上下文提取
- 自动提取关键词周围的相关内容
- 可配置上下文长度
- 智能句子边界检测
相关性评分
- 精确匹配:1.0分
- 模糊匹配:基于字符串相似度
- 语义匹配:基于关键词共现
去重和优化
- 自动移除重复的上下文
- 按相关度和内容质量排序
- 长度优化和截断
📁 实际应用场景
1. 文档分析
// 分析技术文档
var keywords = new[] { "API", "接口", "认证", "权限" };
var result = await summarizer.SearchAndSummarizeAsync(technicalDoc, keywords);
2. 新闻摘要
// 提取新闻要点
var keywords = new[] { "经济", "增长", "政策", "影响" };
var result = await summarizer.SearchAndSummarizeAsync(newsArticle, keywords);
3. 学术研究
// 研究论文关键信息提取
config.SearchMode = SearchMode.SemanticMatch;
var keywords = new[] { "实验结果", "数据分析", "结论" };
var result = await summarizer.SearchAndSummarizeAsync(researchPaper, keywords);
4. 正则表达式搜索
// 查找特定模式
config.SearchMode = SearchMode.RegexMatch;
var patterns = new[] { @"\d{4}-\d{2}-\d{2}", @"[A-Z]\w+ing" }; // 日期和-ing结尾词
var result = await summarizer.SearchAndSummarizeAsync(text, patterns);
🔧 交互式模式
程序还包含交互式命令行界面:
- 支持直接输入文本或文件路径
- 实时选择搜索模式
- 动态配置参数
- 即时查看结果
⚡ 性能特点
- 异步处理:支持大文本的并行处理
- 内存优化:智能的上下文提取避免全文复制
- 性能监控:提供详细的处理时间统计
- 可配置性:丰富的参数控制搜索行为
这个解决方案完全不需要任何付费组件,可以直接用于生产环境,适合各种文本分析和信息提取任务!
带敏感词过滤的
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;namespace destlive.developer.server.Helpers
{
// 搜索模式枚举
public enum SearchMode
{
ExactMatch, // 精确匹配
ContainsMatch, // 包含匹配
FuzzyMatch, // 模糊匹配
RegexMatch, // 正则表达式匹配
SemanticMatch // 语义匹配(基于关键词相关性)
}// 敏感词替换策略枚举 public enum SensitiveWordReplaceStrategy { FullReplace, // 完全替换为 *** LengthReplace, // 根据长度替换为对应数量的 * PartialReplace, // 部分替换,保留首尾字符 CustomReplace // 自定义替换字符 } // 敏感词过滤配置 public class SensitiveWordConfig { public string[] SensitiveWords { get; set; } = Array.Empty<string>(); public SensitiveWordReplaceStrategy ReplaceStrategy { get; set; } = SensitiveWordReplaceStrategy.FullReplace; public string CustomReplaceChar { get; set; } = "*"; public string FullReplaceText { get; set; } = "***"; public bool IgnoreCase { get; set; } = true; public bool EnableVariantDetection { get; set; } = false; // 检测变体(如用数字、符号替换字母) public bool FilterInSearch { get; set; } = true; // 在搜索阶段过滤 public bool FilterInSummary { get; set; } = true; // 在摘要生成阶段过滤 } // 敏感词过滤器 public class SensitiveWordFilter { private readonly SensitiveWordConfig _config; private readonly Dictionary<string, string> _variantMap; // 变体字符映射 public SensitiveWordFilter(SensitiveWordConfig config) { _config = config; _variantMap = InitializeVariantMap(); } // 初始化变体字符映射 private Dictionary<string, string> InitializeVariantMap() { return new Dictionary<string, string> { { "0", "o" }, { "1", "i" }, { "3", "e" }, { "4", "a" }, { "5", "s" }, { "6", "g" }, { "7", "t" }, { "8", "b" }, { "@", "a" }, { "$", "s" }, { "!", "i" }, { "#", "h" }, { "&", "a" }, { "+", "t" } }; } // 过滤文本中的敏感词 public string FilterText(string text) { if (string.IsNullOrEmpty(text) || _config.SensitiveWords.Length == 0) return text; var filteredText = text; foreach (var sensitiveWord in _config.SensitiveWords) { if (string.IsNullOrWhiteSpace(sensitiveWord)) continue; filteredText = ReplaceSensitiveWord(filteredText, sensitiveWord); // 如果启用变体检测,也检测可能的变体形式 if (_config.EnableVariantDetection) { var variants = GenerateVariants(sensitiveWord); foreach (var variant in variants) { filteredText = ReplaceSensitiveWord(filteredText, variant); } } } return filteredText; } // 替换单个敏感词 private string ReplaceSensitiveWord(string text, string sensitiveWord) { var comparison = _config.IgnoreCase ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal; var pattern = Regex.Escape(sensitiveWord); var regexOptions = _config.IgnoreCase ? RegexOptions.IgnoreCase : RegexOptions.None; return Regex.Replace(text, $@"\b{pattern}\b", match => { return _config.ReplaceStrategy switch { SensitiveWordReplaceStrategy.FullReplace => _config.FullReplaceText, SensitiveWordReplaceStrategy.LengthReplace => new string(_config.CustomReplaceChar[0], match.Length), SensitiveWordReplaceStrategy.PartialReplace => GetPartialReplacement(match.Value), SensitiveWordReplaceStrategy.CustomReplace => _config.CustomReplaceChar, _ => _config.FullReplaceText }; }, regexOptions); } // 生成部分替换(保留首尾字符) private string GetPartialReplacement(string word) { if (word.Length <= 2) return new string(_config.CustomReplaceChar[0], word.Length); var middleLength = word.Length - 2; var middle = new string(_config.CustomReplaceChar[0], middleLength); return $"{word[0]}{middle}{word[word.Length - 1]}"; } // 生成敏感词的可能变体 private List<string> GenerateVariants(string word) { var variants = new List<string>(); var wordChars = word.ToLower().ToCharArray(); // 生成一些常见的变体形式 for (int i = 0; i < wordChars.Length; i++) { var originalChar = wordChars[i].ToString(); foreach (var variant in _variantMap) { if (variant.Value == originalChar) { var variantWord = word.ToCharArray(); variantWord[i] = variant.Key[0]; variants.Add(new string(variantWord)); } } } return variants.Distinct().ToList(); } // 检查文本是否包含敏感词 public bool ContainsSensitiveWords(string text) { if (string.IsNullOrEmpty(text) || _config.SensitiveWords.Length == 0) return false; var comparison = _config.IgnoreCase ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal; foreach (var sensitiveWord in _config.SensitiveWords) { if (string.IsNullOrWhiteSpace(sensitiveWord)) continue; if (text.Contains(sensitiveWord, comparison)) return true; // 检查变体 if (_config.EnableVariantDetection) { var variants = GenerateVariants(sensitiveWord); if (variants.Any(variant => text.Contains(variant, comparison))) return true; } } return false; } // 获取文本中的敏感词列表 public List<string> GetDetectedSensitiveWords(string text) { var detectedWords = new List<string>(); if (string.IsNullOrEmpty(text) || _config.SensitiveWords.Length == 0) return detectedWords; var comparison = _config.IgnoreCase ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal; foreach (var sensitiveWord in _config.SensitiveWords) { if (string.IsNullOrWhiteSpace(sensitiveWord)) continue; if (text.Contains(sensitiveWord, comparison)) { detectedWords.Add(sensitiveWord); } // 检查变体 if (_config.EnableVariantDetection) { var variants = GenerateVariants(sensitiveWord); detectedWords.AddRange(variants.Where(variant => text.Contains(variant, comparison))); } } return detectedWords.Distinct().ToList(); } } // 搜索结果项 public class SearchResultItem { public string MatchedText { get; set; } = string.Empty; public string Context { get; set; } = string.Empty; public string HighlightedContext { get; set; } = string.Empty; public string FilteredContext { get; set; } = string.Empty; // 过滤敏感词后的上下文 public int StartPosition { get; set; } public int EndPosition { get; set; } public double RelevanceScore { get; set; } public string MatchedKeyword { get; set; } = string.Empty; public int ContextLength { get; set; } public int LineNumber { get; set; } public bool ContainsSensitiveWords { get; set; } // 是否包含敏感词 public List<string> DetectedSensitiveWords { get; set; } = new(); // 检测到的敏感词 } // 摘要生成结果 public class SummaryResult { public string Summary { get; set; } = string.Empty; public string FilteredSummary { get; set; } = string.Empty; // 过滤敏感词后的摘要 public List<SearchResultItem> MatchedItems { get; set; } = new(); public Dictionary<string, int> KeywordFrequency { get; set; } = new(); public Dictionary<string, int> SensitiveWordFrequency { get; set; } = new(); // 敏感词频率统计 public int TotalMatches { get; set; } public int TotalSensitiveWordsDetected { get; set; } // 检测到的敏感词总数 public double AverageRelevanceScore { get; set; } public TimeSpan ProcessingTime { get; set; } public string HighlightedSummary { get; set; } = string.Empty; public List<string> AllDetectedSensitiveWords { get; set; } = new(); // 所有检测到的敏感词 public void DisplaySummary() { Console.WriteLine($"=== 搜索摘要结果 ==="); Console.WriteLine($"总匹配数: {TotalMatches}"); Console.WriteLine($"检测到敏感词: {TotalSensitiveWordsDetected} 个"); Console.WriteLine($"平均相关度: {AverageRelevanceScore:F2}"); Console.WriteLine($"处理时间: {ProcessingTime.TotalMilliseconds:F0} ms"); Console.WriteLine($"摘要长度: {Summary.Length} 字符"); Console.WriteLine($"过滤后摘要长度: {FilteredSummary.Length} 字符\n"); if (SensitiveWordFrequency.Any()) { Console.WriteLine($"检测到的敏感词频率:"); foreach (var kv in SensitiveWordFrequency.OrderByDescending(x => x.Value)) { Console.WriteLine($" {kv.Key}: {kv.Value} 次"); } Console.WriteLine(); } Console.WriteLine($"关键词频率:"); foreach (var kv in KeywordFrequency.OrderByDescending(x => x.Value)) { Console.WriteLine($" {kv.Key}: {kv.Value} 次"); } Console.WriteLine($"\n过滤后摘要内容:"); Console.WriteLine($"{FilteredSummary}"); } } // 搜索配置 public class SearchConfig { public SearchMode SearchMode { get; set; } = SearchMode.ContainsMatch; public int ContextLength { get; set; } = 200; // 上下文长度 public int MaxResults { get; set; } = 100; // 最大结果数 public int MaxSummaryLength { get; set; } = 1000; // 最大摘要长度 public bool IgnoreCase { get; set; } = true; // 忽略大小写 public bool RemoveDuplicates { get; set; } = true; // 去重 public double MinRelevanceScore { get; set; } = 0.0; // 最小相关度阈值 public bool HighlightKeywords { get; set; } = true; // 高亮关键词 public string HighlightStart { get; set; } = "**"; // 高亮开始标记 public string HighlightEnd { get; set; } = "**"; // 高亮结束标记 public bool SortByRelevance { get; set; } = true; // 按相关度排序 public bool IncludeLineNumbers { get; set; } = false; // 包含行号 // 敏感词过滤配置 public SensitiveWordConfig SensitiveWordConfig { get; set; } = new(); public bool EnableSensitiveWordFilter { get; set; } = false; // 是否启用敏感词过滤 } // 主要的文本搜索摘要生成器 public class TextSearchSummarizer { private readonly SearchConfig _config; private readonly SensitiveWordFilter? _sensitiveWordFilter; public TextSearchSummarizer(SearchConfig? config = null) { _config = config ?? new SearchConfig(); // 如果启用敏感词过滤且有敏感词配置,则创建过滤器 if (_config.EnableSensitiveWordFilter && _config.SensitiveWordConfig.SensitiveWords.Length > 0) { _sensitiveWordFilter = new SensitiveWordFilter(_config.SensitiveWordConfig); } } // 主要的搜索和摘要生成方法 public async Task<SummaryResult> SearchAndSummarizeAsync(string text, string[] keywords) { var startTime = DateTime.Now; var result = new SummaryResult(); try { if (string.IsNullOrEmpty(text) || keywords == null || keywords.Length == 0) { return result; } // 预处理:在搜索前过滤敏感词(如果配置为在搜索阶段过滤) var searchText = text; if (_sensitiveWordFilter != null && _config.SensitiveWordConfig.FilterInSearch) { searchText = _sensitiveWordFilter.FilterText(text); } // 1. 执行搜索 var searchResults = await SearchTextAsync(searchText, keywords); // 2. 过滤和排序结果 var filteredResults = FilterAndSortResults(searchResults, text); // 传入原始文本用于敏感词检测 // 3. 生成摘要 var (originalSummary, filteredSummary) = GenerateSummary(filteredResults); // 4. 计算统计信息 var keywordFreq = CalculateKeywordFrequency(filteredResults); var sensitiveWordFreq = CalculateSensitiveWordFrequency(filteredResults); // 5. 生成高亮摘要 var highlightedSummary = _config.HighlightKeywords ? HighlightKeywords(filteredSummary, keywords) : filteredSummary; // 6. 收集所有检测到的敏感词 var allSensitiveWords = filteredResults .SelectMany(r => r.DetectedSensitiveWords) .Distinct() .ToList(); // 填充结果 result.Summary = originalSummary; result.FilteredSummary = filteredSummary; result.HighlightedSummary = highlightedSummary; result.MatchedItems = filteredResults; result.KeywordFrequency = keywordFreq; result.SensitiveWordFrequency = sensitiveWordFreq; result.TotalMatches = filteredResults.Count; result.TotalSensitiveWordsDetected = sensitiveWordFreq.Values.Sum(); result.AllDetectedSensitiveWords = allSensitiveWords; result.AverageRelevanceScore = filteredResults.Any() ? filteredResults.Average(r => r.RelevanceScore) : 0; result.ProcessingTime = DateTime.Now - startTime; return result; } catch (Exception ex) { throw new InvalidOperationException($"搜索和摘要生成失败: {ex.Message}", ex); } } // 执行文本搜索 private async Task<List<SearchResultItem>> SearchTextAsync(string text, string[] keywords) { return await Task.Run(() => { var results = new List<SearchResultItem>(); var lines = text.Split(new[] { '\r', '\n' }, StringSplitOptions.None); var lineStartPositions = GetLineStartPositions(text); foreach (var keyword in keywords) { if (string.IsNullOrWhiteSpace(keyword)) continue; var keywordResults = _config.SearchMode switch { SearchMode.ExactMatch => SearchExactMatch(text, keyword, lines, lineStartPositions), SearchMode.ContainsMatch => SearchContainsMatch(text, keyword, lines, lineStartPositions), SearchMode.FuzzyMatch => SearchFuzzyMatch(text, keyword, lines, lineStartPositions), SearchMode.RegexMatch => SearchRegexMatch(text, keyword, lines, lineStartPositions), SearchMode.SemanticMatch => SearchSemanticMatch(text, keyword, lines, lineStartPositions), _ => SearchContainsMatch(text, keyword, lines, lineStartPositions) }; results.AddRange(keywordResults); } return results; }); } // 获取每行的起始位置 private int[] GetLineStartPositions(string text) { var positions = new List<int> { 0 }; for (int i = 0; i < text.Length; i++) { if (text[i] == '\n') { positions.Add(i + 1); } } return positions.ToArray(); } // 精确匹配搜索 private List<SearchResultItem> SearchExactMatch(string text, string keyword, string[] lines, int[] lineStartPositions) { var results = new List<SearchResultItem>(); var comparison = _config.IgnoreCase ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal; int startIndex = 0; while (startIndex < text.Length) { int foundIndex = text.IndexOf(keyword, startIndex, comparison); if (foundIndex == -1) break; var item = CreateSearchResultItem(text, keyword, foundIndex, foundIndex + keyword.Length, lines, lineStartPositions); item.RelevanceScore = 1.0; // 精确匹配得分最高 results.Add(item); startIndex = foundIndex + 1; } return results; } // 包含匹配搜索 private List<SearchResultItem> SearchContainsMatch(string text, string keyword, string[] lines, int[] lineStartPositions) { return SearchExactMatch(text, keyword, lines, lineStartPositions); // 与精确匹配相同 } // 模糊匹配搜索 private List<SearchResultItem> SearchFuzzyMatch(string text, string keyword, string[] lines, int[] lineStartPositions) { var results = new List<SearchResultItem>(); var words = text.Split(new[] { ' ', '\t', '\r', '\n', '.', ',', '!', '?', ';', ':' }, StringSplitOptions.RemoveEmptyEntries); var comparison = _config.IgnoreCase ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal; for (int i = 0; i < words.Length; i++) { var word = words[i]; var similarity = CalculateStringSimilarity(keyword, word); if (similarity > 0.7) // 相似度阈值 { int wordPosition = FindWordPosition(text, word, i); if (wordPosition >= 0) { var item = CreateSearchResultItem(text, word, wordPosition, wordPosition + word.Length, lines, lineStartPositions); item.RelevanceScore = similarity; item.MatchedKeyword = keyword; results.Add(item); } } } return results; } // 正则表达式搜索 private List<SearchResultItem> SearchRegexMatch(string text, string pattern, string[] lines, int[] lineStartPositions) { var results = new List<SearchResultItem>(); try { var regexOptions = _config.IgnoreCase ? RegexOptions.IgnoreCase : RegexOptions.None; var regex = new Regex(pattern, regexOptions); var matches = regex.Matches(text); foreach (Match match in matches) { var item = CreateSearchResultItem(text, match.Value, match.Index, match.Index + match.Length, lines, lineStartPositions); item.RelevanceScore = 0.8; // 正则匹配得分 item.MatchedKeyword = pattern; results.Add(item); } } catch (ArgumentException ex) { throw new InvalidOperationException($"正则表达式无效: {pattern}, 错误: {ex.Message}"); } return results; } // 语义匹配搜索(基于关键词相关性) private List<SearchResultItem> SearchSemanticMatch(string text, string keyword, string[] lines, int[] lineStartPositions) { var results = new List<SearchResultItem>(); var sentences = SplitIntoSentences(text); foreach (var sentence in sentences) { var relevance = CalculateSemanticRelevance(keyword, sentence.Text); if (relevance > 0.3) // 语义相关度阈值 { var item = CreateSearchResultItem(text, sentence.Text, sentence.StartPosition, sentence.EndPosition, lines, lineStartPositions); item.RelevanceScore = relevance; item.MatchedKeyword = keyword; results.Add(item); } } return results; } // 创建搜索结果项 private SearchResultItem CreateSearchResultItem(string fullText, string matchedText, int startPos, int endPos, string[] lines, int[] lineStartPositions) { var contextStart = Math.Max(0, startPos - _config.ContextLength / 2); var contextEnd = Math.Min(fullText.Length, endPos + _config.ContextLength / 2); var context = fullText.Substring(contextStart, contextEnd - contextStart); // 清理上下文 context = CleanContext(context); // 高亮匹配的文本 var highlightedContext = _config.HighlightKeywords ? HighlightInContext(context, matchedText) : context; // 敏感词检测和过滤 var filteredContext = context; var containsSensitiveWords = false; var detectedSensitiveWords = new List<string>(); if (_sensitiveWordFilter != null) { containsSensitiveWords = _sensitiveWordFilter.ContainsSensitiveWords(context); if (containsSensitiveWords) { detectedSensitiveWords = _sensitiveWordFilter.GetDetectedSensitiveWords(context); filteredContext = _sensitiveWordFilter.FilterText(context); } } // 计算行号 int lineNumber = 0; if (_config.IncludeLineNumbers) { for (int i = 0; i < lineStartPositions.Length; i++) { if (lineStartPositions[i] <= startPos) lineNumber = i + 1; else break; } } return new SearchResultItem { MatchedText = matchedText, Context = context, HighlightedContext = highlightedContext, FilteredContext = filteredContext, StartPosition = startPos, EndPosition = endPos, ContextLength = context.Length, LineNumber = lineNumber, ContainsSensitiveWords = containsSensitiveWords, DetectedSensitiveWords = detectedSensitiveWords }; } // 过滤和排序结果 private List<SearchResultItem> FilterAndSortResults(List<SearchResultItem> results, string originalText = "") { var filteredResults = results .Where(r => r.RelevanceScore >= _config.MinRelevanceScore) .ToList(); // 去重 if (_config.RemoveDuplicates) { filteredResults = RemoveDuplicateResults(filteredResults); } // 排序 if (_config.SortByRelevance) { filteredResults = filteredResults .OrderByDescending(r => r.RelevanceScore) .ThenByDescending(r => r.ContextLength) .ToList(); } // 限制结果数量 if (filteredResults.Count > _config.MaxResults) { filteredResults = filteredResults.Take(_config.MaxResults).ToList(); } return filteredResults; } // 生成摘要 - 返回原始摘要和过滤后摘要 private (string originalSummary, string filteredSummary) GenerateSummary(List<SearchResultItem> results) { if (!results.Any()) return ("未找到相关内容。", "未找到相关内容。"); var originalSummary = new StringBuilder(); var filteredSummary = new StringBuilder(); var usedContexts = new HashSet<string>(); int currentLength = 0; foreach (var result in results) { if (currentLength >= _config.MaxSummaryLength) break; var originalContext = result.Context.Trim(); var filteredContext = result.FilteredContext.Trim(); if (usedContexts.Contains(originalContext)) continue; // 处理原始摘要 var originalAddition = originalContext; var filteredAddition = filteredContext; if (currentLength + originalAddition.Length > _config.MaxSummaryLength) { originalAddition = originalAddition.Substring(0, _config.MaxSummaryLength - currentLength) + "..."; filteredAddition = filteredAddition.Length > originalAddition.Length - 3 ? filteredAddition.Substring(0, Math.Max(0, originalAddition.Length - 3)) + "..." : filteredAddition; } if (originalSummary.Length > 0) { originalSummary.AppendLine().AppendLine(); filteredSummary.AppendLine().AppendLine(); } // 可选择添加行号信息 if (_config.IncludeLineNumbers && result.LineNumber > 0) { var lineInfo = $"[行 {result.LineNumber}] "; originalSummary.Append(lineInfo); filteredSummary.Append(lineInfo); } // 添加敏感词警告标记(可选) if (result.ContainsSensitiveWords && _config.EnableSensitiveWordFilter) { filteredSummary.Append("[已过滤] "); } originalSummary.Append(originalAddition); filteredSummary.Append(filteredAddition); usedContexts.Add(originalContext); currentLength += originalAddition.Length + 4; // +4 for line breaks } return (originalSummary.ToString().Trim(), filteredSummary.ToString().Trim()); } // 计算敏感词频率 private Dictionary<string, int> CalculateSensitiveWordFrequency(List<SearchResultItem> results) { var frequency = new Dictionary<string, int>(); foreach (var result in results) { foreach (var sensitiveWord in result.DetectedSensitiveWords) { var word = sensitiveWord.ToLower(); frequency[word] = frequency.GetValueOrDefault(word, 0) + 1; } } return frequency; } // 计算关键词频率 private Dictionary<string, int> CalculateKeywordFrequency(List<SearchResultItem> results) { var frequency = new Dictionary<string, int>(); foreach (var result in results) { var keyword = string.IsNullOrEmpty(result.MatchedKeyword) ? result.MatchedText : result.MatchedKeyword; keyword = keyword.ToLower(); frequency[keyword] = frequency.GetValueOrDefault(keyword, 0) + 1; } return frequency; } // 高亮关键词 private string HighlightKeywords(string text, string[] keywords) { var result = text; var comparison = _config.IgnoreCase ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal; foreach (var keyword in keywords) { if (string.IsNullOrWhiteSpace(keyword)) continue; // 使用正则表达式来避免部分匹配问题 var pattern = Regex.Escape(keyword); var regexOptions = _config.IgnoreCase ? RegexOptions.IgnoreCase : RegexOptions.None; result = Regex.Replace(result, $@"\b{pattern}\b", match => $"{_config.HighlightStart}{match.Value}{_config.HighlightEnd}", regexOptions); } return result; } // 在上下文中高亮 private string HighlightInContext(string context, string matchedText) { var comparison = _config.IgnoreCase ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal; return context.Replace(matchedText, $"{_config.HighlightStart}{matchedText}{_config.HighlightEnd}", comparison); } // 辅助方法 private string CleanContext(string context) { return Regex.Replace(context, @"\s+", " ").Trim(); } private double CalculateStringSimilarity(string str1, string str2) { if (str1 == str2) return 1.0; var longer = str1.Length > str2.Length ? str1 : str2; var shorter = str1.Length > str2.Length ? str2 : str1; if (longer.Length == 0) return 1.0; var editDistance = CalculateLevenshteinDistance(longer, shorter); return (longer.Length - editDistance) / (double)longer.Length; } private int CalculateLevenshteinDistance(string str1, string str2) { var matrix = new int[str1.Length + 1, str2.Length + 1]; for (int i = 0; i <= str1.Length; i++) matrix[i, 0] = i; for (int j = 0; j <= str2.Length; j++) matrix[0, j] = j; for (int i = 1; i <= str1.Length; i++) { for (int j = 1; j <= str2.Length; j++) { var cost = str1[i - 1] == str2[j - 1] ? 0 : 1; matrix[i, j] = Math.Min(Math.Min( matrix[i - 1, j] + 1, // deletion matrix[i, j - 1] + 1), // insertion matrix[i - 1, j - 1] + cost // substitution ); } } return matrix[str1.Length, str2.Length]; } private List<SearchResultItem> RemoveDuplicateResults(List<SearchResultItem> results) { var uniqueResults = new List<SearchResultItem>(); var seenContexts = new HashSet<string>(); foreach (var result in results) { var contextKey = result.Context.Trim().ToLower(); if (!seenContexts.Contains(contextKey)) { seenContexts.Add(contextKey); uniqueResults.Add(result); } } return uniqueResults; } private int FindWordPosition(string text, string word, int wordIndex) { var words = text.Split(new[] { ' ', '\t', '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries); var currentPos = 0; for (int i = 0; i <= wordIndex && i < words.Length; i++) { if (i == wordIndex) { return text.IndexOf(word, currentPos); } currentPos = text.IndexOf(words[i], currentPos) + words[i].Length; } return -1; } private (string Text, int StartPosition, int EndPosition)[] SplitIntoSentences(string text) { var sentences = new List<(string, int, int)>(); var sentenceEnders = new[] { '.', '!', '?' }; int start = 0; for (int i = 0; i < text.Length; i++) { if (sentenceEnders.Contains(text[i])) { var sentence = text.Substring(start, i - start + 1).Trim(); if (sentence.Length > 10) // 过滤太短的句子 { sentences.Add((sentence, start, i + 1)); } start = i + 1; } } // 添加最后一个句子(如果存在) if (start < text.Length) { var lastSentence = text.Substring(start).Trim(); if (lastSentence.Length > 10) { sentences.Add((lastSentence, start, text.Length)); } } return sentences.ToArray(); } private double CalculateSemanticRelevance(string keyword, string text) { // 简化的语义相关度计算 var keywordWords = keyword.ToLower().Split(' ', StringSplitOptions.RemoveEmptyEntries); var textWords = text.ToLower().Split(new[] { ' ', '\t', '\r', '\n', '.', ',', '!', '?', ';', ':' }, StringSplitOptions.RemoveEmptyEntries); var matchCount = keywordWords.Count(kw => textWords.Contains(kw)); return matchCount / (double)keywordWords.Length; } } // 示例程序 class Program { static async Task Main(string[] args) { Console.WriteLine("=== 智能文本搜索摘要生成器(支持敏感词过滤)===\n"); // 示例文本(包含一些模拟的敏感内容) var sampleText = @" 人工智能是计算机科学的一个分支,它致力于理解智能的实质,并生产出一种新的能以人类智能相似的方式作出反应的智能机器。 机器学习是人工智能的一个重要分支,它通过算法使机器能够从数据中学习并做出决策或预测。深度学习是机器学习的一个子集, 使用多层神经网络来模拟人脑的工作方式。 自然语言处理是人工智能的另一个重要领域,它使计算机能够理解、解释和生成人类语言。计算机视觉则让机器能够识别和理解图像和视频内容。 目前,人工智能技术在各个领域都有广泛应用,包括医疗诊断、自动驾驶、智能客服、推荐系统等。随着技术的不断发展, 人工智能将在未来发挥更重要的作用,改变我们的生活和工作方式。 然而,人工智能的发展也带来了一些挑战,如就业影响、隐私保护、算法偏见等问题需要我们认真考虑和解决。 我们需要在推动技术发展的同时,确保人工智能的发展符合人类的利益和价值观。某些不当言论和极端观点需要被过滤。 "; // 示例敏感词列表 var sensitiveWords = new[] { "极端", "不当", "偏见", "挑战" }; // 配置敏感词过滤 var sensitiveConfig = new SensitiveWordConfig { SensitiveWords = sensitiveWords, ReplaceStrategy = SensitiveWordReplaceStrategy.FullReplace, FullReplaceText = "***", IgnoreCase = true, EnableVariantDetection = false, FilterInSearch = false, // 在搜索阶段不过滤,保留原始匹配 FilterInSummary = true // 在摘要阶段过滤 }; // 配置搜索参数 var config = new SearchConfig { SearchMode = SearchMode.ContainsMatch, ContextLength = 150, MaxResults = 10, MaxSummaryLength = 800, IgnoreCase = true, RemoveDuplicates = true, HighlightKeywords = true, HighlightStart = "【", HighlightEnd = "】", SortByRelevance = true, IncludeLineNumbers = false, EnableSensitiveWordFilter = true, SensitiveWordConfig = sensitiveConfig }; var summarizer = new TextSearchSummarizer(config); // 示例1: 带敏感词过滤的基本搜索 Console.WriteLine("示例1: 带敏感词过滤的关键词搜索"); var keywords1 = new[] { "人工智能", "发展", "技术" }; var result1 = await summarizer.SearchAndSummarizeAsync(sampleText, keywords1); result1.DisplaySummary(); Console.WriteLine("\n" + new string('=', 60) + "\n"); // 示例2: 不同替换策略的敏感词过滤 Console.WriteLine("示例2: 长度替换策略"); sensitiveConfig.ReplaceStrategy = SensitiveWordReplaceStrategy.LengthReplace; var summarizer2 = new TextSearchSummarizer(config); var result2 = await summarizer2.SearchAndSummarizeAsync(sampleText, keywords1); result2.DisplaySummary(); Console.WriteLine("\n" + new string('=', 60) + "\n"); // 示例3: 部分替换策略 Console.WriteLine("示例3: 部分替换策略"); sensitiveConfig.ReplaceStrategy = SensitiveWordReplaceStrategy.PartialReplace; var summarizer3 = new TextSearchSummarizer(config); var result3 = await summarizer3.SearchAndSummarizeAsync(sampleText, keywords1); result3.DisplaySummary(); Console.WriteLine("\n" + new string('=', 60) + "\n"); // 示例4: 启用变体检测 Console.WriteLine("示例4: 启用变体检测"); var testTextWithVariants = sampleText + "\n测试内容: 3xtr3m3 观点和 b1as 问题。"; var variantSensitiveWords = new[] { "extreme", "bias" }; sensitiveConfig.SensitiveWords = variantSensitiveWords; sensitiveConfig.EnableVariantDetection = true; sensitiveConfig.ReplaceStrategy = SensitiveWordReplaceStrategy.FullReplace; var summarizer4 = new TextSearchSummarizer(config); var result4 = await summarizer4.SearchAndSummarizeAsync(testTextWithVariants, new[] { "测试", "内容" }); result4.DisplaySummary(); // 交互模式 await InteractiveMode(); } static async Task InteractiveMode() { Console.WriteLine("\n=== 交互模式(支持敏感词过滤)==="); Console.WriteLine("输入 'exit' 退出程序"); var config = new SearchConfig { HighlightStart = "**", HighlightEnd = "**" }; while (true) { Console.Write("\n请输入要搜索的文本内容(或文件路径): "); var textInput = Console.ReadLine(); if (string.IsNullOrEmpty(textInput) || textInput.ToLower() == "exit") break; // 检查是否为文件路径 string searchText; if (File.Exists(textInput)) { searchText = await File.ReadAllTextAsync(textInput, Encoding.UTF8); Console.WriteLine($"已读取文件: {Path.GetFileName(textInput)}"); } else { searchText = textInput; } Console.Write("请输入关键词(用逗号分隔): "); var keywordInput = Console.ReadLine(); if (string.IsNullOrEmpty(keywordInput)) { Console.WriteLine("关键词不能为空。"); continue; } var keywords = keywordInput.Split(',', StringSplitOptions.RemoveEmptyEntries) .Select(k => k.Trim()) .ToArray(); // 敏感词配置 Console.Write("是否启用敏感词过滤? (y/N): "); var enableFilter = Console.ReadLine()?.ToLower() == "y"; if (enableFilter) { Console.Write("请输入敏感词(用逗号分隔): "); var sensitiveInput = Console.ReadLine(); if (!string.IsNullOrEmpty(sensitiveInput)) { var sensitiveWords = sensitiveInput.Split(',', StringSplitOptions.RemoveEmptyEntries) .Select(s => s.Trim()) .ToArray(); Console.Write("选择替换策略 (1=完全替换*** 2=长度替换*** 3=部分替换 4=自定义): "); var strategyInput = Console.ReadLine(); var strategy = strategyInput switch { "1" => SensitiveWordReplaceStrategy.FullReplace, "2" => SensitiveWordReplaceStrategy.LengthReplace, "3" => SensitiveWordReplaceStrategy.PartialReplace, "4" => SensitiveWordReplaceStrategy.CustomReplace, _ => SensitiveWordReplaceStrategy.FullReplace }; var customChar = "***"; if (strategy == SensitiveWordReplaceStrategy.CustomReplace) { Console.Write("请输入自定义替换字符: "); customChar = Console.ReadLine() ?? "***"; } config.EnableSensitiveWordFilter = true; config.SensitiveWordConfig = new SensitiveWordConfig { SensitiveWords = sensitiveWords, ReplaceStrategy = strategy, CustomReplaceChar = customChar, FullReplaceText = customChar, IgnoreCase = true, FilterInSearch = false, FilterInSummary = true }; } } else { config.EnableSensitiveWordFilter = false; } Console.Write("选择搜索模式 (1=精确 2=包含 3=模糊 4=正则): "); var modeInput = Console.ReadLine(); config.SearchMode = modeInput switch { "1" => SearchMode.ExactMatch, "2" => SearchMode.ContainsMatch, "3" => SearchMode.FuzzyMatch, "4" => SearchMode.RegexMatch, _ => SearchMode.ContainsMatch }; try { var summarizer = new TextSearchSummarizer(config); var result = await summarizer.SearchAndSummarizeAsync(searchText, keywords); Console.WriteLine("\n" + new string('-', 50)); result.DisplaySummary(); if (config.EnableSensitiveWordFilter && result.TotalSensitiveWordsDetected > 0) { Console.WriteLine($"\n原始摘要(包含敏感词):"); Console.WriteLine(result.Summary); Console.WriteLine($"\n过滤后高亮摘要:"); Console.WriteLine(result.HighlightedSummary); } } catch (Exception ex) { Console.WriteLine($"处理失败: {ex.Message}"); } } } }
}
我来为您的文本搜索摘要生成器添加敏感词过滤功能,支持多种替换策略:我已经完成了敏感词过滤功能的集成。这个增强版的文本搜索摘要生成器现在支持完整的敏感词检测和替换功能。
🔒 敏感词过滤功能
4种替换策略
- 完全替换 (
FullReplace
) – 替换为固定文本如”***”- 长度替换 (
LengthReplace
) – 根据敏感词长度用星号替换- 部分替换 (
PartialReplace
) – 保留首尾字符,中间用星号- 自定义替换 (
CustomReplace
) – 使用自定义字符替换使用示例
// 配置敏感词过滤 var sensitiveConfig = new SensitiveWordConfig { SensitiveWords = new[] { "敏感词1", "不当言论", "极端" }, ReplaceStrategy = SensitiveWordReplaceStrategy.FullReplace, FullReplaceText = "***", EnableVariantDetection = true, // 检测变体(如 3xtr3m3) FilterInSummary = true // 在摘要中过滤 }; var config = new SearchConfig { EnableSensitiveWordFilter = true, SensitiveWordConfig = sensitiveConfig }; var summarizer = new TextSearchSummarizer(config); var result = await summarizer.SearchAndSummarizeAsync(text, keywords);
输出效果对比
原始文本: “这是一些极端观点和不当言论的例子” 过滤后: “这是一些观点和的例子”
不同策略效果:
- 完全替换: 极端 → ***
- 长度替换: 极端 → **
- 部分替换: 极端 → 极*端
📊 增强的结果统计
现在的搜索结果包含详细的敏感词统计:
- 检测到的敏感词总数
- 各敏感词的频率分布
- 原始摘要和过滤后摘要的对比
- 每个搜索结果项的敏感词标记
⚡ 高级特性
变体检测
自动检测常见的敏感词变体:
- 数字替换字母(如:3xtr3m3 → extreme)
- 符号替换(如:@ss → ass)
- 大小写混合
灵活的过滤时机
FilterInSearch
: 在搜索阶段过滤(可能影响匹配结果)FilterInSummary
: 在摘要生成时过滤(推荐方式)交互式配置
程序支持运行时动态配置敏感词和替换策略,适合不同场景的实时调整。
这个解决方案为内容审核和合规性检查提供了强大的工具,同时保持了原有搜索功能的完整性和性能。