最近项目中遇到一个需求,通过计算两个csv文件的克拉默系数,来判断文件的相似度,是否存在抄袭。简单说下我们业务背景,我们的是竞赛平台,参数选手通过分析大量数据生成一个csv文件,第一列为uuid,第二列为目标答案,根据组织的比赛不同,第二列的取值范围不同,有的时候可能是0/1,有的时候可能是1//2/3/4,有的时候可能还是Y/N。
代码如下:
package com.lsl.cramer;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.UUID;
/**
* 封装选手提交的竞赛答案CSV数据,适配2列固定结构
*/
public class ContestAnswerData {
// 选手名称/文件标识,用于结果输出
private final String playerIdentifier;
// uuid列名(业务上的目标id)
private final String uuidColumnName;
// 答案列名
private final String answerColumnName;
// 核心数据:key=uuid/目标id,value=选手答案值
private final Map<String, String> answerMap;
// 该选手答案的所有唯一取值,用于同竞赛格式校验
private final Set<String> answerValueSet;
public ContestAnswerData(String playerIdentifier, String uuidColumnName, String answerColumnName,
Map<String, String> answerMap, Set<String> answerValueSet) {
this.playerIdentifier = playerIdentifier;
this.uuidColumnName = uuidColumnName;
this.answerColumnName = answerColumnName;
this.answerMap = answerMap;
this.answerValueSet = answerValueSet;
}
// Getter方法
public String getPlayerIdentifier() {
return playerIdentifier;
}
public String getUuidColumnName() {
return uuidColumnName;
}
public String getAnswerColumnName() {
return answerColumnName;
}
public Map<String, String> getAnswerMap() {
return answerMap;
}
public Set<String> getAnswerValueSet() {
return answerValueSet;
}
// 获取有效答案数量
public int getValidAnswerCount() {
return answerMap.size();
}
public static void main(String[] args) {
Random random = new Random();
for(int i = 0; i<100;i++) {
System.out.println(UUID.randomUUID().toString() + "," + random.nextInt(5));
}
}
}
package com.lsl.cramer;
/**
* 竞赛答案CSV格式校验异常,适配业务场景错误处理
*/
public class ContestAnswerCsvException extends RuntimeException {
public ContestAnswerCsvException(String message) {
super(message);
}
public ContestAnswerCsvException(String message, Throwable cause) {
super(message, cause);
}
}
package com.lsl.cramer;
import com.opencsv.CSVReader;
import com.opencsv.exceptions.CsvException;
import java.io.FileReader;
import java.io.IOException;
import java.util.*;
/**
* 竞赛选手答案克拉默系数计算核心工具类
* 适配2列CSV结构,兼容任意答案取值范围,用于反抄袭相似度对比
*/
public class ContestAnswerCramerCalculator {
// ===================== 可配置项(根据业务需求调整)=====================
// uuid列名,适配业务中的"目标id"等别名
private static final String UUID_COLUMN_NAME = "uuid";
// 答案列名,可根据实际CSV列名调整
private static final String ANSWER_COLUMN_NAME = "answer";
// 抄袭风险阈值:克拉默系数超过该值,标记为高风险抄袭
private static final double PLAGIARISM_RISK_THRESHOLD = 0.9;
// 卡方检验理论频数最低阈值:低于该值会输出警告,影响结果可靠性
private static final double MIN_EXPECTED_FREQUENCY = 1.0;
// 缺失值处理策略
public enum MissingValueStrategy {
EXCLUDE_MISSING, // 剔除仅单方存在的uuid(推荐,同竞赛答案应覆盖同一批目标id)
MISSING_AS_NULL // 将缺失值作为"NULL"独立答案类别,保留所有uuid
}
// =========================================================================
/**
* 读取选手提交的CSV答案文件,封装为统一数据结构
* @param filePath CSV文件路径
* @param playerIdentifier 选手标识(如"选手A"、"北京分行-选手1")
* @return 封装后的选手答案数据
*/
public ContestAnswerData readAnswerCsv(String filePath, String playerIdentifier) {
try (CSVReader reader = new CSVReader(new FileReader(filePath))) {
List<String[]> allRows = reader.readAll();
// 1. 基础格式校验
if (allRows.isEmpty()) {
throw new ContestAnswerCsvException("CSV文件为空:" + filePath);
}
if (allRows.get(0).length != 2) {
throw new ContestAnswerCsvException("CSV文件列数必须为2,当前列数:" + allRows.get(0).length + ",文件:" + filePath);
}
// 2. 表头校验
// String[] header = allRows.get(0);
// String uuidColumn = header[0].trim();
// String answerColumn = header[1].trim();
// if (!uuidColumn.equalsIgnoreCase(UUID_COLUMN_NAME)) {
// throw new ContestAnswerCsvException("CSV首列必须为uuid/目标id列,当前首列:" + uuidColumn + ",文件:" + filePath);
// }
// if (!answerColumn.equalsIgnoreCase(ANSWER_COLUMN_NAME)) {
// throw new ContestAnswerCsvException("CSV第二列必须为答案列,当前第二列:" + answerColumn + ",文件:" + filePath);
// }
// 3. 读取数据行,处理重复uuid、空值
Map<String, String> answerMap = new LinkedHashMap<>();
for (int i = 1; i < allRows.size(); i++) {
String[] row = allRows.get(i);
if (row.length != 2) {
throw new ContestAnswerCsvException("CSV第" + (i+1) + "行列数与表头不一致,文件:" + filePath);
}
String uuid = row[0].trim();
String answer = row[1].trim();
// 跳过uuid为空的无效行
if (uuid.isEmpty()) {
continue;
}
// 空答案统一处理为"NULL",避免统计失真
if (answer.isEmpty()) {
answer = "NULL";
}
// 按uuid去重,保留最后一条有效数据
answerMap.put(uuid, answer);
}
// 4. 校验有效数据
if (answerMap.isEmpty()) {
throw new ContestAnswerCsvException("CSV文件无有效答案数据,文件:" + filePath);
}
// 5. 提取答案唯一取值集合,用于同竞赛格式校验
Set<String> answerValueSet = new HashSet<>(answerMap.values());
return new ContestAnswerData(playerIdentifier, "uuidColumn", "answerColumn", answerMap, answerValueSet);
} catch (IOException | CsvException e) {
throw new ContestAnswerCsvException("读取CSV答案文件失败:" + filePath, e);
}
}
/**
* 校验两个选手的答案是否符合同一竞赛的格式要求
* 核心校验:答案取值范围一致(同竞赛要求)
* @param data1 选手1的答案数据
* @param data2 选手2的答案数据
*/
public void validateSameContestFormat(ContestAnswerData data1, ContestAnswerData data2) {
// 校验列名一致
if (!data1.getUuidColumnName().equalsIgnoreCase(data2.getUuidColumnName())) {
throw new ContestAnswerCsvException("两个选手的uuid列名不一致,不属于同一竞赛");
}
if (!data1.getAnswerColumnName().equalsIgnoreCase(data2.getAnswerColumnName())) {
throw new ContestAnswerCsvException("两个选手的答案列名不一致,不属于同一竞赛");
}
// 校验答案取值范围一致(同竞赛核心要求)
Set<String> allAnswerValues = new HashSet<>();
allAnswerValues.addAll(data1.getAnswerValueSet());
allAnswerValues.addAll(data2.getAnswerValueSet());
// 若两个选手的取值集合的并集,与各自的集合差异超过阈值,说明不属于同一竞赛
// 允许少量差异(如个别选手的异常值),严格场景可改为完全相等
if (allAnswerValues.size() > Math.max(data1.getAnswerValueSet().size(), data2.getAnswerValueSet().size()) * 1.2) {
throw new ContestAnswerCsvException("两个选手的答案取值范围差异过大,不属于同一竞赛,无法对比");
}
}
/**
* 对齐两个选手的答案数据,按uuid匹配,处理缺失值
* @param data1 选手1的答案数据
* @param data2 选手2的答案数据
* @param strategy 缺失值处理策略
* @return 对齐后的数据:key=uuid,value=[选手1答案, 选手2答案]
*/
public Map<String, List<String>> alignAnswerData(ContestAnswerData data1, ContestAnswerData data2, MissingValueStrategy strategy) {
Map<String, String> answerMap1 = data1.getAnswerMap();
Map<String, String> answerMap2 = data2.getAnswerMap();
// 获取所有uuid的并集
Set<String> allUuids = new HashSet<>(answerMap1.keySet());
allUuids.addAll(answerMap2.keySet());
Map<String, List<String>> alignedData = new LinkedHashMap<>();
final String missingValue = "NULL";
for (String uuid : allUuids) {
String answer1 = answerMap1.getOrDefault(uuid, missingValue);
String answer2 = answerMap2.getOrDefault(uuid, missingValue);
// 按策略处理缺失值
if (strategy == MissingValueStrategy.EXCLUDE_MISSING) {
// 剔除仅单方存在的uuid
if (answer1.equals(missingValue) || answer2.equals(missingValue)) {
continue;
}
}
alignedData.put(uuid, Arrays.asList(answer1, answer2));
}
// 校验对齐后是否有有效数据
if (alignedData.isEmpty()) {
throw new ContestAnswerCsvException("数据对齐后无有效对比样本,两个选手无共同的uuid/目标id");
}
return alignedData;
}
/**
* 动态构建二维列联表,统计交叉频数
* 适配任意答案取值范围,无需提前配置
* @param alignedData 对齐后的答案数据
* @return 列联表封装数据,包含频数矩阵、行/列类别、总样本数
*/
public ContingencyTable buildContingencyTable(Map<String, List<String>> alignedData) {
// 提取所有答案取值,动态确定行/列类别
Set<String> player1AnswerSet = new LinkedHashSet<>();
Set<String> player2AnswerSet = new LinkedHashSet<>();
for (List<String> answerPair : alignedData.values()) {
player1AnswerSet.add(answerPair.get(0));
player2AnswerSet.add(answerPair.get(1));
}
// 转换为列表,固定索引
List<String> player1Categories = new ArrayList<>(player1AnswerSet);
List<String> player2Categories = new ArrayList<>(player2AnswerSet);
int rowCount = player1Categories.size();
int colCount = player2Categories.size();
// 构建频数矩阵,统计交叉出现次数
int[][] frequencyMatrix = new int[rowCount][colCount];
for (List<String> answerPair : alignedData.values()) {
String answer1 = answerPair.get(0);
String answer2 = answerPair.get(1);
int rowIdx = player1Categories.indexOf(answer1);
int colIdx = player2Categories.indexOf(answer2);
frequencyMatrix[rowIdx][colIdx]++;
}
return new ContingencyTable(frequencyMatrix, player1Categories, player2Categories, alignedData.size());
}
/**
* 计算皮尔逊卡方统计量,基于列联表
* @param table 列联表数据
* @return 卡方值
*/
public double calculatePearsonChiSquare(ContingencyTable table) {
int[][] frequencyMatrix = table.getFrequencyMatrix();
int rowCount = frequencyMatrix.length;
int colCount = frequencyMatrix[0].length;
int totalSample = table.getTotalSample();
// 计算行合计
int[] rowSum = new int[rowCount];
for (int i = 0; i < rowCount; i++) {
for (int j = 0; j < colCount; j++) {
rowSum[i] += frequencyMatrix[i][j];
}
}
// 计算列合计
int[] colSum = new int[colCount];
for (int j = 0; j < colCount; j++) {
for (int i = 0; i < rowCount; i++) {
colSum[j] += frequencyMatrix[i][j];
}
}
// 计算卡方值,同时校验理论频数
double chiSquare = 0.0;
boolean hasLowExpectedFrequency = false;
int lowFrequencyCount = 0;
int totalCellCount = rowCount * colCount;
for (int i = 0; i < rowCount; i++) {
for (int j = 0; j < colCount; j++) {
double expectedFrequency = (double) (rowSum[i] * colSum[j]) / totalSample;
// 统计理论频数低于阈值的单元格
if (expectedFrequency < MIN_EXPECTED_FREQUENCY) {
hasLowExpectedFrequency = true;
lowFrequencyCount++;
}
// 卡方计算公式
chiSquare += Math.pow(frequencyMatrix[i][j] - expectedFrequency, 2) / expectedFrequency;
}
}
// 输出理论频数警告
if (hasLowExpectedFrequency) {
double lowFrequencyRatio = (double) lowFrequencyCount / totalCellCount;
System.out.printf("⚠️ 警告:列联表中%d个单元格理论频数低于%.1f,占比%.1f%%,卡方检验结果可靠性可能受影响%n",
lowFrequencyCount, MIN_EXPECTED_FREQUENCY, lowFrequencyRatio * 100);
}
return chiSquare;
}
/**
* 计算克拉默系数,基于卡方值和列联表数据
* @param chiSquare 皮尔逊卡方值
* @param table 列联表数据
* @return 克拉默系数,取值范围[0,1]
*/
public double calculateCramerCoefficient(double chiSquare, ContingencyTable table) {
int totalSample = table.getTotalSample();
int categoryCount1 = table.getPlayer1Categories().size();
int categoryCount2 = table.getPlayer2Categories().size();
// 边界情况处理:仅1个答案类别时,系数无意义,返回0
if (categoryCount1 <= 1 || categoryCount2 <= 1) {
System.out.println("⚠️ 警告:答案仅1个有效类别,克拉默系数无统计意义,返回0");
return 0.0;
}
// 计算自由度最小值
int minDegreeOfFreedom = Math.min(categoryCount1 - 1, categoryCount2 - 1);
// 克拉默系数核心计算公式
return Math.sqrt(chiSquare / (totalSample * minDegreeOfFreedom));
}
/**
* 全流程执行:读取两个选手的CSV,计算克拉默系数,输出反抄袭结果
* @param filePath1 选手1的CSV文件路径
* @param player1Identifier 选手1的标识
* @param filePath2 选手2的CSV文件路径
* @param player2Identifier 选手2的标识
* @param strategy 缺失值处理策略
* @return 最终计算结果,包含克拉默系数、风险等级、详细统计数据
*/
public CramerCalculationResult calculateFullProcess(String filePath1, String player1Identifier,
String filePath2, String player2Identifier,
MissingValueStrategy strategy) {
// 1. 读取两个选手的答案CSV
ContestAnswerData data1 = readAnswerCsv(filePath1, player1Identifier);
ContestAnswerData data2 = readAnswerCsv(filePath2, player2Identifier);
System.out.printf("✅ 成功读取两个选手的答案文件%n");
System.out.printf(" %s:%d条有效答案,取值范围:%s%n",
player1Identifier, data1.getValidAnswerCount(), data1.getAnswerValueSet());
System.out.printf(" %s:%d条有效答案,取值范围:%s%n",
player2Identifier, data2.getValidAnswerCount(), data2.getAnswerValueSet());
// 2. 校验同竞赛格式一致性
validateSameContestFormat(data1, data2);
System.out.println("✅ 两个选手的答案符合同一竞赛格式要求,可正常对比");
// 3. 对齐答案数据
Map<String, List<String>> alignedData = alignAnswerData(data1, data2, strategy);
System.out.printf("✅ 数据对齐完成,有效对比样本数:%d%n", alignedData.size());
// 4. 构建列联表
ContingencyTable table = buildContingencyTable(alignedData);
System.out.printf("✅ 列联表构建完成,%s答案类别数:%d,%s答案类别数:%d%n",
player1Identifier, table.getPlayer1Categories().size(),
player2Identifier, table.getPlayer2Categories().size());
// 5. 计算皮尔逊卡方值
double chiSquare = calculatePearsonChiSquare(table);
System.out.printf("✅ 皮尔逊卡方值计算完成:%.2f%n", chiSquare);
// 6. 计算克拉默系数
double cramerCoefficient = calculateCramerCoefficient(chiSquare, table);
System.out.printf("✅ 克拉默系数计算完成:%.4f%n", cramerCoefficient);
// 7. 判定抄袭风险等级
String riskLevel = cramerCoefficient >= PLAGIARISM_RISK_THRESHOLD ? "高风险抄袭" : "正常";
if ("高风险抄袭".equals(riskLevel)) {
System.out.println("🚨 检测到高风险抄袭行为,请立即人工复核!");
}
// 8. 封装最终结果
CramerCalculationResult result = new CramerCalculationResult(
data1, data2, alignedData.size(), table,
chiSquare, cramerCoefficient, riskLevel, PLAGIARISM_RISK_THRESHOLD
);
// 打印完整结果
printResult(result);
return result;
}
/**
* 打印完整的计算结果,适配反抄袭业务场景
* @param result 计算结果
*/
private void printResult(CramerCalculationResult result) {
System.out.println("\n==================== 竞赛答案相似度对比结果 ====================");
System.out.printf("对比选手1:%s%n", result.getData1().getPlayerIdentifier());
System.out.printf("对比选手2:%s%n", result.getData2().getPlayerIdentifier());
System.out.printf("有效对比样本数:%d%n", result.getTotalSample());
System.out.printf("皮尔逊卡方值:%.2f%n", result.getChiSquare());
System.out.printf("克拉默系数(相似度):%.4f%n", result.getCramerCoefficient());
System.out.printf("抄袭风险阈值:≥%.2f%n", result.getRiskThreshold());
System.out.printf("最终风险评估:%s%n", result.getRiskLevel());
System.out.println("\n答案交叉统计详情:");
System.out.println("------------------------------------------------------------");
ContingencyTable table = result.getTable();
List<String> player1Categories = table.getPlayer1Categories();
List<String> player2Categories = table.getPlayer2Categories();
int[][] frequencyMatrix = table.getFrequencyMatrix();
// 打印表头
System.out.printf("%-15s", result.getData1().getPlayerIdentifier() + " \\ " + result.getData2().getPlayerIdentifier());
for (String category : player2Categories) {
System.out.printf("%-10s", category);
}
System.out.println("行合计");
System.out.println("------------------------------------------------------------");
// 打印每行数据
for (int i = 0; i < player1Categories.size(); i++) {
System.out.printf("%-15s", player1Categories.get(i));
int rowSum = 0;
for (int j = 0; j < player2Categories.size(); j++) {
System.out.printf("%-10d", frequencyMatrix[i][j]);
rowSum += frequencyMatrix[i][j];
}
System.out.println(rowSum);
}
// 打印列合计
System.out.println("------------------------------------------------------------");
System.out.printf("%-15s", "列合计");
for (int j = 0; j < player2Categories.size(); j++) {
int colSum = 0;
for (int i = 0; i < player1Categories.size(); i++) {
colSum += frequencyMatrix[i][j];
}
System.out.printf("%-10d", colSum);
}
System.out.println(result.getTotalSample());
System.out.println("============================================================");
}
// ==================== 内部封装类 ====================
/**
* 列联表数据封装
*/
public static class ContingencyTable {
private final int[][] frequencyMatrix;
private final List<String> player1Categories;
private final List<String> player2Categories;
private final int totalSample;
public ContingencyTable(int[][] frequencyMatrix, List<String> player1Categories,
List<String> player2Categories, int totalSample) {
this.frequencyMatrix = frequencyMatrix;
this.player1Categories = player1Categories;
this.player2Categories = player2Categories;
this.totalSample = totalSample;
}
public int[][] getFrequencyMatrix() {
return frequencyMatrix;
}
public List<String> getPlayer1Categories() {
return player1Categories;
}
public List<String> getPlayer2Categories() {
return player2Categories;
}
public int getTotalSample() {
return totalSample;
}
}
/**
* 最终计算结果封装
*/
public static class CramerCalculationResult {
private final ContestAnswerData data1;
private final ContestAnswerData data2;
private final int totalSample;
private final ContingencyTable table;
private final double chiSquare;
private final double cramerCoefficient;
private final String riskLevel;
private final double riskThreshold;
public CramerCalculationResult(ContestAnswerData data1, ContestAnswerData data2, int totalSample,
ContingencyTable table, double chiSquare, double cramerCoefficient,
String riskLevel, double riskThreshold) {
this.data1 = data1;
this.data2 = data2;
this.totalSample = totalSample;
this.table = table;
this.chiSquare = chiSquare;
this.cramerCoefficient = cramerCoefficient;
this.riskLevel = riskLevel;
this.riskThreshold = riskThreshold;
}
// Getter方法,用于上层业务获取结果
public ContestAnswerData getData1() {
return data1;
}
public ContestAnswerData getData2() {
return data2;
}
public int getTotalSample() {
return totalSample;
}
public ContingencyTable getTable() {
return table;
}
public double getChiSquare() {
return chiSquare;
}
public double getCramerCoefficient() {
return cramerCoefficient;
}
public String getRiskLevel() {
return riskLevel;
}
public double getRiskThreshold() {
return riskThreshold;
}
}
}
package com.lsl.cramer;
/**
* 竞赛答案克拉默系数计算测试主类
* 演示如何快速使用工具类进行两个选手的答案对比
*/
public class ContestAnswerCramerMain {
public static void main(String[] args) {
// 选手1的CSV文件路径和标识
String filePath1 = "D:\\temp\\aaa.csv";
String player1Identifier = "北京分行-选手A";
// 选手2的CSV文件路径和标识
String filePath2 = "D:\\temp\\bbb.csv";
String player2Identifier = "北京分行-选手B";
// 创建计算器实例
ContestAnswerCramerCalculator calculator = new ContestAnswerCramerCalculator();
// 全流程执行计算,使用推荐的缺失值处理策略:剔除仅单方存在的uuid
ContestAnswerCramerCalculator.CramerCalculationResult result = calculator.calculateFullProcess(
filePath1, player1Identifier,
filePath2, player2Identifier,
ContestAnswerCramerCalculator.MissingValueStrategy.EXCLUDE_MISSING
);
// 上层业务可基于结果做后续处理,比如入库、发送告警、人工复核通知等
if ("高风险抄袭".equals(result.getRiskLevel())) {
// 业务告警逻辑
System.out.println("\n🚨 业务告警:检测到高风险抄袭行为,已触发人工复核流程!");
}
}
}
1359

被折叠的 条评论
为什么被折叠?



