371 lines
13 KiB
TypeScript
371 lines
13 KiB
TypeScript
import React, { useState } from 'react';
|
||
import {
|
||
FileText,
|
||
Upload,
|
||
Download,
|
||
Loader2,
|
||
CheckCircle,
|
||
AlertCircle,
|
||
Info,
|
||
Trash2,
|
||
ArrowLeft
|
||
} from 'lucide-react';
|
||
import { useNavigate } from 'react-router-dom';
|
||
import { invoke } from '@tauri-apps/api/core';
|
||
import { open, save } from '@tauri-apps/plugin-dialog';
|
||
import { listen } from '@tauri-apps/api/event';
|
||
import { useNotifications } from '../../components/NotificationSystem';
|
||
|
||
interface DataCleaningProgress {
|
||
current: number;
|
||
total: number;
|
||
percentage: number;
|
||
status: string;
|
||
}
|
||
|
||
interface DataCleaningResult {
|
||
success: boolean;
|
||
message: string;
|
||
original_count: number;
|
||
removed_count: number;
|
||
final_count: number;
|
||
output_file: string;
|
||
}
|
||
|
||
/**
|
||
* AI检索图片/数据清洗工具详情页
|
||
* 遵循 Tauri 开发规范和 UI/UX 设计标准
|
||
*/
|
||
const DataCleaningTool: React.FC = () => {
|
||
const navigate = useNavigate();
|
||
const [allDataFile, setAllDataFile] = useState<string>('');
|
||
const [removeDataFile, setRemoveDataFile] = useState<string>('');
|
||
const [outputFile, setOutputFile] = useState<string>('');
|
||
const [isProcessing, setIsProcessing] = useState(false);
|
||
const [progress, setProgress] = useState<DataCleaningProgress | null>(null);
|
||
const [result, setResult] = useState<DataCleaningResult | null>(null);
|
||
|
||
const { success, error } = useNotifications();
|
||
|
||
// 选择全部数据文件
|
||
const selectAllDataFile = async () => {
|
||
try {
|
||
const selected = await open({
|
||
multiple: false,
|
||
filters: [{
|
||
name: 'JSONL Files',
|
||
extensions: ['jsonl']
|
||
}]
|
||
});
|
||
|
||
if (selected && typeof selected === 'string') {
|
||
setAllDataFile(selected);
|
||
}
|
||
} catch (err) {
|
||
error('文件选择失败', '无法选择全部数据文件');
|
||
}
|
||
};
|
||
|
||
// 选择要去除的数据文件
|
||
const selectRemoveDataFile = async () => {
|
||
try {
|
||
const selected = await open({
|
||
multiple: false,
|
||
filters: [{
|
||
name: 'JSONL Files',
|
||
extensions: ['jsonl']
|
||
}]
|
||
});
|
||
|
||
if (selected && typeof selected === 'string') {
|
||
setRemoveDataFile(selected);
|
||
}
|
||
} catch (err) {
|
||
error('文件选择失败', '无法选择要去除的数据文件');
|
||
}
|
||
};
|
||
|
||
// 选择输出文件
|
||
const selectOutputFile = async () => {
|
||
try {
|
||
const selected = await save({
|
||
filters: [{
|
||
name: 'JSONL Files',
|
||
extensions: ['jsonl']
|
||
}],
|
||
defaultPath: 'cleaned_data.jsonl'
|
||
});
|
||
|
||
if (selected) {
|
||
setOutputFile(selected);
|
||
}
|
||
} catch (err) {
|
||
error('文件选择失败', '无法选择输出文件');
|
||
}
|
||
};
|
||
|
||
// 开始数据清洗
|
||
const startDataCleaning = async () => {
|
||
if (!allDataFile || !removeDataFile || !outputFile) {
|
||
error('参数错误', '请选择所有必需的文件');
|
||
return;
|
||
}
|
||
|
||
setIsProcessing(true);
|
||
setProgress(null);
|
||
setResult(null);
|
||
|
||
try {
|
||
// 监听进度事件
|
||
const unlisten = await listen<DataCleaningProgress>('data-cleaning-progress', (event) => {
|
||
setProgress(event.payload);
|
||
});
|
||
|
||
// 调用后端命令
|
||
const cleaningResult = await invoke<DataCleaningResult>('clean_jsonl_data', {
|
||
allDataFile,
|
||
removeDataFile,
|
||
outputFile
|
||
});
|
||
|
||
setResult(cleaningResult);
|
||
|
||
if (cleaningResult.success) {
|
||
success('数据清洗完成',
|
||
`原始数据: ${cleaningResult.original_count} 条,` +
|
||
`去除重复: ${cleaningResult.removed_count} 条,` +
|
||
`最终结果: ${cleaningResult.final_count} 条`
|
||
);
|
||
} else {
|
||
error('数据清洗失败', cleaningResult.message);
|
||
}
|
||
|
||
// 清理事件监听器
|
||
unlisten();
|
||
} catch (err) {
|
||
const errorMessage = err instanceof Error ? err.message : '未知错误';
|
||
error('数据清洗失败', errorMessage);
|
||
} finally {
|
||
setIsProcessing(false);
|
||
}
|
||
};
|
||
|
||
// 重置表单
|
||
const resetForm = () => {
|
||
setAllDataFile('');
|
||
setRemoveDataFile('');
|
||
setOutputFile('');
|
||
setProgress(null);
|
||
setResult(null);
|
||
};
|
||
|
||
return (
|
||
<div className="space-y-6">
|
||
{/* 页面标题和返回按钮 */}
|
||
<div className="flex items-center gap-4">
|
||
<button
|
||
onClick={() => navigate('/tools')}
|
||
className="flex items-center gap-2 px-3 py-2 text-gray-600 hover:text-gray-900 hover:bg-gray-100 rounded-lg transition-colors"
|
||
>
|
||
<ArrowLeft className="w-4 h-4" />
|
||
</button>
|
||
<div className="flex items-center gap-3">
|
||
<div className="w-10 h-10 bg-gradient-to-br from-purple-500 to-purple-600 rounded-lg flex items-center justify-center shadow-sm">
|
||
<FileText className="w-5 h-5 text-white" />
|
||
</div>
|
||
<div>
|
||
<h1 className="text-2xl font-bold text-gray-900">AI检索图片/数据清洗</h1>
|
||
<p className="text-gray-600">JSONL格式数据去重处理工具</p>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
{/* 工具主体内容 */}
|
||
<div className="bg-white rounded-xl shadow-sm border border-gray-200 overflow-hidden">
|
||
<div className="p-6 border-b border-gray-200">
|
||
<div className="flex items-center gap-3">
|
||
<FileText className="w-6 h-6 text-purple-600" />
|
||
<div>
|
||
<h2 className="text-lg font-semibold text-gray-900">数据清洗工具</h2>
|
||
<p className="text-sm text-gray-600">高效处理JSONL格式数据,支持大文件批量去重</p>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<div className="p-6 space-y-6">
|
||
{/* 使用说明 */}
|
||
<div className="bg-blue-50 border border-blue-200 rounded-lg p-4">
|
||
<div className="flex items-start gap-3">
|
||
<Info className="w-5 h-5 text-blue-600 mt-0.5 flex-shrink-0" />
|
||
<div className="text-sm text-blue-800">
|
||
<p className="font-medium mb-2">使用说明:</p>
|
||
<ul className="space-y-1 list-disc list-inside">
|
||
<li>选择包含所有数据的JSONL文件(全部数据)</li>
|
||
<li>选择包含要去除数据的JSONL文件(要去除的数据)</li>
|
||
<li>系统将根据URI字段进行匹配,去除重复项</li>
|
||
<li>处理结果将保存到指定的输出文件</li>
|
||
</ul>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
{/* 文件选择区域 */}
|
||
<div className="grid grid-cols-1 md:grid-cols-2 gap-6">
|
||
{/* 全部数据文件 */}
|
||
<div className="space-y-2">
|
||
<label className="block text-sm font-medium text-gray-700">
|
||
全部数据文件 <span className="text-red-500">*</span>
|
||
</label>
|
||
<div className="flex gap-2">
|
||
<button
|
||
onClick={selectAllDataFile}
|
||
disabled={isProcessing}
|
||
className="flex items-center gap-2 px-4 py-2 bg-gray-100 hover:bg-gray-200 disabled:bg-gray-50 disabled:text-gray-400 text-gray-700 rounded-lg border border-gray-300 transition-colors"
|
||
>
|
||
<Upload className="w-4 h-4" />
|
||
选择文件
|
||
</button>
|
||
</div>
|
||
{allDataFile && (
|
||
<p className="text-xs text-gray-600 break-all bg-gray-50 p-2 rounded">
|
||
{allDataFile}
|
||
</p>
|
||
)}
|
||
</div>
|
||
|
||
{/* 要去除的数据文件 */}
|
||
<div className="space-y-2">
|
||
<label className="block text-sm font-medium text-gray-700">
|
||
要去除的数据文件 <span className="text-red-500">*</span>
|
||
</label>
|
||
<div className="flex gap-2">
|
||
<button
|
||
onClick={selectRemoveDataFile}
|
||
disabled={isProcessing}
|
||
className="flex items-center gap-2 px-4 py-2 bg-gray-100 hover:bg-gray-200 disabled:bg-gray-50 disabled:text-gray-400 text-gray-700 rounded-lg border border-gray-300 transition-colors"
|
||
>
|
||
<Upload className="w-4 h-4" />
|
||
选择文件
|
||
</button>
|
||
</div>
|
||
{removeDataFile && (
|
||
<p className="text-xs text-gray-600 break-all bg-gray-50 p-2 rounded">
|
||
{removeDataFile}
|
||
</p>
|
||
)}
|
||
</div>
|
||
</div>
|
||
|
||
{/* 输出文件选择 */}
|
||
<div className="space-y-2">
|
||
<label className="block text-sm font-medium text-gray-700">
|
||
输出文件 <span className="text-red-500">*</span>
|
||
</label>
|
||
<div className="flex gap-2">
|
||
<button
|
||
onClick={selectOutputFile}
|
||
disabled={isProcessing}
|
||
className="flex items-center gap-2 px-4 py-2 bg-gray-100 hover:bg-gray-200 disabled:bg-gray-50 disabled:text-gray-400 text-gray-700 rounded-lg border border-gray-300 transition-colors"
|
||
>
|
||
<Download className="w-4 h-4" />
|
||
选择保存位置
|
||
</button>
|
||
</div>
|
||
{outputFile && (
|
||
<p className="text-xs text-gray-600 break-all bg-gray-50 p-2 rounded">
|
||
{outputFile}
|
||
</p>
|
||
)}
|
||
</div>
|
||
|
||
{/* 进度显示 */}
|
||
{progress && (
|
||
<div className="bg-gray-50 border border-gray-200 rounded-lg p-4">
|
||
<div className="flex items-center gap-3 mb-3">
|
||
<Loader2 className="w-5 h-5 text-purple-600 animate-spin" />
|
||
<div>
|
||
<p className="text-sm font-medium text-gray-900">正在处理数据...</p>
|
||
<p className="text-xs text-gray-600">{progress.status}</p>
|
||
</div>
|
||
</div>
|
||
<div className="w-full bg-gray-200 rounded-full h-2">
|
||
<div
|
||
className="bg-purple-600 h-2 rounded-full transition-all duration-300"
|
||
style={{ width: `${progress.percentage}%` }}
|
||
></div>
|
||
</div>
|
||
<p className="text-xs text-gray-600 mt-2">
|
||
{progress.current} / {progress.total} ({progress.percentage.toFixed(1)}%)
|
||
</p>
|
||
</div>
|
||
)}
|
||
|
||
{/* 结果显示 */}
|
||
{result && (
|
||
<div className={`border rounded-lg p-4 ${
|
||
result.success
|
||
? 'bg-green-50 border-green-200'
|
||
: 'bg-red-50 border-red-200'
|
||
}`}>
|
||
<div className="flex items-start gap-3">
|
||
{result.success ? (
|
||
<CheckCircle className="w-5 h-5 text-green-600 mt-0.5 flex-shrink-0" />
|
||
) : (
|
||
<AlertCircle className="w-5 h-5 text-red-600 mt-0.5 flex-shrink-0" />
|
||
)}
|
||
<div className="flex-1">
|
||
<p className={`text-sm font-medium ${
|
||
result.success ? 'text-green-800' : 'text-red-800'
|
||
}`}>
|
||
{result.success ? '数据清洗完成' : '数据清洗失败'}
|
||
</p>
|
||
<p className={`text-sm mt-1 ${
|
||
result.success ? 'text-green-700' : 'text-red-700'
|
||
}`}>
|
||
{result.message}
|
||
</p>
|
||
{result.success && (
|
||
<div className="mt-2 text-xs text-green-700 space-y-1">
|
||
<p>原始数据: {result.original_count} 条</p>
|
||
<p>去除重复: {result.removed_count} 条</p>
|
||
<p>最终结果: {result.final_count} 条</p>
|
||
<p>输出文件: {result.output_file}</p>
|
||
</div>
|
||
)}
|
||
</div>
|
||
</div>
|
||
</div>
|
||
)}
|
||
|
||
{/* 操作按钮 */}
|
||
<div className="flex gap-3 pt-4 border-t border-gray-200">
|
||
<button
|
||
onClick={startDataCleaning}
|
||
disabled={!allDataFile || !removeDataFile || !outputFile || isProcessing}
|
||
className="flex items-center gap-2 px-6 py-2 bg-purple-600 hover:bg-purple-700 disabled:bg-gray-300 disabled:cursor-not-allowed text-white rounded-lg font-medium transition-colors"
|
||
>
|
||
{isProcessing ? (
|
||
<Loader2 className="w-4 h-4 animate-spin" />
|
||
) : (
|
||
<FileText className="w-4 h-4" />
|
||
)}
|
||
{isProcessing ? '处理中...' : '开始处理'}
|
||
</button>
|
||
|
||
<button
|
||
onClick={resetForm}
|
||
disabled={isProcessing}
|
||
className="flex items-center gap-2 px-6 py-2 bg-gray-100 hover:bg-gray-200 disabled:bg-gray-50 disabled:text-gray-400 text-gray-700 rounded-lg font-medium transition-colors"
|
||
>
|
||
<Trash2 className="w-4 h-4" />
|
||
重置
|
||
</button>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
);
|
||
};
|
||
|
||
export default DataCleaningTool;
|