fix: markdown解析bug

This commit is contained in:
imeepos 2025-07-22 16:09:31 +08:00
parent 725514af2d
commit dde679fd6e
3 changed files with 59 additions and 13 deletions

View File

@ -63,6 +63,8 @@ pub struct Position {
pub column: usize,
/// 字符偏移量从0开始
pub offset: usize,
/// 字节偏移量从0开始
pub byte_offset: usize,
}
/// 范围信息
@ -154,13 +156,14 @@ impl MarkdownParser {
})
}
/// 计算文本中的位置信息
fn calculate_position(&self, text: &str, offset: usize) -> Position {
/// 根据字节偏移计算位置信息
fn calculate_position_from_byte_offset(&self, text: &str, byte_offset: usize) -> Position {
let mut line = 0;
let mut column = 0;
let mut char_offset = 0;
for (i, ch) in text.char_indices() {
if i >= offset {
for (byte_idx, ch) in text.char_indices() {
if byte_idx >= byte_offset {
break;
}
if ch == '\n' {
@ -169,15 +172,52 @@ impl MarkdownParser {
} else {
column += 1;
}
char_offset += 1;
}
Position {
line,
column,
offset,
offset: char_offset,
byte_offset,
}
}
/// 根据字符偏移计算位置信息
fn calculate_position_from_char_offset(&self, text: &str, char_offset: usize) -> Position {
let mut line = 0;
let mut column = 0;
let mut current_char_offset = 0;
let mut byte_offset = 0;
for (byte_idx, ch) in text.char_indices() {
if current_char_offset >= char_offset {
break;
}
if ch == '\n' {
line += 1;
column = 0;
} else {
column += 1;
}
current_char_offset += 1;
byte_offset = byte_idx + ch.len_utf8();
}
Position {
line,
column,
offset: char_offset,
byte_offset,
}
}
/// 计算文本中的位置信息(保持向后兼容)
fn calculate_position(&self, text: &str, offset: usize) -> Position {
// 为了向后兼容,假设传入的是字节偏移
self.calculate_position_from_byte_offset(text, offset)
}
/// 解析Markdown文本
pub fn parse(&mut self, text: &str) -> Result<MarkdownParseResult> {
let start_time = std::time::Instant::now();
@ -222,8 +262,8 @@ impl MarkdownParser {
node_type: MarkdownNodeType::Document,
content: source_text.to_string(),
range: Range {
start: Position { line: 0, column: 0, offset: 0 },
end: self.calculate_position(source_text, source_text.len()),
start: self.calculate_position_from_byte_offset(source_text, 0),
end: self.calculate_position_from_byte_offset(source_text, source_text.len()),
},
children: Vec::new(),
attributes: HashMap::new(),

View File

@ -99,13 +99,13 @@ export const EnhancedMarkdownRenderer: React.FC<EnhancedMarkdownRendererProps> =
return null;
}
// 计算节点在原始文本中的字符偏移位置
const nodeStartOffset = node.range?.start?.offset || 0;
const nodeEndOffset = node.range?.end?.offset || 0;
// 计算节点在原始文本中的字节偏移位置与grounding数据的字节偏移匹配
const nodeStartOffset = node.range?.start?.byte_offset || 0;
const nodeEndOffset = node.range?.end?.byte_offset || 0;
// 查找与当前节点位置重叠的grounding支持信息
const relatedSupports = groundingMetadata.grounding_supports.filter(support => {
// 这里是字符串二进制的offset
// grounding数据使用字节偏移
const segmentStart = support.segment.startIndex;
const segmentEnd = support.segment.endIndex;
@ -113,7 +113,11 @@ export const EnhancedMarkdownRenderer: React.FC<EnhancedMarkdownRendererProps> =
// 检查节点范围与grounding片段是否有重叠
return hasOverlap;
});
console.log({
relatedSupports,
nodeStartOffset,
nodeEndOffset
})
if (relatedSupports.length > 0) {
// 获取相关的来源信息
const relatedSources = relatedSupports.flatMap(support =>

View File

@ -37,6 +37,8 @@ export interface Position {
column: number;
/** 字符偏移量从0开始 */
offset: number;
/** 字节偏移量从0开始 */
byte_offset: number;
}
/**