feat(csv-loader): strip comments before parsing CSV

Pre-filter comment lines from the content before passing it to
`csv-parse`. This prevents quote parsing errors when comment lines
contain double quotes and simplifies the record filtering logic.
This commit is contained in:
hypercross 2026-04-19 14:40:41 +08:00
parent 5a1627c6f1
commit 9a2db5edb6
2 changed files with 43 additions and 18 deletions

View File

@ -40,6 +40,24 @@ describe("parseCsv - basic parsing", () => {
expect(result.data[1]).toEqual({ id: 2, count: 3, price: 4.5 });
});
it("should parse CSV with non-ASCII characters and comments", () => {
const csv = [
'# id: unique intent state ID (e.g. "仙人掌怪-boost")',
"id",
"string",
"仙人掌怪-boost",
"仙人掌怪-defend",
"仙人掌怪-attack",
].join("\n");
const result = parseCsv(csv, { emitTypes: false });
expect(result.data).toHaveLength(3);
expect(result.data[0]).toEqual({ id: "仙人掌怪-boost" });
expect(result.data[1]).toEqual({ id: "仙人掌怪-defend" });
expect(result.data[2]).toEqual({ id: "仙人掌怪-attack" });
});
it("should parse CSV with string literal columns (unquoted in CSV)", () => {
const csv = [
"name,status",

View File

@ -789,34 +789,41 @@ export function parseCsv(
const refBaseDir = options.refBaseDir;
const defaultPrimaryKey = options.defaultPrimaryKey ?? "id";
const records = parse(content, {
// Pre-strip comment lines from content before passing to csv-parse,
// to avoid quote parsing errors in comment lines containing double quotes.
const reverseReferences: ReverseReferenceDeclaration[] = [];
let filteredContent = content;
if (comment) {
const lines = content.split(/\r?\n/);
const nonCommentLines: string[] = [];
for (const line of lines) {
const trimmed = line.trim();
if (trimmed.startsWith(comment)) {
const decl = parseReverseReferenceDeclaration(trimmed, comment);
if (decl) {
reverseReferences.push(decl);
}
// Skip comment lines
} else {
nonCommentLines.push(line);
}
}
filteredContent = nonCommentLines.join("\n");
}
const records = parse(filteredContent, {
delimiter,
quote,
escape,
bom,
// Don't let csv-parse skip comments; we need to parse them for reverse references.
// Comment lines are filtered out manually below using the configured comment character.
comment: undefined,
trim,
skip_empty_lines: true,
relax_column_count: true,
});
// Filter out comment lines from all records, collecting reverse reference declarations
const reverseReferences: ReverseReferenceDeclaration[] = [];
const filteredRecords: string[][] = [];
for (const row of records) {
const firstCell = (row[0] ?? "").trim();
if (comment && firstCell.startsWith(comment)) {
const decl = parseReverseReferenceDeclaration(firstCell, comment);
if (decl) {
reverseReferences.push(decl);
}
// Skip comment lines (whether or not they're reverse ref declarations)
continue;
}
filteredRecords.push(row);
}
// Comment lines were already filtered out before parsing
const filteredRecords = records;
if (filteredRecords.length < 2) {
throw new Error("CSV must have at least 2 rows: headers and schemas");