From 9a2db5edb67470ef197974270f0c91c4ea1f425d Mon Sep 17 00:00:00 2001 From: hypercross Date: Sun, 19 Apr 2026 14:40:41 +0800 Subject: [PATCH] feat(csv-loader): strip comments before parsing CSV Pre-filter comment lines from the content before passing it to `csv-parse`. This prevents quote parsing errors when comment lines contain double quotes and simplifies the record filtering logic. --- src/csv-loader/loader.test.ts | 18 +++++++++++++++ src/csv-loader/loader.ts | 43 ++++++++++++++++++++--------------- 2 files changed, 43 insertions(+), 18 deletions(-) diff --git a/src/csv-loader/loader.test.ts b/src/csv-loader/loader.test.ts index 5cd256c..906c9d7 100644 --- a/src/csv-loader/loader.test.ts +++ b/src/csv-loader/loader.test.ts @@ -40,6 +40,24 @@ describe("parseCsv - basic parsing", () => { expect(result.data[1]).toEqual({ id: 2, count: 3, price: 4.5 }); }); + it("should parse CSV with non-ASCII characters and comments", () => { + const csv = [ + '# id: unique intent state ID (e.g. "仙人掌怪-boost")', + "id", + "string", + "仙人掌怪-boost", + "仙人掌怪-defend", + "仙人掌怪-attack", + ].join("\n"); + + const result = parseCsv(csv, { emitTypes: false }); + + expect(result.data).toHaveLength(3); + expect(result.data[0]).toEqual({ id: "仙人掌怪-boost" }); + expect(result.data[1]).toEqual({ id: "仙人掌怪-defend" }); + expect(result.data[2]).toEqual({ id: "仙人掌怪-attack" }); + }); + it("should parse CSV with string literal columns (unquoted in CSV)", () => { const csv = [ "name,status", diff --git a/src/csv-loader/loader.ts b/src/csv-loader/loader.ts index 3eb6018..f13292a 100644 --- a/src/csv-loader/loader.ts +++ b/src/csv-loader/loader.ts @@ -789,34 +789,41 @@ export function parseCsv( const refBaseDir = options.refBaseDir; const defaultPrimaryKey = options.defaultPrimaryKey ?? "id"; - const records = parse(content, { + // Pre-strip comment lines from content before passing to csv-parse, + // to avoid quote parsing errors in comment lines containing double quotes. + const reverseReferences: ReverseReferenceDeclaration[] = []; + let filteredContent = content; + if (comment) { + const lines = content.split(/\r?\n/); + const nonCommentLines: string[] = []; + for (const line of lines) { + const trimmed = line.trim(); + if (trimmed.startsWith(comment)) { + const decl = parseReverseReferenceDeclaration(trimmed, comment); + if (decl) { + reverseReferences.push(decl); + } + // Skip comment lines + } else { + nonCommentLines.push(line); + } + } + filteredContent = nonCommentLines.join("\n"); + } + + const records = parse(filteredContent, { delimiter, quote, escape, bom, - // Don't let csv-parse skip comments; we need to parse them for reverse references. - // Comment lines are filtered out manually below using the configured comment character. comment: undefined, trim, skip_empty_lines: true, relax_column_count: true, }); - // Filter out comment lines from all records, collecting reverse reference declarations - const reverseReferences: ReverseReferenceDeclaration[] = []; - const filteredRecords: string[][] = []; - for (const row of records) { - const firstCell = (row[0] ?? "").trim(); - if (comment && firstCell.startsWith(comment)) { - const decl = parseReverseReferenceDeclaration(firstCell, comment); - if (decl) { - reverseReferences.push(decl); - } - // Skip comment lines (whether or not they're reverse ref declarations) - continue; - } - filteredRecords.push(row); - } + // Comment lines were already filtered out before parsing + const filteredRecords = records; if (filteredRecords.length < 2) { throw new Error("CSV must have at least 2 rows: headers and schemas");