From 9a2db5edb67470ef197974270f0c91c4ea1f425d Mon Sep 17 00:00:00 2001
From: hypercross <helios.tnak@gmail.com>
Date: Sun, 19 Apr 2026 14:40:41 +0800
Subject: [PATCH] feat(csv-loader): strip comments before parsing CSV

Pre-filter comment lines from the content before passing it to
`csv-parse`. This prevents quote parsing errors when comment lines
contain double quotes and simplifies the record filtering logic.
---
 src/csv-loader/loader.test.ts | 18 +++++++++++++++
 src/csv-loader/loader.ts      | 43 ++++++++++++++++++++---------------
 2 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/src/csv-loader/loader.test.ts b/src/csv-loader/loader.test.ts
index 5cd256c..906c9d7 100644
--- a/src/csv-loader/loader.test.ts
+++ b/src/csv-loader/loader.test.ts
@@ -40,6 +40,24 @@ describe("parseCsv - basic parsing", () => {
     expect(result.data[1]).toEqual({ id: 2, count: 3, price: 4.5 });
   });
 
+  it("should parse CSV with non-ASCII characters and comments", () => {
+    const csv = [
+      '# id: unique intent state ID (e.g. "仙人掌怪-boost")',
+      "id",
+      "string",
+      "仙人掌怪-boost",
+      "仙人掌怪-defend",
+      "仙人掌怪-attack",
+    ].join("\n");
+
+    const result = parseCsv(csv, { emitTypes: false });
+
+    expect(result.data).toHaveLength(3);
+    expect(result.data[0]).toEqual({ id: "仙人掌怪-boost" });
+    expect(result.data[1]).toEqual({ id: "仙人掌怪-defend" });
+    expect(result.data[2]).toEqual({ id: "仙人掌怪-attack" });
+  });
+
   it("should parse CSV with string literal columns (unquoted in CSV)", () => {
     const csv = [
       "name,status",
diff --git a/src/csv-loader/loader.ts b/src/csv-loader/loader.ts
index 3eb6018..f13292a 100644
--- a/src/csv-loader/loader.ts
+++ b/src/csv-loader/loader.ts
@@ -789,34 +789,41 @@ export function parseCsv(
   const refBaseDir = options.refBaseDir;
   const defaultPrimaryKey = options.defaultPrimaryKey ?? "id";
 
-  const records = parse(content, {
+  // Pre-strip comment lines from content before passing to csv-parse,
+  // to avoid quote parsing errors in comment lines containing double quotes.
+  const reverseReferences: ReverseReferenceDeclaration[] = [];
+  let filteredContent = content;
+  if (comment) {
+    const lines = content.split(/\r?\n/);
+    const nonCommentLines: string[] = [];
+    for (const line of lines) {
+      const trimmed = line.trim();
+      if (trimmed.startsWith(comment)) {
+        const decl = parseReverseReferenceDeclaration(trimmed, comment);
+        if (decl) {
+          reverseReferences.push(decl);
+        }
+        // Skip comment lines
+      } else {
+        nonCommentLines.push(line);
+      }
+    }
+    filteredContent = nonCommentLines.join("\n");
+  }
+
+  const records = parse(filteredContent, {
     delimiter,
     quote,
     escape,
     bom,
-    // Don't let csv-parse skip comments; we need to parse them for reverse references.
-    // Comment lines are filtered out manually below using the configured comment character.
     comment: undefined,
     trim,
     skip_empty_lines: true,
     relax_column_count: true,
   });
 
-  // Filter out comment lines from all records, collecting reverse reference declarations
-  const reverseReferences: ReverseReferenceDeclaration[] = [];
-  const filteredRecords: string[][] = [];
-  for (const row of records) {
-    const firstCell = (row[0] ?? "").trim();
-    if (comment && firstCell.startsWith(comment)) {
-      const decl = parseReverseReferenceDeclaration(firstCell, comment);
-      if (decl) {
-        reverseReferences.push(decl);
-      }
-      // Skip comment lines (whether or not they're reverse ref declarations)
-      continue;
-    }
-    filteredRecords.push(row);
-  }
+  // Comment lines were already filtered out before parsing
+  const filteredRecords = records;
 
   if (filteredRecords.length < 2) {
     throw new Error("CSV must have at least 2 rows: headers and schemas");