update input data formats

RaymondTang2003 · RaymondTang2003 · commit f8e79bb9462d · 2025-09-19T19:18:49.000+08:00
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@
 
 - [ ] Main functionss
   - [x] Support both local deployment or API calls for LLM, VLM, and Embedding models.
-  - [ ] Support diverse input formats: HTML, CSV, MARKDOWN, ...
+  - [x] Support diverse input formats: HTML, CSV, MARKDOWN, ...
   - [ ] Support Image input.
   - [ ] Expand the table extraction module to support table types beyond problem definition.
 
diff --git a/README.zh-CN.md b/README.zh-CN.md
@@ -18,7 +18,7 @@
 
 - [ ] 主要功能
   - [x] 支持LLM、VLM及嵌入模型的本地部署与API调用
-  - [ ] 支持多样化输入格式：HTML、CSV、MARKDOWN等
+  - [x] 支持多样化输入格式：HTML、CSV、MARKDOWN等
   - [ ] 支持图像输入
   - [ ] 扩展表格提取模块，支持问题定义之外的表格类型
 
diff --git a/main.py b/main.py
@@ -97,6 +97,31 @@ def benchmark(
     ##### 读入所有 Table 文件列表
     table_files = sorted(glob.glob(table_dir + "/*"))
 
+    # 处理不同格式的输入，即都转换为 Excel 格式
+    new_table_files = []
+    for table_file in table_files:
+        last_dot_idx = os.path.basename(table_file).rfind('.')
+        new_table_file = os.path.join(log_dir, os.path.basename(table_file)[:last_dot_idx] + '.xlsx')
+
+        if table_file.endswith(".xlsx"):
+            pass
+        elif table_file.endswith(".csv"):
+            df = pd.read_csv(new_table_file)
+            df.to_excel(new_table_file, index=False, engine='openpyxl')
+        elif table_file.endswith(".html"):
+            html_content = open(table_file).read()
+            html2workbook(html_content).save(new_table_file)
+        elif table_file.endswith(".md"):
+            markdown_content = open(table_file).read()
+            table = extract_markdown_tables(markdown_content)
+            with pd.ExcelWriter(new_table_file, engine='openpyxl') as writer:
+                sheet_name = f'sheet'
+                df = pd.DataFrame(table[1:], columns=table[0])
+                df.to_excel(writer, sheet_name=sheet_name, index=False)
+        
+        new_table_files.append(new_table_file)
+    table_files = new_table_files
+
     ##### 读取已经处理了的 QA Pair
     output_data = []
     qid_set = set()
diff --git a/utils/sheet_utils.py b/utils/sheet_utils.py
@@ -45,6 +45,40 @@ def sheet2markdown(sheet):
         
     return markdown_table
 
+
+def extract_markdown_tables(content):
+    """
+    从Markdown内容中提取表格数据
+    Args:
+        content (str): Markdown内容
+    Returns:
+        list: 包含所有表格数据的列表
+    """
+    # 正则表达式匹配Markdown表格
+    table_pattern = r'\|(.+)\|\n\|[-:| -]+\|\n((?:\|.+\\|\n)+)'
+    matches = re.findall(table_pattern, content, re.MULTILINE)
+    
+    tables = []
+    for match in matches:
+        header_row = match[0]
+        data_rows = match[1].strip().split('\n')
+        
+        # 处理表头
+        headers = [cell.strip() for cell in header_row.split('|') if cell.strip()]
+        
+        # 处理数据行
+        table_data = [headers]
+        for row in data_rows:
+            cells = [cell.strip() for cell in row.split('|') if cell.strip()]
+            if cells:  # 确保不是空行
+                table_data.append(cells)
+        
+        tables.append(table_data)
+    
+    if len(tables) > 0:
+        return tables[0]
+    return ""
+
 # 将HTMl表格转换为Excel表格
 def html2workbook(html_content):
     # 使用 BeautifulSoup 解析 HTML