Skip to content

Commit f8e79bb

Browse files
update input data formats
1 parent bed6264 commit f8e79bb

4 files changed

Lines changed: 61 additions & 2 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
- [ ] Main functionss
2020
- [x] Support both local deployment or API calls for LLM, VLM, and Embedding models.
21-
- [ ] Support diverse input formats: HTML, CSV, MARKDOWN, ...
21+
- [x] Support diverse input formats: HTML, CSV, MARKDOWN, ...
2222
- [ ] Support Image input.
2323
- [ ] Expand the table extraction module to support table types beyond problem definition.
2424

README.zh-CN.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
- [ ] 主要功能
2020
- [x] 支持LLM、VLM及嵌入模型的本地部署与API调用
21-
- [ ] 支持多样化输入格式:HTML、CSV、MARKDOWN等
21+
- [x] 支持多样化输入格式:HTML、CSV、MARKDOWN等
2222
- [ ] 支持图像输入
2323
- [ ] 扩展表格提取模块,支持问题定义之外的表格类型
2424

main.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,31 @@ def benchmark(
9797
##### 读入所有 Table 文件列表
9898
table_files = sorted(glob.glob(table_dir + "/*"))
9999

100+
# 处理不同格式的输入,即都转换为 Excel 格式
101+
new_table_files = []
102+
for table_file in table_files:
103+
last_dot_idx = os.path.basename(table_file).rfind('.')
104+
new_table_file = os.path.join(log_dir, os.path.basename(table_file)[:last_dot_idx] + '.xlsx')
105+
106+
if table_file.endswith(".xlsx"):
107+
pass
108+
elif table_file.endswith(".csv"):
109+
df = pd.read_csv(new_table_file)
110+
df.to_excel(new_table_file, index=False, engine='openpyxl')
111+
elif table_file.endswith(".html"):
112+
html_content = open(table_file).read()
113+
html2workbook(html_content).save(new_table_file)
114+
elif table_file.endswith(".md"):
115+
markdown_content = open(table_file).read()
116+
table = extract_markdown_tables(markdown_content)
117+
with pd.ExcelWriter(new_table_file, engine='openpyxl') as writer:
118+
sheet_name = f'sheet'
119+
df = pd.DataFrame(table[1:], columns=table[0])
120+
df.to_excel(writer, sheet_name=sheet_name, index=False)
121+
122+
new_table_files.append(new_table_file)
123+
table_files = new_table_files
124+
100125
##### 读取已经处理了的 QA Pair
101126
output_data = []
102127
qid_set = set()

utils/sheet_utils.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,40 @@ def sheet2markdown(sheet):
4545

4646
return markdown_table
4747

48+
49+
def extract_markdown_tables(content):
50+
"""
51+
从Markdown内容中提取表格数据
52+
Args:
53+
content (str): Markdown内容
54+
Returns:
55+
list: 包含所有表格数据的列表
56+
"""
57+
# 正则表达式匹配Markdown表格
58+
table_pattern = r'\|(.+)\|\n\|[-:| -]+\|\n((?:\|.+\\|\n)+)'
59+
matches = re.findall(table_pattern, content, re.MULTILINE)
60+
61+
tables = []
62+
for match in matches:
63+
header_row = match[0]
64+
data_rows = match[1].strip().split('\n')
65+
66+
# 处理表头
67+
headers = [cell.strip() for cell in header_row.split('|') if cell.strip()]
68+
69+
# 处理数据行
70+
table_data = [headers]
71+
for row in data_rows:
72+
cells = [cell.strip() for cell in row.split('|') if cell.strip()]
73+
if cells: # 确保不是空行
74+
table_data.append(cells)
75+
76+
tables.append(table_data)
77+
78+
if len(tables) > 0:
79+
return tables[0]
80+
return ""
81+
4882
# 将HTMl表格转换为Excel表格
4983
def html2workbook(html_content):
5084
# 使用 BeautifulSoup 解析 HTML

0 commit comments

Comments
 (0)