diff --git a/src/main/java/com/torchv/infra/unstructured/UnstructuredParser.java b/src/main/java/com/torchv/infra/unstructured/UnstructuredParser.java index e4cf1cd03f6bd88963fefbdda6700f999e6ba521..53d453c654b787a364b1d3d924d54f869a5c7032 100644 --- a/src/main/java/com/torchv/infra/unstructured/UnstructuredParser.java +++ b/src/main/java/com/torchv/infra/unstructured/UnstructuredParser.java @@ -96,6 +96,22 @@ public class UnstructuredParser { } return toMarkdown(file.getAbsolutePath()); } + + /** + * 将输入流转换为Markdown格式的字符串 + * + * @param inputStream 输入流 + * @param fileName 文件名,用于检测格式 + * @return 转换后的Markdown字符串 + * @throws UnsupportedOperationException 当文件格式不支持时抛出异常 + */ + public static String toMarkdown(InputStream inputStream, String fileName) { + // 根据文件名检测格式 + if (!isWordExtension(fileName)) { + throw new UnsupportedOperationException("暂不支持的文件格式: " + fileName); + } + return UnstructuredWord.toMarkdown(inputStream, fileName); + } /** * 解析文档为Markdown格式(保留表格的HTML结构) @@ -160,7 +176,7 @@ public class UnstructuredParser { */ public static DocumentResult toStructuredResult(InputStream inputStream, String fileName) { // 根据文件名检测格式 - if (isWordDocument(fileName)) { + if (isWordExtension(fileName)) { return UnstructuredWord.toStructuredResult(inputStream, fileName); } // TODO: 添加其他格式的支持 diff --git a/src/main/java/com/torchv/infra/unstructured/util/UnstructuredUtils.java b/src/main/java/com/torchv/infra/unstructured/util/UnstructuredUtils.java index 722cd681d6c5828fe30e099361eabd575b74f27a..ef269d538aeaf22fe49f57cae1b1fcda50d434ef 100644 --- a/src/main/java/com/torchv/infra/unstructured/util/UnstructuredUtils.java +++ b/src/main/java/com/torchv/infra/unstructured/util/UnstructuredUtils.java @@ -84,8 +84,18 @@ public class UnstructuredUtils { return false; } - String fileName = file.getName().toLowerCase(); - return fileName.endsWith(".docx") || fileName.endsWith(".doc"); + return isWordExtension(file.getName()); + } + + /** + * 判断文件是否为Word文档名扩展名 + * + * @param fileName 文件名 + * @return 如果是docx / doc 返回true,否则返回false + */ + public static boolean isWordExtension(String fileName) { + String extension = getFileExtension(fileName.toLowerCase()); + return "docx".equals(extension) || "doc".equals(extension); } /** diff --git a/src/test/java/com/torchv/infra/unstructured/WordParserTest.java b/src/test/java/com/torchv/infra/unstructured/WordParserTest.java index b58502725b53d90fb746ec8671e53e6ca0510a4b..6f9348acbbc1a9a23d398113e9b9d1d967d46df3 100644 --- a/src/test/java/com/torchv/infra/unstructured/WordParserTest.java +++ b/src/test/java/com/torchv/infra/unstructured/WordParserTest.java @@ -17,12 +17,17 @@ package com.torchv.infra.unstructured; +import cn.hutool.core.io.FileUtil; import com.torchv.infra.unstructured.core.DocumentResult; import lombok.extern.slf4j.Slf4j; import org.junit.Test; +import java.io.BufferedInputStream; +import java.io.File; import java.util.List; +import static org.junit.Assert.*; + /** * @author xiaoymin@foxmail.com * 2025/7/19 23:11 @@ -86,4 +91,32 @@ public class WordParserTest { // 获取结构化结果,提供更多控制 } + + /** + * 测试解析doc文件为markdown格式 + */ + @Test + public void test_parse_4() { + String filePath = "src/test/resources/docs/test.doc"; + String name = FileUtil.getName(filePath); + BufferedInputStream inputStream = FileUtil.getInputStream(new File(filePath)); + String content = UnstructuredParser.toMarkdown(inputStream,name); + log.info(content); + } + + /** + * 测试通过输入流转换为结构化结果 + * + * @throws Exception 测试过程中可能抛出的异常 + */ + @Test + public void test_structured_result_by_stream(){ + String filePath = "src/test/resources/docs/test.docx"; + BufferedInputStream inputStream = FileUtil.getInputStream(new File(filePath)); + String name = FileUtil.getName(filePath); + DocumentResult structuredResult = UnstructuredParser.toStructuredResult(inputStream, name); + log.info(structuredResult.getContent()); + assertNotNull(structuredResult); + } + }