|
1 | 1 | # Copyright (C) 2024 Intel Corporation |
2 | 2 | # SPDX-License-Identifier: Apache-2.0 |
3 | 3 |
|
4 | | -import asyncio |
5 | 4 | import base64 |
6 | 5 | import os |
7 | 6 | import subprocess |
@@ -55,7 +54,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k |
55 | 54 | return inputs |
56 | 55 |
|
57 | 56 |
|
58 | | -def read_pdf(file): |
| 57 | +def read_pdf(file: str): |
59 | 58 | from langchain.document_loaders import PyPDFLoader |
60 | 59 |
|
61 | 60 | loader = PyPDFLoader(file) |
@@ -101,29 +100,50 @@ def video2audio( |
101 | 100 | return audio_base64 |
102 | 101 |
|
103 | 102 |
|
104 | | -def read_text_from_file(file, save_file_name): |
| 103 | +async def read_text_from_file(file: UploadFile): |
| 104 | + ctype = file.headers["content-type"] |
| 105 | + valid = ( |
| 106 | + "text/plain", |
| 107 | + "application/pdf", |
| 108 | + "application/octet-stream", |
| 109 | + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", |
| 110 | + ) |
| 111 | + |
| 112 | + file_content = None |
| 113 | + if ctype not in valid: |
| 114 | + return file_content |
| 115 | + |
| 116 | + import aiofiles |
105 | 117 | import docx2txt |
106 | 118 | from langchain.text_splitter import CharacterTextSplitter |
107 | 119 |
|
108 | 120 | # read text file |
109 | | - if file.headers["content-type"] == "text/plain": |
| 121 | + if ctype == "text/plain": |
110 | 122 | file.file.seek(0) |
111 | 123 | content = file.file.read().decode("utf-8") |
112 | | - # Split text |
| 124 | + # Split text to multiple documents |
113 | 125 | text_splitter = CharacterTextSplitter() |
114 | | - texts = text_splitter.split_text(content) |
115 | | - # Create multiple documents |
116 | | - file_content = texts |
117 | | - # read pdf file |
118 | | - elif file.headers["content-type"] == "application/pdf": |
119 | | - documents = read_pdf(save_file_name) |
120 | | - file_content = [doc.page_content for doc in documents] |
121 | | - # read docx file |
122 | | - elif ( |
123 | | - file.headers["content-type"] == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
124 | | - or file.headers["content-type"] == "application/octet-stream" |
125 | | - ): |
126 | | - file_content = docx2txt.process(save_file_name) |
| 126 | + return text_splitter.split_text(content) |
| 127 | + |
| 128 | + # need a tmp file for rest |
| 129 | + async with aiofiles.tempfile.NamedTemporaryFile() as tmp: |
| 130 | + await tmp.write(await file.read()) |
| 131 | + await tmp.flush() |
| 132 | + |
| 133 | + # read pdf file |
| 134 | + if ctype == "application/pdf": |
| 135 | + documents = read_pdf(tmp.name) |
| 136 | + file_content = [doc.page_content for doc in documents] |
| 137 | + |
| 138 | + # read docx file |
| 139 | + if ctype in ( |
| 140 | + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", |
| 141 | + "application/octet-stream", |
| 142 | + ): |
| 143 | + file_content = docx2txt.process(tmp.name) |
| 144 | + |
| 145 | + # remove temp file |
| 146 | + await tmp.close() |
127 | 147 |
|
128 | 148 | return file_content |
129 | 149 |
|
@@ -188,25 +208,14 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File( |
188 | 208 | file_summaries = [] |
189 | 209 | if files: |
190 | 210 | for file in files: |
191 | | - # Fix concurrency issue with the same file name |
192 | | - # https://github.com/opea-project/GenAIExamples/issues/1279 |
193 | | - uid = str(uuid.uuid4()) |
194 | | - file_path = f"/tmp/{uid}" |
195 | | - |
196 | 211 | if data_type is not None and data_type in ["audio", "video"]: |
197 | 212 | raise ValueError( |
198 | 213 | "Audio and Video file uploads are not supported in docsum with curl request, \ |
199 | 214 | please use the UI or pass base64 string of the content directly." |
200 | 215 | ) |
201 | 216 |
|
202 | 217 | else: |
203 | | - import aiofiles |
204 | | - |
205 | | - async with aiofiles.open(file_path, "wb") as f: |
206 | | - await f.write(await file.read()) |
207 | | - |
208 | | - docs = read_text_from_file(file, file_path) |
209 | | - os.remove(file_path) |
| 218 | + docs = await read_text_from_file(file) |
210 | 219 |
|
211 | 220 | if isinstance(docs, list): |
212 | 221 | file_summaries.extend(docs) |
|
0 commit comments