From 517bbc281a9e9046bbe372be18a91a2428751080 Mon Sep 17 00:00:00 2001
From: jyong <718720800@qq.com>
Date: Mon, 4 Nov 2024 15:31:20 +0800
Subject: [PATCH] fix the ssrf of docx file extractor external images

---
 api/core/helper/ssrf_proxy.py            | 29 ++++++++++++------------
 api/core/rag/extractor/word_extractor.py |  1 -
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/api/core/helper/ssrf_proxy.py b/api/core/helper/ssrf_proxy.py
index 6793e41978..b13aaab04b 100644
--- a/api/core/helper/ssrf_proxy.py
+++ b/api/core/helper/ssrf_proxy.py
@@ -31,24 +31,25 @@ def make_request(method, url, max_retries=SSRF_DEFAULT_MAX_RETRIES, **kwargs):
         allow_redirects = kwargs.pop("allow_redirects")
         if "follow_redirects" not in kwargs:
             kwargs["follow_redirects"] = allow_redirects
-
+    stream = kwargs.pop("stream", False)
     retries = 0
     while retries <= max_retries:
         try:
-            if SSRF_PROXY_ALL_URL:
-                with httpx.Client(proxy=SSRF_PROXY_ALL_URL) as client:
-                    response = client.request(method=method, url=url, **kwargs)
-            elif proxy_mounts:
-                with httpx.Client(mounts=proxy_mounts) as client:
-                    response = client.request(method=method, url=url, **kwargs)
-            else:
-                with httpx.Client() as client:
-                    response = client.request(method=method, url=url, **kwargs)
+            client_args = {"proxy": SSRF_PROXY_ALL_URL} if SSRF_PROXY_ALL_URL else {}
+            if proxy_mounts:
+                client_args["mounts"] = proxy_mounts
 
-            if response.status_code not in STATUS_FORCELIST:
-                return response
-            else:
-                logging.warning(f"Received status code {response.status_code} for URL {url} which is in the force list")
+            with httpx.Client(**client_args) as client:
+                response = client.request(method=method, url=url, **kwargs)
+
+                if response.status_code not in STATUS_FORCELIST:
+                    if stream:
+                        return response.iter_bytes()
+                    return response
+                else:
+                    logging.warning(
+                        f"Received status code {response.status_code} for URL {url} which is in the force list"
+                    )
 
         except httpx.RequestError as e:
             logging.warning(f"Request to URL {url} failed on attempt {retries + 1}: {e}")
diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py
index d4434ea28f..0a76036d87 100644
--- a/api/core/rag/extractor/word_extractor.py
+++ b/api/core/rag/extractor/word_extractor.py
@@ -81,7 +81,6 @@ class WordExtractor(BaseExtractor):
         os.makedirs(image_folder, exist_ok=True)
         image_count = 0
         image_map = {}
-
         for rel in doc.part.rels.values():
             if "image" in rel.target_ref:
                 image_count += 1