From 517bbc281a9e9046bbe372be18a91a2428751080 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Mon, 4 Nov 2024 15:31:20 +0800 Subject: [PATCH] fix the ssrf of docx file extractor external images --- api/core/helper/ssrf_proxy.py | 29 ++++++++++++------------ api/core/rag/extractor/word_extractor.py | 1 - 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/api/core/helper/ssrf_proxy.py b/api/core/helper/ssrf_proxy.py index 6793e41978..b13aaab04b 100644 --- a/api/core/helper/ssrf_proxy.py +++ b/api/core/helper/ssrf_proxy.py @@ -31,24 +31,25 @@ def make_request(method, url, max_retries=SSRF_DEFAULT_MAX_RETRIES, **kwargs): allow_redirects = kwargs.pop("allow_redirects") if "follow_redirects" not in kwargs: kwargs["follow_redirects"] = allow_redirects - + stream = kwargs.pop("stream", False) retries = 0 while retries <= max_retries: try: - if SSRF_PROXY_ALL_URL: - with httpx.Client(proxy=SSRF_PROXY_ALL_URL) as client: - response = client.request(method=method, url=url, **kwargs) - elif proxy_mounts: - with httpx.Client(mounts=proxy_mounts) as client: - response = client.request(method=method, url=url, **kwargs) - else: - with httpx.Client() as client: - response = client.request(method=method, url=url, **kwargs) + client_args = {"proxy": SSRF_PROXY_ALL_URL} if SSRF_PROXY_ALL_URL else {} + if proxy_mounts: + client_args["mounts"] = proxy_mounts - if response.status_code not in STATUS_FORCELIST: - return response - else: - logging.warning(f"Received status code {response.status_code} for URL {url} which is in the force list") + with httpx.Client(**client_args) as client: + response = client.request(method=method, url=url, **kwargs) + + if response.status_code not in STATUS_FORCELIST: + if stream: + return response.iter_bytes() + return response + else: + logging.warning( + f"Received status code {response.status_code} for URL {url} which is in the force list" + ) except httpx.RequestError as e: logging.warning(f"Request to URL {url} failed on attempt {retries + 1}: {e}") diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py index d4434ea28f..0a76036d87 100644 --- a/api/core/rag/extractor/word_extractor.py +++ b/api/core/rag/extractor/word_extractor.py @@ -81,7 +81,6 @@ class WordExtractor(BaseExtractor): os.makedirs(image_folder, exist_ok=True) image_count = 0 image_map = {} - for rel in doc.part.rels.values(): if "image" in rel.target_ref: image_count += 1