From 48e4b71dd10eb94c83a5ff641358a3d8b046461e Mon Sep 17 00:00:00 2001
From: strawberry <strawberry@puppygock.gay>
Date: Sat, 10 Feb 2024 13:29:12 -0500
Subject: [PATCH] remove hardcoded 300kb limit on spider size with config
 option of 1MB default

modern websites are sadly massive, 300kb is pretty low. 1MB should be enough.

Signed-off-by: strawberry <strawberry@puppygock.gay>
---
 conduwuit-example.toml         |  3 +++
 src/api/client_server/media.rs |  5 ++---
 src/config/mod.rs              | 12 ++++++++++--
 src/service/globals/mod.rs     |  4 ++++
 4 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/conduwuit-example.toml b/conduwuit-example.toml
index 0f02a2004..376c19d1e 100644
--- a/conduwuit-example.toml
+++ b/conduwuit-example.toml
@@ -180,6 +180,9 @@ url_preview_domain_explicit_allowlist = []
 # Setting this to "*" will allow all URL previews. Please note that this opens up significant attack surface to your server, you are expected to be aware of the risks by doing so.
 url_preview_url_contains_allowlist = []
 
+# Maximum amount of bytes allowed in a URL preview body size when spidering. Defaults to 1MB (1_000_000 bytes)
+url_preview_max_spider_size = 1_000_000
+
 
 
 ### Misc
diff --git a/src/api/client_server/media.rs b/src/api/client_server/media.rs
index d3e22c28d..3e42e6727 100644
--- a/src/api/client_server/media.rs
+++ b/src/api/client_server/media.rs
@@ -342,14 +342,13 @@ async fn download_image(client: &reqwest::Client, url: &str) -> Result<UrlPrevie
 }
 
 async fn download_html(client: &reqwest::Client, url: &str) -> Result<UrlPreviewData> {
-    let max_download_size = 300_000; // TODO: is this bytes? kilobytes? megabytes?
-
     let mut response = client.get(url).send().await?;
 
     let mut bytes: Vec<u8> = Vec::new();
     while let Some(chunk) = response.chunk().await? {
         bytes.extend_from_slice(&chunk);
-        if bytes.len() > max_download_size {
+        if bytes.len() > services().globals.url_preview_max_spider_size() {
+            debug!("Response body from URL {} exceeds url_preview_max_spider_size ({}), not processing the rest of the response body and assuming our necessary data is in this range.", url, services().globals.url_preview_max_spider_size());
             break;
         }
     }
diff --git a/src/config/mod.rs b/src/config/mod.rs
index ddb1c654c..6f6e24e04 100644
--- a/src/config/mod.rs
+++ b/src/config/mod.rs
@@ -136,12 +136,12 @@ pub struct Config {
 
     #[serde(default = "Vec::new")]
     pub url_preview_domain_contains_allowlist: Vec<String>,
-
     #[serde(default = "Vec::new")]
     pub url_preview_domain_explicit_allowlist: Vec<String>,
-
     #[serde(default = "Vec::new")]
     pub url_preview_url_contains_allowlist: Vec<String>,
+    #[serde(default = "default_url_preview_max_spider_size")]
+    pub url_preview_max_spider_size: usize,
 
     #[serde(default = "RegexSet::empty")]
     #[serde(with = "serde_regex")]
@@ -370,6 +370,10 @@ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
                 "URL preview URL contains allowlist",
                 &self.url_preview_url_contains_allowlist.join(", "),
             ),
+            (
+                "URL preview maximum spider size",
+                &self.url_preview_max_spider_size.to_string(),
+            ),
         ];
 
         let mut msg: String = "Active config values:\n\n".to_owned();
@@ -495,3 +499,7 @@ fn default_ip_range_denylist() -> Vec<String> {
         "fec0::/10".to_owned(),
     ]
 }
+
+fn default_url_preview_max_spider_size() -> usize {
+    1_000_000 // 1MB
+}
diff --git a/src/service/globals/mod.rs b/src/service/globals/mod.rs
index b3f0557fa..0944cedc5 100644
--- a/src/service/globals/mod.rs
+++ b/src/service/globals/mod.rs
@@ -412,6 +412,10 @@ pub fn url_preview_url_contains_allowlist(&self) -> &Vec<String> {
         &self.config.url_preview_url_contains_allowlist
     }
 
+    pub fn url_preview_max_spider_size(&self) -> usize {
+        self.config.url_preview_max_spider_size
+    }
+
     pub fn forbidden_room_names(&self) -> &RegexSet {
         &self.config.forbidden_room_names
     }
-- 
GitLab