From 47c3a70e6ed10f4db991d23ce19de22264db80ae Mon Sep 17 00:00:00 2001
From: postie <postie@noreply.codeberg.org>
Date: Thu, 12 Mar 2026 21:32:29 +0000
Subject: [PATCH] LinkPreview: cURL, response charset, size limit

---
 Digitigrade/LinkPreview.php | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/Digitigrade/LinkPreview.php b/Digitigrade/LinkPreview.php
index 706c3c7..f23b1c0 100644
--- a/Digitigrade/LinkPreview.php
+++ b/Digitigrade/LinkPreview.php
@@ -29,13 +29,38 @@ class LinkPreview implements \JsonSerializable {
             return false;
         }
 
-        // TODO: Respect Content-Type header from response, maybe set an Accept header in the request?
         // I was able to find a URL serving SHIFT-JIS, but they do not specify a charset in the Content-Type header.
         //    https://www2d.biglobe.ne.jp/~msyk/charcode/cp932/Windows-31J-charset.html
-        $c = @file_get_contents($this->url);
+        $ch = curl_init($this->url);
+        curl_setopt($ch, CURLOPT_HTTPHEADER, [
+            'Accept: text/html,application/xhtml+xml,application/xml',
+        ]);
+        curl_setopt($ch, CURLOPT_RANGE, "0-256000"); // request first 256KB of the page
+        curl_setopt($ch, CURLOPT_MAXFILESIZE, 8 * 1024 * 1024); // don't download more than 8MiB of HTML
+        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
+        $response_charset = null;
+        curl_setopt($ch, CURLOPT_HEADERFUNCTION, function($curl, $header) use (&$response_charset) {
+            $len = strlen($header);
+            $header = explode(':', $header, 2);
+            if (count($header) != 2)
+                return $len;
+
+            // content-type: text/html; charset=utf-8
+            if (strtolower(trim($header[0])) == 'content-type') {
+                $vals = explode(';', $header[1]);
+                foreach($vals as $val) {
+                    $val = strtolower(trim($val));
+                    if(str_starts_with($val,"charset=")) {
+                        $response_charset = substr($val, 8);
+                    }
+                }
+            }
+            return $len;
+        });
+        $c = curl_exec($ch);
         if (!$c)
             return false;
-        $d = \Dom\HTMLDocument::createFromString($c, LIBXML_NOERROR);
+        $d = \Dom\HTMLDocument::createFromString($c, LIBXML_NOERROR, $response_charset);
 
         $url = $this->url;
         $siteName = hostname_from_uri($this->url);
-- 
2.47.3

