Added OCR and OpenAPI tools

2025-06-09 16:27:14 +02:00
parent 20fca31ced
commit 67197c5c48
12 changed files with 402 additions and 2 deletions
--- a/.env.example
+++ b/.env.example
@@ -81,3 +81,8 @@ VITE_REVERB_APP_KEY="${REVERB_APP_KEY}"
 VITE_REVERB_HOST="${REVERB_HOST}"
 VITE_REVERB_PORT="${REVERB_PORT}"
 VITE_REVERB_SCHEME="${REVERB_SCHEME}"
+
+# AI LLM
+LLM_HOST_URL="https://openai.com/api"
+LLM_CHAT_MODEL="gpt-4o"
+LLM_VISION_MODEL="gpt-4o-vision-preview"
--- a/2
+++ b/2
@@ -54,6 +54,8 @@ RUN apk update && apk add --no-cache \
    openssl \
    linux-headers \
    supervisor \
+    tesseract-ocr \
+    ffmpeg \
    && rm  -rf /tmp/* /var/cache/apk/*

 RUN docker-php-ext-configure zip && docker-php-ext-install zip
--- a/app/AIPrompt/IAIPrompt.php
+++ b/app/AIPrompt/IAIPrompt.php
@@ -0,0 +1,10 @@
+<?php
+
+namespace App\AIPrompt;
+
+interface IAIPrompt
+{
+    public function generate(string $model, string $prompt, array $images = [], string $outputFormat = "json", string $systemMessage = null, bool $keepAlive = true, bool $shouldThink = false): string;
+
+    //public function chat(string $model, string $prompt, array $images = []): string;
+}
--- a/app/AIPrompt/OpenAPIPrompt.php
+++ b/app/AIPrompt/OpenAPIPrompt.php
@@ -0,0 +1,129 @@
+<?php
+
+namespace App\AIPrompt;
+
+/**
+ * Use OpenAI API to get answers from a model.
+ */
+class OpenAPIPrompt implements IAIPrompt
+{
+    private string $host;
+
+    public function __construct(string $host = null) {
+        $this->host = $host ?? config('llm.host');
+    }
+
+    private function getHeaders(): array
+    {
+        return [
+            'Content-Type' => 'application/json'
+        ];
+    }
+
+    /**
+     * Call the OpenAI API with the given endpoint and body.
+     * @param string $endpoint
+     * @param string $body
+     * @throws \Exception
+     * @return string
+     */
+    private function callAPI(string $endpoint, string $body): string
+    {
+        $url = $this->host . $endpoint;
+
+        $ch = curl_init($url);
+        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
+        curl_setopt($ch, CURLOPT_HTTPHEADER, $this->getHeaders());
+        curl_setopt($ch, CURLOPT_POST, true);
+        curl_setopt($ch, CURLOPT_POSTFIELDS, $body);
+        $response = curl_exec($ch);
+        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
+        curl_close($ch);
+        if ($httpCode !== 200) {
+            throw new \Exception("Error calling OpenAI API: HTTP $httpCode - $response");
+        }
+        return $response;
+    }
+
+    /**
+     * Call the OpenAI API generate endpoint. to generate a response to a prompt.
+     * @param string $model
+     * @param string $prompt
+     * @param array $images
+     * @return void
+     */
+    public function generate(string $model, string $prompt, array $images = [], string $outputFormat = null, string $systemMessage = null, bool $keepAlive = true, bool $shouldThink = false): string
+    {
+        /*
+            Generate a completion
+
+            POST /api/generate
+
+            Generate a response for a given prompt with a provided model. This is a streaming endpoint, so there will be a series of responses. The final response object will include statistics and additional data from the request.
+            Parameters
+
+                model: (required) the model name
+                prompt: the prompt to generate a response for
+                suffix: the text after the model response
+                images: (optional) a list of base64-encoded images (for multimodal models such as llava)
+                think: (for thinking models) should the model think before responding?
+
+            Advanced parameters (optional):
+
+                format: the format to return a response in. Format can be json or a JSON schema
+                options: additional model parameters listed in the documentation for the Modelfile such as temperature
+                system: system message to (overrides what is defined in the Modelfile)
+                template: the prompt template to use (overrides what is defined in the Modelfile)
+                stream: if false the response will be returned as a single response object, rather than a stream of objects
+                raw: if true no formatting will be applied to the prompt. You may choose to use the raw parameter if you are specifying a full templated prompt in your request to the API
+                keep_alive: controls how long the model will stay loaded into memory following the request (default: 5m)
+                context (deprecated): the context parameter returned from a previous request to /generate, this can be used to keep a short conversational memory
+
+            Structured outputs
+
+            Structured outputs are supported by providing a JSON schema in the format parameter. The model will generate a response that matches the schema. See the structured outputs example below.
+            JSON mode
+
+            Enable JSON mode by setting the format parameter to json. This will structure the response as a valid JSON object. See the JSON mode example below.
+
+            Important
+
+            It's important to instruct the model to use JSON in the prompt. Otherwise, the model may generate large amounts whitespace.
+        */
+
+        // Transform the images to base64
+        foreach ($images as &$image) {
+            if (file_exists($image)) {
+                $image = base64_encode(file_get_contents($image));
+            }
+        }
+
+        $body = [
+            'model' => $model,
+            'prompt' => $prompt,
+            'images' => $images,
+            'think' => $shouldThink,
+            'stream' => false,
+        ];
+
+        if ($systemMessage !== null) {
+            $body['system'] = $systemMessage;
+        }
+        if ($outputFormat !== null) {
+            $body['format'] = json_decode($outputFormat);
+        }
+        if (!$keepAlive) {
+            $body['keep_alive'] = "0m";
+        }
+
+        $body = json_encode($body);
+
+        dump($body);
+        $response = $this->callAPI('/api/generate', $body);
+        $decodedResponse = json_decode($response, true);
+        if (json_last_error() !== JSON_ERROR_NONE) {
+            throw new \Exception("Error decoding JSON response: " . json_last_error_msg());
+        }
+        return $decodedResponse['response'] ?? '';
+    }
+}
--- a/app/Browser/Jobs/InstagramRepost/OCRLLMReelDescriptor.php
+++ b/app/Browser/Jobs/InstagramRepost/OCRLLMReelDescriptor.php
@@ -0,0 +1,12 @@
+<?php
+
+namespace App\Browser\Jobs\InstagramRepost;
+
+class OCRLLMReelDescriptor extends \App\FileTools\VideoDescriptor\OCRLLMVideoDescriptor
+{
+    public const DESCRIPTION_PROMPT = "Describe the Instagram reel based on the screenshots. Each screenshot has a timestamp of when in the video the screenshot was taken, an OCR result and a description of the screenshot by an LLM. Do not specify that it is a reel, just try to describe the video and most importantly the joke behind it if there is one. The description must have a maximum of 500 words.\n";
+
+    public function __construct() {
+        parent::__construct();
+    }
+}
--- a/app/FileTools/OCR/IImageOCR.php
+++ b/app/FileTools/OCR/IImageOCR.php
@@ -0,0 +1,14 @@
+<?php
+
+namespace App\FileTools\OCR;
+
+interface IImageOCR
+{
+    /**
+     * Perform OCR on the given file.
+     *
+     * @param string $filePath The path to the file to be processed.
+     * @return string The extracted text from the file.
+     */
+    public function performOCR(string $filePath): string;
+}
--- a/app/FileTools/OCR/TesseractImageOCR.php
+++ b/app/FileTools/OCR/TesseractImageOCR.php
@@ -0,0 +1,15 @@
+<?php
+
+namespace App\FileTools\OCR;
+use thiagoalessio\TesseractOCR\TesseractOCR;
+
+class TesseractImageOCR implements IImageOCR
+{
+    /**
+     * @inheritDoc
+     */
+    public function performOCR(string $filePath): string {
+        $tesseract = new TesseractOCR($filePath);
+        return $tesseract->run();
+    }
+}
--- a/app/FileTools/VideoDescriptor/IVideoDescriptor.php
+++ b/app/FileTools/VideoDescriptor/IVideoDescriptor.php
@@ -0,0 +1,14 @@
+<?php
+
+namespace App\FileTools\VideoDescriptor;
+
+interface IVideoDescriptor
+{
+    /**
+     * Get the video description.
+     *
+     * @param string $filePath The path to the video file.
+     * @return string The description of the video.
+     */
+    public function getDescription(string $filePath): ?string;
+}
--- a/app/FileTools/VideoDescriptor/OCRLLMVideoDescriptor.php
+++ b/app/FileTools/VideoDescriptor/OCRLLMVideoDescriptor.php
@@ -0,0 +1,117 @@
+<?php
+
+namespace App\FileTools\VideoDescriptor;
+
+use App\AIPrompt\IAIPrompt;
+use App\AIPrompt\OpenAPIPrompt;
+use App\FileTools\OCR\IImageOCR;
+use App\FileTools\OCR\TesseractImageOCR;
+use Log;
+
+class OCRLLMVideoDescriptor implements IVideoDescriptor
+{
+    private IImageOCR $ocr;
+    private IAIPrompt $llm; // LLM That can visualize images and generate descriptions
+
+    public const DESCRIPTION_PROMPT = "Describe the video based on the screenshots. Each screenshot has a timestamp of when in the video the screenshot was taken, an OCR result and a description of the screenshot by an LLM. Do not specify that it is a video, just describe the video. The description must have a maximum of 500 words.\n";
+
+    public function __construct() {
+        $this->ocr = new TesseractImageOCR();
+        $this->llm = new OpenAPIPrompt();
+    }
+
+    public function getDescription(string $filePath): ?string
+    {
+        /*
+            1. Cut videos in screenshots
+            2. Use OCR to extract text from screenshots
+            3. Use LLM to generate a description of the screenshot
+            4. Combine the descriptions of all screenshots into a single description
+            5. Ask an LLM to describe the video
+        */
+
+        // Step 1: Cut video into screenshots
+        $screenshots = $this->cutVideoIntoScreenshots($filePath);
+
+        if (empty($screenshots)) {
+            throw new \Exception("No screenshots were generated from the video {$filePath}.");
+        }
+
+        // Step 2 & 3: Use OCR to extract text and LLM to get description from screenshots
+        $descriptions = [];
+        foreach ($screenshots as $screenshot) {
+            $descriptions[$screenshot] = [];
+
+            $ocrDescription = $this->ocr->performOCR($screenshot);
+            $ocrDescription = empty($ocrDescription) ? 'No text found' : $ocrDescription;
+            $descriptions[$screenshot]['ocr'] = $ocrDescription;
+
+            $llmDescription = $this->llm->generate(
+                config('llm.models.vision.name'),
+                "Describe the content of this screenshot from a video. Do not specify that it is a screenshot, just describe the content.",
+                images: [$screenshot],
+                outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
+                systemMessage: "The user will ask something. Give your direct answer to that.",
+                keepAlive: $screenshot != end($screenshots), // Keep alive for all but the last screenshot
+                shouldThink: config('llm.models.vision.shouldThink')
+            );
+            $descriptions[$screenshot]['text'] = json_decode($llmDescription, true)['answer'] ?? 'No description generated';
+        }
+
+        // HERE COULD BE SOME INTERMEDIATE PROCESSING OF DESCRIPTIONS
+
+        // Step 4: Combine the descriptions of all screenshots into a single description
+        $combinedDescription = '';
+        $screenshotCount = 0;
+        foreach ($descriptions as $screenshot => $description) {
+            $screenshotCount++;
+            $combinedDescription .= "Screenshot: {$screenshotCount}\n";
+            $combinedDescription .= "Timestamp: {$screenshotCount}s\n"; // TODO Cut the video in smaller parts when the video is short
+            $combinedDescription .= "OCR: {$description['ocr']}\n";
+            $combinedDescription .= "LLM Description: {$description['text']}\n\n";
+        }
+        $combinedDescription = trim($combinedDescription);
+
+        // Step 5: Ask an LLM to describe the video based on the combined descriptions
+        $llmDescription = $this->llm->generate(
+            config('llm.models.chat.name'),
+            self::DESCRIPTION_PROMPT . $combinedDescription,
+            outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
+            systemMessage: "The user will ask something. Give your direct answer to that.",
+            keepAlive: true,
+            shouldThink: config('llm.models.chat.shouldThink')
+        );
+
+        $llmDescription = json_decode($llmDescription, true)['answer'] ?? null;
+        if (empty($llmDescription)) {
+            $llmDescription = null;
+        }
+
+        return $llmDescription;
+    }
+
+    /**
+     * Cut the video into screenshots.
+     * Using ffmpeg to cut the video into screenshots at regular intervals.
+     * The screenshots will be saved in a temporary directory.
+     * @param string $filePath
+     * @return void
+     */
+    private function cutVideoIntoScreenshots(string $filePath): array
+    {
+        $tempDir = sys_get_temp_dir() . '/video_screenshots';
+        if (!is_dir($tempDir)) {
+            mkdir($tempDir, 0777, true);
+        }
+
+        Log::info("Cutting video into screenshots: $filePath");
+
+        $outputPattern = $tempDir . '/screenshot_%d.png';
+        $command = "ffmpeg -i " . escapeshellarg($filePath) . " -vf fps=1 " . escapeshellarg($outputPattern);
+        exec($command);
+
+        // Collect all screenshots
+        $screenshots = glob($tempDir . '/screenshot_*.png');
+        return $screenshots;
+    }
+}
--- a/composer.json
+++ b/composer.json
@@ -19,6 +19,7 @@
        "laravel/telescope": "^5.5",
        "laravel/tinker": "^2.9",
        "norkunas/youtube-dl-php": "dev-master",
+        "thiagoalessio/tesseract_ocr": "^2.13",
        "tightenco/ziggy": "^2.0"
    },
    "require-dev": {
--- a/composer.lock
+++ b/composer.lock
@@ -4,7 +4,7 @@
        "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
        "This file is @generated automatically"
    ],
-    "content-hash": "9a964008040d9ce219547515fe65dd86",
+    "content-hash": "20c0488746a861aecc1187374ca0aa7f",
    "packages": [
        {
            "name": "brick/math",
@@ -7038,6 +7038,55 @@
            ],
            "time": "2025-01-17T11:39:41+00:00"
        },
+        {
+            "name": "thiagoalessio/tesseract_ocr",
+            "version": "2.13.0",
+            "source": {
+                "type": "git",
+                "url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git",
+                "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1"
+            },
+            "dist": {
+                "type": "zip",
+                "url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
+                "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
+                "shasum": ""
+            },
+            "require": {
+                "php": "^5.3 || ^7.0 || ^8.0"
+            },
+            "require-dev": {
+                "phpunit/php-code-coverage": "^2.2.4 || ^9.0.0"
+            },
+            "type": "library",
+            "autoload": {
+                "psr-4": {
+                    "thiagoalessio\\TesseractOCR\\": "src/"
+                }
+            },
+            "notification-url": "https://packagist.org/downloads/",
+            "license": [
+                "MIT"
+            ],
+            "authors": [
+                {
+                    "name": "thiagoalessio",
+                    "email": "thiagoalessio@me.com"
+                }
+            ],
+            "description": "A wrapper to work with Tesseract OCR inside PHP.",
+            "keywords": [
+                "OCR",
+                "Tesseract",
+                "text recognition"
+            ],
+            "support": {
+                "irc": "irc://irc.freenode.net/tesseract-ocr-for-php",
+                "issues": "https://github.com/thiagoalessio/tesseract-ocr-for-php/issues",
+                "source": "https://github.com/thiagoalessio/tesseract-ocr-for-php"
+            },
+            "time": "2023-10-05T21:14:48+00:00"
+        },
        {
            "name": "tightenco/ziggy",
            "version": "v2.5.2",
@@ -9726,7 +9775,7 @@
    "prefer-stable": true,
    "prefer-lowest": false,
    "platform": {
-        "php": "^8.3"
+        "php": "8.3.*"
    },
    "platform-dev": [],
    "plugin-api-version": "2.6.0"
--- a/config/llm.php
+++ b/config/llm.php
@@ -0,0 +1,32 @@
+<?php
+
+return [
+    /**
+     * Host for the OpenAI API.
+     * This should be the base URL of the OpenAI API you are using.
+     */
+    'host' => env('LLM_HOST_URL', null),
+
+    /**
+     * Models configuration.
+     */
+    'models' => [
+        /**
+         * Great for chatting, can have reasoning capabilities.
+         * This model is typically used for conversational or thinking AI tasks.
+         */
+        'chat' => [
+            'name' => env('LLM_CHAT_MODEL', null),
+            'shouldThink' => env('LLM_CHAT_MODEL_THINK', false),
+        ],
+
+        /**
+         * Great for analyzing images, can have reasoning capabilities.
+         * This model is typically used for tasks that require understanding and interpreting images.
+         */
+        'vision' => [
+            'name' => env('LLM_VISION_MODEL', null),
+            'shouldThink' => env('LLM_VISION_MODEL_THINK', false),
+        ],
+    ]
+];