Added OCR and OpenAPI tools

2025-06-09 16:27:14 +02:00
parent 20fca31ced
commit 67197c5c48
12 changed files with 402 additions and 2 deletions
--- a/app/FileTools/VideoDescriptor/OCRLLMVideoDescriptor.php
+++ b/app/FileTools/VideoDescriptor/OCRLLMVideoDescriptor.php
@@ -0,0 +1,117 @@
+<?php
+
+namespace App\FileTools\VideoDescriptor;
+
+use App\AIPrompt\IAIPrompt;
+use App\AIPrompt\OpenAPIPrompt;
+use App\FileTools\OCR\IImageOCR;
+use App\FileTools\OCR\TesseractImageOCR;
+use Log;
+
+class OCRLLMVideoDescriptor implements IVideoDescriptor
+{
+    private IImageOCR $ocr;
+    private IAIPrompt $llm; // LLM That can visualize images and generate descriptions
+
+    public const DESCRIPTION_PROMPT = "Describe the video based on the screenshots. Each screenshot has a timestamp of when in the video the screenshot was taken, an OCR result and a description of the screenshot by an LLM. Do not specify that it is a video, just describe the video. The description must have a maximum of 500 words.\n";
+
+    public function __construct() {
+        $this->ocr = new TesseractImageOCR();
+        $this->llm = new OpenAPIPrompt();
+    }
+
+    public function getDescription(string $filePath): ?string
+    {
+        /*
+            1. Cut videos in screenshots
+            2. Use OCR to extract text from screenshots
+            3. Use LLM to generate a description of the screenshot
+            4. Combine the descriptions of all screenshots into a single description
+            5. Ask an LLM to describe the video
+        */
+
+        // Step 1: Cut video into screenshots
+        $screenshots = $this->cutVideoIntoScreenshots($filePath);
+
+        if (empty($screenshots)) {
+            throw new \Exception("No screenshots were generated from the video {$filePath}.");
+        }
+
+        // Step 2 & 3: Use OCR to extract text and LLM to get description from screenshots
+        $descriptions = [];
+        foreach ($screenshots as $screenshot) {
+            $descriptions[$screenshot] = [];
+
+            $ocrDescription = $this->ocr->performOCR($screenshot);
+            $ocrDescription = empty($ocrDescription) ? 'No text found' : $ocrDescription;
+            $descriptions[$screenshot]['ocr'] = $ocrDescription;
+
+            $llmDescription = $this->llm->generate(
+                config('llm.models.vision.name'),
+                "Describe the content of this screenshot from a video. Do not specify that it is a screenshot, just describe the content.",
+                images: [$screenshot],
+                outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
+                systemMessage: "The user will ask something. Give your direct answer to that.",
+                keepAlive: $screenshot != end($screenshots), // Keep alive for all but the last screenshot
+                shouldThink: config('llm.models.vision.shouldThink')
+            );
+            $descriptions[$screenshot]['text'] = json_decode($llmDescription, true)['answer'] ?? 'No description generated';
+        }
+
+        // HERE COULD BE SOME INTERMEDIATE PROCESSING OF DESCRIPTIONS
+
+        // Step 4: Combine the descriptions of all screenshots into a single description
+        $combinedDescription = '';
+        $screenshotCount = 0;
+        foreach ($descriptions as $screenshot => $description) {
+            $screenshotCount++;
+            $combinedDescription .= "Screenshot: {$screenshotCount}\n";
+            $combinedDescription .= "Timestamp: {$screenshotCount}s\n"; // TODO Cut the video in smaller parts when the video is short
+            $combinedDescription .= "OCR: {$description['ocr']}\n";
+            $combinedDescription .= "LLM Description: {$description['text']}\n\n";
+        }
+        $combinedDescription = trim($combinedDescription);
+
+        // Step 5: Ask an LLM to describe the video based on the combined descriptions
+        $llmDescription = $this->llm->generate(
+            config('llm.models.chat.name'),
+            self::DESCRIPTION_PROMPT . $combinedDescription,
+            outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
+            systemMessage: "The user will ask something. Give your direct answer to that.",
+            keepAlive: true,
+            shouldThink: config('llm.models.chat.shouldThink')
+        );
+
+        $llmDescription = json_decode($llmDescription, true)['answer'] ?? null;
+        if (empty($llmDescription)) {
+            $llmDescription = null;
+        }
+
+        return $llmDescription;
+    }
+
+    /**
+     * Cut the video into screenshots.
+     * Using ffmpeg to cut the video into screenshots at regular intervals.
+     * The screenshots will be saved in a temporary directory.
+     * @param string $filePath
+     * @return void
+     */
+    private function cutVideoIntoScreenshots(string $filePath): array
+    {
+        $tempDir = sys_get_temp_dir() . '/video_screenshots';
+        if (!is_dir($tempDir)) {
+            mkdir($tempDir, 0777, true);
+        }
+
+        Log::info("Cutting video into screenshots: $filePath");
+
+        $outputPattern = $tempDir . '/screenshot_%d.png';
+        $command = "ffmpeg -i " . escapeshellarg($filePath) . " -vf fps=1 " . escapeshellarg($outputPattern);
+        exec($command);
+
+        // Collect all screenshots
+        $screenshots = glob($tempDir . '/screenshot_*.png');
+        return $screenshots;
+    }
+}