LLM reel caption and video description + Refactor in services

2025-06-30 16:14:29 +02:00
parent 21abbcdff5
commit 228d67a48d
20 changed files with 575 additions and 151 deletions
--- a/app/Services/FileTools/VideoDescriptor/LLMFullVideoDescriptor.php
+++ b/app/Services/FileTools/VideoDescriptor/LLMFullVideoDescriptor.php
@@ -0,0 +1,64 @@
+<?php
+
+namespace App\Services\FileTools\VideoDescriptor;
+
+use App\Services\AIPrompt\OpenAPIPrompt;
+use App\Services\FileTools\OCR\IImageOCR;
+
+class LLMFullVideoDescriptor extends AbstractLLMVideoDescriptor implements IVideoDescriptor
+{
+    public const DESCRIPTION_PROMPT = "Describe the video based on the screenshots. Each screenshot has a timestamp of when in the video the screenshot was taken. Do not specify that it is a video, just describe the video. Do not describe the screenshots one by one, try to make sense out of all the screenshots, what could be the video about ? What capion is attached to the video ? is it a meme ? If yes, what is the joke ? Be the most descriptive without exceeding 5000 words.\n";
+
+    public function __construct(public IImageOCR $ocr, public OpenAPIPrompt $llm) {
+    }
+
+    public function getDescription(string $filePath): ?string
+    {
+        /*
+            1. Cut videos in screenshots
+            2. Ask an LLM to describe the video with all the screenshots
+        */
+
+        // Step 1: Cut video into screenshots
+        $screenshots = $this->cutVideoIntoScreenshots($filePath);
+
+        if (empty($screenshots)) {
+            throw new \Exception("No screenshots were generated from the video {$filePath}.");
+        }
+
+        // Step 4: Combine the descriptions of all screenshots into a single description
+        $combinedDescription = '';
+        $screenshotCount = 0;
+        foreach ($screenshots as $values) {
+            $screenshot = $values['screenshot'];
+            $timestamp = $values['timestamp'];
+
+            $screenshotCount++;
+            $combinedDescription .= "Screenshot: {$screenshotCount}\n";
+            $combinedDescription .= "Timestamp: {$timestamp}s\n"; // TODO Cut the video in smaller parts when the video is short
+            $ocrDescription = $this->ocr->performOCR($screenshot);
+            $ocrDescription = empty($ocrDescription) ? 'No text found' : $ocrDescription;
+            $combinedDescription .= "OCR: {$ocrDescription}\n"; // Perform OCR on the screenshot
+            $combinedDescription .= "\n";
+        }
+        $combinedDescription = trim($combinedDescription);
+
+        // Step 5: Ask an LLM to describe the video based on the combined descriptions
+        $llmDescription = $this->llm->generate(
+            config('llm.models.vision.name'),
+            static::DESCRIPTION_PROMPT . $combinedDescription,
+            images: array_map(function ($screenshot) {return $screenshot["screenshot"];}, $screenshots), // Pass the screenshots to the LLM
+            outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
+            systemMessage: "The user will ask something. Give your direct answer to that.",
+            keepAlive: true,
+            shouldThink: config('llm.models.vision.shouldThink')
+        );
+
+        $llmDescription = json_decode($llmDescription, true)['answer'] ?? null;
+        if (empty($llmDescription)) {
+            $llmDescription = null;
+        }
+
+        return $llmDescription;
+    }
+}