LLM reel caption and video description + Refactor in services

2025-06-30 16:14:29 +02:00
parent 21abbcdff5
commit 228d67a48d
20 changed files with 575 additions and 151 deletions
--- a/app/Services/AIPrompt/IAIPrompt.php
+++ b/app/Services/AIPrompt/IAIPrompt.php
@@ -0,0 +1,10 @@
+<?php
+
+namespace App\Services\AIPrompt;
+
+interface IAIPrompt
+{
+    public function generate(string $model, string $prompt, array $images = [], string $outputFormat = "json", string $systemMessage = null, bool $keepAlive = true, bool $shouldThink = false): string;
+
+    //public function chat(string $model, string $prompt, array $images = []): string;
+}
--- a/app/Services/AIPrompt/OpenAPIPrompt.php
+++ b/app/Services/AIPrompt/OpenAPIPrompt.php
@@ -0,0 +1,137 @@
+<?php
+
+namespace App\Services\AIPrompt;
+
+use Uri;
+
+/**
+ * Use OpenAI API to get answers from a model.
+ */
+class OpenAPIPrompt implements IAIPrompt
+{
+    private string $host;
+    private ?string $token = null;
+
+    public function __construct(string $host = null) {
+        $this->host = $host ?? config('llm.api.host');
+        if (config('llm.api.token')) {
+            $this->token = config('llm.api.token');
+        }
+    }
+
+    private function getHeaders(): array
+    {
+        return [
+            'Authorization: ' . ($this->token ? 'Bearer ' . $this->token : ''),
+            'Content-Type: application/json',
+        ];
+    }
+
+    /**
+     * Call the OpenAI API with the given endpoint and body.
+     * @param string $endpoint
+     * @param string $body
+     * @throws \Exception
+     * @return string
+     */
+    private function callAPI(string $endpoint, string $body): string
+    {
+        $url = $this->host . $endpoint;
+
+        $ch = curl_init($url);
+        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
+        curl_setopt($ch, CURLOPT_HTTPHEADER, $this->getHeaders());
+        curl_setopt($ch, CURLOPT_POST, true);
+        curl_setopt($ch, CURLOPT_POSTFIELDS, $body);
+        $response = curl_exec($ch);
+        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
+        curl_close($ch);
+
+        if ($httpCode !== 200) {
+            throw new \Exception("Error calling OpenAI API: HTTP $httpCode - $response");
+        }
+        return $response;
+    }
+
+    /**
+     * Call the OpenAI API generate endpoint. to generate a response to a prompt.
+     * @param string $model
+     * @param string $prompt
+     * @param array $images
+     * @return void
+     */
+    public function generate(string $model, string $prompt, array $images = [], string $outputFormat = null, string $systemMessage = null, bool $keepAlive = true, bool $shouldThink = false): string
+    {
+        /*
+            Generate a completion
+
+            POST /api/generate
+
+            Generate a response for a given prompt with a provided model. This is a streaming endpoint, so there will be a series of responses. The final response object will include statistics and additional data from the request.
+            Parameters
+
+                model: (required) the model name
+                prompt: the prompt to generate a response for
+                suffix: the text after the model response
+                images: (optional) a list of base64-encoded images (for multimodal models such as llava)
+                think: (for thinking models) should the model think before responding?
+
+            Advanced parameters (optional):
+
+                format: the format to return a response in. Format can be json or a JSON schema
+                options: additional model parameters listed in the documentation for the Modelfile such as temperature
+                system: system message to (overrides what is defined in the Modelfile)
+                template: the prompt template to use (overrides what is defined in the Modelfile)
+                stream: if false the response will be returned as a single response object, rather than a stream of objects
+                raw: if true no formatting will be applied to the prompt. You may choose to use the raw parameter if you are specifying a full templated prompt in your request to the API
+                keep_alive: controls how long the model will stay loaded into memory following the request (default: 5m)
+                context (deprecated): the context parameter returned from a previous request to /generate, this can be used to keep a short conversational memory
+
+            Structured outputs
+
+            Structured outputs are supported by providing a JSON schema in the format parameter. The model will generate a response that matches the schema. See the structured outputs example below.
+            JSON mode
+
+            Enable JSON mode by setting the format parameter to json. This will structure the response as a valid JSON object. See the JSON mode example below.
+
+            Important
+
+            **It's important to instruct the model to use JSON in the prompt. Otherwise, the model may generate large amounts whitespace.**
+        */
+
+        // Transform the images to base64
+        foreach ($images as &$image) {
+            if (file_exists($image)) {
+                $image = base64_encode(file_get_contents($image));
+            }
+        }
+
+        $body = [
+            'model' => $model,
+            'prompt' => $prompt,
+            'images' => $images,
+            'think' => $shouldThink,
+            'stream' => false,
+        ];
+
+        if ($systemMessage !== null) {
+            $body['system'] = $systemMessage;
+        }
+        if ($outputFormat !== null) {
+            $body['format'] = json_decode($outputFormat);
+        }
+        if (!$keepAlive) {
+            $body['keep_alive'] = "0m";
+        }
+
+        $body = json_encode($body);
+
+        dump($body);
+        $response = $this->callAPI('/api/generate', $body);
+        $decodedResponse = json_decode($response, true);
+        if (json_last_error() !== JSON_ERROR_NONE) {
+            throw new \Exception("Error decoding JSON response: " . json_last_error_msg());
+        }
+        return $decodedResponse['response'] ?? '';
+    }
+}
--- a/app/Services/FileTools/OCR/IImageOCR.php
+++ b/app/Services/FileTools/OCR/IImageOCR.php
@@ -0,0 +1,14 @@
+<?php
+
+namespace App\Services\FileTools\OCR;
+
+interface IImageOCR
+{
+    /**
+     * Perform OCR on the given file.
+     *
+     * @param string $filePath The path to the file to be processed.
+     * @return string The extracted text from the file.
+     */
+    public function performOCR(string $filePath): string;
+}
--- a/app/Services/FileTools/OCR/TesseractImageOCR.php
+++ b/app/Services/FileTools/OCR/TesseractImageOCR.php
@@ -0,0 +1,15 @@
+<?php
+
+namespace App\Services\FileTools\OCR;
+use thiagoalessio\TesseractOCR\TesseractOCR;
+
+class TesseractImageOCR implements IImageOCR
+{
+    /**
+     * @inheritDoc
+     */
+    public function performOCR(string $filePath): string {
+        $tesseract = new TesseractOCR($filePath);
+        return $tesseract->run();
+    }
+}
--- a/app/Services/FileTools/VideoDescriptor/AbstractLLMVideoDescriptor.php
+++ b/app/Services/FileTools/VideoDescriptor/AbstractLLMVideoDescriptor.php
@@ -0,0 +1,59 @@
+<?php
+
+namespace App\Services\FileTools\VideoDescriptor;
+
+use App\Services\FileTools\VideoDescriptor\IVideoDescriptor;
+use Log;
+
+abstract class AbstractLLMVideoDescriptor implements IVideoDescriptor
+{
+    public const MAX_FRAMES = 5;
+
+    abstract public function getDescription(string $filePath): ?string;
+
+    /**
+     * Cut the video into screenshots.
+     * Using ffmpeg to cut the video into screenshots at regular intervals.
+     * The screenshots will be saved in a temporary directory.
+     * @param string $filePath
+     * @return array array with timestamps as key and screenshot file paths as values.
+     */
+    protected function cutVideoIntoScreenshots(string $filePath): array
+    {
+        $tempDir = sys_get_temp_dir() . '/video_screenshots';
+        if (!is_dir($tempDir)) {
+            mkdir($tempDir, 0777, true);
+        }
+        else {
+            // Clear the directory if it already exists
+            array_map('unlink', glob($tempDir . '/*'));
+        }
+
+        Log::info("Cutting video into screenshots: $filePath");
+
+        $videoDuration = shell_exec("ffprobe -v error -show_entries format=duration -of csv=p=0 " . escapeshellarg($filePath));
+        if ($videoDuration === null) {
+            Log::error("Failed to get video duration for file: $filePath");
+            return [];
+        }
+        $videoDuration = floatval($videoDuration);
+
+        $framesInterval = ceil($videoDuration / self::MAX_FRAMES);
+        $fps = 1/$framesInterval; // Frames per second for the screenshots
+
+        $outputPattern = $tempDir . '/screenshot_%d.png';
+        $command = "ffmpeg -i " . escapeshellarg($filePath) . " -vf fps={$fps} " . escapeshellarg($outputPattern);
+        exec($command);
+
+        // Collect all screenshots
+        $screenshots = glob($tempDir . '/screenshot_*.png');
+        $array = [];
+        foreach ($screenshots as $screenshot) {
+            $array[] = [
+                "screenshot" => $screenshot,
+                "timestamp" => floor(sizeof($array) * $framesInterval),
+            ];
+        }
+        return $array;
+    }
+}
--- a/app/Services/FileTools/VideoDescriptor/IVideoDescriptor.php
+++ b/app/Services/FileTools/VideoDescriptor/IVideoDescriptor.php
@@ -0,0 +1,14 @@
+<?php
+
+namespace App\Services\FileTools\VideoDescriptor;
+
+interface IVideoDescriptor
+{
+    /**
+     * Get the video description.
+     *
+     * @param string $filePath The path to the video file.
+     * @return string The description of the video.
+     */
+    public function getDescription(string $filePath): ?string;
+}
--- a/app/Services/FileTools/VideoDescriptor/LLMFullVideoDescriptor.php
+++ b/app/Services/FileTools/VideoDescriptor/LLMFullVideoDescriptor.php
@@ -0,0 +1,64 @@
+<?php
+
+namespace App\Services\FileTools\VideoDescriptor;
+
+use App\Services\AIPrompt\OpenAPIPrompt;
+use App\Services\FileTools\OCR\IImageOCR;
+
+class LLMFullVideoDescriptor extends AbstractLLMVideoDescriptor implements IVideoDescriptor
+{
+    public const DESCRIPTION_PROMPT = "Describe the video based on the screenshots. Each screenshot has a timestamp of when in the video the screenshot was taken. Do not specify that it is a video, just describe the video. Do not describe the screenshots one by one, try to make sense out of all the screenshots, what could be the video about ? What capion is attached to the video ? is it a meme ? If yes, what is the joke ? Be the most descriptive without exceeding 5000 words.\n";
+
+    public function __construct(public IImageOCR $ocr, public OpenAPIPrompt $llm) {
+    }
+
+    public function getDescription(string $filePath): ?string
+    {
+        /*
+            1. Cut videos in screenshots
+            2. Ask an LLM to describe the video with all the screenshots
+        */
+
+        // Step 1: Cut video into screenshots
+        $screenshots = $this->cutVideoIntoScreenshots($filePath);
+
+        if (empty($screenshots)) {
+            throw new \Exception("No screenshots were generated from the video {$filePath}.");
+        }
+
+        // Step 4: Combine the descriptions of all screenshots into a single description
+        $combinedDescription = '';
+        $screenshotCount = 0;
+        foreach ($screenshots as $values) {
+            $screenshot = $values['screenshot'];
+            $timestamp = $values['timestamp'];
+
+            $screenshotCount++;
+            $combinedDescription .= "Screenshot: {$screenshotCount}\n";
+            $combinedDescription .= "Timestamp: {$timestamp}s\n"; // TODO Cut the video in smaller parts when the video is short
+            $ocrDescription = $this->ocr->performOCR($screenshot);
+            $ocrDescription = empty($ocrDescription) ? 'No text found' : $ocrDescription;
+            $combinedDescription .= "OCR: {$ocrDescription}\n"; // Perform OCR on the screenshot
+            $combinedDescription .= "\n";
+        }
+        $combinedDescription = trim($combinedDescription);
+
+        // Step 5: Ask an LLM to describe the video based on the combined descriptions
+        $llmDescription = $this->llm->generate(
+            config('llm.models.vision.name'),
+            static::DESCRIPTION_PROMPT . $combinedDescription,
+            images: array_map(function ($screenshot) {return $screenshot["screenshot"];}, $screenshots), // Pass the screenshots to the LLM
+            outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
+            systemMessage: "The user will ask something. Give your direct answer to that.",
+            keepAlive: true,
+            shouldThink: config('llm.models.vision.shouldThink')
+        );
+
+        $llmDescription = json_decode($llmDescription, true)['answer'] ?? null;
+        if (empty($llmDescription)) {
+            $llmDescription = null;
+        }
+
+        return $llmDescription;
+    }
+}
--- a/app/Services/FileTools/VideoDescriptor/OCRLLMVideoDescriptor.php
+++ b/app/Services/FileTools/VideoDescriptor/OCRLLMVideoDescriptor.php
@@ -0,0 +1,144 @@
+<?php
+
+namespace App\Services\FileTools\VideoDescriptor;
+
+use App\Services\AIPrompt\OpenAPIPrompt;
+use App\Services\FileTools\OCR\IImageOCR;
+
+class OCRLLMVideoDescriptor extends AbstractLLMVideoDescriptor implements IVideoDescriptor
+{
+    public const DESCRIPTION_PROMPT = "Analyze this Video sequence. You are given information for each individual screenshot/analysis from the video:";
+
+    public function __construct(public IImageOCR $ocr, public OpenAPIPrompt $llm) {
+    }
+
+    public function getDescription(string $filePath): ?string
+    {
+        /*
+            1. Cut videos in screenshots
+            2. Use OCR to extract text from screenshots
+            3. Use LLM to generate a description of the screenshot
+            4. Combine the descriptions of all screenshots into a single description
+            5. Ask an LLM to describe the video
+        */
+
+        // Step 1: Cut video into screenshots
+        $screenshots = $this->cutVideoIntoScreenshots($filePath);
+
+        if (empty($screenshots)) {
+            throw new \Exception("No screenshots were generated from the video {$filePath}.");
+        }
+
+        // Step 2 & 3: Use OCR to extract text and LLM to get description from screenshots
+        $descriptions = [];
+        foreach ($screenshots as $values) {
+            $screenshot = $values['screenshot'];
+            $timestamp = $values['timestamp'];
+
+            $descriptions[$screenshot] = [];
+
+            $ocrDescription = $this->ocr->performOCR($screenshot);
+            $ocrDescription = empty($ocrDescription) ? 'No text found' : $ocrDescription;
+            $descriptions[$screenshot]['ocr'] = $ocrDescription;
+            dump($ocrDescription); // DEBUG
+
+            $llmDescription = $this->llm->generate(
+                config('llm.models.vision.name'),
+                "Describe this image in detail, breaking it down into distinct parts as follows:
+
+1.  **Scene Description:** Describe the overall setting and environment of the image (e.g., forest clearing, futuristic city street, medieval castle interior).
+2.  **Main Subject/Character(s):** Detail what is happening with the primary character or subject present in the frame.
+3.  **Text Description (if any):** If there are visible text elements (like words, letters, captions), describe them exactly as they appear and note their location relative to other elements. This includes any emojis used in captions, describing their visual appearance and likely meaning.
+4.  **Summary:** Briefly summarize the key content of the image for clarity.
+5.  **Joke:** If the image is part of a meme or humorous content, describe the joke or humorous element present in the image. Do not include this part if you are not sure to understand the joke/meme.
+
+Format your response strictly using numbered lines corresponding to these four points (1., 2., 3., 4., 5.). Do not use markdown formatting or extra text outside these lines; simply list them sequentially as plain text output.",
+                images: [$screenshot],
+                outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
+                systemMessage: "You are an image understanding AI specialized in describing visual scenes accurately and concisely. Your task is solely to describe the content of the provided image based on what you can visually perceive.
+
+Please analyze the image carefully and provide a description focusing purely on the visible information without generating any text about concepts, interpretations, or future actions beyond the immediate scene. Describe everything that is clearly depicted.",
+                keepAlive: $screenshot != end($screenshots), // Keep alive for all but the last screenshot
+                shouldThink: config('llm.models.vision.shouldThink')
+            );
+            dump($llmDescription); // DEBUG
+            $descriptions[$screenshot]['text'] = json_decode($llmDescription, true)['answer'] ?? 'No description generated';
+        }
+
+        // HERE COULD BE SOME INTERMEDIATE PROCESSING OF DESCRIPTIONS
+
+        // Step 4: Combine the descriptions of all screenshots into a single description
+        $combinedDescription = '';
+        $screenshotCount = 0;
+        foreach ($screenshots as $values) {
+            $screenshot = $values['screenshot'];
+            $timestamp = $values['timestamp'];
+
+            $screenshotCount++;
+            $description = $descriptions[$screenshot] ?? [];
+
+            $combinedDescription .= "Screenshot: {$screenshotCount}\n";
+            $combinedDescription .= "Timestamp: {$timestamp}s\n"; // TODO Cut the video in smaller parts when the video is short
+            $combinedDescription .= "OCR: {$description['ocr']}\n";
+            $combinedDescription .= "LLM Description: {$description['text']}\n";
+            $combinedDescription .= "\n";
+        }
+        $combinedDescription = trim($combinedDescription);
+
+        // Step 5: Ask an LLM to describe the video based on the combined descriptions
+        $llmDescription = $this->llm->generate(
+            config('llm.models.chat.name'),
+            static::DESCRIPTION_PROMPT . $combinedDescription . "\n\nBased only on these frame analyses, please provide:
+
+     A single, concise description that captures the main action or theme occurring in the reel across all frames.
+     Identify and describe any joke or humorous element present in the video if you can discern one.
+
+
+Important Considerations
+
+     Remember that most videos are of poor quality; frame descriptions might be inaccurate, vague, or contradictory due to blurriness or fast cuts.
+     Your task is synthesis: focus on the overall impression and sequence, not perfecting each individual piece of information. Some details mentioned in one analysis may simply be incorrect or misidentified from another perspective.
+
+
+Analyze all provided frames (separated by --- for clarity) to understand what's happening. Then, synthesize this understanding into point 1 above and identify the joke if present as per point 2.",
+            outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
+            systemMessage: "You are an expert social media content analyst specializing in interpreting Instagram Reels. Your primary function is to generate a comprehensive description and identify any underlying humor or joke in a given video sequence. You will be provided with individual frame analyses, each containing:
+
+     Screenshot Number: The sequential number of the frame.
+     Timestamp: When that specific frame occurs within the reel.
+     OCR Text Result: Raw text extracted from the image content using OCR (Optical Character Recognition), which may contain errors or misinterpretations (\"may appear\" descriptions).
+     LLM Description of Screenshot: A textual interpretation of what's visible in the frame, based on previous LLM processing.
+
+
+Please note:
+
+     The individual frame analyses can be inconsistent due to low video quality (e.g., blurriness) or rapid scene changes where details are hard to distinguish.
+     Your task is not to perfect each frame description but to understand the overall sequence and likely narrative, focusing on identifying any joke, irony, absurdity, or humorous transformation occurring across these frames.
+
+
+Your response should be structured as follows:
+
+     Overall Video Description: Provide a concise summary of what happens in the reel based on the combined information from all the provided screenshots.
+     Humor/Joke Identification (If Applicable): If you can discern any joke or humorous element, explicitly state it and explain how the sequence of frames contributes to this.
+
+
+Instructions for Synthesis:
+
+     Focus on identifying recurring elements, main subject(s), consistent actions/actions that seem unlikely (potential contradiction).
+     Look for patterns where details change rapidly or absurdly.
+     Prioritize information from descriptions over relying solely on OCR text if the description seems more plausible. Ignore minor inconsistencies between frames unless they clearly contradict a central theme or joke premise.
+     Be ready to point out where the humor lies, which might involve unexpected changes, wordplay captured by OCR errors in the context of the visual action described, absurdity, or irony.",
+            keepAlive: true,
+            shouldThink: config('llm.models.chat.shouldThink')
+        );
+
+        $llmDescription = json_decode($llmDescription, true)['answer'] ?? null;
+        if (empty($llmDescription)) {
+            $llmDescription = null;
+        }
+
+        dump($llmDescription); // DEBUG
+
+        return $llmDescription;
+    }
+}