DatBrowser/app/Services/FileTools/VideoDescriptor/OCRLLMVideoDescriptor.php

<?php

namespace App\Services\FileTools\VideoDescriptor;

use App\Services\AIPrompt\OpenAPIPrompt;
use App\Services\FileTools\OCR\IImageOCR;

class OCRLLMVideoDescriptor extends AbstractLLMVideoDescriptor implements IVideoDescriptor
{
    public const DESCRIPTION_PROMPT = "Analyze this Video sequence. You are given information for each individual screenshot/analysis from the video:";

    public function __construct(public IImageOCR $ocr, public OpenAPIPrompt $llm) {
    }

    public function getDescription(string $filePath): ?string
    {
        /*
            1. Cut videos in screenshots
            2. Use OCR to extract text from screenshots
            3. Use LLM to generate a description of the screenshot
            4. Combine the descriptions of all screenshots into a single description
            5. Ask an LLM to describe the video
        */

        // Step 1: Cut video into screenshots
        $screenshots = $this->cutVideoIntoScreenshots($filePath);

        if (empty($screenshots)) {
            throw new \Exception("No screenshots were generated from the video {$filePath}.");
        }

        // Step 2 & 3: Use OCR to extract text and LLM to get description from screenshots
        $descriptions = [];
        foreach ($screenshots as $values) {
            $screenshot = $values['screenshot'];
            $timestamp = $values['timestamp'];

            $descriptions[$screenshot] = [];

            $ocrDescription = $this->ocr->performOCR($screenshot);
            $ocrDescription = empty($ocrDescription) ? 'No text found' : $ocrDescription;
            $descriptions[$screenshot]['ocr'] = $ocrDescription;
            dump($ocrDescription); // DEBUG

            $llmDescription = $this->llm->generate(
                config('llm.models.vision.name'),
                "Describe this image in detail, breaking it down into distinct parts as follows:

1.  **Scene Description:** Describe the overall setting and environment of the image (e.g., forest clearing, futuristic city street, medieval castle interior).
2.  **Main Subject/Character(s):** Detail what is happening with the primary character or subject present in the frame.
3.  **Text Description (if any):** If there are visible text elements (like words, letters, captions), describe them exactly as they appear and note their location relative to other elements. This includes any emojis used in captions, describing their visual appearance and likely meaning.
4.  **Summary:** Briefly summarize the key content of the image for clarity.
5.  **Joke:** If the image is part of a meme or humorous content, describe the joke or humorous element present in the image. Do not include this part if you are not sure to understand the joke/meme.

Format your response strictly using numbered lines corresponding to these four points (1., 2., 3., 4., 5.). Do not use markdown formatting or extra text outside these lines; simply list them sequentially as plain text output.",
                images: [$screenshot],
                outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
                systemMessage: "You are an image understanding AI specialized in describing visual scenes accurately and concisely. Your task is solely to describe the content of the provided image based on what you can visually perceive.

Please analyze the image carefully and provide a description focusing purely on the visible information without generating any text about concepts, interpretations, or future actions beyond the immediate scene. Describe everything that is clearly depicted.",
                keepAlive: $screenshot != end($screenshots), // Keep alive for all but the last screenshot
                shouldThink: config('llm.models.vision.shouldThink')
            );
            dump($llmDescription); // DEBUG
            $descriptions[$screenshot]['text'] = json_decode($llmDescription, true)['answer'] ?? 'No description generated';
        }

        // HERE COULD BE SOME INTERMEDIATE PROCESSING OF DESCRIPTIONS

        // Step 4: Combine the descriptions of all screenshots into a single description
        $combinedDescription = '';
        $screenshotCount = 0;
        foreach ($screenshots as $values) {
            $screenshot = $values['screenshot'];
            $timestamp = $values['timestamp'];

            $screenshotCount++;
            $description = $descriptions[$screenshot] ?? [];

            $combinedDescription .= "Screenshot: {$screenshotCount}\n";
            $combinedDescription .= "Timestamp: {$timestamp}s\n"; // TODO Cut the video in smaller parts when the video is short
            $combinedDescription .= "OCR: {$description['ocr']}\n";
            $combinedDescription .= "LLM Description: {$description['text']}\n";
            $combinedDescription .= "\n";
        }
        $combinedDescription = trim($combinedDescription);

        // Step 5: Ask an LLM to describe the video based on the combined descriptions
        $llmDescription = $this->llm->generate(
            config('llm.models.chat.name'),
            static::DESCRIPTION_PROMPT . $combinedDescription . "\n\nYou are analyzing an Instagram Reel (a short-form video). You have received multiple frames from this reel. For each frame:

1.  A **screenshot number** is given (e.g., `Screenshot : 3`).
2.  The approximate **timestamp in seconds** within the video where that frame occurs.
3.  An **OCR result** which contains text extracted directly from an image of this frame, potentially including OCR errors or unusual characters.
4.  A description provided by another LLM for that specific frame (the `LLM Description`).

Your task is to synthesize a single, coherent video description summarizing the entire reel (`the whole thing`). Use all the information (screenshot number, timestamp, OCR, and llm_description) but be aware that individual descriptions may be inaccurate due to poor image quality or interpretation errors. Look for consistency across multiple frames.

Analyze the sequence of events, character(s), setting, style (e.g., fast cuts, slow-motion), narrative structure (if any), humor, and joke elements throughout the video based on these frame-by-frame inputs. Pay special attention to identifying if there's an underlying joke or humorous concept running through the reel.

Based on your analysis, write a concise description (`the whole thing`) that captures the essence of this Instagram Reel. Format your output strictly as JSON with only the `answer` field containing this synthesized summary.",
            outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
            systemMessage: "You are an AI assistant specialized in analyzing video content, particularly short-form videos like Instagram Reels. Your task is to synthesize a single description for the entire video based on sequential information provided from its screenshots and associated text data (OCR results).

Your response must strictly follow this JSON format:
{\"answer\": \"<your final synthesized video description here as a string>\"}

## Rules
1.  Analyze all provided inputs: screenshot number, timestamp, OCR result snippet, and LLM description for each frame.
2.  The core goal is to produce one concise, coherent, and engaging video description that captures the essence of the entire reel (\"the whole thing\").
3.  Individual frame descriptions can be inaccurate or contradictory (e.g., object changes drastically between frames). Prioritize consistency across multiple frames unless strongly contradicted by a clear majority.
4.  Do not generate separate JSON objects for each screenshot; only produce one final `answer` string summarizing the video as a whole at the end of your reasoning.
5.  Pay special attention to identifying any underlying joke, humor, or satirical element present in the reel based on the collective information.

## Output Constraints
-   Your response **MUST** be ONLY valid JSON conforming to the structure: {\"type\": \"object\", \"properties\": {\"answer\": {\"type\": \"string\"}}, \"required\": [\"answer\"]}.
-   Only fill the `answer` field. Do not include any other text or explanations outside this JSON structure.
-   The `answer` string should be a comprehensive description of the video, suitable for representing it to another user on a platform like Instagram/YouTube Shorts.",
            keepAlive: true,
            shouldThink: config('llm.models.chat.shouldThink')
        );

        $llmDescription = json_decode($llmDescription, true)['answer'] ?? null;
        if (empty($llmDescription)) {
            $llmDescription = null;
        }

        dump($llmDescription); // DEBUG

        return $llmDescription;
    }
}