DatBrowser/app/Services/FileTools/VideoDescriptor/OCRLLMVideoDescriptor.php

<?php

namespace App\Services\FileTools\VideoDescriptor;

use App\Services\AIPrompt\OpenAPIPrompt;
use App\Services\FileTools\OCR\IImageOCR;
use App\Services\FileTools\Transcription\IAudioTranscriptor;

class OCRLLMVideoDescriptor extends AbstractLLMVideoDescriptor implements IVideoDescriptor
{
    public const DESCRIPTION_PROMPT = "Analyze this Video sequence. You are given information for each individual screenshot/analysis from the video:";

    public function __construct(public IImageOCR $ocr, public OpenAPIPrompt $llm, public IAudioTranscriptor $audioTranscriptor)
    {
    }

    public function getDescription(string $filePath): ?string
    {
        /*
            1. Cut videos in screenshots
            2. Use OCR to extract text from screenshots
            3. Use LLM to generate a description of the screenshot
            4. Combine the descriptions of all screenshots into a single description
            5. Ask an LLM to describe the video
        */

        // Step 1: Cut video into screenshots
        $screenshots = $this->cutVideoIntoScreenshots($filePath);
        $audio = $this->extractAudioFromVideo($filePath);

        // Audio transcription
        $audioTranscription = null;
        if (isset($audio)) {
            $audioTranscription = $this->audioTranscriptor->transcribe($audio);
            dump($audioTranscription); // DEBUG
        }

        if (empty($screenshots)) {
            throw new \Exception("No screenshots were generated from the video {$filePath}.");
        }

        // Step 2 & 3: Use OCR to extract text and LLM to get description from screenshots
        $descriptions = [];
        foreach ($screenshots as $values) {
            $screenshot = $values['screenshot'];
            $timestamp = $values['timestamp'];

            $descriptions[$screenshot] = [];

            $ocrDescription = $this->ocr->performOCR($screenshot);
            $ocrDescription = empty($ocrDescription) ? 'No text found' : $ocrDescription;
            $descriptions[$screenshot]['ocr'] = $ocrDescription;
            dump($ocrDescription); // DEBUG

            $llmDescription = $this->llm->generate(
                config('llm.models.vision.name'),
                "Describe this image in detail, breaking it down into distinct parts as follows:

1.  **Scene Description:** Describe the overall setting and environment of the image (e.g., forest clearing, futuristic city street, medieval castle interior).
2.  **Main Subject/Character(s):** Detail what is happening with the primary character or subject present in the frame.
3.  **Text Description (if any):** If there are visible text elements (like words, letters, captions), describe them exactly as they appear and note their location relative to other elements. This includes any emojis used in captions, describing their visual appearance and likely meaning.
4.  **Summary:** Briefly summarize the key content of the image for clarity.
5.  **Joke:** If the image is part of a meme or humorous content, describe the joke or humorous element present in the image. Do not include this part if you are not sure to understand the joke/meme.

Format your response strictly using numbered lines corresponding to these four points (1., 2., 3., 4., 5.). Do not use markdown formatting or extra text outside these lines; simply list them sequentially as plain text output.",
                images: [$screenshot],
                outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
                systemMessage: "You are an image understanding AI specialized in describing visual scenes accurately and concisely. Your task is solely to describe the content of the provided image based on what you can visually perceive.

Please analyze the image carefully and provide a description focusing purely on the visible information without generating any text about concepts, interpretations, or future actions beyond the immediate scene. Describe everything that is clearly depicted.",
                keepAlive: $screenshot != end($screenshots), // Keep alive for all but the last screenshot
                shouldThink: config('llm.models.vision.shouldThink')
            );
            dump($llmDescription); // DEBUG
            $descriptions[$screenshot]['text'] = json_decode($llmDescription, true)['answer'] ?? 'No description generated';
        }

        // HERE COULD BE SOME INTERMEDIATE PROCESSING OF DESCRIPTIONS

        // Step 4: Combine the descriptions of all screenshots into a single description
        $combinedDescription = '';
        // Add full video informations
        // Audio transcription
        if (isset($audio)) {
            $combinedDescription .= "Audio Transcription: {$audioTranscription}\n";
        }

        if (!empty($combinedDescription)) {
            $combinedDescription .= "\n";
        }

        // Add screenshots descriptions
        $screenshotCount = 0;
        foreach ($screenshots as $values) {
            $screenshot = $values['screenshot'];
            $timestamp = $values['timestamp'];

            $screenshotCount++;
            $description = $descriptions[$screenshot] ?? [];

            $combinedDescription .= "Screenshot: {$screenshotCount}\n";
            $combinedDescription .= "Timestamp: {$timestamp}s\n"; // TODO Cut the video in smaller parts when the video is short
            $combinedDescription .= "OCR: {$description['ocr']}\n";
            $combinedDescription .= "LLM Description: {$description['text']}\n";
            $combinedDescription .= "\n";
        }
        $combinedDescription = trim($combinedDescription);

        dump($combinedDescription); // DEBUG

        // Step 5: Ask an LLM to describe the video based on the combined descriptions
        $llmDescription = $this->llm->generate(
            config('llm.models.chat.name'),
            static::DESCRIPTION_PROMPT . $combinedDescription,
            outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
            systemMessage: "You are an expert social media content analyst specializing in Instagram Reels. Your task is to synthesize descriptions and OCR findings from multiple screenshots of a single video reel into a single, concise, and accurate overall description of the video's content, style, and potential humor.

Your input will consist of:
1.  An audio transcription of the entire video.
2.  Multiple entries containing:
    -   Screenshot number (e.g., \"Screenshot: 1\")
    -   Timestamp (in seconds) indicating its position in the reel
    -   Raw OCR text from that specific screenshot, which may contain errors or unusual characters but should be interpreted for content relevance.
    -   A description of the image content generated by an LLM for that screenshot.

The descriptions provided by the LLM for individual screenshots are often inconsistent with adjacent frames and might not capture subtle humor accurately. The raw OCR text can sometimes provide direct quotes relevant to the context, even if misspelled or partially recognized.

Your response must be in **exactly** the following JSON format:
```json
{
  \"answer\": \"{your synthesized description here}\"
}
```
Please follow these instructions carefully:

     Analyze All Data: Consider both the audio transcription and all the screenshot data (OCR text and descriptions) together.
     Synthesize Coherently: Create a single, flowing narrative that describes the main subject(s), actions, setting, transitions, sound/music, and overall style of the video reel based on the most consistent or contextually supported information across its frames.
     Handle Inconsistencies: Assume that individual screenshot analyses might contain errors (especially with OCR) or be limited in scope. Do not rely solely on one frame's description contradicting another unless strongly supported by context and multiple data points converge to a different understanding or the inconsistency is clearly part of a joke requiring literal interpretation.
     Focus on Repeated Elements: Pay close attention to subjects, actions, objects, text content (especially from OCR), sounds/words mentioned in the transcription, and visual styles that repeat across multiple frames, as this indicates continuity or recurring themes/humor.
     Identify Joke/Humor: Actively look for elements within the combined data that suggest a joke, satire, absurdity, irony, sarcasm, clever wordplay (from OCR/transcription), or unexpected humor. This includes inconsistent descriptions if they are clearly intended as part of a gag, visual puns, audio-visual mismatches mentioned in the transcription, or any content designed for comedic effect.
     Prioritize Core Content: Base your description primarily on the core subject and action within the reel (as identified repeatedly across frames). Use details from individual screenshots to flesh out specific moments only if they fit this narrative context.
     Filter Minor Details: Ignore highly variable or insignificant details that appear inconsistent unless they are clearly integral to the joke or overall theme (e.g., slight variations in background color might be acceptable, but a consistent change is important).
     Output Requirement: Your response must contain only valid JSON with an object having exactly one property answer of type string. Do not output any other text, explanations, lists, or code outside this JSON structure.",
            keepAlive: true,
            shouldThink: config('llm.models.chat.shouldThink')
        );

        $llmDescription = json_decode($llmDescription, true)['answer'] ?? null;
        if (empty($llmDescription)) {
            $llmDescription = null;
        }

        dump($llmDescription); // DEBUG

        return $llmDescription;
    }
}