DatBrowser/app/Services/FileTools/VideoDescriptor/LLMFullVideoDescriptor.php

<?php

namespace App\Services\FileTools\VideoDescriptor;

use App\Services\AIPrompt\OpenAPIPrompt;
use App\Services\FileTools\OCR\IImageOCR;

class LLMFullVideoDescriptor extends AbstractLLMVideoDescriptor implements IVideoDescriptor
{
    public const DESCRIPTION_PROMPT = "Describe the video based on the screenshots. Each screenshot has a timestamp of when in the video the screenshot was taken. Do not specify that it is a video, just describe the video. Do not describe the screenshots one by one, try to make sense out of all the screenshots, what could be the video about ? What capion is attached to the video ? is it a meme ? If yes, what is the joke ? Be the most descriptive without exceeding 5000 words.\n";

    public function __construct(public IImageOCR $ocr, public OpenAPIPrompt $llm) {
    }

    public function getDescription(string $filePath): ?string
    {
        /*
            1. Cut videos in screenshots
            2. Ask an LLM to describe the video with all the screenshots
        */

        // Step 1: Cut video into screenshots
        $screenshots = $this->cutVideoIntoScreenshots($filePath);

        if (empty($screenshots)) {
            throw new \Exception("No screenshots were generated from the video {$filePath}.");
        }

        // Step 4: Combine the descriptions of all screenshots into a single description
        $combinedDescription = '';
        $screenshotCount = 0;
        foreach ($screenshots as $values) {
            $screenshot = $values['screenshot'];
            $timestamp = $values['timestamp'];

            $screenshotCount++;
            $combinedDescription .= "Screenshot: {$screenshotCount}\n";
            $combinedDescription .= "Timestamp: {$timestamp}s\n"; // TODO Cut the video in smaller parts when the video is short
            $ocrDescription = $this->ocr->performOCR($screenshot);
            $ocrDescription = empty($ocrDescription) ? 'No text found' : $ocrDescription;
            $combinedDescription .= "OCR: {$ocrDescription}\n"; // Perform OCR on the screenshot
            $combinedDescription .= "\n";
        }
        $combinedDescription = trim($combinedDescription);

        // Step 5: Ask an LLM to describe the video based on the combined descriptions
        $llmDescription = $this->llm->generate(
            config('llm.models.vision.name'),
            static::DESCRIPTION_PROMPT . $combinedDescription,
            images: array_map(function ($screenshot) {return $screenshot["screenshot"];}, $screenshots), // Pass the screenshots to the LLM
            outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
            systemMessage: "The user will ask something. Give your direct answer to that.",
            keepAlive: true,
            shouldThink: config('llm.models.vision.shouldThink')
        );

        $llmDescription = json_decode($llmDescription, true)['answer'] ?? null;
        if (empty($llmDescription)) {
            $llmDescription = null;
        }

        return $llmDescription;
    }
}