Files
DatBrowser/app/Services/FileTools/VideoDescriptor/LLMFullVideoDescriptor.php

65 lines
3.0 KiB
PHP

<?php
namespace App\Services\FileTools\VideoDescriptor;
use App\Services\AIPrompt\OpenAPIPrompt;
use App\Services\FileTools\OCR\IImageOCR;
class LLMFullVideoDescriptor extends AbstractLLMVideoDescriptor implements IVideoDescriptor
{
public const DESCRIPTION_PROMPT = "Describe the video based on the screenshots. Each screenshot has a timestamp of when in the video the screenshot was taken. Do not specify that it is a video, just describe the video. Do not describe the screenshots one by one, try to make sense out of all the screenshots, what could be the video about ? What capion is attached to the video ? is it a meme ? If yes, what is the joke ? Be the most descriptive without exceeding 5000 words.\n";
public function __construct(public IImageOCR $ocr, public OpenAPIPrompt $llm) {
}
public function getDescription(string $filePath): ?string
{
/*
1. Cut videos in screenshots
2. Ask an LLM to describe the video with all the screenshots
*/
// Step 1: Cut video into screenshots
$screenshots = $this->cutVideoIntoScreenshots($filePath);
if (empty($screenshots)) {
throw new \Exception("No screenshots were generated from the video {$filePath}.");
}
// Step 4: Combine the descriptions of all screenshots into a single description
$combinedDescription = '';
$screenshotCount = 0;
foreach ($screenshots as $values) {
$screenshot = $values['screenshot'];
$timestamp = $values['timestamp'];
$screenshotCount++;
$combinedDescription .= "Screenshot: {$screenshotCount}\n";
$combinedDescription .= "Timestamp: {$timestamp}s\n"; // TODO Cut the video in smaller parts when the video is short
$ocrDescription = $this->ocr->performOCR($screenshot);
$ocrDescription = empty($ocrDescription) ? 'No text found' : $ocrDescription;
$combinedDescription .= "OCR: {$ocrDescription}\n"; // Perform OCR on the screenshot
$combinedDescription .= "\n";
}
$combinedDescription = trim($combinedDescription);
// Step 5: Ask an LLM to describe the video based on the combined descriptions
$llmDescription = $this->llm->generate(
config('llm.models.vision.name'),
static::DESCRIPTION_PROMPT . $combinedDescription,
images: array_map(function ($screenshot) {return $screenshot["screenshot"];}, $screenshots), // Pass the screenshots to the LLM
outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
systemMessage: "The user will ask something. Give your direct answer to that.",
keepAlive: true,
shouldThink: config('llm.models.vision.shouldThink')
);
$llmDescription = json_decode($llmDescription, true)['answer'] ?? null;
if (empty($llmDescription)) {
$llmDescription = null;
}
return $llmDescription;
}
}