65 lines
3.0 KiB
PHP
65 lines
3.0 KiB
PHP
<?php
|
|
|
|
namespace App\Services\FileTools\VideoDescriptor;
|
|
|
|
use App\Services\AIPrompt\OpenAPIPrompt;
|
|
use App\Services\FileTools\OCR\IImageOCR;
|
|
|
|
class LLMFullVideoDescriptor extends AbstractLLMVideoDescriptor implements IVideoDescriptor
|
|
{
|
|
public const DESCRIPTION_PROMPT = "Describe the video based on the screenshots. Each screenshot has a timestamp of when in the video the screenshot was taken. Do not specify that it is a video, just describe the video. Do not describe the screenshots one by one, try to make sense out of all the screenshots, what could be the video about ? What capion is attached to the video ? is it a meme ? If yes, what is the joke ? Be the most descriptive without exceeding 5000 words.\n";
|
|
|
|
public function __construct(public IImageOCR $ocr, public OpenAPIPrompt $llm) {
|
|
}
|
|
|
|
public function getDescription(string $filePath): ?string
|
|
{
|
|
/*
|
|
1. Cut videos in screenshots
|
|
2. Ask an LLM to describe the video with all the screenshots
|
|
*/
|
|
|
|
// Step 1: Cut video into screenshots
|
|
$screenshots = $this->cutVideoIntoScreenshots($filePath);
|
|
|
|
if (empty($screenshots)) {
|
|
throw new \Exception("No screenshots were generated from the video {$filePath}.");
|
|
}
|
|
|
|
// Step 4: Combine the descriptions of all screenshots into a single description
|
|
$combinedDescription = '';
|
|
$screenshotCount = 0;
|
|
foreach ($screenshots as $values) {
|
|
$screenshot = $values['screenshot'];
|
|
$timestamp = $values['timestamp'];
|
|
|
|
$screenshotCount++;
|
|
$combinedDescription .= "Screenshot: {$screenshotCount}\n";
|
|
$combinedDescription .= "Timestamp: {$timestamp}s\n"; // TODO Cut the video in smaller parts when the video is short
|
|
$ocrDescription = $this->ocr->performOCR($screenshot);
|
|
$ocrDescription = empty($ocrDescription) ? 'No text found' : $ocrDescription;
|
|
$combinedDescription .= "OCR: {$ocrDescription}\n"; // Perform OCR on the screenshot
|
|
$combinedDescription .= "\n";
|
|
}
|
|
$combinedDescription = trim($combinedDescription);
|
|
|
|
// Step 5: Ask an LLM to describe the video based on the combined descriptions
|
|
$llmDescription = $this->llm->generate(
|
|
config('llm.models.vision.name'),
|
|
static::DESCRIPTION_PROMPT . $combinedDescription,
|
|
images: array_map(function ($screenshot) {return $screenshot["screenshot"];}, $screenshots), // Pass the screenshots to the LLM
|
|
outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
|
|
systemMessage: "The user will ask something. Give your direct answer to that.",
|
|
keepAlive: true,
|
|
shouldThink: config('llm.models.vision.shouldThink')
|
|
);
|
|
|
|
$llmDescription = json_decode($llmDescription, true)['answer'] ?? null;
|
|
if (empty($llmDescription)) {
|
|
$llmDescription = null;
|
|
}
|
|
|
|
return $llmDescription;
|
|
}
|
|
}
|