118 lines
5.0 KiB
PHP
118 lines
5.0 KiB
PHP
<?php
|
|
|
|
namespace App\FileTools\VideoDescriptor;
|
|
|
|
use App\AIPrompt\IAIPrompt;
|
|
use App\AIPrompt\OpenAPIPrompt;
|
|
use App\FileTools\OCR\IImageOCR;
|
|
use App\FileTools\OCR\TesseractImageOCR;
|
|
use Log;
|
|
|
|
class OCRLLMVideoDescriptor implements IVideoDescriptor
|
|
{
|
|
private IImageOCR $ocr;
|
|
private IAIPrompt $llm; // LLM That can visualize images and generate descriptions
|
|
|
|
public const DESCRIPTION_PROMPT = "Describe the video based on the screenshots. Each screenshot has a timestamp of when in the video the screenshot was taken, an OCR result and a description of the screenshot by an LLM. Do not specify that it is a video, just describe the video. The description must have a maximum of 500 words.\n";
|
|
|
|
public function __construct() {
|
|
$this->ocr = new TesseractImageOCR();
|
|
$this->llm = new OpenAPIPrompt();
|
|
}
|
|
|
|
public function getDescription(string $filePath): ?string
|
|
{
|
|
/*
|
|
1. Cut videos in screenshots
|
|
2. Use OCR to extract text from screenshots
|
|
3. Use LLM to generate a description of the screenshot
|
|
4. Combine the descriptions of all screenshots into a single description
|
|
5. Ask an LLM to describe the video
|
|
*/
|
|
|
|
// Step 1: Cut video into screenshots
|
|
$screenshots = $this->cutVideoIntoScreenshots($filePath);
|
|
|
|
if (empty($screenshots)) {
|
|
throw new \Exception("No screenshots were generated from the video {$filePath}.");
|
|
}
|
|
|
|
// Step 2 & 3: Use OCR to extract text and LLM to get description from screenshots
|
|
$descriptions = [];
|
|
foreach ($screenshots as $screenshot) {
|
|
$descriptions[$screenshot] = [];
|
|
|
|
$ocrDescription = $this->ocr->performOCR($screenshot);
|
|
$ocrDescription = empty($ocrDescription) ? 'No text found' : $ocrDescription;
|
|
$descriptions[$screenshot]['ocr'] = $ocrDescription;
|
|
|
|
$llmDescription = $this->llm->generate(
|
|
config('llm.models.vision.name'),
|
|
"Describe the content of this screenshot from a video. Do not specify that it is a screenshot, just describe the content.",
|
|
images: [$screenshot],
|
|
outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
|
|
systemMessage: "The user will ask something. Give your direct answer to that.",
|
|
keepAlive: $screenshot != end($screenshots), // Keep alive for all but the last screenshot
|
|
shouldThink: config('llm.models.vision.shouldThink')
|
|
);
|
|
$descriptions[$screenshot]['text'] = json_decode($llmDescription, true)['answer'] ?? 'No description generated';
|
|
}
|
|
|
|
// HERE COULD BE SOME INTERMEDIATE PROCESSING OF DESCRIPTIONS
|
|
|
|
// Step 4: Combine the descriptions of all screenshots into a single description
|
|
$combinedDescription = '';
|
|
$screenshotCount = 0;
|
|
foreach ($descriptions as $screenshot => $description) {
|
|
$screenshotCount++;
|
|
$combinedDescription .= "Screenshot: {$screenshotCount}\n";
|
|
$combinedDescription .= "Timestamp: {$screenshotCount}s\n"; // TODO Cut the video in smaller parts when the video is short
|
|
$combinedDescription .= "OCR: {$description['ocr']}\n";
|
|
$combinedDescription .= "LLM Description: {$description['text']}\n\n";
|
|
}
|
|
$combinedDescription = trim($combinedDescription);
|
|
|
|
// Step 5: Ask an LLM to describe the video based on the combined descriptions
|
|
$llmDescription = $this->llm->generate(
|
|
config('llm.models.chat.name'),
|
|
self::DESCRIPTION_PROMPT . $combinedDescription,
|
|
outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
|
|
systemMessage: "The user will ask something. Give your direct answer to that.",
|
|
keepAlive: true,
|
|
shouldThink: config('llm.models.chat.shouldThink')
|
|
);
|
|
|
|
$llmDescription = json_decode($llmDescription, true)['answer'] ?? null;
|
|
if (empty($llmDescription)) {
|
|
$llmDescription = null;
|
|
}
|
|
|
|
return $llmDescription;
|
|
}
|
|
|
|
/**
|
|
* Cut the video into screenshots.
|
|
* Using ffmpeg to cut the video into screenshots at regular intervals.
|
|
* The screenshots will be saved in a temporary directory.
|
|
* @param string $filePath
|
|
* @return void
|
|
*/
|
|
private function cutVideoIntoScreenshots(string $filePath): array
|
|
{
|
|
$tempDir = sys_get_temp_dir() . '/video_screenshots';
|
|
if (!is_dir($tempDir)) {
|
|
mkdir($tempDir, 0777, true);
|
|
}
|
|
|
|
Log::info("Cutting video into screenshots: $filePath");
|
|
|
|
$outputPattern = $tempDir . '/screenshot_%d.png';
|
|
$command = "ffmpeg -i " . escapeshellarg($filePath) . " -vf fps=1 " . escapeshellarg($outputPattern);
|
|
exec($command);
|
|
|
|
// Collect all screenshots
|
|
$screenshots = glob($tempDir . '/screenshot_*.png');
|
|
return $screenshots;
|
|
}
|
|
}
|