LLM reel caption and video description + Refactor in services
This commit is contained in:
@ -0,0 +1,64 @@
|
||||
<?php
|
||||
|
||||
namespace App\Services\FileTools\VideoDescriptor;
|
||||
|
||||
use App\Services\AIPrompt\OpenAPIPrompt;
|
||||
use App\Services\FileTools\OCR\IImageOCR;
|
||||
|
||||
class LLMFullVideoDescriptor extends AbstractLLMVideoDescriptor implements IVideoDescriptor
|
||||
{
|
||||
public const DESCRIPTION_PROMPT = "Describe the video based on the screenshots. Each screenshot has a timestamp of when in the video the screenshot was taken. Do not specify that it is a video, just describe the video. Do not describe the screenshots one by one, try to make sense out of all the screenshots, what could be the video about ? What capion is attached to the video ? is it a meme ? If yes, what is the joke ? Be the most descriptive without exceeding 5000 words.\n";
|
||||
|
||||
public function __construct(public IImageOCR $ocr, public OpenAPIPrompt $llm) {
|
||||
}
|
||||
|
||||
public function getDescription(string $filePath): ?string
|
||||
{
|
||||
/*
|
||||
1. Cut videos in screenshots
|
||||
2. Ask an LLM to describe the video with all the screenshots
|
||||
*/
|
||||
|
||||
// Step 1: Cut video into screenshots
|
||||
$screenshots = $this->cutVideoIntoScreenshots($filePath);
|
||||
|
||||
if (empty($screenshots)) {
|
||||
throw new \Exception("No screenshots were generated from the video {$filePath}.");
|
||||
}
|
||||
|
||||
// Step 4: Combine the descriptions of all screenshots into a single description
|
||||
$combinedDescription = '';
|
||||
$screenshotCount = 0;
|
||||
foreach ($screenshots as $values) {
|
||||
$screenshot = $values['screenshot'];
|
||||
$timestamp = $values['timestamp'];
|
||||
|
||||
$screenshotCount++;
|
||||
$combinedDescription .= "Screenshot: {$screenshotCount}\n";
|
||||
$combinedDescription .= "Timestamp: {$timestamp}s\n"; // TODO Cut the video in smaller parts when the video is short
|
||||
$ocrDescription = $this->ocr->performOCR($screenshot);
|
||||
$ocrDescription = empty($ocrDescription) ? 'No text found' : $ocrDescription;
|
||||
$combinedDescription .= "OCR: {$ocrDescription}\n"; // Perform OCR on the screenshot
|
||||
$combinedDescription .= "\n";
|
||||
}
|
||||
$combinedDescription = trim($combinedDescription);
|
||||
|
||||
// Step 5: Ask an LLM to describe the video based on the combined descriptions
|
||||
$llmDescription = $this->llm->generate(
|
||||
config('llm.models.vision.name'),
|
||||
static::DESCRIPTION_PROMPT . $combinedDescription,
|
||||
images: array_map(function ($screenshot) {return $screenshot["screenshot"];}, $screenshots), // Pass the screenshots to the LLM
|
||||
outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
|
||||
systemMessage: "The user will ask something. Give your direct answer to that.",
|
||||
keepAlive: true,
|
||||
shouldThink: config('llm.models.vision.shouldThink')
|
||||
);
|
||||
|
||||
$llmDescription = json_decode($llmDescription, true)['answer'] ?? null;
|
||||
if (empty($llmDescription)) {
|
||||
$llmDescription = null;
|
||||
}
|
||||
|
||||
return $llmDescription;
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user