Added video transcription
All checks were successful
Push image to registry / build-image (push) Successful in 5m59s

This commit is contained in:
2025-07-04 12:36:03 +02:00
parent a00c5ba4b8
commit f0e52147e4
8 changed files with 321 additions and 41 deletions

View File

@ -56,4 +56,33 @@ abstract class AbstractLLMVideoDescriptor implements IVideoDescriptor
}
return $array;
}
/**
* Extract audio from the video file.
* Using ffmpeg to extract audio from the video file.
* The audio will be saved in a temporary directory as an MP3 file.
* If the audio extraction fails, it will return null.
* @param string $filePath
* @return string|null
*/
protected function extractAudioFromVideo(string $filePath): ?string
{
$tempDir = sys_get_temp_dir() . '/video_audio';
if (!is_dir($tempDir)) {
mkdir($tempDir, 0777, true);
}
else {
// Clear the directory if it already exists
array_map('unlink', glob($tempDir . '/*'));
}
$outputFile = $tempDir . '/audio.mp3';
$command = "ffmpeg -i " . escapeshellarg($filePath) . " " . escapeshellarg($outputFile);
exec($command);
if (file_exists($outputFile)) {
return $outputFile;
}
return null;
}
}

View File

@ -4,12 +4,14 @@ namespace App\Services\FileTools\VideoDescriptor;
use App\Services\AIPrompt\OpenAPIPrompt;
use App\Services\FileTools\OCR\IImageOCR;
use App\Services\FileTools\Transcription\IAudioTranscriptor;
class OCRLLMVideoDescriptor extends AbstractLLMVideoDescriptor implements IVideoDescriptor
{
public const DESCRIPTION_PROMPT = "Analyze this Video sequence. You are given information for each individual screenshot/analysis from the video:";
public function __construct(public IImageOCR $ocr, public OpenAPIPrompt $llm) {
public function __construct(public IImageOCR $ocr, public OpenAPIPrompt $llm, public IAudioTranscriptor $audioTranscriptor)
{
}
public function getDescription(string $filePath): ?string
@ -24,6 +26,14 @@ class OCRLLMVideoDescriptor extends AbstractLLMVideoDescriptor implements IVideo
// Step 1: Cut video into screenshots
$screenshots = $this->cutVideoIntoScreenshots($filePath);
$audio = $this->extractAudioFromVideo($filePath);
// Audio transcription
$audioTranscription = null;
if (isset($audio)) {
$audioTranscription = $this->audioTranscriptor->transcribe($audio);
dump($audioTranscription); // DEBUG
}
if (empty($screenshots)) {
throw new \Exception("No screenshots were generated from the video {$filePath}.");
@ -69,6 +79,17 @@ Please analyze the image carefully and provide a description focusing purely on
// Step 4: Combine the descriptions of all screenshots into a single description
$combinedDescription = '';
// Add full video informations
// Audio transcription
if (isset($audio)) {
$combinedDescription .= "Audio Transcription: {$audioTranscription}\n";
}
if (!empty($combinedDescription)) {
$combinedDescription .= "\n";
}
// Add screenshots descriptions
$screenshotCount = 0;
foreach ($screenshots as $values) {
$screenshot = $values['screenshot'];
@ -85,38 +106,41 @@ Please analyze the image carefully and provide a description focusing purely on
}
$combinedDescription = trim($combinedDescription);
dump($combinedDescription); // DEBUG
// Step 5: Ask an LLM to describe the video based on the combined descriptions
$llmDescription = $this->llm->generate(
config('llm.models.chat.name'),
static::DESCRIPTION_PROMPT . $combinedDescription . "\n\nYou are analyzing an Instagram Reel (a short-form video). You have received multiple frames from this reel. For each frame:
1. A **screenshot number** is given (e.g., `Screenshot : 3`).
2. The approximate **timestamp in seconds** within the video where that frame occurs.
3. An **OCR result** which contains text extracted directly from an image of this frame, potentially including OCR errors or unusual characters.
4. A description provided by another LLM for that specific frame (the `LLM Description`).
Your task is to synthesize a single, coherent video description summarizing the entire reel (`the whole thing`). Use all the information (screenshot number, timestamp, OCR, and llm_description) but be aware that individual descriptions may be inaccurate due to poor image quality or interpretation errors. Look for consistency across multiple frames.
Analyze the sequence of events, character(s), setting, style (e.g., fast cuts, slow-motion), narrative structure (if any), humor, and joke elements throughout the video based on these frame-by-frame inputs. Pay special attention to identifying if there's an underlying joke or humorous concept running through the reel.
Based on your analysis, write a concise description (`the whole thing`) that captures the essence of this Instagram Reel. Format your output strictly as JSON with only the `answer` field containing this synthesized summary.",
static::DESCRIPTION_PROMPT . $combinedDescription,
outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
systemMessage: "You are an AI assistant specialized in analyzing video content, particularly short-form videos like Instagram Reels. Your task is to synthesize a single description for the entire video based on sequential information provided from its screenshots and associated text data (OCR results).
systemMessage: "You are an expert social media content analyst specializing in Instagram Reels. Your task is to synthesize descriptions and OCR findings from multiple screenshots of a single video reel into a single, concise, and accurate overall description of the video's content, style, and potential humor.
Your response must strictly follow this JSON format:
{\"answer\": \"<your final synthesized video description here as a string>\"}
Your input will consist of:
1. An audio transcription of the entire video.
2. Multiple entries containing:
- Screenshot number (e.g., \"Screenshot: 1\")
- Timestamp (in seconds) indicating its position in the reel
- Raw OCR text from that specific screenshot, which may contain errors or unusual characters but should be interpreted for content relevance.
- A description of the image content generated by an LLM for that screenshot.
## Rules
1. Analyze all provided inputs: screenshot number, timestamp, OCR result snippet, and LLM description for each frame.
2. The core goal is to produce one concise, coherent, and engaging video description that captures the essence of the entire reel (\"the whole thing\").
3. Individual frame descriptions can be inaccurate or contradictory (e.g., object changes drastically between frames). Prioritize consistency across multiple frames unless strongly contradicted by a clear majority.
4. Do not generate separate JSON objects for each screenshot; only produce one final `answer` string summarizing the video as a whole at the end of your reasoning.
5. Pay special attention to identifying any underlying joke, humor, or satirical element present in the reel based on the collective information.
The descriptions provided by the LLM for individual screenshots are often inconsistent with adjacent frames and might not capture subtle humor accurately. The raw OCR text can sometimes provide direct quotes relevant to the context, even if misspelled or partially recognized.
## Output Constraints
- Your response **MUST** be ONLY valid JSON conforming to the structure: {\"type\": \"object\", \"properties\": {\"answer\": {\"type\": \"string\"}}, \"required\": [\"answer\"]}.
- Only fill the `answer` field. Do not include any other text or explanations outside this JSON structure.
- The `answer` string should be a comprehensive description of the video, suitable for representing it to another user on a platform like Instagram/YouTube Shorts.",
Your response must be in **exactly** the following JSON format:
```json
{
\"answer\": \"{your synthesized description here}\"
}
```
Please follow these instructions carefully:
Analyze All Data: Consider both the audio transcription and all the screenshot data (OCR text and descriptions) together.
Synthesize Coherently: Create a single, flowing narrative that describes the main subject(s), actions, setting, transitions, sound/music, and overall style of the video reel based on the most consistent or contextually supported information across its frames.
Handle Inconsistencies: Assume that individual screenshot analyses might contain errors (especially with OCR) or be limited in scope. Do not rely solely on one frame's description contradicting another unless strongly supported by context and multiple data points converge to a different understanding or the inconsistency is clearly part of a joke requiring literal interpretation.
Focus on Repeated Elements: Pay close attention to subjects, actions, objects, text content (especially from OCR), sounds/words mentioned in the transcription, and visual styles that repeat across multiple frames, as this indicates continuity or recurring themes/humor.
Identify Joke/Humor: Actively look for elements within the combined data that suggest a joke, satire, absurdity, irony, sarcasm, clever wordplay (from OCR/transcription), or unexpected humor. This includes inconsistent descriptions if they are clearly intended as part of a gag, visual puns, audio-visual mismatches mentioned in the transcription, or any content designed for comedic effect.
Prioritize Core Content: Base your description primarily on the core subject and action within the reel (as identified repeatedly across frames). Use details from individual screenshots to flesh out specific moments only if they fit this narrative context.
Filter Minor Details: Ignore highly variable or insignificant details that appear inconsistent unless they are clearly integral to the joke or overall theme (e.g., slight variations in background color might be acceptable, but a consistent change is important).
Output Requirement: Your response must contain only valid JSON with an object having exactly one property answer of type string. Do not output any other text, explanations, lists, or code outside this JSON structure.",
keepAlive: true,
shouldThink: config('llm.models.chat.shouldThink')
);