ocr = new TesseractImageOCR(); $this->llm = new OpenAPIPrompt(); } public function getDescription(string $filePath): ?string { /* 1. Cut videos in screenshots 2. Use OCR to extract text from screenshots 3. Use LLM to generate a description of the screenshot 4. Combine the descriptions of all screenshots into a single description 5. Ask an LLM to describe the video */ // Step 1: Cut video into screenshots $screenshots = $this->cutVideoIntoScreenshots($filePath); if (empty($screenshots)) { throw new \Exception("No screenshots were generated from the video {$filePath}."); } // Step 2 & 3: Use OCR to extract text and LLM to get description from screenshots $descriptions = []; foreach ($screenshots as $screenshot) { $descriptions[$screenshot] = []; $ocrDescription = $this->ocr->performOCR($screenshot); $ocrDescription = empty($ocrDescription) ? 'No text found' : $ocrDescription; $descriptions[$screenshot]['ocr'] = $ocrDescription; $llmDescription = $this->llm->generate( config('llm.models.vision.name'), "Describe the content of this screenshot from a video. Do not specify that it is a screenshot, just describe the content.", images: [$screenshot], outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}', systemMessage: "The user will ask something. Give your direct answer to that.", keepAlive: $screenshot != end($screenshots), // Keep alive for all but the last screenshot shouldThink: config('llm.models.vision.shouldThink') ); $descriptions[$screenshot]['text'] = json_decode($llmDescription, true)['answer'] ?? 'No description generated'; } // HERE COULD BE SOME INTERMEDIATE PROCESSING OF DESCRIPTIONS // Step 4: Combine the descriptions of all screenshots into a single description $combinedDescription = ''; $screenshotCount = 0; foreach ($descriptions as $screenshot => $description) { $screenshotCount++; $combinedDescription .= "Screenshot: {$screenshotCount}\n"; $combinedDescription .= "Timestamp: {$screenshotCount}s\n"; // TODO Cut the video in smaller parts when the video is short $combinedDescription .= "OCR: {$description['ocr']}\n"; $combinedDescription .= "LLM Description: {$description['text']}\n\n"; } $combinedDescription = trim($combinedDescription); // Step 5: Ask an LLM to describe the video based on the combined descriptions $llmDescription = $this->llm->generate( config('llm.models.chat.name'), self::DESCRIPTION_PROMPT . $combinedDescription, outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}', systemMessage: "The user will ask something. Give your direct answer to that.", keepAlive: true, shouldThink: config('llm.models.chat.shouldThink') ); $llmDescription = json_decode($llmDescription, true)['answer'] ?? null; if (empty($llmDescription)) { $llmDescription = null; } return $llmDescription; } /** * Cut the video into screenshots. * Using ffmpeg to cut the video into screenshots at regular intervals. * The screenshots will be saved in a temporary directory. * @param string $filePath * @return void */ private function cutVideoIntoScreenshots(string $filePath): array { $tempDir = sys_get_temp_dir() . '/video_screenshots'; if (!is_dir($tempDir)) { mkdir($tempDir, 0777, true); } Log::info("Cutting video into screenshots: $filePath"); $outputPattern = $tempDir . '/screenshot_%d.png'; $command = "ffmpeg -i " . escapeshellarg($filePath) . " -vf fps=1 " . escapeshellarg($outputPattern); exec($command); // Collect all screenshots $screenshots = glob($tempDir . '/screenshot_*.png'); return $screenshots; } }