cutVideoIntoScreenshots($filePath); if (empty($screenshots)) { throw new \Exception("No screenshots were generated from the video {$filePath}."); } // Step 4: Combine the descriptions of all screenshots into a single description $combinedDescription = ''; $screenshotCount = 0; foreach ($screenshots as $values) { $screenshot = $values['screenshot']; $timestamp = $values['timestamp']; $screenshotCount++; $combinedDescription .= "Screenshot: {$screenshotCount}\n"; $combinedDescription .= "Timestamp: {$timestamp}s\n"; // TODO Cut the video in smaller parts when the video is short $ocrDescription = $this->ocr->performOCR($screenshot); $ocrDescription = empty($ocrDescription) ? 'No text found' : $ocrDescription; $combinedDescription .= "OCR: {$ocrDescription}\n"; // Perform OCR on the screenshot $combinedDescription .= "\n"; } $combinedDescription = trim($combinedDescription); // Step 5: Ask an LLM to describe the video based on the combined descriptions $llmDescription = $this->llm->generate( config('llm.models.vision.name'), static::DESCRIPTION_PROMPT . $combinedDescription, images: array_map(function ($screenshot) {return $screenshot["screenshot"];}, $screenshots), // Pass the screenshots to the LLM outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}', systemMessage: "The user will ask something. Give your direct answer to that.", keepAlive: true, shouldThink: config('llm.models.vision.shouldThink') ); $llmDescription = json_decode($llmDescription, true)['answer'] ?? null; if (empty($llmDescription)) { $llmDescription = null; } return $llmDescription; } }