cutVideoIntoScreenshots($filePath); if (empty($screenshots)) { throw new \Exception("No screenshots were generated from the video {$filePath}."); } // Step 2 & 3: Use OCR to extract text and LLM to get description from screenshots $descriptions = []; foreach ($screenshots as $values) { $screenshot = $values['screenshot']; $timestamp = $values['timestamp']; $descriptions[$screenshot] = []; $ocrDescription = $this->ocr->performOCR($screenshot); $ocrDescription = empty($ocrDescription) ? 'No text found' : $ocrDescription; $descriptions[$screenshot]['ocr'] = $ocrDescription; dump($ocrDescription); // DEBUG $llmDescription = $this->llm->generate( config('llm.models.vision.name'), "Describe this image in detail, breaking it down into distinct parts as follows: 1. **Scene Description:** Describe the overall setting and environment of the image (e.g., forest clearing, futuristic city street, medieval castle interior). 2. **Main Subject/Character(s):** Detail what is happening with the primary character or subject present in the frame. 3. **Text Description (if any):** If there are visible text elements (like words, letters, captions), describe them exactly as they appear and note their location relative to other elements. This includes any emojis used in captions, describing their visual appearance and likely meaning. 4. **Summary:** Briefly summarize the key content of the image for clarity. 5. **Joke:** If the image is part of a meme or humorous content, describe the joke or humorous element present in the image. Do not include this part if you are not sure to understand the joke/meme. Format your response strictly using numbered lines corresponding to these four points (1., 2., 3., 4., 5.). Do not use markdown formatting or extra text outside these lines; simply list them sequentially as plain text output.", images: [$screenshot], outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}', systemMessage: "You are an image understanding AI specialized in describing visual scenes accurately and concisely. Your task is solely to describe the content of the provided image based on what you can visually perceive. Please analyze the image carefully and provide a description focusing purely on the visible information without generating any text about concepts, interpretations, or future actions beyond the immediate scene. Describe everything that is clearly depicted.", keepAlive: $screenshot != end($screenshots), // Keep alive for all but the last screenshot shouldThink: config('llm.models.vision.shouldThink') ); dump($llmDescription); // DEBUG $descriptions[$screenshot]['text'] = json_decode($llmDescription, true)['answer'] ?? 'No description generated'; } // HERE COULD BE SOME INTERMEDIATE PROCESSING OF DESCRIPTIONS // Step 4: Combine the descriptions of all screenshots into a single description $combinedDescription = ''; $screenshotCount = 0; foreach ($screenshots as $values) { $screenshot = $values['screenshot']; $timestamp = $values['timestamp']; $screenshotCount++; $description = $descriptions[$screenshot] ?? []; $combinedDescription .= "Screenshot: {$screenshotCount}\n"; $combinedDescription .= "Timestamp: {$timestamp}s\n"; // TODO Cut the video in smaller parts when the video is short $combinedDescription .= "OCR: {$description['ocr']}\n"; $combinedDescription .= "LLM Description: {$description['text']}\n"; $combinedDescription .= "\n"; } $combinedDescription = trim($combinedDescription); // Step 5: Ask an LLM to describe the video based on the combined descriptions $llmDescription = $this->llm->generate( config('llm.models.chat.name'), static::DESCRIPTION_PROMPT . $combinedDescription . "\n\nBased only on these frame analyses, please provide: A single, concise description that captures the main action or theme occurring in the reel across all frames. Identify and describe any joke or humorous element present in the video if you can discern one. Important Considerations Remember that most videos are of poor quality; frame descriptions might be inaccurate, vague, or contradictory due to blurriness or fast cuts. Your task is synthesis: focus on the overall impression and sequence, not perfecting each individual piece of information. Some details mentioned in one analysis may simply be incorrect or misidentified from another perspective. Analyze all provided frames (separated by --- for clarity) to understand what's happening. Then, synthesize this understanding into point 1 above and identify the joke if present as per point 2.", outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}', systemMessage: "You are an expert social media content analyst specializing in interpreting Instagram Reels. Your primary function is to generate a comprehensive description and identify any underlying humor or joke in a given video sequence. You will be provided with individual frame analyses, each containing: Screenshot Number: The sequential number of the frame. Timestamp: When that specific frame occurs within the reel. OCR Text Result: Raw text extracted from the image content using OCR (Optical Character Recognition), which may contain errors or misinterpretations (\"may appear\" descriptions). LLM Description of Screenshot: A textual interpretation of what's visible in the frame, based on previous LLM processing. Please note: The individual frame analyses can be inconsistent due to low video quality (e.g., blurriness) or rapid scene changes where details are hard to distinguish. Your task is not to perfect each frame description but to understand the overall sequence and likely narrative, focusing on identifying any joke, irony, absurdity, or humorous transformation occurring across these frames. Your response should be structured as follows: Overall Video Description: Provide a concise summary of what happens in the reel based on the combined information from all the provided screenshots. Humor/Joke Identification (If Applicable): If you can discern any joke or humorous element, explicitly state it and explain how the sequence of frames contributes to this. Instructions for Synthesis: Focus on identifying recurring elements, main subject(s), consistent actions/actions that seem unlikely (potential contradiction). Look for patterns where details change rapidly or absurdly. Prioritize information from descriptions over relying solely on OCR text if the description seems more plausible. Ignore minor inconsistencies between frames unless they clearly contradict a central theme or joke premise. Be ready to point out where the humor lies, which might involve unexpected changes, wordplay captured by OCR errors in the context of the visual action described, absurdity, or irony.", keepAlive: true, shouldThink: config('llm.models.chat.shouldThink') ); $llmDescription = json_decode($llmDescription, true)['answer'] ?? null; if (empty($llmDescription)) { $llmDescription = null; } dump($llmDescription); // DEBUG return $llmDescription; } }