diff --git a/.env.example b/.env.example index 5ac32c7..338e0a4 100644 --- a/.env.example +++ b/.env.example @@ -81,3 +81,8 @@ VITE_REVERB_APP_KEY="${REVERB_APP_KEY}" VITE_REVERB_HOST="${REVERB_HOST}" VITE_REVERB_PORT="${REVERB_PORT}" VITE_REVERB_SCHEME="${REVERB_SCHEME}" + +# AI LLM +LLM_HOST_URL="https://openai.com/api" +LLM_CHAT_MODEL="gpt-4o" +LLM_VISION_MODEL="gpt-4o-vision-preview" diff --git a/Dockerfile b/Dockerfile index fe562f2..f5b0d1b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -54,6 +54,8 @@ RUN apk update && apk add --no-cache \ openssl \ linux-headers \ supervisor \ + tesseract-ocr \ + ffmpeg \ && rm -rf /tmp/* /var/cache/apk/* RUN docker-php-ext-configure zip && docker-php-ext-install zip diff --git a/app/AIPrompt/IAIPrompt.php b/app/AIPrompt/IAIPrompt.php new file mode 100644 index 0000000..ebc7706 --- /dev/null +++ b/app/AIPrompt/IAIPrompt.php @@ -0,0 +1,10 @@ +host = $host ?? config('llm.host'); + } + + private function getHeaders(): array + { + return [ + 'Content-Type' => 'application/json' + ]; + } + + /** + * Call the OpenAI API with the given endpoint and body. + * @param string $endpoint + * @param string $body + * @throws \Exception + * @return string + */ + private function callAPI(string $endpoint, string $body): string + { + $url = $this->host . $endpoint; + + $ch = curl_init($url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_HTTPHEADER, $this->getHeaders()); + curl_setopt($ch, CURLOPT_POST, true); + curl_setopt($ch, CURLOPT_POSTFIELDS, $body); + $response = curl_exec($ch); + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + curl_close($ch); + if ($httpCode !== 200) { + throw new \Exception("Error calling OpenAI API: HTTP $httpCode - $response"); + } + return $response; + } + + /** + * Call the OpenAI API generate endpoint. to generate a response to a prompt. + * @param string $model + * @param string $prompt + * @param array $images + * @return void + */ + public function generate(string $model, string $prompt, array $images = [], string $outputFormat = null, string $systemMessage = null, bool $keepAlive = true, bool $shouldThink = false): string + { + /* + Generate a completion + + POST /api/generate + + Generate a response for a given prompt with a provided model. This is a streaming endpoint, so there will be a series of responses. The final response object will include statistics and additional data from the request. + Parameters + + model: (required) the model name + prompt: the prompt to generate a response for + suffix: the text after the model response + images: (optional) a list of base64-encoded images (for multimodal models such as llava) + think: (for thinking models) should the model think before responding? + + Advanced parameters (optional): + + format: the format to return a response in. Format can be json or a JSON schema + options: additional model parameters listed in the documentation for the Modelfile such as temperature + system: system message to (overrides what is defined in the Modelfile) + template: the prompt template to use (overrides what is defined in the Modelfile) + stream: if false the response will be returned as a single response object, rather than a stream of objects + raw: if true no formatting will be applied to the prompt. You may choose to use the raw parameter if you are specifying a full templated prompt in your request to the API + keep_alive: controls how long the model will stay loaded into memory following the request (default: 5m) + context (deprecated): the context parameter returned from a previous request to /generate, this can be used to keep a short conversational memory + + Structured outputs + + Structured outputs are supported by providing a JSON schema in the format parameter. The model will generate a response that matches the schema. See the structured outputs example below. + JSON mode + + Enable JSON mode by setting the format parameter to json. This will structure the response as a valid JSON object. See the JSON mode example below. + + Important + + It's important to instruct the model to use JSON in the prompt. Otherwise, the model may generate large amounts whitespace. + */ + + // Transform the images to base64 + foreach ($images as &$image) { + if (file_exists($image)) { + $image = base64_encode(file_get_contents($image)); + } + } + + $body = [ + 'model' => $model, + 'prompt' => $prompt, + 'images' => $images, + 'think' => $shouldThink, + 'stream' => false, + ]; + + if ($systemMessage !== null) { + $body['system'] = $systemMessage; + } + if ($outputFormat !== null) { + $body['format'] = json_decode($outputFormat); + } + if (!$keepAlive) { + $body['keep_alive'] = "0m"; + } + + $body = json_encode($body); + + dump($body); + $response = $this->callAPI('/api/generate', $body); + $decodedResponse = json_decode($response, true); + if (json_last_error() !== JSON_ERROR_NONE) { + throw new \Exception("Error decoding JSON response: " . json_last_error_msg()); + } + return $decodedResponse['response'] ?? ''; + } +} diff --git a/app/Browser/Jobs/InstagramRepost/OCRLLMReelDescriptor.php b/app/Browser/Jobs/InstagramRepost/OCRLLMReelDescriptor.php new file mode 100644 index 0000000..791c125 --- /dev/null +++ b/app/Browser/Jobs/InstagramRepost/OCRLLMReelDescriptor.php @@ -0,0 +1,12 @@ +run(); + } +} diff --git a/app/FileTools/VideoDescriptor/IVideoDescriptor.php b/app/FileTools/VideoDescriptor/IVideoDescriptor.php new file mode 100644 index 0000000..7d4e1e2 --- /dev/null +++ b/app/FileTools/VideoDescriptor/IVideoDescriptor.php @@ -0,0 +1,14 @@ +ocr = new TesseractImageOCR(); + $this->llm = new OpenAPIPrompt(); + } + + public function getDescription(string $filePath): ?string + { + /* + 1. Cut videos in screenshots + 2. Use OCR to extract text from screenshots + 3. Use LLM to generate a description of the screenshot + 4. Combine the descriptions of all screenshots into a single description + 5. Ask an LLM to describe the video + */ + + // Step 1: Cut video into screenshots + $screenshots = $this->cutVideoIntoScreenshots($filePath); + + if (empty($screenshots)) { + throw new \Exception("No screenshots were generated from the video {$filePath}."); + } + + // Step 2 & 3: Use OCR to extract text and LLM to get description from screenshots + $descriptions = []; + foreach ($screenshots as $screenshot) { + $descriptions[$screenshot] = []; + + $ocrDescription = $this->ocr->performOCR($screenshot); + $ocrDescription = empty($ocrDescription) ? 'No text found' : $ocrDescription; + $descriptions[$screenshot]['ocr'] = $ocrDescription; + + $llmDescription = $this->llm->generate( + config('llm.models.vision.name'), + "Describe the content of this screenshot from a video. Do not specify that it is a screenshot, just describe the content.", + images: [$screenshot], + outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}', + systemMessage: "The user will ask something. Give your direct answer to that.", + keepAlive: $screenshot != end($screenshots), // Keep alive for all but the last screenshot + shouldThink: config('llm.models.vision.shouldThink') + ); + $descriptions[$screenshot]['text'] = json_decode($llmDescription, true)['answer'] ?? 'No description generated'; + } + + // HERE COULD BE SOME INTERMEDIATE PROCESSING OF DESCRIPTIONS + + // Step 4: Combine the descriptions of all screenshots into a single description + $combinedDescription = ''; + $screenshotCount = 0; + foreach ($descriptions as $screenshot => $description) { + $screenshotCount++; + $combinedDescription .= "Screenshot: {$screenshotCount}\n"; + $combinedDescription .= "Timestamp: {$screenshotCount}s\n"; // TODO Cut the video in smaller parts when the video is short + $combinedDescription .= "OCR: {$description['ocr']}\n"; + $combinedDescription .= "LLM Description: {$description['text']}\n\n"; + } + $combinedDescription = trim($combinedDescription); + + // Step 5: Ask an LLM to describe the video based on the combined descriptions + $llmDescription = $this->llm->generate( + config('llm.models.chat.name'), + self::DESCRIPTION_PROMPT . $combinedDescription, + outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}', + systemMessage: "The user will ask something. Give your direct answer to that.", + keepAlive: true, + shouldThink: config('llm.models.chat.shouldThink') + ); + + $llmDescription = json_decode($llmDescription, true)['answer'] ?? null; + if (empty($llmDescription)) { + $llmDescription = null; + } + + return $llmDescription; + } + + /** + * Cut the video into screenshots. + * Using ffmpeg to cut the video into screenshots at regular intervals. + * The screenshots will be saved in a temporary directory. + * @param string $filePath + * @return void + */ + private function cutVideoIntoScreenshots(string $filePath): array + { + $tempDir = sys_get_temp_dir() . '/video_screenshots'; + if (!is_dir($tempDir)) { + mkdir($tempDir, 0777, true); + } + + Log::info("Cutting video into screenshots: $filePath"); + + $outputPattern = $tempDir . '/screenshot_%d.png'; + $command = "ffmpeg -i " . escapeshellarg($filePath) . " -vf fps=1 " . escapeshellarg($outputPattern); + exec($command); + + // Collect all screenshots + $screenshots = glob($tempDir . '/screenshot_*.png'); + return $screenshots; + } +} diff --git a/composer.json b/composer.json index c18a427..9b9a941 100644 --- a/composer.json +++ b/composer.json @@ -19,6 +19,7 @@ "laravel/telescope": "^5.5", "laravel/tinker": "^2.9", "norkunas/youtube-dl-php": "dev-master", + "thiagoalessio/tesseract_ocr": "^2.13", "tightenco/ziggy": "^2.0" }, "require-dev": { diff --git a/composer.lock b/composer.lock index 8b27c8c..984563d 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "9a964008040d9ce219547515fe65dd86", + "content-hash": "20c0488746a861aecc1187374ca0aa7f", "packages": [ { "name": "brick/math", @@ -7038,6 +7038,55 @@ ], "time": "2025-01-17T11:39:41+00:00" }, + { + "name": "thiagoalessio/tesseract_ocr", + "version": "2.13.0", + "source": { + "type": "git", + "url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git", + "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1", + "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1", + "shasum": "" + }, + "require": { + "php": "^5.3 || ^7.0 || ^8.0" + }, + "require-dev": { + "phpunit/php-code-coverage": "^2.2.4 || ^9.0.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "thiagoalessio\\TesseractOCR\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "thiagoalessio", + "email": "thiagoalessio@me.com" + } + ], + "description": "A wrapper to work with Tesseract OCR inside PHP.", + "keywords": [ + "OCR", + "Tesseract", + "text recognition" + ], + "support": { + "irc": "irc://irc.freenode.net/tesseract-ocr-for-php", + "issues": "https://github.com/thiagoalessio/tesseract-ocr-for-php/issues", + "source": "https://github.com/thiagoalessio/tesseract-ocr-for-php" + }, + "time": "2023-10-05T21:14:48+00:00" + }, { "name": "tightenco/ziggy", "version": "v2.5.2", @@ -9726,7 +9775,7 @@ "prefer-stable": true, "prefer-lowest": false, "platform": { - "php": "^8.3" + "php": "8.3.*" }, "platform-dev": [], "plugin-api-version": "2.6.0" diff --git a/config/llm.php b/config/llm.php new file mode 100644 index 0000000..e4193ce --- /dev/null +++ b/config/llm.php @@ -0,0 +1,32 @@ + env('LLM_HOST_URL', null), + + /** + * Models configuration. + */ + 'models' => [ + /** + * Great for chatting, can have reasoning capabilities. + * This model is typically used for conversational or thinking AI tasks. + */ + 'chat' => [ + 'name' => env('LLM_CHAT_MODEL', null), + 'shouldThink' => env('LLM_CHAT_MODEL_THINK', false), + ], + + /** + * Great for analyzing images, can have reasoning capabilities. + * This model is typically used for tasks that require understanding and interpreting images. + */ + 'vision' => [ + 'name' => env('LLM_VISION_MODEL', null), + 'shouldThink' => env('LLM_VISION_MODEL_THINK', false), + ], + ] +];