Added OCR and OpenAPI tools
This commit is contained in:
@ -81,3 +81,8 @@ VITE_REVERB_APP_KEY="${REVERB_APP_KEY}"
|
||||
VITE_REVERB_HOST="${REVERB_HOST}"
|
||||
VITE_REVERB_PORT="${REVERB_PORT}"
|
||||
VITE_REVERB_SCHEME="${REVERB_SCHEME}"
|
||||
|
||||
# AI LLM
|
||||
LLM_HOST_URL="https://openai.com/api"
|
||||
LLM_CHAT_MODEL="gpt-4o"
|
||||
LLM_VISION_MODEL="gpt-4o-vision-preview"
|
||||
|
@ -54,6 +54,8 @@ RUN apk update && apk add --no-cache \
|
||||
openssl \
|
||||
linux-headers \
|
||||
supervisor \
|
||||
tesseract-ocr \
|
||||
ffmpeg \
|
||||
&& rm -rf /tmp/* /var/cache/apk/*
|
||||
|
||||
RUN docker-php-ext-configure zip && docker-php-ext-install zip
|
||||
|
10
app/AIPrompt/IAIPrompt.php
Normal file
10
app/AIPrompt/IAIPrompt.php
Normal file
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
|
||||
namespace App\AIPrompt;
|
||||
|
||||
interface IAIPrompt
|
||||
{
|
||||
public function generate(string $model, string $prompt, array $images = [], string $outputFormat = "json", string $systemMessage = null, bool $keepAlive = true, bool $shouldThink = false): string;
|
||||
|
||||
//public function chat(string $model, string $prompt, array $images = []): string;
|
||||
}
|
129
app/AIPrompt/OpenAPIPrompt.php
Normal file
129
app/AIPrompt/OpenAPIPrompt.php
Normal file
@ -0,0 +1,129 @@
|
||||
<?php
|
||||
|
||||
namespace App\AIPrompt;
|
||||
|
||||
/**
|
||||
* Use OpenAI API to get answers from a model.
|
||||
*/
|
||||
class OpenAPIPrompt implements IAIPrompt
|
||||
{
|
||||
private string $host;
|
||||
|
||||
public function __construct(string $host = null) {
|
||||
$this->host = $host ?? config('llm.host');
|
||||
}
|
||||
|
||||
private function getHeaders(): array
|
||||
{
|
||||
return [
|
||||
'Content-Type' => 'application/json'
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Call the OpenAI API with the given endpoint and body.
|
||||
* @param string $endpoint
|
||||
* @param string $body
|
||||
* @throws \Exception
|
||||
* @return string
|
||||
*/
|
||||
private function callAPI(string $endpoint, string $body): string
|
||||
{
|
||||
$url = $this->host . $endpoint;
|
||||
|
||||
$ch = curl_init($url);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($ch, CURLOPT_HTTPHEADER, $this->getHeaders());
|
||||
curl_setopt($ch, CURLOPT_POST, true);
|
||||
curl_setopt($ch, CURLOPT_POSTFIELDS, $body);
|
||||
$response = curl_exec($ch);
|
||||
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||||
curl_close($ch);
|
||||
if ($httpCode !== 200) {
|
||||
throw new \Exception("Error calling OpenAI API: HTTP $httpCode - $response");
|
||||
}
|
||||
return $response;
|
||||
}
|
||||
|
||||
/**
|
||||
* Call the OpenAI API generate endpoint. to generate a response to a prompt.
|
||||
* @param string $model
|
||||
* @param string $prompt
|
||||
* @param array $images
|
||||
* @return void
|
||||
*/
|
||||
public function generate(string $model, string $prompt, array $images = [], string $outputFormat = null, string $systemMessage = null, bool $keepAlive = true, bool $shouldThink = false): string
|
||||
{
|
||||
/*
|
||||
Generate a completion
|
||||
|
||||
POST /api/generate
|
||||
|
||||
Generate a response for a given prompt with a provided model. This is a streaming endpoint, so there will be a series of responses. The final response object will include statistics and additional data from the request.
|
||||
Parameters
|
||||
|
||||
model: (required) the model name
|
||||
prompt: the prompt to generate a response for
|
||||
suffix: the text after the model response
|
||||
images: (optional) a list of base64-encoded images (for multimodal models such as llava)
|
||||
think: (for thinking models) should the model think before responding?
|
||||
|
||||
Advanced parameters (optional):
|
||||
|
||||
format: the format to return a response in. Format can be json or a JSON schema
|
||||
options: additional model parameters listed in the documentation for the Modelfile such as temperature
|
||||
system: system message to (overrides what is defined in the Modelfile)
|
||||
template: the prompt template to use (overrides what is defined in the Modelfile)
|
||||
stream: if false the response will be returned as a single response object, rather than a stream of objects
|
||||
raw: if true no formatting will be applied to the prompt. You may choose to use the raw parameter if you are specifying a full templated prompt in your request to the API
|
||||
keep_alive: controls how long the model will stay loaded into memory following the request (default: 5m)
|
||||
context (deprecated): the context parameter returned from a previous request to /generate, this can be used to keep a short conversational memory
|
||||
|
||||
Structured outputs
|
||||
|
||||
Structured outputs are supported by providing a JSON schema in the format parameter. The model will generate a response that matches the schema. See the structured outputs example below.
|
||||
JSON mode
|
||||
|
||||
Enable JSON mode by setting the format parameter to json. This will structure the response as a valid JSON object. See the JSON mode example below.
|
||||
|
||||
Important
|
||||
|
||||
It's important to instruct the model to use JSON in the prompt. Otherwise, the model may generate large amounts whitespace.
|
||||
*/
|
||||
|
||||
// Transform the images to base64
|
||||
foreach ($images as &$image) {
|
||||
if (file_exists($image)) {
|
||||
$image = base64_encode(file_get_contents($image));
|
||||
}
|
||||
}
|
||||
|
||||
$body = [
|
||||
'model' => $model,
|
||||
'prompt' => $prompt,
|
||||
'images' => $images,
|
||||
'think' => $shouldThink,
|
||||
'stream' => false,
|
||||
];
|
||||
|
||||
if ($systemMessage !== null) {
|
||||
$body['system'] = $systemMessage;
|
||||
}
|
||||
if ($outputFormat !== null) {
|
||||
$body['format'] = json_decode($outputFormat);
|
||||
}
|
||||
if (!$keepAlive) {
|
||||
$body['keep_alive'] = "0m";
|
||||
}
|
||||
|
||||
$body = json_encode($body);
|
||||
|
||||
dump($body);
|
||||
$response = $this->callAPI('/api/generate', $body);
|
||||
$decodedResponse = json_decode($response, true);
|
||||
if (json_last_error() !== JSON_ERROR_NONE) {
|
||||
throw new \Exception("Error decoding JSON response: " . json_last_error_msg());
|
||||
}
|
||||
return $decodedResponse['response'] ?? '';
|
||||
}
|
||||
}
|
12
app/Browser/Jobs/InstagramRepost/OCRLLMReelDescriptor.php
Normal file
12
app/Browser/Jobs/InstagramRepost/OCRLLMReelDescriptor.php
Normal file
@ -0,0 +1,12 @@
|
||||
<?php
|
||||
|
||||
namespace App\Browser\Jobs\InstagramRepost;
|
||||
|
||||
class OCRLLMReelDescriptor extends \App\FileTools\VideoDescriptor\OCRLLMVideoDescriptor
|
||||
{
|
||||
public const DESCRIPTION_PROMPT = "Describe the Instagram reel based on the screenshots. Each screenshot has a timestamp of when in the video the screenshot was taken, an OCR result and a description of the screenshot by an LLM. Do not specify that it is a reel, just try to describe the video and most importantly the joke behind it if there is one. The description must have a maximum of 500 words.\n";
|
||||
|
||||
public function __construct() {
|
||||
parent::__construct();
|
||||
}
|
||||
}
|
14
app/FileTools/OCR/IImageOCR.php
Normal file
14
app/FileTools/OCR/IImageOCR.php
Normal file
@ -0,0 +1,14 @@
|
||||
<?php
|
||||
|
||||
namespace App\FileTools\OCR;
|
||||
|
||||
interface IImageOCR
|
||||
{
|
||||
/**
|
||||
* Perform OCR on the given file.
|
||||
*
|
||||
* @param string $filePath The path to the file to be processed.
|
||||
* @return string The extracted text from the file.
|
||||
*/
|
||||
public function performOCR(string $filePath): string;
|
||||
}
|
15
app/FileTools/OCR/TesseractImageOCR.php
Normal file
15
app/FileTools/OCR/TesseractImageOCR.php
Normal file
@ -0,0 +1,15 @@
|
||||
<?php
|
||||
|
||||
namespace App\FileTools\OCR;
|
||||
use thiagoalessio\TesseractOCR\TesseractOCR;
|
||||
|
||||
class TesseractImageOCR implements IImageOCR
|
||||
{
|
||||
/**
|
||||
* @inheritDoc
|
||||
*/
|
||||
public function performOCR(string $filePath): string {
|
||||
$tesseract = new TesseractOCR($filePath);
|
||||
return $tesseract->run();
|
||||
}
|
||||
}
|
14
app/FileTools/VideoDescriptor/IVideoDescriptor.php
Normal file
14
app/FileTools/VideoDescriptor/IVideoDescriptor.php
Normal file
@ -0,0 +1,14 @@
|
||||
<?php
|
||||
|
||||
namespace App\FileTools\VideoDescriptor;
|
||||
|
||||
interface IVideoDescriptor
|
||||
{
|
||||
/**
|
||||
* Get the video description.
|
||||
*
|
||||
* @param string $filePath The path to the video file.
|
||||
* @return string The description of the video.
|
||||
*/
|
||||
public function getDescription(string $filePath): ?string;
|
||||
}
|
117
app/FileTools/VideoDescriptor/OCRLLMVideoDescriptor.php
Normal file
117
app/FileTools/VideoDescriptor/OCRLLMVideoDescriptor.php
Normal file
@ -0,0 +1,117 @@
|
||||
<?php
|
||||
|
||||
namespace App\FileTools\VideoDescriptor;
|
||||
|
||||
use App\AIPrompt\IAIPrompt;
|
||||
use App\AIPrompt\OpenAPIPrompt;
|
||||
use App\FileTools\OCR\IImageOCR;
|
||||
use App\FileTools\OCR\TesseractImageOCR;
|
||||
use Log;
|
||||
|
||||
class OCRLLMVideoDescriptor implements IVideoDescriptor
|
||||
{
|
||||
private IImageOCR $ocr;
|
||||
private IAIPrompt $llm; // LLM That can visualize images and generate descriptions
|
||||
|
||||
public const DESCRIPTION_PROMPT = "Describe the video based on the screenshots. Each screenshot has a timestamp of when in the video the screenshot was taken, an OCR result and a description of the screenshot by an LLM. Do not specify that it is a video, just describe the video. The description must have a maximum of 500 words.\n";
|
||||
|
||||
public function __construct() {
|
||||
$this->ocr = new TesseractImageOCR();
|
||||
$this->llm = new OpenAPIPrompt();
|
||||
}
|
||||
|
||||
public function getDescription(string $filePath): ?string
|
||||
{
|
||||
/*
|
||||
1. Cut videos in screenshots
|
||||
2. Use OCR to extract text from screenshots
|
||||
3. Use LLM to generate a description of the screenshot
|
||||
4. Combine the descriptions of all screenshots into a single description
|
||||
5. Ask an LLM to describe the video
|
||||
*/
|
||||
|
||||
// Step 1: Cut video into screenshots
|
||||
$screenshots = $this->cutVideoIntoScreenshots($filePath);
|
||||
|
||||
if (empty($screenshots)) {
|
||||
throw new \Exception("No screenshots were generated from the video {$filePath}.");
|
||||
}
|
||||
|
||||
// Step 2 & 3: Use OCR to extract text and LLM to get description from screenshots
|
||||
$descriptions = [];
|
||||
foreach ($screenshots as $screenshot) {
|
||||
$descriptions[$screenshot] = [];
|
||||
|
||||
$ocrDescription = $this->ocr->performOCR($screenshot);
|
||||
$ocrDescription = empty($ocrDescription) ? 'No text found' : $ocrDescription;
|
||||
$descriptions[$screenshot]['ocr'] = $ocrDescription;
|
||||
|
||||
$llmDescription = $this->llm->generate(
|
||||
config('llm.models.vision.name'),
|
||||
"Describe the content of this screenshot from a video. Do not specify that it is a screenshot, just describe the content.",
|
||||
images: [$screenshot],
|
||||
outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
|
||||
systemMessage: "The user will ask something. Give your direct answer to that.",
|
||||
keepAlive: $screenshot != end($screenshots), // Keep alive for all but the last screenshot
|
||||
shouldThink: config('llm.models.vision.shouldThink')
|
||||
);
|
||||
$descriptions[$screenshot]['text'] = json_decode($llmDescription, true)['answer'] ?? 'No description generated';
|
||||
}
|
||||
|
||||
// HERE COULD BE SOME INTERMEDIATE PROCESSING OF DESCRIPTIONS
|
||||
|
||||
// Step 4: Combine the descriptions of all screenshots into a single description
|
||||
$combinedDescription = '';
|
||||
$screenshotCount = 0;
|
||||
foreach ($descriptions as $screenshot => $description) {
|
||||
$screenshotCount++;
|
||||
$combinedDescription .= "Screenshot: {$screenshotCount}\n";
|
||||
$combinedDescription .= "Timestamp: {$screenshotCount}s\n"; // TODO Cut the video in smaller parts when the video is short
|
||||
$combinedDescription .= "OCR: {$description['ocr']}\n";
|
||||
$combinedDescription .= "LLM Description: {$description['text']}\n\n";
|
||||
}
|
||||
$combinedDescription = trim($combinedDescription);
|
||||
|
||||
// Step 5: Ask an LLM to describe the video based on the combined descriptions
|
||||
$llmDescription = $this->llm->generate(
|
||||
config('llm.models.chat.name'),
|
||||
self::DESCRIPTION_PROMPT . $combinedDescription,
|
||||
outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
|
||||
systemMessage: "The user will ask something. Give your direct answer to that.",
|
||||
keepAlive: true,
|
||||
shouldThink: config('llm.models.chat.shouldThink')
|
||||
);
|
||||
|
||||
$llmDescription = json_decode($llmDescription, true)['answer'] ?? null;
|
||||
if (empty($llmDescription)) {
|
||||
$llmDescription = null;
|
||||
}
|
||||
|
||||
return $llmDescription;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cut the video into screenshots.
|
||||
* Using ffmpeg to cut the video into screenshots at regular intervals.
|
||||
* The screenshots will be saved in a temporary directory.
|
||||
* @param string $filePath
|
||||
* @return void
|
||||
*/
|
||||
private function cutVideoIntoScreenshots(string $filePath): array
|
||||
{
|
||||
$tempDir = sys_get_temp_dir() . '/video_screenshots';
|
||||
if (!is_dir($tempDir)) {
|
||||
mkdir($tempDir, 0777, true);
|
||||
}
|
||||
|
||||
Log::info("Cutting video into screenshots: $filePath");
|
||||
|
||||
$outputPattern = $tempDir . '/screenshot_%d.png';
|
||||
$command = "ffmpeg -i " . escapeshellarg($filePath) . " -vf fps=1 " . escapeshellarg($outputPattern);
|
||||
exec($command);
|
||||
|
||||
// Collect all screenshots
|
||||
$screenshots = glob($tempDir . '/screenshot_*.png');
|
||||
return $screenshots;
|
||||
}
|
||||
}
|
@ -19,6 +19,7 @@
|
||||
"laravel/telescope": "^5.5",
|
||||
"laravel/tinker": "^2.9",
|
||||
"norkunas/youtube-dl-php": "dev-master",
|
||||
"thiagoalessio/tesseract_ocr": "^2.13",
|
||||
"tightenco/ziggy": "^2.0"
|
||||
},
|
||||
"require-dev": {
|
||||
|
53
composer.lock
generated
53
composer.lock
generated
@ -4,7 +4,7 @@
|
||||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
||||
"This file is @generated automatically"
|
||||
],
|
||||
"content-hash": "9a964008040d9ce219547515fe65dd86",
|
||||
"content-hash": "20c0488746a861aecc1187374ca0aa7f",
|
||||
"packages": [
|
||||
{
|
||||
"name": "brick/math",
|
||||
@ -7038,6 +7038,55 @@
|
||||
],
|
||||
"time": "2025-01-17T11:39:41+00:00"
|
||||
},
|
||||
{
|
||||
"name": "thiagoalessio/tesseract_ocr",
|
||||
"version": "2.13.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git",
|
||||
"reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
|
||||
"reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"php": "^5.3 || ^7.0 || ^8.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpunit/php-code-coverage": "^2.2.4 || ^9.0.0"
|
||||
},
|
||||
"type": "library",
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"thiagoalessio\\TesseractOCR\\": "src/"
|
||||
}
|
||||
},
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"MIT"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "thiagoalessio",
|
||||
"email": "thiagoalessio@me.com"
|
||||
}
|
||||
],
|
||||
"description": "A wrapper to work with Tesseract OCR inside PHP.",
|
||||
"keywords": [
|
||||
"OCR",
|
||||
"Tesseract",
|
||||
"text recognition"
|
||||
],
|
||||
"support": {
|
||||
"irc": "irc://irc.freenode.net/tesseract-ocr-for-php",
|
||||
"issues": "https://github.com/thiagoalessio/tesseract-ocr-for-php/issues",
|
||||
"source": "https://github.com/thiagoalessio/tesseract-ocr-for-php"
|
||||
},
|
||||
"time": "2023-10-05T21:14:48+00:00"
|
||||
},
|
||||
{
|
||||
"name": "tightenco/ziggy",
|
||||
"version": "v2.5.2",
|
||||
@ -9726,7 +9775,7 @@
|
||||
"prefer-stable": true,
|
||||
"prefer-lowest": false,
|
||||
"platform": {
|
||||
"php": "^8.3"
|
||||
"php": "8.3.*"
|
||||
},
|
||||
"platform-dev": [],
|
||||
"plugin-api-version": "2.6.0"
|
||||
|
32
config/llm.php
Normal file
32
config/llm.php
Normal file
@ -0,0 +1,32 @@
|
||||
<?php
|
||||
|
||||
return [
|
||||
/**
|
||||
* Host for the OpenAI API.
|
||||
* This should be the base URL of the OpenAI API you are using.
|
||||
*/
|
||||
'host' => env('LLM_HOST_URL', null),
|
||||
|
||||
/**
|
||||
* Models configuration.
|
||||
*/
|
||||
'models' => [
|
||||
/**
|
||||
* Great for chatting, can have reasoning capabilities.
|
||||
* This model is typically used for conversational or thinking AI tasks.
|
||||
*/
|
||||
'chat' => [
|
||||
'name' => env('LLM_CHAT_MODEL', null),
|
||||
'shouldThink' => env('LLM_CHAT_MODEL_THINK', false),
|
||||
],
|
||||
|
||||
/**
|
||||
* Great for analyzing images, can have reasoning capabilities.
|
||||
* This model is typically used for tasks that require understanding and interpreting images.
|
||||
*/
|
||||
'vision' => [
|
||||
'name' => env('LLM_VISION_MODEL', null),
|
||||
'shouldThink' => env('LLM_VISION_MODEL_THINK', false),
|
||||
],
|
||||
]
|
||||
];
|
Reference in New Issue
Block a user