Added OCR and OpenAPI tools

This commit is contained in:
2025-06-09 16:27:14 +02:00
parent 20fca31ced
commit 67197c5c48
12 changed files with 402 additions and 2 deletions

View File

@ -0,0 +1,10 @@
<?php
namespace App\AIPrompt;
interface IAIPrompt
{
public function generate(string $model, string $prompt, array $images = [], string $outputFormat = "json", string $systemMessage = null, bool $keepAlive = true, bool $shouldThink = false): string;
//public function chat(string $model, string $prompt, array $images = []): string;
}

View File

@ -0,0 +1,129 @@
<?php
namespace App\AIPrompt;
/**
* Use OpenAI API to get answers from a model.
*/
class OpenAPIPrompt implements IAIPrompt
{
private string $host;
public function __construct(string $host = null) {
$this->host = $host ?? config('llm.host');
}
private function getHeaders(): array
{
return [
'Content-Type' => 'application/json'
];
}
/**
* Call the OpenAI API with the given endpoint and body.
* @param string $endpoint
* @param string $body
* @throws \Exception
* @return string
*/
private function callAPI(string $endpoint, string $body): string
{
$url = $this->host . $endpoint;
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, $this->getHeaders());
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $body);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($httpCode !== 200) {
throw new \Exception("Error calling OpenAI API: HTTP $httpCode - $response");
}
return $response;
}
/**
* Call the OpenAI API generate endpoint. to generate a response to a prompt.
* @param string $model
* @param string $prompt
* @param array $images
* @return void
*/
public function generate(string $model, string $prompt, array $images = [], string $outputFormat = null, string $systemMessage = null, bool $keepAlive = true, bool $shouldThink = false): string
{
/*
Generate a completion
POST /api/generate
Generate a response for a given prompt with a provided model. This is a streaming endpoint, so there will be a series of responses. The final response object will include statistics and additional data from the request.
Parameters
model: (required) the model name
prompt: the prompt to generate a response for
suffix: the text after the model response
images: (optional) a list of base64-encoded images (for multimodal models such as llava)
think: (for thinking models) should the model think before responding?
Advanced parameters (optional):
format: the format to return a response in. Format can be json or a JSON schema
options: additional model parameters listed in the documentation for the Modelfile such as temperature
system: system message to (overrides what is defined in the Modelfile)
template: the prompt template to use (overrides what is defined in the Modelfile)
stream: if false the response will be returned as a single response object, rather than a stream of objects
raw: if true no formatting will be applied to the prompt. You may choose to use the raw parameter if you are specifying a full templated prompt in your request to the API
keep_alive: controls how long the model will stay loaded into memory following the request (default: 5m)
context (deprecated): the context parameter returned from a previous request to /generate, this can be used to keep a short conversational memory
Structured outputs
Structured outputs are supported by providing a JSON schema in the format parameter. The model will generate a response that matches the schema. See the structured outputs example below.
JSON mode
Enable JSON mode by setting the format parameter to json. This will structure the response as a valid JSON object. See the JSON mode example below.
Important
It's important to instruct the model to use JSON in the prompt. Otherwise, the model may generate large amounts whitespace.
*/
// Transform the images to base64
foreach ($images as &$image) {
if (file_exists($image)) {
$image = base64_encode(file_get_contents($image));
}
}
$body = [
'model' => $model,
'prompt' => $prompt,
'images' => $images,
'think' => $shouldThink,
'stream' => false,
];
if ($systemMessage !== null) {
$body['system'] = $systemMessage;
}
if ($outputFormat !== null) {
$body['format'] = json_decode($outputFormat);
}
if (!$keepAlive) {
$body['keep_alive'] = "0m";
}
$body = json_encode($body);
dump($body);
$response = $this->callAPI('/api/generate', $body);
$decodedResponse = json_decode($response, true);
if (json_last_error() !== JSON_ERROR_NONE) {
throw new \Exception("Error decoding JSON response: " . json_last_error_msg());
}
return $decodedResponse['response'] ?? '';
}
}

View File

@ -0,0 +1,12 @@
<?php
namespace App\Browser\Jobs\InstagramRepost;
class OCRLLMReelDescriptor extends \App\FileTools\VideoDescriptor\OCRLLMVideoDescriptor
{
public const DESCRIPTION_PROMPT = "Describe the Instagram reel based on the screenshots. Each screenshot has a timestamp of when in the video the screenshot was taken, an OCR result and a description of the screenshot by an LLM. Do not specify that it is a reel, just try to describe the video and most importantly the joke behind it if there is one. The description must have a maximum of 500 words.\n";
public function __construct() {
parent::__construct();
}
}

View File

@ -0,0 +1,14 @@
<?php
namespace App\FileTools\OCR;
interface IImageOCR
{
/**
* Perform OCR on the given file.
*
* @param string $filePath The path to the file to be processed.
* @return string The extracted text from the file.
*/
public function performOCR(string $filePath): string;
}

View File

@ -0,0 +1,15 @@
<?php
namespace App\FileTools\OCR;
use thiagoalessio\TesseractOCR\TesseractOCR;
class TesseractImageOCR implements IImageOCR
{
/**
* @inheritDoc
*/
public function performOCR(string $filePath): string {
$tesseract = new TesseractOCR($filePath);
return $tesseract->run();
}
}

View File

@ -0,0 +1,14 @@
<?php
namespace App\FileTools\VideoDescriptor;
interface IVideoDescriptor
{
/**
* Get the video description.
*
* @param string $filePath The path to the video file.
* @return string The description of the video.
*/
public function getDescription(string $filePath): ?string;
}

View File

@ -0,0 +1,117 @@
<?php
namespace App\FileTools\VideoDescriptor;
use App\AIPrompt\IAIPrompt;
use App\AIPrompt\OpenAPIPrompt;
use App\FileTools\OCR\IImageOCR;
use App\FileTools\OCR\TesseractImageOCR;
use Log;
class OCRLLMVideoDescriptor implements IVideoDescriptor
{
private IImageOCR $ocr;
private IAIPrompt $llm; // LLM That can visualize images and generate descriptions
public const DESCRIPTION_PROMPT = "Describe the video based on the screenshots. Each screenshot has a timestamp of when in the video the screenshot was taken, an OCR result and a description of the screenshot by an LLM. Do not specify that it is a video, just describe the video. The description must have a maximum of 500 words.\n";
public function __construct() {
$this->ocr = new TesseractImageOCR();
$this->llm = new OpenAPIPrompt();
}
public function getDescription(string $filePath): ?string
{
/*
1. Cut videos in screenshots
2. Use OCR to extract text from screenshots
3. Use LLM to generate a description of the screenshot
4. Combine the descriptions of all screenshots into a single description
5. Ask an LLM to describe the video
*/
// Step 1: Cut video into screenshots
$screenshots = $this->cutVideoIntoScreenshots($filePath);
if (empty($screenshots)) {
throw new \Exception("No screenshots were generated from the video {$filePath}.");
}
// Step 2 & 3: Use OCR to extract text and LLM to get description from screenshots
$descriptions = [];
foreach ($screenshots as $screenshot) {
$descriptions[$screenshot] = [];
$ocrDescription = $this->ocr->performOCR($screenshot);
$ocrDescription = empty($ocrDescription) ? 'No text found' : $ocrDescription;
$descriptions[$screenshot]['ocr'] = $ocrDescription;
$llmDescription = $this->llm->generate(
config('llm.models.vision.name'),
"Describe the content of this screenshot from a video. Do not specify that it is a screenshot, just describe the content.",
images: [$screenshot],
outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
systemMessage: "The user will ask something. Give your direct answer to that.",
keepAlive: $screenshot != end($screenshots), // Keep alive for all but the last screenshot
shouldThink: config('llm.models.vision.shouldThink')
);
$descriptions[$screenshot]['text'] = json_decode($llmDescription, true)['answer'] ?? 'No description generated';
}
// HERE COULD BE SOME INTERMEDIATE PROCESSING OF DESCRIPTIONS
// Step 4: Combine the descriptions of all screenshots into a single description
$combinedDescription = '';
$screenshotCount = 0;
foreach ($descriptions as $screenshot => $description) {
$screenshotCount++;
$combinedDescription .= "Screenshot: {$screenshotCount}\n";
$combinedDescription .= "Timestamp: {$screenshotCount}s\n"; // TODO Cut the video in smaller parts when the video is short
$combinedDescription .= "OCR: {$description['ocr']}\n";
$combinedDescription .= "LLM Description: {$description['text']}\n\n";
}
$combinedDescription = trim($combinedDescription);
// Step 5: Ask an LLM to describe the video based on the combined descriptions
$llmDescription = $this->llm->generate(
config('llm.models.chat.name'),
self::DESCRIPTION_PROMPT . $combinedDescription,
outputFormat: '{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}',
systemMessage: "The user will ask something. Give your direct answer to that.",
keepAlive: true,
shouldThink: config('llm.models.chat.shouldThink')
);
$llmDescription = json_decode($llmDescription, true)['answer'] ?? null;
if (empty($llmDescription)) {
$llmDescription = null;
}
return $llmDescription;
}
/**
* Cut the video into screenshots.
* Using ffmpeg to cut the video into screenshots at regular intervals.
* The screenshots will be saved in a temporary directory.
* @param string $filePath
* @return void
*/
private function cutVideoIntoScreenshots(string $filePath): array
{
$tempDir = sys_get_temp_dir() . '/video_screenshots';
if (!is_dir($tempDir)) {
mkdir($tempDir, 0777, true);
}
Log::info("Cutting video into screenshots: $filePath");
$outputPattern = $tempDir . '/screenshot_%d.png';
$command = "ffmpeg -i " . escapeshellarg($filePath) . " -vf fps=1 " . escapeshellarg($outputPattern);
exec($command);
// Collect all screenshots
$screenshots = glob($tempDir . '/screenshot_*.png');
return $screenshots;
}
}