Skip to main content

AI Inference

Run chat completions, text generation, embeddings, audio, and media generation through the Strongly AI gateway.

Overview

The AI Inference resource provides:

  • Chat completions with multi-turn conversation support
  • Text completions from a prompt
  • Streaming responses for real-time token delivery
  • Text embeddings for similarity search and RAG applications
  • Text-to-speech (TTS) audio generation
  • Speech-to-text (STT) transcription and translation
  • Image, video, and music generation
  • Unified access to all configured AI models

All inference methods are accessed through client.ai.inference.

Chat Completion

Generate a response from a conversation history:

from strongly import Strongly

client = Strongly()

response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is gradient descent?"},
],
)

print(response.content)
print(f"Tokens used: {response.usage.total_tokens}")

Multi-Turn Conversations

Pass the full conversation history to maintain context:

from strongly import Strongly

client = Strongly()

messages = [
{"role": "system", "content": "You are a Python expert."},
{"role": "user", "content": "What are decorators?"},
]

response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=messages,
)
print(f"Assistant: {response.content}")

# Continue the conversation
messages.append({"role": "assistant", "content": response.content})
messages.append({"role": "user", "content": "Show me an example."})

response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=messages,
)
print(f"Assistant: {response.content}")

Generation Parameters

Control the output with temperature, token limits, and stop sequences:

from strongly import Strongly

client = Strongly()

response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "List three benefits of unit testing."}],
temperature=0.3, # Lower = more deterministic
max_tokens=256, # Cap the response length
top_p=0.9, # Nucleus sampling
stop=["\n\n", "---"], # Stop generating at these sequences
)

print(response.content)

Streaming

Stream tokens as they are generated for a responsive user experience:

from strongly import Strongly

client = Strongly()

print("Response: ", end="", flush=True)
for chunk in client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "Write a haiku about machine learning."}],
stream=True,
):
print(chunk.content, end="", flush=True)
print()

Collecting a Streamed Response

from strongly import Strongly

client = Strongly()

chunks = []
for chunk in client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "Explain recursion briefly."}],
stream=True,
):
chunks.append(chunk.content)

full_response = "".join(chunks)
print(full_response)

Text Completion

Generate text from a single prompt (no conversation history):

from strongly import Strongly

client = Strongly()

response = client.ai.inference.completion(
model="gpt-4o-mini",
prompt="Explain the difference between a list and a tuple in Python:",
max_tokens=200,
temperature=0.5,
)

print(response.content)
print(f"Tokens: {response.usage.total_tokens}")

Streaming Text Completion

from strongly import Strongly

client = Strongly()

for chunk in client.ai.inference.completion(
model="gpt-4o-mini",
prompt="Write a short poem about data pipelines:",
stream=True,
):
print(chunk.content, end="", flush=True)
print()

Embeddings

Generate vector embeddings for similarity search, clustering, and RAG applications:

from strongly import Strongly

client = Strongly()

response = client.ai.inference.embedding(
model="text-embedding-ada-002",
input="Machine learning is a subset of artificial intelligence.",
)

print(f"Embedding dimension: {len(response.embeddings[0])}")

Batch Embeddings

Pass a list of strings to embed multiple texts in a single call:

from strongly import Strongly

client = Strongly()

texts = [
"Machine learning uses statistical methods to learn from data.",
"Deep learning is based on artificial neural networks.",
"The weather forecast calls for rain tomorrow.",
]

response = client.ai.inference.embedding(
model="text-embedding-ada-002",
input=texts,
)

print(f"Generated {len(response.embeddings)} embeddings")
print(f"Dimension: {len(response.embeddings[0])}")

Similarity Search Example

import math
from strongly import Strongly

client = Strongly()

def cosine_similarity(a, b):
dot = sum(x * y for x, y in zip(a, b))
norm_a = math.sqrt(sum(x * x for x in a))
norm_b = math.sqrt(sum(x * x for x in b))
return dot / (norm_a * norm_b)

texts = [
"Machine learning is a subset of artificial intelligence.",
"Deep learning uses neural networks with many layers.",
"The weather today is sunny and warm.",
]

response = client.ai.inference.embedding(
model="text-embedding-ada-002",
input=texts,
)

sim_01 = cosine_similarity(response.embeddings[0], response.embeddings[1])
sim_02 = cosine_similarity(response.embeddings[0], response.embeddings[2])

print(f"ML vs Deep Learning: {sim_01:.4f}") # High similarity
print(f"ML vs Weather: {sim_02:.4f}") # Low similarity

Text-to-Speech

Generate spoken audio from text:

from strongly import Strongly

client = Strongly()

# Generate speech from text
response = client.ai.inference.speech(
model="your-tts-model-id",
input="Hello, welcome to Strongly AI!",
voice="alloy",
response_format="mp3",
speed=1.0,
)

# Save the audio file
with open("output.mp3", "wb") as f:
f.write(response.content)

print(f"Audio format: {response.content_type}")
print(f"Audio size: {len(response.content)} bytes")

Speech-to-Text (Transcription)

Transcribe audio files to text:

from strongly import Strongly

client = Strongly()

# Transcribe audio file
with open("recording.mp3", "rb") as audio_file:
response = client.ai.inference.transcription(
model="your-stt-model-id",
file=audio_file,
filename="recording.mp3",
language="en",
)

print(f"Transcript: {response.text}")
if response.segments:
for seg in response.segments:
print(f" [{seg.start:.1f}s - {seg.end:.1f}s] {seg.text}")

Audio Translation

Translate audio from any supported language to English:

from strongly import Strongly

client = Strongly()

# Translate audio to English
with open("french_audio.mp3", "rb") as audio_file:
response = client.ai.inference.translation(
model="your-stt-model-id",
file=audio_file,
)

print(f"English translation: {response.text}")

Image Generation

Generate images from text prompts:

from strongly import Strongly

client = Strongly()

# Generate an image
response = client.ai.inference.image_generation(
model="your-image-model-id",
prompt="A serene mountain landscape at sunset",
n=1,
size="1024x1024",
quality="standard",
)

for image in response.data:
if image.url:
print(f"Image URL: {image.url}")
if image.revised_prompt:
print(f"Revised prompt: {image.revised_prompt}")

Video Generation

Generate videos from text prompts. Video generation is asynchronous and returns a job for tracking:

from strongly import Strongly

client = Strongly()

# Submit a video generation job
job = client.ai.inference.video_generation(
model="your-video-model-id",
prompt="A drone flying over a tropical forest",
duration=5,
resolution="1080p",
)
print(f"Job submitted: {job.job_id}")

# Poll for completion
import time
while job.status not in ("completed", "failed", "cancelled"):
time.sleep(10)
job = client.ai.inference.generation_status(job_id=job.job_id)
print(f"Status: {job.status}, Progress: {job.progress}%")

if job.status == "completed":
print(f"Result: {job.result}")

Music Generation

Generate music from text prompts:

from strongly import Strongly

client = Strongly()

# Generate music
job = client.ai.inference.music_generation(
model="your-music-model-id",
prompt="Upbeat electronic music with a catchy melody",
duration=30,
)
print(f"Job submitted: {job.job_id}")

Cancel a Generation Job

Cancel a running generation job:

from strongly import Strongly

client = Strongly()

# Cancel a running job
cancelled = client.ai.inference.cancel_generation(job_id=job.job_id)
print(f"Cancelled: {cancelled.status}")

Content Moderation

Check text content against moderation categories:

from strongly import Strongly

client = Strongly()

# Moderate content
response = client.ai.inference.moderation(
model="your-moderation-model-id",
input="Some text to check for policy violations.",
)

for result in response.results:
print(f"Flagged: {result.flagged}")
for category, flagged in result.categories.items():
if flagged:
print(f" {category}: score={result.category_scores[category]:.4f}")

Batch Moderation

Pass a list to moderate multiple texts in one call:

from strongly import Strongly

client = Strongly()

texts = [
"This is a normal message.",
"Some other text to check.",
]

response = client.ai.inference.moderation(
model="your-moderation-model-id",
input=texts,
)

for i, result in enumerate(response.results):
print(f"Text {i}: flagged={result.flagged}")

Document Reranking

Rerank documents by relevance to a query:

from strongly import Strongly

client = Strongly()

# Rerank documents by relevance
response = client.ai.inference.rerank(
model="your-rerank-model-id",
query="What is machine learning?",
documents=[
"Machine learning is a subset of artificial intelligence.",
"The weather today is sunny and warm.",
"Deep learning uses neural networks to learn from data.",
],
top_n=2,
)

for result in response.results:
print(f"Index: {result.index}, Score: {result.relevance_score:.4f}")
if result.document:
print(f" Text: {result.document.get('text', '')}")

Parameters

chat_completion

def chat_completion(
self,
*,
model: str,
messages: List[Union[Dict[str, Any], ChatMessage]],
stream: bool = False,
max_tokens: Optional[int] = None,
temperature: float = 0.7,
top_p: float = 1.0,
stop: Optional[Union[str, List[str]]] = None,
**kwargs: Any,
) -> Union[ChatCompletion, Iterator[StreamChunk]]
ParameterTypeDefaultDescription
modelstrrequiredModel identifier (e.g., "gpt-4o-mini")
messagesList[Union[Dict, ChatMessage]]requiredConversation messages with role and content
streamboolFalseReturn an iterator of StreamChunk objects
max_tokensintNoneMaximum tokens to generate
temperaturefloat0.7Sampling temperature (0.0 to 2.0)
top_pfloat1.0Nucleus sampling threshold
stopstr | List[str]NoneStop sequence(s)
**kwargsAnyAdditional model-specific parameters

completion

def completion(
self,
*,
model: str,
prompt: str,
stream: bool = False,
max_tokens: Optional[int] = None,
temperature: float = 0.7,
**kwargs: Any,
) -> Union[Completion, Iterator[StreamChunk]]
ParameterTypeDefaultDescription
modelstrrequiredModel identifier
promptstrrequiredText prompt
streamboolFalseReturn an iterator of StreamChunk objects
max_tokensintNoneMaximum tokens to generate
temperaturefloat0.7Sampling temperature
**kwargsAnyAdditional model-specific parameters

embedding

def embedding(
self,
*,
model: str,
input: Union[str, List[str]],
**kwargs: Any,
) -> EmbeddingResponse
ParameterTypeDefaultDescription
modelstrrequiredEmbedding model identifier (e.g., "text-embedding-ada-002")
inputstr | List[str]requiredText or list of texts to embed
**kwargsAnyAdditional model-specific parameters

speech

def speech(
self,
*,
model: str,
input: str,
voice: str = "alloy",
response_format: str = "mp3",
speed: float = 1.0,
**kwargs: Any,
) -> SpeechResponse
ParameterTypeDefaultDescription
modelstrrequiredModel ID for TTS
inputstrrequiredText to convert to speech
voicestr"alloy"Voice to use
response_formatstr"mp3"Audio format (mp3, wav, opus, flac)
speedfloat1.0Speech speed (0.25 to 4.0)
**kwargsAnyAdditional model-specific parameters

transcription

def transcription(
self,
*,
model: str,
file: Union[bytes, BinaryIO],
filename: str = "audio.mp3",
language: Optional[str] = None,
prompt: Optional[str] = None,
response_format: str = "json",
temperature: float = 0.0,
**kwargs: Any,
) -> TranscriptionResponse
ParameterTypeDefaultDescription
modelstrrequiredModel ID for STT
filebytes | BinaryIOrequiredAudio file to transcribe
filenamestr"audio.mp3"Name of the audio file
languagestrNoneLanguage code (e.g., "en", "es")
promptstrNoneOptional context prompt
response_formatstr"json"Response format
temperaturefloat0.0Sampling temperature
**kwargsAnyAdditional model-specific parameters

translation

def translation(
self,
*,
model: str,
file: Union[bytes, BinaryIO],
filename: str = "audio.mp3",
**kwargs: Any,
) -> TranscriptionResponse
ParameterTypeDefaultDescription
modelstrrequiredModel ID for STT
filebytes | BinaryIOrequiredAudio file to translate
filenamestr"audio.mp3"Name of the audio file
**kwargsAnyAdditional model-specific parameters

image_generation

def image_generation(
self,
*,
model: str,
prompt: str,
n: int = 1,
size: str = "1024x1024",
quality: str = "standard",
response_format: str = "url",
**kwargs: Any,
) -> ImageGenerationResponse
ParameterTypeDefaultDescription
modelstrrequiredModel ID for image generation
promptstrrequiredText description of the image
nint1Number of images to generate
sizestr"1024x1024"Image size
qualitystr"standard"Quality level (standard, hd)
response_formatstr"url"Response format (url, b64_json)
**kwargsAnyAdditional model-specific parameters

video_generation

def video_generation(
self,
*,
model: str,
prompt: str,
duration: Optional[int] = None,
resolution: Optional[str] = None,
**kwargs: Any,
) -> GenerationJob
ParameterTypeDefaultDescription
modelstrrequiredModel ID for video generation
promptstrrequiredText description of the video
durationintNoneDuration in seconds
resolutionstrNoneVideo resolution (e.g., "1080p")
**kwargsAnyAdditional model-specific parameters

music_generation

def music_generation(
self,
*,
model: str,
prompt: str,
duration: Optional[int] = None,
**kwargs: Any,
) -> GenerationJob
ParameterTypeDefaultDescription
modelstrrequiredModel ID for music generation
promptstrrequiredText description of the music
durationintNoneDuration in seconds
**kwargsAnyAdditional model-specific parameters

generation_status

def generation_status(
self,
*,
job_id: str,
) -> GenerationJob
ParameterTypeDefaultDescription
job_idstrrequiredJob ID to check status for

cancel_generation

def cancel_generation(
self,
*,
job_id: str,
) -> GenerationJob
ParameterTypeDefaultDescription
job_idstrrequiredJob ID to cancel

moderation

def moderation(
self,
*,
model: str,
input: Union[str, List[str]],
**kwargs: Any,
) -> ModerationResponse
ParameterTypeDefaultDescription
modelstrrequiredModeration model identifier
inputstr | List[str]requiredText or list of texts to moderate
**kwargsAnyAdditional model-specific parameters

rerank

def rerank(
self,
*,
model: str,
query: str,
documents: List[Union[str, Dict[str, Any]]],
top_n: Optional[int] = None,
**kwargs: Any,
) -> RerankResponse
ParameterTypeDefaultDescription
modelstrrequiredRerank model identifier
querystrrequiredQuery to rank documents against
documentsList[str | dict]requiredDocuments to rerank
top_nintNoneReturn only top N results
**kwargsAnyAdditional model-specific parameters

Response Models

ChatCompletion

FieldTypeDescription
idstrUnique completion identifier
modelstrModel used for generation
choicesList[ChatCompletionChoice]List of generated choices
usageTokenUsageToken usage breakdown
createdintUnix timestamp of creation

Property: content -- Returns the first choice's message content (choices[0].message.content).

Completion

FieldTypeDescription
idstrUnique completion identifier
modelstrModel used for generation
choicesList[ChatCompletionChoice]List of generated choices
usageTokenUsageToken usage breakdown
createdintUnix timestamp of creation

Property: content -- Returns the first choice's message content.

ChatCompletionChoice

FieldTypeDescription
indexintChoice index
messageChatMessageGenerated message
finish_reasonstrReason generation stopped (stop, length, etc.)

ChatMessage

FieldTypeDescription
rolestrMessage role (system, user, assistant, function)
contentstrMessage text content
namestrOptional sender name
function_calldictFunction call details (if applicable)
tool_callslistTool call details (if applicable)

TokenUsage

FieldTypeDescription
prompt_tokensintTokens in the prompt
completion_tokensintTokens in the generated response
total_tokensintTotal tokens consumed

StreamChunk

FieldTypeDescription
idstrChunk identifier
modelstrModel used for generation
contentstrGenerated text fragment
finish_reasonstrSet on the final chunk
rolestrMessage role
indexintChoice index

EmbeddingResponse

FieldTypeDescription
modelstrEmbedding model used
dataList[EmbeddingData]List of embedding objects
usageTokenUsageToken usage breakdown

Property: embeddings -- Returns a flat List[List[float]] of embedding vectors.

EmbeddingData

FieldTypeDescription
embeddingList[float]The embedding vector
indexintPosition in the input list

SpeechResponse

FieldTypeDescription
contentbytesRaw audio bytes
content_typestrMIME type of the audio (e.g., audio/mpeg)

TranscriptionResponse

FieldTypeDescription
textstrTranscribed or translated text
languagestrDetected language code
durationfloatAudio duration in seconds
segmentsList[TranscriptionSegment]Timestamped segments (if available)

TranscriptionSegment

FieldTypeDescription
idintSegment index
startfloatStart time in seconds
endfloatEnd time in seconds
textstrSegment text

ImageGenerationResponse

FieldTypeDescription
createdintUnix timestamp of creation
dataList[ImageData]List of generated images

ImageData

FieldTypeDescription
urlstrURL of the generated image (if response_format="url")
b64_jsonstrBase64-encoded image data (if response_format="b64_json")
revised_promptstrRevised prompt used for generation (if applicable)

GenerationJob

FieldTypeDescription
job_idstrUnique job identifier
statusstrJob status (pending, processing, completed, failed, cancelled)
createdintUnix timestamp of creation
progressintProgress percentage (0-100)
resultAnyGeneration result (available when completed)
modelstrModel ID used
providerstrProvider used for generation

ModerationResponse

FieldTypeDescription
idstrModeration request identifier
modelstrModel used for moderation
resultsList[ModerationResult]Moderation results for each input

ModerationResult

FieldTypeDescription
flaggedboolWhether the content was flagged
categoriesDict[str, bool]Category flags (e.g., {"hate": false, "violence": true})
category_scoresDict[str, float]Category confidence scores

RerankResponse

FieldTypeDescription
idstrRerank request identifier
modelstrModel used for reranking
resultsList[RerankResult]Reranked documents
metaDictOptional metadata

RerankResult

FieldTypeDescription
indexintOriginal document index
relevance_scorefloatRelevance score
documentDictDocument content (if returned)

Complete Example

import math
from strongly import Strongly

def main():
client = Strongly()

# --- Chat Completion ---
print("--- Chat Completion ---")
response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a data science tutor."},
{"role": "user", "content": "Explain overfitting in one paragraph."},
],
temperature=0.5,
max_tokens=200,
)
print(response.content)
print(f"Tokens used: {response.usage.total_tokens}\n")

# --- Streaming Chat ---
print("--- Streaming Chat ---")
print("Response: ", end="", flush=True)
for chunk in client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "Write a haiku about neural networks."}],
stream=True,
):
print(chunk.content, end="", flush=True)
print("\n")

# --- Text Completion ---
print("--- Text Completion ---")
response = client.ai.inference.completion(
model="gpt-4o-mini",
prompt="Three benefits of automated testing:",
max_tokens=150,
temperature=0.3,
)
print(response.content)
print(f"Tokens used: {response.usage.total_tokens}\n")

# --- Embeddings ---
print("--- Embeddings ---")
texts = [
"Machine learning is a subset of artificial intelligence.",
"Deep learning uses neural networks with many layers.",
"The weather today is sunny and warm.",
]

response = client.ai.inference.embedding(
model="text-embedding-ada-002",
input=texts,
)

print(f"Generated {len(response.embeddings)} embeddings")
print(f"Dimension: {len(response.embeddings[0])}")

def cosine_similarity(a, b):
dot = sum(x * y for x, y in zip(a, b))
norm_a = math.sqrt(sum(x * x for x in a))
norm_b = math.sqrt(sum(x * x for x in b))
return dot / (norm_a * norm_b)

sim_01 = cosine_similarity(response.embeddings[0], response.embeddings[1])
sim_02 = cosine_similarity(response.embeddings[0], response.embeddings[2])
print(f"ML vs Deep Learning: {sim_01:.4f}")
print(f"ML vs Weather: {sim_02:.4f}")

# --- Multi-Turn Conversation ---
print("\n--- Multi-Turn Conversation ---")
messages = [
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": "What is the difference between SQL and NoSQL?"},
]

response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=messages,
)
print(f"Assistant: {response.content[:200]}...")

messages.append({"role": "assistant", "content": response.content})
messages.append({"role": "user", "content": "When should I choose NoSQL?"})

response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=messages,
)
print(f"Assistant: {response.content[:200]}...")

if __name__ == "__main__":
main()