AI Inference
Run chat completions, text generation, embeddings, audio, and media generation through the Strongly AI gateway.
Overview
The AI Inference resource provides:
- Chat completions with multi-turn conversation support
- Text completions from a prompt
- Streaming responses for real-time token delivery
- Text embeddings for similarity search and RAG applications
- Text-to-speech (TTS) audio generation
- Speech-to-text (STT) transcription and translation
- Image, video, and music generation
- Unified access to all configured AI models
All inference methods are accessed through client.ai.inference.
Chat Completion
Generate a response from a conversation history:
from strongly import Strongly
client = Strongly()
response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is gradient descent?"},
],
)
print(response.content)
print(f"Tokens used: {response.usage.total_tokens}")
Multi-Turn Conversations
Pass the full conversation history to maintain context:
from strongly import Strongly
client = Strongly()
messages = [
{"role": "system", "content": "You are a Python expert."},
{"role": "user", "content": "What are decorators?"},
]
response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=messages,
)
print(f"Assistant: {response.content}")
# Continue the conversation
messages.append({"role": "assistant", "content": response.content})
messages.append({"role": "user", "content": "Show me an example."})
response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=messages,
)
print(f"Assistant: {response.content}")
Generation Parameters
Control the output with temperature, token limits, and stop sequences:
from strongly import Strongly
client = Strongly()
response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "List three benefits of unit testing."}],
temperature=0.3, # Lower = more deterministic
max_tokens=256, # Cap the response length
top_p=0.9, # Nucleus sampling
stop=["\n\n", "---"], # Stop generating at these sequences
)
print(response.content)
Streaming
Stream tokens as they are generated for a responsive user experience:
from strongly import Strongly
client = Strongly()
print("Response: ", end="", flush=True)
for chunk in client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "Write a haiku about machine learning."}],
stream=True,
):
print(chunk.content, end="", flush=True)
print()
Collecting a Streamed Response
from strongly import Strongly
client = Strongly()
chunks = []
for chunk in client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "Explain recursion briefly."}],
stream=True,
):
chunks.append(chunk.content)
full_response = "".join(chunks)
print(full_response)
Text Completion
Generate text from a single prompt (no conversation history):
from strongly import Strongly
client = Strongly()
response = client.ai.inference.completion(
model="gpt-4o-mini",
prompt="Explain the difference between a list and a tuple in Python:",
max_tokens=200,
temperature=0.5,
)
print(response.content)
print(f"Tokens: {response.usage.total_tokens}")
Streaming Text Completion
from strongly import Strongly
client = Strongly()
for chunk in client.ai.inference.completion(
model="gpt-4o-mini",
prompt="Write a short poem about data pipelines:",
stream=True,
):
print(chunk.content, end="", flush=True)
print()
Embeddings
Generate vector embeddings for similarity search, clustering, and RAG applications:
from strongly import Strongly
client = Strongly()
response = client.ai.inference.embedding(
model="text-embedding-ada-002",
input="Machine learning is a subset of artificial intelligence.",
)
print(f"Embedding dimension: {len(response.embeddings[0])}")
Batch Embeddings
Pass a list of strings to embed multiple texts in a single call:
from strongly import Strongly
client = Strongly()
texts = [
"Machine learning uses statistical methods to learn from data.",
"Deep learning is based on artificial neural networks.",
"The weather forecast calls for rain tomorrow.",
]
response = client.ai.inference.embedding(
model="text-embedding-ada-002",
input=texts,
)
print(f"Generated {len(response.embeddings)} embeddings")
print(f"Dimension: {len(response.embeddings[0])}")
Similarity Search Example
import math
from strongly import Strongly
client = Strongly()
def cosine_similarity(a, b):
dot = sum(x * y for x, y in zip(a, b))
norm_a = math.sqrt(sum(x * x for x in a))
norm_b = math.sqrt(sum(x * x for x in b))
return dot / (norm_a * norm_b)
texts = [
"Machine learning is a subset of artificial intelligence.",
"Deep learning uses neural networks with many layers.",
"The weather today is sunny and warm.",
]
response = client.ai.inference.embedding(
model="text-embedding-ada-002",
input=texts,
)
sim_01 = cosine_similarity(response.embeddings[0], response.embeddings[1])
sim_02 = cosine_similarity(response.embeddings[0], response.embeddings[2])
print(f"ML vs Deep Learning: {sim_01:.4f}") # High similarity
print(f"ML vs Weather: {sim_02:.4f}") # Low similarity
Text-to-Speech
Generate spoken audio from text:
from strongly import Strongly
client = Strongly()
# Generate speech from text
response = client.ai.inference.speech(
model="your-tts-model-id",
input="Hello, welcome to Strongly AI!",
voice="alloy",
response_format="mp3",
speed=1.0,
)
# Save the audio file
with open("output.mp3", "wb") as f:
f.write(response.content)
print(f"Audio format: {response.content_type}")
print(f"Audio size: {len(response.content)} bytes")
Speech-to-Text (Transcription)
Transcribe audio files to text:
from strongly import Strongly
client = Strongly()
# Transcribe audio file
with open("recording.mp3", "rb") as audio_file:
response = client.ai.inference.transcription(
model="your-stt-model-id",
file=audio_file,
filename="recording.mp3",
language="en",
)
print(f"Transcript: {response.text}")
if response.segments:
for seg in response.segments:
print(f" [{seg.start:.1f}s - {seg.end:.1f}s] {seg.text}")
Audio Translation
Translate audio from any supported language to English:
from strongly import Strongly
client = Strongly()
# Translate audio to English
with open("french_audio.mp3", "rb") as audio_file:
response = client.ai.inference.translation(
model="your-stt-model-id",
file=audio_file,
)
print(f"English translation: {response.text}")
Image Generation
Generate images from text prompts:
from strongly import Strongly
client = Strongly()
# Generate an image
response = client.ai.inference.image_generation(
model="your-image-model-id",
prompt="A serene mountain landscape at sunset",
n=1,
size="1024x1024",
quality="standard",
)
for image in response.data:
if image.url:
print(f"Image URL: {image.url}")
if image.revised_prompt:
print(f"Revised prompt: {image.revised_prompt}")
Video Generation
Generate videos from text prompts. Video generation is asynchronous and returns a job for tracking:
from strongly import Strongly
client = Strongly()
# Submit a video generation job
job = client.ai.inference.video_generation(
model="your-video-model-id",
prompt="A drone flying over a tropical forest",
duration=5,
resolution="1080p",
)
print(f"Job submitted: {job.job_id}")
# Poll for completion
import time
while job.status not in ("completed", "failed", "cancelled"):
time.sleep(10)
job = client.ai.inference.generation_status(job_id=job.job_id)
print(f"Status: {job.status}, Progress: {job.progress}%")
if job.status == "completed":
print(f"Result: {job.result}")
Music Generation
Generate music from text prompts:
from strongly import Strongly
client = Strongly()
# Generate music
job = client.ai.inference.music_generation(
model="your-music-model-id",
prompt="Upbeat electronic music with a catchy melody",
duration=30,
)
print(f"Job submitted: {job.job_id}")
Cancel a Generation Job
Cancel a running generation job:
from strongly import Strongly
client = Strongly()
# Cancel a running job
cancelled = client.ai.inference.cancel_generation(job_id=job.job_id)
print(f"Cancelled: {cancelled.status}")
Content Moderation
Check text content against moderation categories:
from strongly import Strongly
client = Strongly()
# Moderate content
response = client.ai.inference.moderation(
model="your-moderation-model-id",
input="Some text to check for policy violations.",
)
for result in response.results:
print(f"Flagged: {result.flagged}")
for category, flagged in result.categories.items():
if flagged:
print(f" {category}: score={result.category_scores[category]:.4f}")
Batch Moderation
Pass a list to moderate multiple texts in one call:
from strongly import Strongly
client = Strongly()
texts = [
"This is a normal message.",
"Some other text to check.",
]
response = client.ai.inference.moderation(
model="your-moderation-model-id",
input=texts,
)
for i, result in enumerate(response.results):
print(f"Text {i}: flagged={result.flagged}")
Document Reranking
Rerank documents by relevance to a query:
from strongly import Strongly
client = Strongly()
# Rerank documents by relevance
response = client.ai.inference.rerank(
model="your-rerank-model-id",
query="What is machine learning?",
documents=[
"Machine learning is a subset of artificial intelligence.",
"The weather today is sunny and warm.",
"Deep learning uses neural networks to learn from data.",
],
top_n=2,
)
for result in response.results:
print(f"Index: {result.index}, Score: {result.relevance_score:.4f}")
if result.document:
print(f" Text: {result.document.get('text', '')}")
Parameters
chat_completion
def chat_completion(
self,
*,
model: str,
messages: List[Union[Dict[str, Any], ChatMessage]],
stream: bool = False,
max_tokens: Optional[int] = None,
temperature: float = 0.7,
top_p: float = 1.0,
stop: Optional[Union[str, List[str]]] = None,
**kwargs: Any,
) -> Union[ChatCompletion, Iterator[StreamChunk]]
| Parameter | Type | Default | Description |
|---|---|---|---|
model | str | required | Model identifier (e.g., "gpt-4o-mini") |
messages | List[Union[Dict, ChatMessage]] | required | Conversation messages with role and content |
stream | bool | False | Return an iterator of StreamChunk objects |
max_tokens | int | None | Maximum tokens to generate |
temperature | float | 0.7 | Sampling temperature (0.0 to 2.0) |
top_p | float | 1.0 | Nucleus sampling threshold |
stop | str | List[str] | None | Stop sequence(s) |
**kwargs | Any | Additional model-specific parameters |
completion
def completion(
self,
*,
model: str,
prompt: str,
stream: bool = False,
max_tokens: Optional[int] = None,
temperature: float = 0.7,
**kwargs: Any,
) -> Union[Completion, Iterator[StreamChunk]]
| Parameter | Type | Default | Description |
|---|---|---|---|
model | str | required | Model identifier |
prompt | str | required | Text prompt |
stream | bool | False | Return an iterator of StreamChunk objects |
max_tokens | int | None | Maximum tokens to generate |
temperature | float | 0.7 | Sampling temperature |
**kwargs | Any | Additional model-specific parameters |
embedding
def embedding(
self,
*,
model: str,
input: Union[str, List[str]],
**kwargs: Any,
) -> EmbeddingResponse
| Parameter | Type | Default | Description |
|---|---|---|---|
model | str | required | Embedding model identifier (e.g., "text-embedding-ada-002") |
input | str | List[str] | required | Text or list of texts to embed |
**kwargs | Any | Additional model-specific parameters |
speech
def speech(
self,
*,
model: str,
input: str,
voice: str = "alloy",
response_format: str = "mp3",
speed: float = 1.0,
**kwargs: Any,
) -> SpeechResponse
| Parameter | Type | Default | Description |
|---|---|---|---|
model | str | required | Model ID for TTS |
input | str | required | Text to convert to speech |
voice | str | "alloy" | Voice to use |
response_format | str | "mp3" | Audio format (mp3, wav, opus, flac) |
speed | float | 1.0 | Speech speed (0.25 to 4.0) |
**kwargs | Any | Additional model-specific parameters |
transcription
def transcription(
self,
*,
model: str,
file: Union[bytes, BinaryIO],
filename: str = "audio.mp3",
language: Optional[str] = None,
prompt: Optional[str] = None,
response_format: str = "json",
temperature: float = 0.0,
**kwargs: Any,
) -> TranscriptionResponse
| Parameter | Type | Default | Description |
|---|---|---|---|
model | str | required | Model ID for STT |
file | bytes | BinaryIO | required | Audio file to transcribe |
filename | str | "audio.mp3" | Name of the audio file |
language | str | None | Language code (e.g., "en", "es") |
prompt | str | None | Optional context prompt |
response_format | str | "json" | Response format |
temperature | float | 0.0 | Sampling temperature |
**kwargs | Any | Additional model-specific parameters |
translation
def translation(
self,
*,
model: str,
file: Union[bytes, BinaryIO],
filename: str = "audio.mp3",
**kwargs: Any,
) -> TranscriptionResponse
| Parameter | Type | Default | Description |
|---|---|---|---|
model | str | required | Model ID for STT |
file | bytes | BinaryIO | required | Audio file to translate |
filename | str | "audio.mp3" | Name of the audio file |
**kwargs | Any | Additional model-specific parameters |
image_generation
def image_generation(
self,
*,
model: str,
prompt: str,
n: int = 1,
size: str = "1024x1024",
quality: str = "standard",
response_format: str = "url",
**kwargs: Any,
) -> ImageGenerationResponse
| Parameter | Type | Default | Description |
|---|---|---|---|
model | str | required | Model ID for image generation |
prompt | str | required | Text description of the image |
n | int | 1 | Number of images to generate |
size | str | "1024x1024" | Image size |
quality | str | "standard" | Quality level (standard, hd) |
response_format | str | "url" | Response format (url, b64_json) |
**kwargs | Any | Additional model-specific parameters |
video_generation
def video_generation(
self,
*,
model: str,
prompt: str,
duration: Optional[int] = None,
resolution: Optional[str] = None,
**kwargs: Any,
) -> GenerationJob
| Parameter | Type | Default | Description |
|---|---|---|---|
model | str | required | Model ID for video generation |
prompt | str | required | Text description of the video |
duration | int | None | Duration in seconds |
resolution | str | None | Video resolution (e.g., "1080p") |
**kwargs | Any | Additional model-specific parameters |
music_generation
def music_generation(
self,
*,
model: str,
prompt: str,
duration: Optional[int] = None,
**kwargs: Any,
) -> GenerationJob
| Parameter | Type | Default | Description |
|---|---|---|---|
model | str | required | Model ID for music generation |
prompt | str | required | Text description of the music |
duration | int | None | Duration in seconds |
**kwargs | Any | Additional model-specific parameters |
generation_status
def generation_status(
self,
*,
job_id: str,
) -> GenerationJob
| Parameter | Type | Default | Description |
|---|---|---|---|
job_id | str | required | Job ID to check status for |
cancel_generation
def cancel_generation(
self,
*,
job_id: str,
) -> GenerationJob
| Parameter | Type | Default | Description |
|---|---|---|---|
job_id | str | required | Job ID to cancel |
moderation
def moderation(
self,
*,
model: str,
input: Union[str, List[str]],
**kwargs: Any,
) -> ModerationResponse
| Parameter | Type | Default | Description |
|---|---|---|---|
model | str | required | Moderation model identifier |
input | str | List[str] | required | Text or list of texts to moderate |
**kwargs | Any | Additional model-specific parameters |
rerank
def rerank(
self,
*,
model: str,
query: str,
documents: List[Union[str, Dict[str, Any]]],
top_n: Optional[int] = None,
**kwargs: Any,
) -> RerankResponse
| Parameter | Type | Default | Description |
|---|---|---|---|
model | str | required | Rerank model identifier |
query | str | required | Query to rank documents against |
documents | List[str | dict] | required | Documents to rerank |
top_n | int | None | Return only top N results |
**kwargs | Any | Additional model-specific parameters |
Response Models
ChatCompletion
| Field | Type | Description |
|---|---|---|
id | str | Unique completion identifier |
model | str | Model used for generation |
choices | List[ChatCompletionChoice] | List of generated choices |
usage | TokenUsage | Token usage breakdown |
created | int | Unix timestamp of creation |
Property: content -- Returns the first choice's message content (choices[0].message.content).
Completion
| Field | Type | Description |
|---|---|---|
id | str | Unique completion identifier |
model | str | Model used for generation |
choices | List[ChatCompletionChoice] | List of generated choices |
usage | TokenUsage | Token usage breakdown |
created | int | Unix timestamp of creation |
Property: content -- Returns the first choice's message content.
ChatCompletionChoice
| Field | Type | Description |
|---|---|---|
index | int | Choice index |
message | ChatMessage | Generated message |
finish_reason | str | Reason generation stopped (stop, length, etc.) |
ChatMessage
| Field | Type | Description |
|---|---|---|
role | str | Message role (system, user, assistant, function) |
content | str | Message text content |
name | str | Optional sender name |
function_call | dict | Function call details (if applicable) |
tool_calls | list | Tool call details (if applicable) |
TokenUsage
| Field | Type | Description |
|---|---|---|
prompt_tokens | int | Tokens in the prompt |
completion_tokens | int | Tokens in the generated response |
total_tokens | int | Total tokens consumed |
StreamChunk
| Field | Type | Description |
|---|---|---|
id | str | Chunk identifier |
model | str | Model used for generation |
content | str | Generated text fragment |
finish_reason | str | Set on the final chunk |
role | str | Message role |
index | int | Choice index |
EmbeddingResponse
| Field | Type | Description |
|---|---|---|
model | str | Embedding model used |
data | List[EmbeddingData] | List of embedding objects |
usage | TokenUsage | Token usage breakdown |
Property: embeddings -- Returns a flat List[List[float]] of embedding vectors.
EmbeddingData
| Field | Type | Description |
|---|---|---|
embedding | List[float] | The embedding vector |
index | int | Position in the input list |
SpeechResponse
| Field | Type | Description |
|---|---|---|
content | bytes | Raw audio bytes |
content_type | str | MIME type of the audio (e.g., audio/mpeg) |
TranscriptionResponse
| Field | Type | Description |
|---|---|---|
text | str | Transcribed or translated text |
language | str | Detected language code |
duration | float | Audio duration in seconds |
segments | List[TranscriptionSegment] | Timestamped segments (if available) |
TranscriptionSegment
| Field | Type | Description |
|---|---|---|
id | int | Segment index |
start | float | Start time in seconds |
end | float | End time in seconds |
text | str | Segment text |
ImageGenerationResponse
| Field | Type | Description |
|---|---|---|
created | int | Unix timestamp of creation |
data | List[ImageData] | List of generated images |
ImageData
| Field | Type | Description |
|---|---|---|
url | str | URL of the generated image (if response_format="url") |
b64_json | str | Base64-encoded image data (if response_format="b64_json") |
revised_prompt | str | Revised prompt used for generation (if applicable) |
GenerationJob
| Field | Type | Description |
|---|---|---|
job_id | str | Unique job identifier |
status | str | Job status (pending, processing, completed, failed, cancelled) |
created | int | Unix timestamp of creation |
progress | int | Progress percentage (0-100) |
result | Any | Generation result (available when completed) |
model | str | Model ID used |
provider | str | Provider used for generation |
ModerationResponse
| Field | Type | Description |
|---|---|---|
id | str | Moderation request identifier |
model | str | Model used for moderation |
results | List[ModerationResult] | Moderation results for each input |
ModerationResult
| Field | Type | Description |
|---|---|---|
flagged | bool | Whether the content was flagged |
categories | Dict[str, bool] | Category flags (e.g., {"hate": false, "violence": true}) |
category_scores | Dict[str, float] | Category confidence scores |
RerankResponse
| Field | Type | Description |
|---|---|---|
id | str | Rerank request identifier |
model | str | Model used for reranking |
results | List[RerankResult] | Reranked documents |
meta | Dict | Optional metadata |
RerankResult
| Field | Type | Description |
|---|---|---|
index | int | Original document index |
relevance_score | float | Relevance score |
document | Dict | Document content (if returned) |
Complete Example
import math
from strongly import Strongly
def main():
client = Strongly()
# --- Chat Completion ---
print("--- Chat Completion ---")
response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a data science tutor."},
{"role": "user", "content": "Explain overfitting in one paragraph."},
],
temperature=0.5,
max_tokens=200,
)
print(response.content)
print(f"Tokens used: {response.usage.total_tokens}\n")
# --- Streaming Chat ---
print("--- Streaming Chat ---")
print("Response: ", end="", flush=True)
for chunk in client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "Write a haiku about neural networks."}],
stream=True,
):
print(chunk.content, end="", flush=True)
print("\n")
# --- Text Completion ---
print("--- Text Completion ---")
response = client.ai.inference.completion(
model="gpt-4o-mini",
prompt="Three benefits of automated testing:",
max_tokens=150,
temperature=0.3,
)
print(response.content)
print(f"Tokens used: {response.usage.total_tokens}\n")
# --- Embeddings ---
print("--- Embeddings ---")
texts = [
"Machine learning is a subset of artificial intelligence.",
"Deep learning uses neural networks with many layers.",
"The weather today is sunny and warm.",
]
response = client.ai.inference.embedding(
model="text-embedding-ada-002",
input=texts,
)
print(f"Generated {len(response.embeddings)} embeddings")
print(f"Dimension: {len(response.embeddings[0])}")
def cosine_similarity(a, b):
dot = sum(x * y for x, y in zip(a, b))
norm_a = math.sqrt(sum(x * x for x in a))
norm_b = math.sqrt(sum(x * x for x in b))
return dot / (norm_a * norm_b)
sim_01 = cosine_similarity(response.embeddings[0], response.embeddings[1])
sim_02 = cosine_similarity(response.embeddings[0], response.embeddings[2])
print(f"ML vs Deep Learning: {sim_01:.4f}")
print(f"ML vs Weather: {sim_02:.4f}")
# --- Multi-Turn Conversation ---
print("\n--- Multi-Turn Conversation ---")
messages = [
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": "What is the difference between SQL and NoSQL?"},
]
response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=messages,
)
print(f"Assistant: {response.content[:200]}...")
messages.append({"role": "assistant", "content": response.content})
messages.append({"role": "user", "content": "When should I choose NoSQL?"})
response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=messages,
)
print(f"Assistant: {response.content[:200]}...")
if __name__ == "__main__":
main()