Skip to main content

AI Inference

Run chat completions, text generation, and embeddings through the Strongly AI gateway.

Overview

The AI Inference resource provides:

  • Chat completions with multi-turn conversation support
  • Text completions from a prompt
  • Streaming responses for real-time token delivery
  • Text embeddings for similarity search and RAG applications
  • Unified access to all configured AI models

All inference methods are accessed through client.ai.inference.

Chat Completion

Generate a response from a conversation history:

from strongly import Strongly

client = Strongly()

response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is gradient descent?"},
],
)

print(response.content)
print(f"Tokens used: {response.usage.total_tokens}")

Multi-Turn Conversations

Pass the full conversation history to maintain context:

from strongly import Strongly

client = Strongly()

messages = [
{"role": "system", "content": "You are a Python expert."},
{"role": "user", "content": "What are decorators?"},
]

response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=messages,
)
print(f"Assistant: {response.content}")

# Continue the conversation
messages.append({"role": "assistant", "content": response.content})
messages.append({"role": "user", "content": "Show me an example."})

response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=messages,
)
print(f"Assistant: {response.content}")

Generation Parameters

Control the output with temperature, token limits, and stop sequences:

from strongly import Strongly

client = Strongly()

response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "List three benefits of unit testing."}],
temperature=0.3, # Lower = more deterministic
max_tokens=256, # Cap the response length
top_p=0.9, # Nucleus sampling
stop=["\n\n", "---"], # Stop generating at these sequences
)

print(response.content)

Streaming

Stream tokens as they are generated for a responsive user experience:

from strongly import Strongly

client = Strongly()

print("Response: ", end="", flush=True)
for chunk in client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "Write a haiku about machine learning."}],
stream=True,
):
print(chunk.content, end="", flush=True)
print()

Collecting a Streamed Response

from strongly import Strongly

client = Strongly()

chunks = []
for chunk in client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "Explain recursion briefly."}],
stream=True,
):
chunks.append(chunk.content)

full_response = "".join(chunks)
print(full_response)

Text Completion

Generate text from a single prompt (no conversation history):

from strongly import Strongly

client = Strongly()

response = client.ai.inference.completion(
model="gpt-4o-mini",
prompt="Explain the difference between a list and a tuple in Python:",
max_tokens=200,
temperature=0.5,
)

print(response.content)
print(f"Tokens: {response.usage.total_tokens}")

Streaming Text Completion

from strongly import Strongly

client = Strongly()

for chunk in client.ai.inference.completion(
model="gpt-4o-mini",
prompt="Write a short poem about data pipelines:",
stream=True,
):
print(chunk.content, end="", flush=True)
print()

Embeddings

Generate vector embeddings for similarity search, clustering, and RAG applications:

from strongly import Strongly

client = Strongly()

response = client.ai.inference.embedding(
model="text-embedding-ada-002",
input="Machine learning is a subset of artificial intelligence.",
)

print(f"Embedding dimension: {len(response.embeddings[0])}")

Batch Embeddings

Pass a list of strings to embed multiple texts in a single call:

from strongly import Strongly

client = Strongly()

texts = [
"Machine learning uses statistical methods to learn from data.",
"Deep learning is based on artificial neural networks.",
"The weather forecast calls for rain tomorrow.",
]

response = client.ai.inference.embedding(
model="text-embedding-ada-002",
input=texts,
)

print(f"Generated {len(response.embeddings)} embeddings")
print(f"Dimension: {len(response.embeddings[0])}")

Similarity Search Example

import math
from strongly import Strongly

client = Strongly()

def cosine_similarity(a, b):
dot = sum(x * y for x, y in zip(a, b))
norm_a = math.sqrt(sum(x * x for x in a))
norm_b = math.sqrt(sum(x * x for x in b))
return dot / (norm_a * norm_b)

texts = [
"Machine learning is a subset of artificial intelligence.",
"Deep learning uses neural networks with many layers.",
"The weather today is sunny and warm.",
]

response = client.ai.inference.embedding(
model="text-embedding-ada-002",
input=texts,
)

sim_01 = cosine_similarity(response.embeddings[0], response.embeddings[1])
sim_02 = cosine_similarity(response.embeddings[0], response.embeddings[2])

print(f"ML vs Deep Learning: {sim_01:.4f}") # High similarity
print(f"ML vs Weather: {sim_02:.4f}") # Low similarity

Parameters

chat_completion

def chat_completion(
self,
*,
model: str,
messages: List[Union[Dict[str, Any], ChatMessage]],
stream: bool = False,
max_tokens: Optional[int] = None,
temperature: float = 0.7,
top_p: float = 1.0,
stop: Optional[Union[str, List[str]]] = None,
**kwargs: Any,
) -> Union[ChatCompletion, Iterator[StreamChunk]]
ParameterTypeDefaultDescription
modelstrrequiredModel identifier (e.g., "gpt-4o-mini")
messagesList[Union[Dict, ChatMessage]]requiredConversation messages with role and content
streamboolFalseReturn an iterator of StreamChunk objects
max_tokensintNoneMaximum tokens to generate
temperaturefloat0.7Sampling temperature (0.0 to 2.0)
top_pfloat1.0Nucleus sampling threshold
stopstr | List[str]NoneStop sequence(s)
**kwargsAnyAdditional model-specific parameters

completion

def completion(
self,
*,
model: str,
prompt: str,
stream: bool = False,
max_tokens: Optional[int] = None,
temperature: float = 0.7,
**kwargs: Any,
) -> Union[Completion, Iterator[StreamChunk]]
ParameterTypeDefaultDescription
modelstrrequiredModel identifier
promptstrrequiredText prompt
streamboolFalseReturn an iterator of StreamChunk objects
max_tokensintNoneMaximum tokens to generate
temperaturefloat0.7Sampling temperature
**kwargsAnyAdditional model-specific parameters

embedding

def embedding(
self,
*,
model: str,
input: Union[str, List[str]],
**kwargs: Any,
) -> EmbeddingResponse
ParameterTypeDefaultDescription
modelstrrequiredEmbedding model identifier (e.g., "text-embedding-ada-002")
inputstr | List[str]requiredText or list of texts to embed
**kwargsAnyAdditional model-specific parameters

Response Models

ChatCompletion

FieldTypeDescription
idstrUnique completion identifier
modelstrModel used for generation
choicesList[ChatCompletionChoice]List of generated choices
usageTokenUsageToken usage breakdown
createdintUnix timestamp of creation

Property: content -- Returns the first choice's message content (choices[0].message.content).

Completion

FieldTypeDescription
idstrUnique completion identifier
modelstrModel used for generation
choicesList[ChatCompletionChoice]List of generated choices
usageTokenUsageToken usage breakdown
createdintUnix timestamp of creation

Property: content -- Returns the first choice's message content.

ChatCompletionChoice

FieldTypeDescription
indexintChoice index
messageChatMessageGenerated message
finish_reasonstrReason generation stopped (stop, length, etc.)

ChatMessage

FieldTypeDescription
rolestrMessage role (system, user, assistant, function)
contentstrMessage text content
namestrOptional sender name
function_calldictFunction call details (if applicable)
tool_callslistTool call details (if applicable)

TokenUsage

FieldTypeDescription
prompt_tokensintTokens in the prompt
completion_tokensintTokens in the generated response
total_tokensintTotal tokens consumed

StreamChunk

FieldTypeDescription
idstrChunk identifier
modelstrModel used for generation
contentstrGenerated text fragment
finish_reasonstrSet on the final chunk
rolestrMessage role
indexintChoice index

EmbeddingResponse

FieldTypeDescription
modelstrEmbedding model used
dataList[EmbeddingData]List of embedding objects
usageTokenUsageToken usage breakdown

Property: embeddings -- Returns a flat List[List[float]] of embedding vectors.

EmbeddingData

FieldTypeDescription
embeddingList[float]The embedding vector
indexintPosition in the input list

Complete Example

import math
from strongly import Strongly

def main():
client = Strongly()

# --- Chat Completion ---
print("--- Chat Completion ---")
response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a data science tutor."},
{"role": "user", "content": "Explain overfitting in one paragraph."},
],
temperature=0.5,
max_tokens=200,
)
print(response.content)
print(f"Tokens used: {response.usage.total_tokens}\n")

# --- Streaming Chat ---
print("--- Streaming Chat ---")
print("Response: ", end="", flush=True)
for chunk in client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "Write a haiku about neural networks."}],
stream=True,
):
print(chunk.content, end="", flush=True)
print("\n")

# --- Text Completion ---
print("--- Text Completion ---")
response = client.ai.inference.completion(
model="gpt-4o-mini",
prompt="Three benefits of automated testing:",
max_tokens=150,
temperature=0.3,
)
print(response.content)
print(f"Tokens used: {response.usage.total_tokens}\n")

# --- Embeddings ---
print("--- Embeddings ---")
texts = [
"Machine learning is a subset of artificial intelligence.",
"Deep learning uses neural networks with many layers.",
"The weather today is sunny and warm.",
]

response = client.ai.inference.embedding(
model="text-embedding-ada-002",
input=texts,
)

print(f"Generated {len(response.embeddings)} embeddings")
print(f"Dimension: {len(response.embeddings[0])}")

def cosine_similarity(a, b):
dot = sum(x * y for x, y in zip(a, b))
norm_a = math.sqrt(sum(x * x for x in a))
norm_b = math.sqrt(sum(x * x for x in b))
return dot / (norm_a * norm_b)

sim_01 = cosine_similarity(response.embeddings[0], response.embeddings[1])
sim_02 = cosine_similarity(response.embeddings[0], response.embeddings[2])
print(f"ML vs Deep Learning: {sim_01:.4f}")
print(f"ML vs Weather: {sim_02:.4f}")

# --- Multi-Turn Conversation ---
print("\n--- Multi-Turn Conversation ---")
messages = [
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": "What is the difference between SQL and NoSQL?"},
]

response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=messages,
)
print(f"Assistant: {response.content[:200]}...")

messages.append({"role": "assistant", "content": response.content})
messages.append({"role": "user", "content": "When should I choose NoSQL?"})

response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=messages,
)
print(f"Assistant: {response.content[:200]}...")

if __name__ == "__main__":
main()