Skip to main content

Data Forge

Generate synthetic fine-tuning datasets from your documents using a teacher LLM.

Overview

The Data Forge resource provides:

  • Project management for organizing dataset generation
  • Document upload via presigned S3 URLs
  • Document parsing and chunking pipeline
  • Q&A pair generation using any AI model in your gateway
  • Pair review, editing, and bulk curation
  • Dataset export in ChatML or Alpaca JSONL format
  • Project analytics and quality metrics

All Data Forge methods are accessed through client.data_forge.


Creating a Project

from strongly import Strongly

client = Strongly()

project = client.data_forge.create_project({
"name": "Customer Support FAQ",
"description": "Generate training data from support documentation",
})

print(f"Project created: {project.name} (ID: {project.id})")

Using the Model Class

from strongly import Strongly
from strongly._models.data_forge import DataForgeProjectCreate

client = Strongly()

project = client.data_forge.create_project(
DataForgeProjectCreate(
name="Customer Support FAQ",
description="Generate training data from support docs",
)
)

Listing and Retrieving Projects

from strongly import Strongly

client = Strongly()

# List all projects (paginated)
for project in client.data_forge.list_projects():
print(f"{project.name}{project.status}")
if project.stats:
print(f" Documents: {project.stats.get('total_documents', 0)}")
print(f" Pairs: {project.stats.get('total_pairs', 0)}")

# Get a specific project
project = client.data_forge.retrieve_project("proj_abc123")
print(f"Name: {project.name}")
print(f"Chunks: {project.stats.get('total_chunks', 0)}")

Updating and Deleting Projects

from strongly import Strongly

client = Strongly()

# Update
project = client.data_forge.update_project(
"proj_abc123",
{"name": "Updated Name", "description": "New description"},
)
print(f"Updated: {project.name}")

# Delete (removes all documents, chunks, pairs, and S3 data)
result = client.data_forge.delete_project("proj_abc123")
print(result)

Uploading Documents

Documents are uploaded in two steps: get a presigned S3 URL, then register the document.

import httpx
from strongly import Strongly

client = Strongly()
project_id = "proj_abc123"

# Step 1: Get presigned upload URL
upload_info = client.data_forge.get_upload_url(
project_id,
filename="product-manual.pdf",
content_type="application/pdf",
)

# Step 2: Upload directly to S3
with open("product-manual.pdf", "rb") as f:
httpx.put(
upload_info["upload_url"],
content=f.read(),
headers={"Content-Type": "application/pdf"},
)

# Step 3: Register the uploaded document
import os
doc = client.data_forge.register_document(
project_id,
filename="product-manual.pdf",
content_type="application/pdf",
file_size=os.path.getsize("product-manual.pdf"),
s3_key=upload_info["s3_key"],
)

print(f"Document registered: {doc.name} (ID: {doc.id})")

Listing and Deleting Documents

from strongly import Strongly

client = Strongly()

# List documents in a project
docs = client.data_forge.list_documents("proj_abc123")
for doc in docs:
print(f"{doc.name}{doc.parsing_status} ({doc.chunk_count} chunks)")

# Delete a document
client.data_forge.delete_document("proj_abc123", "doc_xyz789")

Parsing Documents

Start parsing uploaded documents into text chunks:

from strongly import Strongly

client = Strongly()

result = client.data_forge.parse_documents("proj_abc123")
print(f"Parse job started: {result}")

Viewing Chunks

from strongly import Strongly

client = Strongly()

# List chunks with pagination
chunks = client.data_forge.list_chunks(
"proj_abc123",
page=1,
page_size=20,
)
print(f"Total chunks: {chunks.get('total', 0)}")
for chunk in chunks.get("chunks", []):
print(f" [{chunk.get('position')}] {chunk.get('heading', 'No heading')}")
print(f" {chunk.get('content', '')[:100]}...")

# Filter by document
chunks = client.data_forge.list_chunks(
"proj_abc123",
document_id="doc_xyz789",
)

Editing Chunks

from strongly import Strongly

client = Strongly()

# Edit chunk content before generation
client.data_forge.update_chunk(
"proj_abc123",
"chunk_001",
content="Updated chunk text content...",
)

# Exclude a chunk from generation
client.data_forge.update_chunk(
"proj_abc123",
"chunk_002",
excluded=True,
)

Generating Training Pairs

Basic Generation

from strongly import Strongly

client = Strongly()

generation = client.data_forge.start_generation(
"proj_abc123",
model_id="gpt-4o",
)

print(f"Generation started: {generation.id}")
print(f"Status: {generation.status}")

Advanced Configuration

from strongly import Strongly

client = Strongly()

generation = client.data_forge.start_generation(
"proj_abc123",
model_id="gpt-4o",
generation_type="qa",
temperature=0.8,
system_prompt="Generate Q&A pairs focused on technical troubleshooting.",
num_pairs=500,
config={
"pairs_per_chunk": 5,
"difficulty_distribution": {"easy": 0.2, "medium": 0.5, "hard": 0.3},
"style_template": "troubleshooting",
},
)

print(f"Generation ID: {generation.id}")
print(f"Config: {generation.config}")

Monitoring Generation Progress

from strongly import Strongly
import time

client = Strongly()

generation = client.data_forge.start_generation(
"proj_abc123",
model_id="gpt-4o",
)
gen_id = generation.id

while True:
gen = client.data_forge.retrieve_generation(gen_id)
print(f"Status: {gen.status}, Progress: {gen.progress}%")

if gen.status in ("completed", "failed", "cancelled"):
break

time.sleep(10)

if gen.status == "completed":
print(f"Pairs generated: {gen.results.get('pairs_generated', 0)}")
print(f"Avg quality: {gen.results.get('avg_quality_score', 0):.0%}")
else:
print(f"Generation {gen.status}: {gen.error}")

Cancelling a Generation

from strongly import Strongly

client = Strongly()

client.data_forge.cancel_generation("gen_abc123")
print("Generation cancelled")

Generation History and Logs

Listing Generations

from strongly import Strongly

client = Strongly()

generations = client.data_forge.list_generations("proj_abc123")
for gen in generations:
print(f"{gen.status}{gen.config.get('model_id', 'unknown')}")
if gen.results:
print(f" Pairs: {gen.results.get('pairs_generated', 0)}")
print(f" Quality: {gen.results.get('avg_quality_score', 0):.0%}")

Viewing Logs

from strongly import Strongly

client = Strongly()

logs = client.data_forge.generation_logs("gen_abc123", tail=50)
for log in logs.get("logs", []):
print(f"[{log['timestamp']}] [{log['level']}] {log['message']}")

Reviewing Pairs

Listing Pairs

from strongly import Strongly

client = Strongly()

# List all pairs
pairs = client.data_forge.list_pairs("proj_abc123", page=1, page_size=20)
print(f"Total pairs: {pairs.get('total', 0)}")

for pair in pairs.get("pairs", []):
print(f"Q: {pair.get('question', '')[:80]}")
print(f"A: {pair.get('answer', '')[:80]}")
print(f"Quality: {pair.get('qualityScore', 0):.0%} | Status: {pair.get('status')}")
print()

# Filter by review status
pending = client.data_forge.list_pairs(
"proj_abc123",
status="pending",
)

# Filter by generation run
from_gen = client.data_forge.list_pairs(
"proj_abc123",
generation_id="gen_abc123",
)

# Search across questions and answers
results = client.data_forge.list_pairs(
"proj_abc123",
search="password reset",
)

Updating Pairs

from strongly import Strongly

client = Strongly()

# Accept a pair
client.data_forge.update_pair("pair_001", status="accepted")

# Reject a pair
client.data_forge.update_pair("pair_002", status="rejected")

# Edit content and accept
client.data_forge.update_pair(
"pair_003",
question="How do I reset my password?",
answer="Navigate to Settings > Security > Change Password.",
status="accepted",
reviewer_notes="Simplified the answer",
)

Bulk Actions

from strongly import Strongly

client = Strongly()

# Accept specific pairs
client.data_forge.bulk_action_pairs(
"proj_abc123",
action="accept",
pair_ids=["pair_001", "pair_002", "pair_003"],
)

# Reject pairs matching a filter
client.data_forge.bulk_action_pairs(
"proj_abc123",
action="reject",
filters={"quality_score_lt": 0.5},
)

# Reset all pairs back to pending
client.data_forge.bulk_action_pairs(
"proj_abc123",
action="reset",
)

Exporting Datasets

Export as ChatML

from strongly import Strongly

client = Strongly()

export = client.data_forge.export_dataset(
"proj_abc123",
format="chatml",
include_system_prompt=True,
min_quality_score=0.8,
)

print(f"Version: {export.version}")
print(f"Pairs exported: {export.pair_count}")
print(f"File size: {export.file_size} bytes")
print(f"Download URL: {export.download_url}")

Export as Alpaca

from strongly import Strongly

client = Strongly()

export = client.data_forge.export_dataset(
"proj_abc123",
format="alpaca",
)

print(f"Download: {export.download_url}")

Retrieving Past Exports

from strongly import Strongly

client = Strongly()

# Get download URL for a specific export version
export = client.data_forge.get_export("proj_abc123", version=1)
print(f"Version {export.version}: {export.download_url}")

Analytics

from strongly import Strongly

client = Strongly()

analytics = client.data_forge.analytics("proj_abc123")

print(f"Project: {analytics.project_name}")
print(f"Documents: {analytics.documents}")
print(f"Chunks: {analytics.chunks}")
print(f"Pairs: {analytics.pairs}")
print(f"Quality Distribution: {analytics.quality_distribution}")
print(f"Exports: {analytics.exports_count}")

Discovering Available Models

List AI models available for use as teacher models:

from strongly import Strongly

client = Strongly()

models = client.data_forge.available_models()
for model in models:
print(f"{model['id']}{model.get('name', '')} ({model.get('provider', '')})")

Method Reference

MethodDescriptionReturns
list_projects(*, status=None, search=None, limit=50)List projectsSyncPaginator[DataForgeProject]
create_project(body)Create a projectDataForgeProject
retrieve_project(project_id)Get project by IDDataForgeProject
update_project(project_id, body)Update a projectDataForgeProject
delete_project(project_id)Delete a projectdict
get_upload_url(project_id, *, filename, content_type)Get presigned upload URLdict
register_document(project_id, *, filename, content_type, file_size, s3_key)Register uploaded documentDataForgeDocument
list_documents(project_id)List project documentsList[DataForgeDocument]
delete_document(project_id, document_id)Delete a documentdict
list_chunks(project_id, *, document_id=None, page=1, page_size=50)List parsed chunksdict
update_chunk(project_id, chunk_id, *, content=None, **kwargs)Update a chunkdict
parse_documents(project_id)Start parse jobdict
start_generation(project_id, *, model_id, generation_type="qa", temperature=0.7, system_prompt=None, num_pairs=None, max_tokens=None, chunk_ids=None, config=None, **kwargs)Start generation jobDataForgeGeneration
cancel_generation(generation_id)Cancel a generationdict
list_generations(project_id)List generation runsList[DataForgeGeneration]
retrieve_generation(generation_id)Get generation by IDDataForgeGeneration
generation_logs(generation_id, *, tail=200)Get generation logsdict
list_pairs(project_id, *, status=None, generation_id=None, search=None, page=1, page_size=50)List pairsdict
update_pair(pair_id, *, status=None, question=None, answer=None, reviewer_notes=None, **kwargs)Update a pairdict
bulk_action_pairs(project_id, *, action, pair_ids=None, filters=None)Bulk action on pairsdict
export_dataset(project_id, *, format="chatml", include_system_prompt=True, min_quality_score=None)Export datasetDataForgeExport
get_export(project_id, version)Get export download URLDataForgeExport
analytics(project_id)Get project analyticsDataForgeAnalytics
available_models()List available teacher modelsList[dict]

Response Models

DataForgeProject Fields

FieldTypeDescription
idstrUnique identifier
project_idstrProject ID
namestrProject name
descriptionstrProject description
statusstrProject status
user_idstrOwner user ID
organization_idstrOrganization ID
s3_prefixstrS3 storage prefix
statsdictAggregate stats (total_documents, total_chunks, total_pairs, accepted_pairs, rejected_pairs, pending_pairs, avg_quality_score)
default_configdictDefault generation config
current_versionintCurrent export version
versionslistExport version history
created_atstrCreation timestamp
updated_atstrLast update timestamp

DataForgeDocument Fields

FieldTypeDescription
idstrUnique identifier
document_idstrDocument ID
project_idstrParent project ID
namestrFilename
mime_typestrMIME type
file_sizeintFile size in bytes
s3_keystrS3 object key
parsing_statusstrStatus: pending, processing, completed, failed
parsing_errorstrError message if parsing failed
parsed_metadatadictParsing metadata (page_count, word_count, language, headings)
chunk_countintNumber of chunks created

DataForgeGeneration Fields

FieldTypeDescription
idstrUnique identifier
generation_idstrGeneration ID
project_idstrParent project ID
configdictGeneration configuration
statusstrStatus: pending, parsing, generating, validating, completed, failed, cancelled
progressfloatCompletion percentage (0-100)
job_namestrK8s job name
resultsdictResults (chunks_processed, pairs_generated, pairs_valid, avg_quality_score, tokens_used)
errordictError details if failed
started_atstrJob start timestamp
completed_atstrJob completion timestamp

DataForgeExport Fields

FieldTypeDescription
versionintExport version number
formatstrExport format (chatml or alpaca)
s3_keystrS3 key for the exported file
pair_countintNumber of pairs in the export
file_sizeintFile size in bytes
download_urlstrPresigned download URL
include_system_promptboolWhether system prompt is included
min_quality_scorefloatQuality score filter applied

DataForgeAnalytics Fields

FieldTypeDescription
project_idstrProject ID
project_namestrProject name
documentsdictDocument statistics
chunksdictChunk statistics
pairsdictPair statistics (total, accepted, rejected, pending, avg_quality_score)
quality_distributionlistQuality score distribution
generationsdictGeneration run statistics
exports_countintNumber of exports

Complete Example

from strongly import Strongly
import httpx
import os
import time


def main():
client = Strongly()

# --- Create project ---
print("=== Creating Project ===")
project = client.data_forge.create_project({
"name": "Product Docs Training Data",
"description": "Generate Q&A pairs from product documentation",
})
project_id = project.id
print(f"Project ID: {project_id}")

# --- Upload documents ---
print("\n=== Uploading Documents ===")
files = ["user-guide.pdf", "api-reference.md", "faq.txt"]
for filename in files:
if not os.path.exists(filename):
continue

# Get presigned URL
upload_info = client.data_forge.get_upload_url(
project_id,
filename=filename,
content_type="application/octet-stream",
)

# Upload to S3
with open(filename, "rb") as f:
httpx.put(upload_info["upload_url"], content=f.read())

# Register
doc = client.data_forge.register_document(
project_id,
filename=filename,
content_type="application/octet-stream",
file_size=os.path.getsize(filename),
s3_key=upload_info["s3_key"],
)
print(f" Uploaded: {doc.name}")

# --- Parse documents ---
print("\n=== Parsing Documents ===")
client.data_forge.parse_documents(project_id)
print("Parse job started")

# Wait for parsing (poll generations for completion)
time.sleep(30)

# Check chunks
chunks = client.data_forge.list_chunks(project_id)
print(f"Chunks created: {chunks.get('total', 0)}")

# --- Discover available models ---
print("\n=== Available Models ===")
models = client.data_forge.available_models()
for m in models[:5]:
print(f" {m['id']} ({m.get('provider', '')})")

# --- Generate training pairs ---
print("\n=== Starting Generation ===")
generation = client.data_forge.start_generation(
project_id,
model_id="gpt-4o",
temperature=0.7,
config={
"pairs_per_chunk": 3,
"difficulty_distribution": {"easy": 0.3, "medium": 0.5, "hard": 0.2},
},
)
gen_id = generation.id
print(f"Generation ID: {gen_id}")

# Monitor progress
while True:
gen = client.data_forge.retrieve_generation(gen_id)
print(f" Status: {gen.status}, Progress: {gen.progress}%")

if gen.status in ("completed", "failed", "cancelled"):
break
time.sleep(15)

if gen.status == "completed":
print(f"\nGeneration complete!")
print(f" Pairs generated: {gen.results.get('pairs_generated', 0)}")
print(f" Avg quality: {gen.results.get('avg_quality_score', 0):.0%}")
else:
print(f"Generation {gen.status}: {gen.error}")
return

# --- Review pairs ---
print("\n=== Reviewing Pairs ===")
pairs = client.data_forge.list_pairs(project_id, page_size=10)
print(f"Total pairs: {pairs.get('total', 0)}")

# Auto-accept high quality pairs
client.data_forge.bulk_action_pairs(
project_id,
action="accept",
filters={"quality_score_gte": 0.9},
)
print("Accepted all pairs with quality >= 90%")

# --- Export dataset ---
print("\n=== Exporting Dataset ===")
export = client.data_forge.export_dataset(
project_id,
format="chatml",
include_system_prompt=True,
min_quality_score=0.7,
)
print(f"Export v{export.version}: {export.pair_count} pairs")
print(f"Download: {export.download_url}")

# --- Analytics ---
print("\n=== Project Analytics ===")
analytics = client.data_forge.analytics(project_id)
print(f"Documents: {analytics.documents}")
print(f"Pairs: {analytics.pairs}")
print(f"Quality distribution: {analytics.quality_distribution}")


if __name__ == "__main__":
main()