Data Forge
Generate synthetic fine-tuning datasets from your documents using a teacher LLM.
Overview
The Data Forge resource provides:
- Project management for organizing dataset generation
- Document upload via presigned S3 URLs
- Document parsing and chunking pipeline
- Q&A pair generation using any AI model in your gateway
- Pair review, editing, and bulk curation
- Dataset export in ChatML or Alpaca JSONL format
- Project analytics and quality metrics
All Data Forge methods are accessed through client.data_forge.
Creating a Project
from strongly import Strongly
client = Strongly()
project = client.data_forge.create_project({
"name": "Customer Support FAQ",
"description": "Generate training data from support documentation",
})
print(f"Project created: {project.name} (ID: {project.id})")
Using the Model Class
from strongly import Strongly
from strongly._models.data_forge import DataForgeProjectCreate
client = Strongly()
project = client.data_forge.create_project(
DataForgeProjectCreate(
name="Customer Support FAQ",
description="Generate training data from support docs",
)
)
Listing and Retrieving Projects
from strongly import Strongly
client = Strongly()
# List all projects (paginated)
for project in client.data_forge.list_projects():
print(f"{project.name} — {project.status}")
if project.stats:
print(f" Documents: {project.stats.get('total_documents', 0)}")
print(f" Pairs: {project.stats.get('total_pairs', 0)}")
# Get a specific project
project = client.data_forge.retrieve_project("proj_abc123")
print(f"Name: {project.name}")
print(f"Chunks: {project.stats.get('total_chunks', 0)}")
Updating and Deleting Projects
from strongly import Strongly
client = Strongly()
# Update
project = client.data_forge.update_project(
"proj_abc123",
{"name": "Updated Name", "description": "New description"},
)
print(f"Updated: {project.name}")
# Delete (removes all documents, chunks, pairs, and S3 data)
result = client.data_forge.delete_project("proj_abc123")
print(result)
Uploading Documents
Documents are uploaded in two steps: get a presigned S3 URL, then register the document.
import httpx
from strongly import Strongly
client = Strongly()
project_id = "proj_abc123"
# Step 1: Get presigned upload URL
upload_info = client.data_forge.get_upload_url(
project_id,
filename="product-manual.pdf",
content_type="application/pdf",
)
# Step 2: Upload directly to S3
with open("product-manual.pdf", "rb") as f:
httpx.put(
upload_info["upload_url"],
content=f.read(),
headers={"Content-Type": "application/pdf"},
)
# Step 3: Register the uploaded document
import os
doc = client.data_forge.register_document(
project_id,
filename="product-manual.pdf",
content_type="application/pdf",
file_size=os.path.getsize("product-manual.pdf"),
s3_key=upload_info["s3_key"],
)
print(f"Document registered: {doc.name} (ID: {doc.id})")
Listing and Deleting Documents
from strongly import Strongly
client = Strongly()
# List documents in a project
docs = client.data_forge.list_documents("proj_abc123")
for doc in docs:
print(f"{doc.name} — {doc.parsing_status} ({doc.chunk_count} chunks)")
# Delete a document
client.data_forge.delete_document("proj_abc123", "doc_xyz789")
Parsing Documents
Start parsing uploaded documents into text chunks:
from strongly import Strongly
client = Strongly()
result = client.data_forge.parse_documents("proj_abc123")
print(f"Parse job started: {result}")
Viewing Chunks
from strongly import Strongly
client = Strongly()
# List chunks with pagination
chunks = client.data_forge.list_chunks(
"proj_abc123",
page=1,
page_size=20,
)
print(f"Total chunks: {chunks.get('total', 0)}")
for chunk in chunks.get("chunks", []):
print(f" [{chunk.get('position')}] {chunk.get('heading', 'No heading')}")
print(f" {chunk.get('content', '')[:100]}...")
# Filter by document
chunks = client.data_forge.list_chunks(
"proj_abc123",
document_id="doc_xyz789",
)
Editing Chunks
from strongly import Strongly
client = Strongly()
# Edit chunk content before generation
client.data_forge.update_chunk(
"proj_abc123",
"chunk_001",
content="Updated chunk text content...",
)
# Exclude a chunk from generation
client.data_forge.update_chunk(
"proj_abc123",
"chunk_002",
excluded=True,
)
Generating Training Pairs
Basic Generation
from strongly import Strongly
client = Strongly()
generation = client.data_forge.start_generation(
"proj_abc123",
model_id="gpt-4o",
)
print(f"Generation started: {generation.id}")
print(f"Status: {generation.status}")
Advanced Configuration
from strongly import Strongly
client = Strongly()
generation = client.data_forge.start_generation(
"proj_abc123",
model_id="gpt-4o",
generation_type="qa",
temperature=0.8,
system_prompt="Generate Q&A pairs focused on technical troubleshooting.",
num_pairs=500,
config={
"pairs_per_chunk": 5,
"difficulty_distribution": {"easy": 0.2, "medium": 0.5, "hard": 0.3},
"style_template": "troubleshooting",
},
)
print(f"Generation ID: {generation.id}")
print(f"Config: {generation.config}")
Monitoring Generation Progress
from strongly import Strongly
import time
client = Strongly()
generation = client.data_forge.start_generation(
"proj_abc123",
model_id="gpt-4o",
)
gen_id = generation.id
while True:
gen = client.data_forge.retrieve_generation(gen_id)
print(f"Status: {gen.status}, Progress: {gen.progress}%")
if gen.status in ("completed", "failed", "cancelled"):
break
time.sleep(10)
if gen.status == "completed":
print(f"Pairs generated: {gen.results.get('pairs_generated', 0)}")
print(f"Avg quality: {gen.results.get('avg_quality_score', 0):.0%}")
else:
print(f"Generation {gen.status}: {gen.error}")
Cancelling a Generation
from strongly import Strongly
client = Strongly()
client.data_forge.cancel_generation("gen_abc123")
print("Generation cancelled")
Generation History and Logs
Listing Generations
from strongly import Strongly
client = Strongly()
generations = client.data_forge.list_generations("proj_abc123")
for gen in generations:
print(f"{gen.status} — {gen.config.get('model_id', 'unknown')}")
if gen.results:
print(f" Pairs: {gen.results.get('pairs_generated', 0)}")
print(f" Quality: {gen.results.get('avg_quality_score', 0):.0%}")
Viewing Logs
from strongly import Strongly
client = Strongly()
logs = client.data_forge.generation_logs("gen_abc123", tail=50)
for log in logs.get("logs", []):
print(f"[{log['timestamp']}] [{log['level']}] {log['message']}")
Reviewing Pairs
Listing Pairs
from strongly import Strongly
client = Strongly()
# List all pairs
pairs = client.data_forge.list_pairs("proj_abc123", page=1, page_size=20)
print(f"Total pairs: {pairs.get('total', 0)}")
for pair in pairs.get("pairs", []):
print(f"Q: {pair.get('question', '')[:80]}")
print(f"A: {pair.get('answer', '')[:80]}")
print(f"Quality: {pair.get('qualityScore', 0):.0%} | Status: {pair.get('status')}")
print()
# Filter by review status
pending = client.data_forge.list_pairs(
"proj_abc123",
status="pending",
)
# Filter by generation run
from_gen = client.data_forge.list_pairs(
"proj_abc123",
generation_id="gen_abc123",
)
# Search across questions and answers
results = client.data_forge.list_pairs(
"proj_abc123",
search="password reset",
)
Updating Pairs
from strongly import Strongly
client = Strongly()
# Accept a pair
client.data_forge.update_pair("pair_001", status="accepted")
# Reject a pair
client.data_forge.update_pair("pair_002", status="rejected")
# Edit content and accept
client.data_forge.update_pair(
"pair_003",
question="How do I reset my password?",
answer="Navigate to Settings > Security > Change Password.",
status="accepted",
reviewer_notes="Simplified the answer",
)
Bulk Actions
from strongly import Strongly
client = Strongly()
# Accept specific pairs
client.data_forge.bulk_action_pairs(
"proj_abc123",
action="accept",
pair_ids=["pair_001", "pair_002", "pair_003"],
)
# Reject pairs matching a filter
client.data_forge.bulk_action_pairs(
"proj_abc123",
action="reject",
filters={"quality_score_lt": 0.5},
)
# Reset all pairs back to pending
client.data_forge.bulk_action_pairs(
"proj_abc123",
action="reset",
)
Exporting Datasets
Export as ChatML
from strongly import Strongly
client = Strongly()
export = client.data_forge.export_dataset(
"proj_abc123",
format="chatml",
include_system_prompt=True,
min_quality_score=0.8,
)
print(f"Version: {export.version}")
print(f"Pairs exported: {export.pair_count}")
print(f"File size: {export.file_size} bytes")
print(f"Download URL: {export.download_url}")
Export as Alpaca
from strongly import Strongly
client = Strongly()
export = client.data_forge.export_dataset(
"proj_abc123",
format="alpaca",
)
print(f"Download: {export.download_url}")
Retrieving Past Exports
from strongly import Strongly
client = Strongly()
# Get download URL for a specific export version
export = client.data_forge.get_export("proj_abc123", version=1)
print(f"Version {export.version}: {export.download_url}")
Analytics
from strongly import Strongly
client = Strongly()
analytics = client.data_forge.analytics("proj_abc123")
print(f"Project: {analytics.project_name}")
print(f"Documents: {analytics.documents}")
print(f"Chunks: {analytics.chunks}")
print(f"Pairs: {analytics.pairs}")
print(f"Quality Distribution: {analytics.quality_distribution}")
print(f"Exports: {analytics.exports_count}")
Discovering Available Models
List AI models available for use as teacher models:
from strongly import Strongly
client = Strongly()
models = client.data_forge.available_models()
for model in models:
print(f"{model['id']} — {model.get('name', '')} ({model.get('provider', '')})")
Method Reference
| Method | Description | Returns |
|---|---|---|
list_projects(*, status=None, search=None, limit=50) | List projects | SyncPaginator[DataForgeProject] |
create_project(body) | Create a project | DataForgeProject |
retrieve_project(project_id) | Get project by ID | DataForgeProject |
update_project(project_id, body) | Update a project | DataForgeProject |
delete_project(project_id) | Delete a project | dict |
get_upload_url(project_id, *, filename, content_type) | Get presigned upload URL | dict |
register_document(project_id, *, filename, content_type, file_size, s3_key) | Register uploaded document | DataForgeDocument |
list_documents(project_id) | List project documents | List[DataForgeDocument] |
delete_document(project_id, document_id) | Delete a document | dict |
list_chunks(project_id, *, document_id=None, page=1, page_size=50) | List parsed chunks | dict |
update_chunk(project_id, chunk_id, *, content=None, **kwargs) | Update a chunk | dict |
parse_documents(project_id) | Start parse job | dict |
start_generation(project_id, *, model_id, generation_type="qa", temperature=0.7, system_prompt=None, num_pairs=None, max_tokens=None, chunk_ids=None, config=None, **kwargs) | Start generation job | DataForgeGeneration |
cancel_generation(generation_id) | Cancel a generation | dict |
list_generations(project_id) | List generation runs | List[DataForgeGeneration] |
retrieve_generation(generation_id) | Get generation by ID | DataForgeGeneration |
generation_logs(generation_id, *, tail=200) | Get generation logs | dict |
list_pairs(project_id, *, status=None, generation_id=None, search=None, page=1, page_size=50) | List pairs | dict |
update_pair(pair_id, *, status=None, question=None, answer=None, reviewer_notes=None, **kwargs) | Update a pair | dict |
bulk_action_pairs(project_id, *, action, pair_ids=None, filters=None) | Bulk action on pairs | dict |
export_dataset(project_id, *, format="chatml", include_system_prompt=True, min_quality_score=None) | Export dataset | DataForgeExport |
get_export(project_id, version) | Get export download URL | DataForgeExport |
analytics(project_id) | Get project analytics | DataForgeAnalytics |
available_models() | List available teacher models | List[dict] |
Response Models
DataForgeProject Fields
| Field | Type | Description |
|---|---|---|
id | str | Unique identifier |
project_id | str | Project ID |
name | str | Project name |
description | str | Project description |
status | str | Project status |
user_id | str | Owner user ID |
organization_id | str | Organization ID |
s3_prefix | str | S3 storage prefix |
stats | dict | Aggregate stats (total_documents, total_chunks, total_pairs, accepted_pairs, rejected_pairs, pending_pairs, avg_quality_score) |
default_config | dict | Default generation config |
current_version | int | Current export version |
versions | list | Export version history |
created_at | str | Creation timestamp |
updated_at | str | Last update timestamp |
DataForgeDocument Fields
| Field | Type | Description |
|---|---|---|
id | str | Unique identifier |
document_id | str | Document ID |
project_id | str | Parent project ID |
name | str | Filename |
mime_type | str | MIME type |
file_size | int | File size in bytes |
s3_key | str | S3 object key |
parsing_status | str | Status: pending, processing, completed, failed |
parsing_error | str | Error message if parsing failed |
parsed_metadata | dict | Parsing metadata (page_count, word_count, language, headings) |
chunk_count | int | Number of chunks created |
DataForgeGeneration Fields
| Field | Type | Description |
|---|---|---|
id | str | Unique identifier |
generation_id | str | Generation ID |
project_id | str | Parent project ID |
config | dict | Generation configuration |
status | str | Status: pending, parsing, generating, validating, completed, failed, cancelled |
progress | float | Completion percentage (0-100) |
job_name | str | K8s job name |
results | dict | Results (chunks_processed, pairs_generated, pairs_valid, avg_quality_score, tokens_used) |
error | dict | Error details if failed |
started_at | str | Job start timestamp |
completed_at | str | Job completion timestamp |
DataForgeExport Fields
| Field | Type | Description |
|---|---|---|
version | int | Export version number |
format | str | Export format (chatml or alpaca) |
s3_key | str | S3 key for the exported file |
pair_count | int | Number of pairs in the export |
file_size | int | File size in bytes |
download_url | str | Presigned download URL |
include_system_prompt | bool | Whether system prompt is included |
min_quality_score | float | Quality score filter applied |
DataForgeAnalytics Fields
| Field | Type | Description |
|---|---|---|
project_id | str | Project ID |
project_name | str | Project name |
documents | dict | Document statistics |
chunks | dict | Chunk statistics |
pairs | dict | Pair statistics (total, accepted, rejected, pending, avg_quality_score) |
quality_distribution | list | Quality score distribution |
generations | dict | Generation run statistics |
exports_count | int | Number of exports |
Complete Example
from strongly import Strongly
import httpx
import os
import time
def main():
client = Strongly()
# --- Create project ---
print("=== Creating Project ===")
project = client.data_forge.create_project({
"name": "Product Docs Training Data",
"description": "Generate Q&A pairs from product documentation",
})
project_id = project.id
print(f"Project ID: {project_id}")
# --- Upload documents ---
print("\n=== Uploading Documents ===")
files = ["user-guide.pdf", "api-reference.md", "faq.txt"]
for filename in files:
if not os.path.exists(filename):
continue
# Get presigned URL
upload_info = client.data_forge.get_upload_url(
project_id,
filename=filename,
content_type="application/octet-stream",
)
# Upload to S3
with open(filename, "rb") as f:
httpx.put(upload_info["upload_url"], content=f.read())
# Register
doc = client.data_forge.register_document(
project_id,
filename=filename,
content_type="application/octet-stream",
file_size=os.path.getsize(filename),
s3_key=upload_info["s3_key"],
)
print(f" Uploaded: {doc.name}")
# --- Parse documents ---
print("\n=== Parsing Documents ===")
client.data_forge.parse_documents(project_id)
print("Parse job started")
# Wait for parsing (poll generations for completion)
time.sleep(30)
# Check chunks
chunks = client.data_forge.list_chunks(project_id)
print(f"Chunks created: {chunks.get('total', 0)}")
# --- Discover available models ---
print("\n=== Available Models ===")
models = client.data_forge.available_models()
for m in models[:5]:
print(f" {m['id']} ({m.get('provider', '')})")
# --- Generate training pairs ---
print("\n=== Starting Generation ===")
generation = client.data_forge.start_generation(
project_id,
model_id="gpt-4o",
temperature=0.7,
config={
"pairs_per_chunk": 3,
"difficulty_distribution": {"easy": 0.3, "medium": 0.5, "hard": 0.2},
},
)
gen_id = generation.id
print(f"Generation ID: {gen_id}")
# Monitor progress
while True:
gen = client.data_forge.retrieve_generation(gen_id)
print(f" Status: {gen.status}, Progress: {gen.progress}%")
if gen.status in ("completed", "failed", "cancelled"):
break
time.sleep(15)
if gen.status == "completed":
print(f"\nGeneration complete!")
print(f" Pairs generated: {gen.results.get('pairs_generated', 0)}")
print(f" Avg quality: {gen.results.get('avg_quality_score', 0):.0%}")
else:
print(f"Generation {gen.status}: {gen.error}")
return
# --- Review pairs ---
print("\n=== Reviewing Pairs ===")
pairs = client.data_forge.list_pairs(project_id, page_size=10)
print(f"Total pairs: {pairs.get('total', 0)}")
# Auto-accept high quality pairs
client.data_forge.bulk_action_pairs(
project_id,
action="accept",
filters={"quality_score_gte": 0.9},
)
print("Accepted all pairs with quality >= 90%")
# --- Export dataset ---
print("\n=== Exporting Dataset ===")
export = client.data_forge.export_dataset(
project_id,
format="chatml",
include_system_prompt=True,
min_quality_score=0.7,
)
print(f"Export v{export.version}: {export.pair_count} pairs")
print(f"Download: {export.download_url}")
# --- Analytics ---
print("\n=== Project Analytics ===")
analytics = client.data_forge.analytics(project_id)
print(f"Documents: {analytics.documents}")
print(f"Pairs: {analytics.pairs}")
print(f"Quality distribution: {analytics.quality_distribution}")
if __name__ == "__main__":
main()