Training Data API
Training sample collection and export classes.
TrainingSample
from src.training_data import TrainingSample
Class Definition
@dataclass
class TrainingSample:
"""
Training data sample.
Attributes:
sample_id: Unique identifier
persona_id: Source persona
persona_features: Persona feature dictionary
original_query: Query before style transfer
initial_query: Query after style transfer
prompt_trajectory: List of user prompts in order
full_conversation: Complete conversation messages
num_turns: Number of conversation turns
noisy_initial_queries: Noisy versions (if distractor enabled)
metadata: Additional metadata
"""
Methods
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for JSON export."""
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'TrainingSample':
"""Create from dictionary."""
TrainingDataCollector
from src.training_data import TrainingDataCollector
Class Definition
class TrainingDataCollector:
"""
Collects and transforms interactions to training samples.
Handles:
- Conversion from Interaction to TrainingSample
- Extraction of prompt trajectories
- Inclusion of noisy versions
"""
Constructor
def __init__(self, config: Dict[str, Any]):
"""
Initialize collector.
Args:
config: Configuration with training_data.format settings
"""
Methods
def interaction_to_training_sample(
self,
interaction: Dict[str, Any]
) -> TrainingSample:
"""
Convert single interaction to training sample.
Args:
interaction: Interaction dictionary
Returns:
TrainingSample object
"""
def collect_from_interactions(
self,
interactions: List[Dict[str, Any]]
) -> List[TrainingSample]:
"""
Convert multiple interactions to training samples.
Args:
interactions: List of interaction dictionaries
Returns:
List of TrainingSample objects
"""
TrainingDataExporter
from src.training_data import TrainingDataExporter
Class Definition
class TrainingDataExporter:
"""
Exports training samples to files.
Features:
- JSON export
- Optional train/val/test split
- Statistics generation
- Timestamped filenames
"""
Constructor
def __init__(self, config: Dict[str, Any]):
"""
Initialize exporter.
Args:
config: Configuration with paths and split settings
"""
Methods
def export_samples(
self,
samples: List[TrainingSample],
split: bool = False,
use_timestamp: bool = True
) -> Dict[str, str]:
"""
Export samples to JSON files.
Args:
samples: List of training samples
split: Whether to split into train/val/test
use_timestamp: Add timestamp to filenames
Returns:
Dictionary mapping split name to file path
{'train': 'path/train_samples.json', ...}
"""
def export_statistics(
self,
samples: List[TrainingSample],
use_timestamp: bool = True
) -> str:
"""
Export dataset statistics.
Returns:
Path to statistics JSON file
"""
def compute_statistics(
self,
samples: List[TrainingSample]
) -> Dict[str, Any]:
"""
Compute dataset statistics.
Returns:
Statistics dictionary with counts, averages, distributions
"""
Usage Examples
Collect from Interactions
from src.training_data import TrainingDataCollector, TrainingDataExporter
from src.interaction_generator import InteractionStorage
# Load interactions
storage = InteractionStorage("output/interactions")
interactions = storage.load_all()
# Initialize collector
collector = TrainingDataCollector(config)
# Convert to training samples
samples = collector.collect_from_interactions(interactions)
print(f"Collected {len(samples)} samples")
# Access sample data
for sample in samples[:3]:
print(f"ID: {sample.sample_id}")
print(f"Trajectory: {sample.prompt_trajectory}")
print(f"Turns: {sample.num_turns}")
print()
Export to Files
exporter = TrainingDataExporter(config)
# Export all (no split)
files = exporter.export_samples(
samples,
split=False,
use_timestamp=True
)
print(f"Exported to: {files['train']}")
# Export with split
files = exporter.export_samples(
samples,
split=True,
use_timestamp=False
)
print(f"Train: {files['train']}")
print(f"Val: {files['validation']}")
print(f"Test: {files['test']}")
# Export statistics
stats_file = exporter.export_statistics(samples)
print(f"Statistics: {stats_file}")
Compute Statistics
stats = exporter.compute_statistics(samples)
print(f"Total samples: {stats['total_samples']}")
print(f"Average turns: {stats['avg_turns']:.1f}")
print(f"Noise rate: {stats['distractor_stats']['noise_rate']:.1%}")
Complete Pipeline Integration
from src.enhanced_pipeline import EnhancedPersonaGenerationPipeline
pipeline = EnhancedPersonaGenerationPipeline("config.yaml")
result = pipeline.run(num_personas=100)
# Training data is automatically exported
print(f"Samples: {result['training_data']['total_samples']}")
print(f"Files: {result['training_data']['sample_files']}")
Output Format
Sample JSON
{
"sample_id": "sample_20260206_001",
"persona_id": "persona_001",
"persona_features": {
"role": "engineer",
"communication_style": "casual"
},
"original_query": "Help me write Python code",
"initial_query": "hey can u help me with python",
"prompt_trajectory": [
"hey can u help me with python",
"make it handle errors",
"looks good thanks"
],
"full_conversation": [
{"role": "user", "content": "hey can u help me with python"},
{"role": "assistant", "content": "Sure! What would you like?"},
{"role": "user", "content": "make it handle errors"},
{"role": "assistant", "content": "Here's the updated code..."},
{"role": "user", "content": "looks good thanks"},
{"role": "assistant", "content": "You're welcome!"}
],
"num_turns": 3,
"noisy_initial_queries": [
{
"noisy_text": "hey help me writ python",
"noise_type": "surface_noise",
"applied_strategies": ["typo_misspelling"]
}
],
"metadata": {
"interaction_id": "interaction_...",
"distractor_applied": true
}
}
Statistics JSON
{
"total_samples": 500,
"avg_turns": 3.2,
"avg_trajectory_length": 3.2,
"distractor_stats": {
"samples_with_noise": 125,
"noise_rate": 0.25
},
"persona_distribution": {
"engineer": 120,
"student": 95
},
"created_at": "2026-02-06T12:00:00"
}
See Also
Training Data - User guide
Interaction API - Source of training data