The AI Agent Content Pipeline: From Raw Footage to Posted — Fully Automated
The full content workflow — film, ingest, transcribe, extract clips, caption, thumbnail, and post — can run automatically with AI agents. Here's the complete pipeline architecture and code.
The bottleneck for most content creators isn't filming — it's everything that comes after. Editing, captioning, thumbnail creation, writing captions for each platform, posting to 5 accounts. It can take 3-4 hours of post-production per video.
An AI agent pipeline collapses this to about 20 minutes of active work: transfer the footage, kick off the pipeline, review the output, approve and post.
Here's the complete architecture.
Pipeline Overview
1. Ingest Raw footage → working directory, metadata extracted
2. Transcribe Whisper → transcript + timestamps + keywords
3. Scene Detect Shot detection → segment boundaries
4. Clip Extract Viral moment detection → top clips ranked by score
5. Audio Mix Denoise + normalize + background music if needed
6. Caption Burn SRT → styled captions burned into video
7. Thumbnail Gen Keyframe selection + AI text overlay → thumbnail.jpg
8. Metadata Gen GPT → platform-specific title, caption, hashtags
9. Export Platform-optimized encodes (9:16, 16:9, 1:1)
10. Queue Post Schedule or immediately post to all accounts
Each stage is a discrete function that reads from and writes to a shared job directory. Stages can run in parallel where dependencies allow.
Job Directory Structure
./jobs/job_20260322_001/
├── input/
│ └── raw_footage.mp4
├── meta.json ← job config + status tracking
├── transcripts/
│ ├── transcript.json ← full Whisper output with timestamps
│ └── transcript.srt
├── clips/
│ ├── clip_001.mp4 ← extracted highlight clips
│ ├── clip_002.mp4
│ └── scores.json ← virality scores for each clip
├── audio/
│ └── clean_audio.wav
├── captions/
│ └── styled.srt
├── thumbnails/
│ └── thumbnail_001.jpg
├── export/
│ ├── reel_9x16.mp4 ← final exports
│ ├── youtube_16x9.mp4
│ └── square_1x1.mp4
└── post_metadata.json ← captions, titles, hashtags per platform
Stage 1: Ingest and Metadata Extraction
import subprocess
import json
import shutil
from pathlib import Path
from datetime import datetime
def ingest_footage(source_path: str, jobs_dir: str = "./jobs") -> dict:
"""
Copy footage to a new job directory, extract video metadata.
Returns job config dict.
"""
# Create job directory
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
job_id = f"job_{timestamp}"
job_dir = Path(jobs_dir) / job_id
for subdir in ["input", "transcripts", "clips", "audio", "captions",
"thumbnails", "export"]:
(job_dir / subdir).mkdir(parents=True)
# Copy input file
input_path = job_dir / "input" / Path(source_path).name
shutil.copy2(source_path, input_path)
# Extract metadata with ffprobe
probe = subprocess.run([
"ffprobe", "-v", "quiet", "-print_format", "json",
"-show_streams", "-show_format", str(input_path)
], capture_output=True, text=True)
probe_data = json.loads(probe.stdout)
fmt = probe_data.get("format", {})
job = {
"job_id": job_id,
"job_dir": str(job_dir),
"input_path": str(input_path),
"duration": float(fmt.get("duration", 0)),
"file_size_mb": int(fmt.get("size", 0)) / 1_000_000,
"created_at": datetime.now().isoformat(),
"status": "ingested",
"stages_complete": []
}
(job_dir / "meta.json").write_text(json.dumps(job, indent=2))
print(f"✓ Ingested: {job_id} ({job['duration']:.0f}s, {job['file_size_mb']:.1f}MB)")
return job
Stage 2: Transcription
from faster_whisper import WhisperModel
_whisper_model = None
def get_whisper():
global _whisper_model
if _whisper_model is None:
_whisper_model = WhisperModel("medium", device="cuda", compute_type="float16")
return _whisper_model
def transcribe_footage(job: dict) -> dict:
"""Transcribe the input footage. Updates job with transcript paths."""
model = get_whisper()
input_path = job["input_path"]
transcript_dir = Path(job["job_dir"]) / "transcripts"
segments, info = model.transcribe(
input_path,
language="en",
word_timestamps=True,
beam_size=5
)
all_segments = list(segments)
full_text = " ".join(s.text for s in all_segments).strip()
# Build structured transcript
transcript_data = {
"language": info.language,
"duration": info.duration,
"text": full_text,
"segments": [
{
"start": s.start,
"end": s.end,
"text": s.text.strip(),
"words": [{"word": w.word, "start": w.start, "end": w.end}
for w in (s.words or [])]
}
for s in all_segments
]
}
(transcript_dir / "transcript.json").write_text(json.dumps(transcript_data, indent=2))
# Write SRT
srt_lines = []
for i, seg in enumerate(all_segments, 1):
srt_lines.append(f"{i}\n{_fmt_ts(seg.start)} --> {_fmt_ts(seg.end)}\n{seg.text.strip()}\n")
(transcript_dir / "transcript.srt").write_text("\n".join(srt_lines))
job["transcript_path"] = str(transcript_dir / "transcript.json")
job["stages_complete"].append("transcribe")
return job
def _fmt_ts(seconds: float) -> str:
from datetime import timedelta
td = timedelta(seconds=seconds)
h, rem = divmod(int(td.total_seconds()), 3600)
m, s = divmod(rem, 60)
ms = int((td.total_seconds() % 1) * 1000)
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
Stage 3: Viral Clip Extraction
import openai
import re
client = openai.OpenAI()
def extract_viral_clips(job: dict, max_clips: int = 5) -> dict:
"""
Use GPT to identify the best clips from the transcript.
Returns timestamp ranges + scores.
"""
with open(job["transcript_path"]) as f:
transcript = json.load(f)
# Build transcript with timestamps for GPT
timestamped = "\n".join(
f"[{seg['start']:.1f}s-{seg['end']:.1f}s]: {seg['text']}"
for seg in transcript["segments"]
)
prompt = f"""
This is a video transcript with timestamps. Identify the {max_clips} best clips for social media.
Transcript:
{timestamped[:4000]}
For each clip, provide:
- start_time: float (seconds)
- end_time: float (seconds, clip should be 15-60 seconds max)
- hook: the opening line that makes people stop scrolling
- virality_score: 1-10 (10 = extremely shareable)
- reason: why this clip works
Prioritize: strong hooks, emotional moments, surprising facts, how-to segments.
Return as JSON array under key "clips".
"""
resp = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
temperature=0.3
)
clips_data = json.loads(resp.choices[0].message.content).get("clips", [])
clips_dir = Path(job["job_dir"]) / "clips"
# Extract each clip with ffmpeg
for i, clip in enumerate(clips_data):
output_path = clips_dir / f"clip_{i+1:03d}.mp4"
duration = clip["end_time"] - clip["start_time"]
subprocess.run([
"ffmpeg", "-y",
"-ss", str(clip["start_time"]),
"-i", job["input_path"],
"-t", str(duration),
"-c:v", "libx264", "-crf", "18",
"-c:a", "aac", "-b:a", "128k",
str(output_path)
], capture_output=True)
clip["file_path"] = str(output_path)
(clips_dir / "scores.json").write_text(json.dumps(clips_data, indent=2))
job["clips"] = clips_data
job["stages_complete"].append("clip_extract")
return job
Stage 4: Platform Metadata Generation
def generate_post_metadata(job: dict, brand: str = "nepa_ai") -> dict:
"""Generate platform-specific captions, titles, and hashtags."""
with open(job["transcript_path"]) as f:
transcript = json.load(f)
full_text = transcript["text"][:1500]
prompt = f"""
Brand: {brand}
Video transcript summary: {full_text}
Generate post metadata for each platform. Return JSON with keys:
- youtube: {{title, description (500 words), tags: list}}
- instagram: {{caption (2200 chars max), hashtags: list of 30}}
- tiktok: {{caption (150 chars max), hashtags: list of 10}}
- twitter: {{tweet (280 chars), hashtags: list of 5}}
Tone: professional but conversational. Include a CTA on each platform.
For YouTube, include timestamps based on the transcript.
"""
resp = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
temperature=0.6
)
metadata = json.loads(resp.choices[0].message.content)
meta_path = Path(job["job_dir"]) / "post_metadata.json"
meta_path.write_text(json.dumps(metadata, indent=2))
job["post_metadata_path"] = str(meta_path)
job["stages_complete"].append("metadata")
return job
Stage 5: Export and Queue
def export_for_platforms(job: dict, clip_index: int = 0) -> dict:
"""Export video in platform-optimized formats."""
clip_files = sorted(Path(job["job_dir"]).glob("clips/clip_*.mp4"))
if not clip_files:
raise ValueError("No clips found — run extract_viral_clips first")
source = str(clip_files[clip_index])
export_dir = Path(job["job_dir"]) / "export"
exports = {
"reel_9x16.mp4": ["-vf", "scale=1080:1920:force_original_aspect_ratio=decrease,pad=1080:1920:(ow-iw)/2:(oh-ih)/2"],
"youtube_16x9.mp4": ["-vf", "scale=1920:1080:force_original_aspect_ratio=decrease,pad=1920:1080:(ow-iw)/2:(oh-ih)/2"],
"square_1x1.mp4": ["-vf", "scale=1080:1080:force_original_aspect_ratio=decrease,pad=1080:1080:(ow-iw)/2:(oh-ih)/2"],
}
for filename, vf_args in exports.items():
output = str(export_dir / filename)
subprocess.run([
"ffmpeg", "-y", "-i", source,
*vf_args,
"-c:v", "libx264", "-crf", "20", "-preset", "fast",
"-c:a", "aac", "-b:a", "128k",
output
], capture_output=True)
print(f" ✓ Exported: {filename}")
job["exports"] = {k: str(export_dir / k) for k in exports}
job["stages_complete"].append("export")
return job
## Full Pipeline Runner
def run_full_pipeline(footage_path: str, brand: str = "nepa_ai"):
"""Execute the complete film-to-post pipeline."""
print(f"\n🎬 Starting content pipeline for: {footage_path}")
job = ingest_footage(footage_path)
job = transcribe_footage(job)
job = extract_viral_clips(job, max_clips=3)
job = generate_post_metadata(job, brand=brand)
job = export_for_platforms(job, clip_index=0)
job["status"] = "ready_to_post"
Path(job["job_dir"] + "/meta.json").write_text(json.dumps(job, indent=2))
print(f"\n✅ Pipeline complete: {job['job_id']}")
print(f" Clips extracted: {len(job['clips'])}")
print(f" Exports: {list(job['exports'].keys())}")
print(f" Review at: {job['job_dir']}")
return job
# Kick it off
job = run_full_pipeline("./footage/todays_recording.mp4", brand="nepa_ai")
The full pipeline runs in about 8-12 minutes for a 30-minute recording on modern hardware. You review the outputs, make any edits, and approve for posting.
The NEPA AI Content Creator Stack packages the complete pipeline — ingest, transcribe, clip extraction, captioning, thumbnail generation, metadata writing, and multi-platform posting — into a single agent-driven workflow.
→ Get the Content Creator AI Stack at /shop/content-creator-ai-stack
Film it. The agent handles the rest.