The bottleneck for most content creators is editing, captioning, thumbnails, and posting to multiple accounts — that takes 3-4 hours per video. An AI pipeline cuts this down to about 20 minutes of active work.
Here’s the full setup:
Pipeline Overview
- Ingest: Transfer footage + extract metadata
- Transcribe: Use Whisper for transcripts & timestamps
- Scene Detect: Shot detection to segment videos
- Clip Extract: Auto-find viral moments
- Audio Mix: Denoise, normalize audio, add background music if needed
- Caption Burn: Add styled captions to video
- Thumbnail Gen: Generate keyframes & AI text overlays for thumbnails
- Metadata Gen: Write platform-specific titles & hashtags with GPT
- Export: Optimize videos for 9:16, 16:9, and 1:1 formats
- Queue Post: Schedule or post to all accounts
Ingest & Metadata Extraction
import subprocess
from datetime import datetime
def ingest_footage(source_path):
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
job_id = f"job_{timestamp}"
job_dir = Path(f"./jobs/{job_id}/")
for subdir in ["input", "transcripts", "clips", "audio", "captions",
"thumbnails", "export"]:
(job_dir / subdir).mkdir(parents=True, exist_ok=True)
input_path = job_dir / "input" / Path(source_path).name
shutil.copy2(source_path, input_path)
probe = subprocess.run(
["ffprobe", "-v", "quiet", "-print_format", "json",
"-show_streams", "-show_format", str(input_path)],
capture_output=True, text=True
)
job_data = {
"job_id": job_id,
"input_path": str(input_path),
"status": "ingested"
}
with open(job_dir / "meta.json", 'w') as f:
json.dump(job_data, f)
print(f"✓ Ingested: {job_id} ({job_data['input_path']})")
Transcription
from faster_whisper import WhisperModel
def transcribe_footage(input_path):
model = WhisperModel("medium", device="cuda", compute_type="float16")
segments, info = model.transcribe(input_path)
transcript_data = {
"language": info.language,
"duration": info.duration,
"text": " ".join(s.text for s in segments),
"segments": [
{"start": s.start, "end": s.end,
"text": s.text.strip(),
"words": [{"word": w.word, "start": w.start, "end": w.end}
for w in (s.words or [])]}
for s in segments
]
}
with open(f"./jobs/job_*/transcripts/transcript.json", 'w') as f:
json.dump(transcript_data, f)
Clip Extraction
import openai
from datetime import timedelta
client = openai.OpenAI()
def extract_viral_clips(job_id):
transcript_path = Path(f"./jobs/{job_id}/transcripts/transcript.json")
prompt = """
Identify the best clips from this transcript for social media.
Return JSON array under key "clips".
"""
resp = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
temperature=0.3
)
clips_data = json.loads(resp.choices[0].message.content).get("clips", [])
for i, clip in enumerate(clips_data):
duration = clip["end_time"] - clip["start_time"]
output_path = f"./jobs/{job_id}/clips/clip_{i+1:03d}.mp4"
subprocess.run([
"ffmpeg", "-y",
"-ss", str(clip["start_time"]),
"-i", job_dir / "input" / Path(source_path).name,
"-t", str(duration),
"-c:v", "libx264", "-crf", "18",
"-c:a", "aac", "-b:a", "128k",
output_path
])
with open(f"./jobs/{job_id}/clips/scores.json", 'w') as f:
json.dump(clips_data, f)
Metadata Generation
def generate_post_metadata(job_id):
transcript_path = Path(f"./jobs/{job_id}/transcripts/transcript.json")
prompt = """
Generate metadata for YouTube, Instagram, TikTok, and Twitter.
Return JSON with platform-specific data.
"""
resp = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
temperature=0.6
)
metadata = json.loads(resp.choices[0].message.content)
with open(f"./jobs/{job_id}/post_metadata.json", 'w') as f:
json.dump(metadata, f)
Export & Queue
def export_for_platforms(job_id):
job_dir = Path(f"./jobs/{job_id}")
clip_files = sorted(job_dir.glob("clips/clip_*.mp4"))
for filename, vf_args in exports.items():
output_path = job_dir / "export" / filename
subprocess.run([
"ffmpeg", "-y",
"-i", str(clip_files[0]),
*vf_args,
"-c:v", "libx264", "-crf", "20", "-preset", "fast",
"-c:a", "aac", "-b:a", "128k",
str(output_path)
], capture_output=True)
Full Pipeline
def run_full_pipeline(source_path, brand="nepa_ai"):
job = ingest_footage(source_path)
job = transcribe_footage(job["input_path"])
job = extract_viral_clips(job["job_id"], max_clips=3)
job = generate_post_metadata(job["job_id"], brand=brand)
job = export_for_platforms(job["job_id"], clip_index=0)
print(f"\n✅ Pipeline complete: {job['job_id']}")
return job
# Run it
job = run_full_pipeline("./footage/todays_recording.mp4", brand="nepa_ai")
This pipeline runs in 8-12 minutes for a 30-minute recording on modern hardware. Review and approve the outputs before posting.
The NEPA AI Content Creator Stack packages everything — ingest, transcribe, clip extraction, captioning, thumbnail generation, metadata writing, and multi-platform posting — into one agent-driven workflow.
→ Get the Content Creator AI Stack at axon.nepa-ai.com



