aboutsummaryrefslogtreecommitdiff
path: root/src/subtitle_extraction/whisper.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/subtitle_extraction/whisper.rs')
-rw-r--r--src/subtitle_extraction/whisper.rs143
1 files changed, 0 insertions, 143 deletions
diff --git a/src/subtitle_extraction/whisper.rs b/src/subtitle_extraction/whisper.rs
deleted file mode 100644
index ffa2e47..0000000
--- a/src/subtitle_extraction/whisper.rs
+++ /dev/null
@@ -1,143 +0,0 @@
-use std::{
- io::{self, BufRead, BufReader},
- net::{TcpListener, TcpStream},
- sync::mpsc,
-};
-
-use anyhow::Context;
-use ffmpeg::{filter, frame};
-use serde::Deserialize;
-
-use crate::{subtitle_extraction::*, tracks::StreamIndex};
-
-#[derive(Debug, Deserialize)]
-struct WhisperCue {
- start: u64,
- end: u64,
- text: String,
-}
-
-pub fn generate_whisper_subtitles(
- // stream index to use when storing generated subtitles, this index
- // already has to be in TRACKS when this function is called!
- stream_ix: StreamIndex,
- context: ffmpeg::codec::Context,
- time_base: ffmpeg::Rational,
- packet_rx: mpsc::Receiver<ffmpeg::Packet>,
- sender: ComponentSender<SubtitleExtractor>,
-) -> anyhow::Result<()> {
- // FFmpeg's whisper filter will send the generated subtitles to us as JSON
- // objects over a TCP socket. This is the best solution I could find
- // because we need to use one of the protocols in
- // https://ffmpeg.org/ffmpeg-protocols.html, and TCP is the only one on the
- // list which is portable and supports non-blocking IO in Rust.
- let tcp_listener = TcpListener::bind("127.0.0.1:0")?;
-
- let mut decoder = context
- .decoder()
- .audio()
- .with_context(|| format!("error creating subtitle decoder for stream {}", stream_ix))?;
-
- let mut filter = filter::Graph::new();
-
- let abuffer_args = format!(
- "time_base={}:sample_rate={}:sample_fmt={}:channel_layout=0x{:x}",
- time_base,
- decoder.rate(),
- decoder.format().name(),
- decoder.channel_layout().bits()
- );
-
- let whisper_args = format!(
- "model={}:queue={}:destination=tcp\\\\://127.0.0.1\\\\:{}:format=json",
- "/Users/malte/repos/lleap/whisper-models/ggml-large-v3.bin",
- 30,
- tcp_listener.local_addr()?.port()
- );
- let filter_spec = format!("[src] whisper={} [sink]", whisper_args);
-
- filter.add(&filter::find("abuffer").unwrap(), "src", &abuffer_args)?;
- filter.add(&filter::find("abuffersink").unwrap(), "sink", "")?;
- filter
- .output("src", 0)?
- .input("sink", 0)?
- .parse(&filter_spec)?;
- filter.validate()?;
-
- let mut source_ctx = filter.get("src").unwrap();
- let mut sink_ctx = filter.get("sink").unwrap();
-
- let (tcp_stream, _) = tcp_listener.accept()?;
- tcp_stream.set_nonblocking(true)?;
-
- let mut transcript_reader = BufReader::new(tcp_stream);
- let mut line_buf = String::new();
-
- while let Ok(packet) = packet_rx.recv() {
- handle_packet(
- stream_ix,
- &sender,
- &mut decoder,
- source_ctx.source(),
- sink_ctx.sink(),
- &mut transcript_reader,
- &mut line_buf,
- packet,
- )
- .unwrap_or_else(|e| log::error!("error handling audio packet: {}", e))
- }
-
- Ok(())
-}
-
-// TODO: can we do this without passing all the arguments? this is kinda ugly
-fn handle_packet(
- stream_ix: StreamIndex,
- sender: &ComponentSender<SubtitleExtractor>,
- decoder: &mut ffmpeg::decoder::Audio,
- mut source: filter::Source,
- mut sink: filter::Sink,
- transcript_reader: &mut BufReader<TcpStream>,
- line_buf: &mut String,
- packet: ffmpeg::Packet,
-) -> anyhow::Result<()> {
- decoder.send_packet(&packet)?;
-
- let mut decoded = frame::Audio::empty();
- while decoder.receive_frame(&mut decoded).is_ok() {
- source.add(&decoded)?;
- }
-
- let mut out_frame = frame::Audio::empty();
- while sink.frame(&mut out_frame).is_ok() {}
-
- line_buf.clear();
- match transcript_reader.read_line(line_buf) {
- Ok(_) => {
- let whisper_cue: WhisperCue = serde_json::from_str(&line_buf)?;
-
- let cue = SubtitleCue {
- start: gst::ClockTime::from_mseconds(whisper_cue.start),
- end: gst::ClockTime::from_mseconds(whisper_cue.end),
- text: whisper_cue.text,
- };
-
- // TODO deduplicate this vs. the code in embedded.rs
- SUBTITLE_TRACKS
- .write()
- .get_mut(&stream_ix)
- .unwrap()
- .cues
- .push(cue.clone());
- sender
- .output(SubtitleExtractorOutput::NewCue(stream_ix, cue))
- .unwrap();
-
- Ok(())
- }
- Err(e) => match e.kind() {
- io::ErrorKind::WouldBlock => Ok(()),
- _ => Err(e)?,
- },
- }
-}