aboutsummaryrefslogtreecommitdiff
path: root/src/subtitles/extraction/whisper.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/subtitles/extraction/whisper.rs')
-rw-r--r--src/subtitles/extraction/whisper.rs139
1 files changed, 139 insertions, 0 deletions
diff --git a/src/subtitles/extraction/whisper.rs b/src/subtitles/extraction/whisper.rs
new file mode 100644
index 0000000..bd6fba7
--- /dev/null
+++ b/src/subtitles/extraction/whisper.rs
@@ -0,0 +1,139 @@
+use std::{
+ io::{self, BufRead, BufReader},
+ net::{TcpListener, TcpStream},
+ sync::mpsc,
+};
+
+use anyhow::Context;
+use ffmpeg::{filter, frame};
+use serde::Deserialize;
+
+use crate::{
+ subtitles::extraction::*,
+ subtitles::{StreamIndex, SubtitleCue},
+};
+
+#[derive(Debug, Deserialize)]
+struct WhisperCue {
+ start: u64,
+ end: u64,
+ text: String,
+}
+
+pub fn generate_whisper_subtitles(
+ // stream index to use when storing generated subtitles, this index
+ // already has to be in TRACKS when this function is called!
+ stream_ix: StreamIndex,
+ context: ffmpeg::codec::Context,
+ time_base: ffmpeg::Rational,
+ packet_rx: mpsc::Receiver<ffmpeg::Packet>,
+ sender: ComponentSender<SubtitleExtractor>,
+) -> anyhow::Result<()> {
+ // FFmpeg's whisper filter will send the generated subtitles to us as JSON
+ // objects over a TCP socket. This is the best solution I could find
+ // because we need to use one of the protocols in
+ // https://ffmpeg.org/ffmpeg-protocols.html, and TCP is the only one on the
+ // list which is portable and supports non-blocking IO in Rust.
+ let tcp_listener = TcpListener::bind("127.0.0.1:0")?;
+
+ let mut decoder = context
+ .decoder()
+ .audio()
+ .with_context(|| format!("error creating subtitle decoder for stream {}", stream_ix))?;
+
+ let mut filter = filter::Graph::new();
+
+ let abuffer_args = format!(
+ "time_base={}:sample_rate={}:sample_fmt={}:channel_layout=0x{:x}",
+ time_base,
+ decoder.rate(),
+ decoder.format().name(),
+ decoder.channel_layout().bits()
+ );
+
+ let whisper_args = format!(
+ "model={}:queue={}:destination=tcp\\\\://127.0.0.1\\\\:{}:format=json",
+ "/Users/malte/repos/lleap/whisper-models/ggml-large-v3.bin",
+ 30,
+ tcp_listener.local_addr()?.port()
+ );
+ let filter_spec = format!("[src] whisper={} [sink]", whisper_args);
+
+ filter.add(&filter::find("abuffer").unwrap(), "src", &abuffer_args)?;
+ filter.add(&filter::find("abuffersink").unwrap(), "sink", "")?;
+ filter
+ .output("src", 0)?
+ .input("sink", 0)?
+ .parse(&filter_spec)?;
+ filter.validate()?;
+
+ let mut source_ctx = filter.get("src").unwrap();
+ let mut sink_ctx = filter.get("sink").unwrap();
+
+ let (tcp_stream, _) = tcp_listener.accept()?;
+ tcp_stream.set_nonblocking(true)?;
+
+ let mut transcript_reader = BufReader::new(tcp_stream);
+ let mut line_buf = String::new();
+
+ while let Ok(packet) = packet_rx.recv() {
+ handle_packet(
+ stream_ix,
+ &sender,
+ &mut decoder,
+ source_ctx.source(),
+ sink_ctx.sink(),
+ &mut transcript_reader,
+ &mut line_buf,
+ packet,
+ )
+ .unwrap_or_else(|e| log::error!("error handling audio packet: {}", e))
+ }
+
+ Ok(())
+}
+
+// TODO: can we do this without passing all the arguments? this is kinda ugly
+fn handle_packet(
+ stream_ix: StreamIndex,
+ sender: &ComponentSender<SubtitleExtractor>,
+ decoder: &mut ffmpeg::decoder::Audio,
+ mut source: filter::Source,
+ mut sink: filter::Sink,
+ transcript_reader: &mut BufReader<TcpStream>,
+ line_buf: &mut String,
+ packet: ffmpeg::Packet,
+) -> anyhow::Result<()> {
+ decoder.send_packet(&packet)?;
+
+ let mut decoded = frame::Audio::empty();
+ while decoder.receive_frame(&mut decoded).is_ok() {
+ source.add(&decoded)?;
+ }
+
+ let mut out_frame = frame::Audio::empty();
+ while sink.frame(&mut out_frame).is_ok() {}
+
+ line_buf.clear();
+ match transcript_reader.read_line(line_buf) {
+ Ok(_) => {
+ let whisper_cue: WhisperCue = serde_json::from_str(&line_buf)?;
+
+ let cue = SubtitleCue {
+ text: whisper_cue.text,
+ start_time: gst::ClockTime::from_mseconds(whisper_cue.start),
+ end_time: gst::ClockTime::from_mseconds(whisper_cue.end),
+ };
+
+ sender
+ .output(SubtitleExtractorOutput::NewCue(stream_ix, cue))
+ .unwrap();
+
+ Ok(())
+ }
+ Err(e) => match e.kind() {
+ io::ErrorKind::WouldBlock => Ok(()),
+ _ => Err(e)?,
+ },
+ }
+}