aboutsummaryrefslogtreecommitdiff
path: root/src/subtitles/extraction/embedded.rs
diff options
context:
space:
mode:
authorMalte Voos <git@mal.tc>2025-12-05 15:35:38 +0100
committerMalte Voos <git@mal.tc>2025-12-05 15:43:58 +0100
commitc347b6133365dcf1b7da4e77890b20d04d6cfba4 (patch)
treec83aac6f7d1e6edc57e607f01e5d3eeee8da4a0e /src/subtitles/extraction/embedded.rs
parent652b1c2a0ce7db4885ebc51f7f09133a43401442 (diff)
downloadlleap-c347b6133365dcf1b7da4e77890b20d04d6cfba4.tar.gz
lleap-c347b6133365dcf1b7da4e77890b20d04d6cfba4.zip
implement machine translation; various fixes and refactorings
Diffstat (limited to 'src/subtitles/extraction/embedded.rs')
-rw-r--r--src/subtitles/extraction/embedded.rs116
1 files changed, 116 insertions, 0 deletions
diff --git a/src/subtitles/extraction/embedded.rs b/src/subtitles/extraction/embedded.rs
new file mode 100644
index 0000000..920f52b
--- /dev/null
+++ b/src/subtitles/extraction/embedded.rs
@@ -0,0 +1,116 @@
+use std::sync::mpsc;
+
+use anyhow::Context;
+
+use crate::{subtitles::SubtitleCue, subtitles::extraction::*};
+
+pub fn extract_embedded_subtitles(
+ // stream index to use when storing extracted subtitles, this index already
+ // has to be in TRACKS when this function is called!
+ stream_ix: StreamIndex,
+ context: ffmpeg::codec::Context,
+ time_base: ffmpeg::Rational,
+ packet_rx: mpsc::Receiver<ffmpeg::Packet>,
+ sender: ComponentSender<SubtitleExtractor>,
+) -> anyhow::Result<()> {
+ let mut decoder = context
+ .decoder()
+ .subtitle()
+ .with_context(|| format!("error creating subtitle decoder for stream {}", stream_ix))?;
+
+ while let Ok(packet) = packet_rx.recv() {
+ let mut subtitle = ffmpeg::Subtitle::new();
+ match decoder.decode(&packet, &mut subtitle) {
+ Ok(true) => {
+ if let Some(cue) = parse_subtitle(&subtitle, &packet, time_base) {
+ sender
+ .output(SubtitleExtractorOutput::NewCue(stream_ix, cue))
+ .unwrap();
+ } else {
+ log::error!("error parsing subtitle at pts {:?}", packet.pts())
+ }
+ }
+ Ok(false) => {
+ log::debug!("got empty (?) subtitle, not sure if this should ever happen");
+ }
+ Err(e) => {
+ log::error!("error decoding subtitle: {:?}", e)
+ }
+ }
+ }
+
+ Ok(())
+}
+
+fn parse_subtitle(
+ subtitle: &ffmpeg::Subtitle,
+ packet: &ffmpeg::Packet,
+ time_base: Rational,
+) -> Option<SubtitleCue> {
+ let pts_to_clock_time = |pts: i64| {
+ let nseconds: i64 =
+ (pts * time_base.numerator() as i64 * 1_000_000_000) / time_base.denominator() as i64;
+ gst::ClockTime::from_nseconds(nseconds as u64)
+ };
+
+ let text = subtitle
+ .rects()
+ .into_iter()
+ .map(|rect| match rect {
+ ffmpeg::subtitle::Rect::Text(text) => text.get().to_string(),
+ ffmpeg::subtitle::Rect::Ass(ass) => {
+ extract_dialogue_text(ass.get()).unwrap_or(String::new())
+ }
+ _ => String::new(),
+ })
+ .collect::<Vec<String>>()
+ .join("\n— ");
+
+ let start_time = pts_to_clock_time(packet.pts()?);
+ let end_time = pts_to_clock_time(packet.pts()? + packet.duration());
+
+ Some(SubtitleCue {
+ text,
+ start_time,
+ end_time,
+ })
+}
+
+fn extract_dialogue_text(dialogue_line: &str) -> Option<String> {
+ // ASS dialogue format: ReadOrder,Layer,Style,Name,MarginL,MarginR,MarginV,Effect,Text
+ // we need the 9th field (Text), so split on comma but only take first 9 splits
+ // see also https://github.com/FFmpeg/FFmpeg/blob/a700f0f72d1f073e5adcfbb16f4633850b0ef51c/libavcodec/ass_split.c#L433
+ let text = dialogue_line.splitn(9, ',').last()?;
+
+ // remove ASS override codes (formatting tags) like {\b1}, {\i1}, {\c&Hffffff&}, etc.
+ let mut result = String::new();
+ let mut in_tag = false;
+ let mut char_iter = text.chars().peekable();
+
+ while let Some(c) = char_iter.next() {
+ if c == '{' && char_iter.peek() == Some(&'\\') {
+ in_tag = true;
+ } else if c == '}' {
+ in_tag = false;
+ } else if !in_tag {
+ // process line breaks and hard spaces
+ if c == '\\' {
+ match char_iter.peek() {
+ Some(&'N') => {
+ char_iter.next();
+ result.push('\n');
+ }
+ Some(&'n') | Some(&'h') => {
+ char_iter.next();
+ result.push(' ');
+ }
+ _ => result.push(c),
+ }
+ } else {
+ result.push(c);
+ }
+ }
+ }
+
+ Some(result)
+}