diff --git a/.gitignore b/.gitignore index af194c4079eeee88273b96ec68b25aa2acbf0e7d..8b600467339944c16699646e590bbfb733354d92 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,19 @@ +# local envs timerenv/ +timerenv2/ +srts/ + +# build folders lyrictimer/__pycache__/ -*.srt *.egg-info -srts/ dist/ -build/ \ No newline at end of file +build/ + +# lyric formats +*.srt +*.lrc +*.vtt +*.ltr + +# spleeter temp +pretrained_models/ \ No newline at end of file diff --git a/README.md b/README.md index 02e1f7ce4300adc306c9d1e593749ba6e0e6f601..89f8435ab1da49a75fe395d10f00cb45c627b492 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ After that, call the program, providing the music and lyrics file. lyrictimer path/to/song.mp3 path/to/lyrics.txt ``` -By default, the lyrics with timecodes will be saved as a SubRip (srt) file with the same name as the lyrics file in the current folder. +By default, the lyrics with timecodes will be saved as a SubRip (srt) file with the same name as the lyrics file in the current folder. However, the program supports WebVTT (vtt, useful for YouTube subtitles) and LRC files. ### Examples - Transcribe `another_tomorrow.mp3` with the lyrics being in `lyrics/another_tomorrow.txt`: @@ -46,18 +46,20 @@ Lyric Timer can also be configured using arguments. The entire list can be seen lyrictimer --model large-v2 -o Music/lyrics/saint.srt Music/Favorites/Saint.mp3 Music/lyrics/saint.txt ``` -- Transcribe `Music/Favorites/IThreeU/I3U.flac` with the lyrics being in `Music/lyrics/i_three_u.txt`, highlighting the lyrics: +- Transcribe `Music/Favorites/Lovebirds/I3U.flac` with the lyrics being in `Music/lyrics/i_three_u.txt`, highlighting the lyrics: ``` -lyrictimer -w Music/Favorites/IThreeU/I3U.flac Music/lyrics/i_three_u.txt +lyrictimer -w Music/Favorites/Lovebirds/I3U.flac Music/lyrics/i_three_u.txt ``` The lyrics will be italic and underlined: ![Screenshot of a music video with the words "Here come old flat top, he come grooving up slowly". The word "flat" is highlighted.](img/highlight_mode.png) -- Transcribe `Music/Favorites/Ami/Mon_Ami.wav` with the lyrics being in `Music/lyrics/mon_ami.txt`, specifying the language as French and using the `base` model: +> If you're exporting in the LRC format, the output file will be in [enhanced/A2 LRC format](https://en.wikipedia.org/wiki/LRC_(file_format)#A2_extension:_word_time_tag) with word-level timestamps. Keep in mind that not all media players support this format. + +- Transcribe `Music/Favorites/Groupe_Musique/Mon_Ami.wav` with the lyrics being in `Music/lyrics/mon_ami.txt`, specifying the language as French and using the `base` model: ``` -lyrictimer -l fr --model base Music/Favorites/Ami/Mon_Ami.wav Music/lyrics/mon_ami.txt +lyrictimer -l fr --model base Music/Favorites/Groupe_Musique/Mon_Ami.wav Music/lyrics/mon_ami.txt ``` ## F. A. Q. @@ -66,6 +68,8 @@ lyrictimer -l fr --model base Music/Favorites/Ami/Mon_Ami.wav Music/lyrics/mon_a The program is just using Whisper to transcribe the song, giving the lyrics as a prompt (so the AI makes less transcription errors). ### The transcribed lyrics are wrong/nonsense +Of course, no voice to text model is 100% accurate, but here are some tips to help improve the accuracy: + - Whisper's [transcription quality](https://github.com/openai/whisper#available-models-and-languages) naturally plays a huge role in the quality of the lyrics. - The transcription quality of songs with verses in multiple languages (e.g. the chorus or the last verse being in a different language to the rest of the song) is impacted by Whisper's accuracy in _both_ languages, regardless of the language of an individual verse. - For example, if a mostly English song has a verse in Russian, the quality of the timed lyrics in both the Russian verse and other parts is impacted by Whisper's accuracy in both English and Russian. @@ -75,7 +79,8 @@ The program is just using Whisper to transcribe the song, giving the lyrics as a - If the language of the song is known (and especially if the transcribed lyrics contain other alphabets), try specifying the language: `lyrictimer -l ...`. - Additionally, if the song is in English, try an English only model (see the tip above). - The model works worse with heavily processed voices. -- Try phoneme-based correction. +- Try phoneme-based correction. It essentially works by using the transcribed text as a guide to pick the real lyrics instead of using the transcribed lyrics. +- Try the "split vocals" feature. This sometimes results in reduced accuracy, but could help in other times. - Sometimes, using another source for the lyrics helps. - Lyric files that have punctuation and don't shorten the text (e.g. using "(x2)" to indicate that the verse is repeated twice or using "(chorus)" after writing the chorus lyrics once instead of repeating the chorus every time) usually result in transcriptions that are more accurate to the source. @@ -89,7 +94,7 @@ set "PHONEMIZER_ESPEAK_PATH=C:\Program Files\eSpeak NG" ``` Assuming eSpeak is installed in the default path, `C:\Program Files\eSpeak NG`. If it's not installed there change the path. -## Usage from Python +## Usage from code Lyric Timer can also be used as a library. Here's an example: ```python from lyrictimer import transcribe @@ -124,7 +129,7 @@ This will return a similar output (i.e. you will still get the sentences), but y The words' timestamps are relative to the entire song, not the sentence they're in. -## Phoneme-based correction +### Phoneme-based correction This is a feature that looks at the phonemes in sentences to try and correct sentences that might be incorrectly transcribed, as well as tries to fix cases where several lyrics are transcribed as one. It currently doesn't support word highlighting. @@ -148,3 +153,25 @@ with open(lyrics_path, "r") as f: timed_lyrics = transcribe(lyrics, song_path, split_words=True) fixed_lyrics = ph_correct(timed_lyrics, lyrics) ``` + +### Raw exports + +This is an export format that might help developers or advanced users, and is especially useful if the program fails to recognize something. It uses a special `.ltr` (**L**yric **T**imer **r**aw) extension, and can be selected like any other format (`lyrictimer -f ltr`). + +Raw files are JSON files with the following structure: + +* `ver` - the version. +* `params` - the configuration of the program: + * `pbc` - is phoneme-based correction enabled? + * `highlight` - is word highlighting enabled? + * `adv` - advanced parameters. + * `model` - the model used to transcribe. + * `lang` - if specified, the language of the song. +* `lyrics` - the real lyrics of the song as an array of lines. +* `result` - the transcription output. + * If phoneme-based correction is on, it's an object with the following keys (the values are `result` objects): + * `raw` - the raw result as returned by the neural network. + * `phc` - the corrected result. + * If it's off, it's a serialized `result` object. + +Eventually it will be possible to use these raw files to process lyrics again and export them to a different format without having to re-run speech recognition. diff --git a/lyrictimer/__init__.py b/lyrictimer/__init__.py index c6f7707d7040aecd743f43e704729411dc11ebf3..40070c4bedaaac288d82f0da050d6069d60957c6 100644 --- a/lyrictimer/__init__.py +++ b/lyrictimer/__init__.py @@ -1,5 +1,7 @@ import whisper +VERSION = "0.3" + def transcribe( lyrics, songfile, modelname = "small", verbose = False, slashes = True, language = None, split_words = False, condition_on_previous_text=True @@ -17,12 +19,20 @@ def transcribe( if verbose: print("raw result:") print(result) - return simplify(result, split_words) + return simplify(result, split_words, verbose) -def simplify(result, words): +def simplify(result, words, verbose = False): segments = [] # for each original segment: for w_segment in result["segments"]: + # skip empty lyrics + if w_segment['text'] == "": + if w_segment['no_speech_prob'] < 0.8: + print(f"WARNING: returned empty text at {w_segment['start']}s-{w_segment['end']}s, but silence probability is low - this indicates that an "+ + "error has occured during transcription. Skipping.") + if verbose: + print(f"skipping empty lyric at {w_segment['start']}s-{w_segment['end']}s") + continue # make a simplified one l_segment = { "text": w_segment['text'].strip(), diff --git a/lyrictimer/__main__.py b/lyrictimer/__main__.py index f7561da0e5f7642d339154ae2990cd4325a60f3b..2dc9177226e8a146ce4a97a934f99cdee1f017bc 100644 --- a/lyrictimer/__main__.py +++ b/lyrictimer/__main__.py @@ -2,56 +2,101 @@ # (c) 2023 narektor import argparse +import pathlib +import json +import os + from . import transcribe from .srt import to_srt, to_word_srt from .vtt import to_vtt, to_word_vtt +from .lrc import to_lrc, to_word_lrc from .phoneme_correction import correct_result as ph_correct -import pathlib +from .__init__ import VERSION -def cli(): - # set up parser +def setupParser(): parser = argparse.ArgumentParser( prog='lyrictimer', description='Adds timing to song lyrics when given the song and the lyrics.') parser.add_argument('music_file', help='path to the song') # positional argument parser.add_argument('lyric_file', help='path to the lyrics') # positional argument parser.add_argument('-o', '--output', dest='output', action='store', default=None, - help="path to output") + help="path to save the subtitles in") parser.add_argument('--model', dest='model', action='store', default="small", help='Whisper model (default: small)') - parser.add_argument('-v','--verbose', dest='verbose', action='store_const', - const=True, default=False, - help='show verbose output') parser.add_argument('-l','--language', dest='language', action='store', default=None, help='language of the song (default: auto detect)') - parser.add_argument('-w','--highlight-words', dest='highlight', action='store_const', - const=True, default=False, - help='highlight individual words in the subtitles (experimental)') - parser.add_argument('--vtt', dest='vtt', action='store_const', - const=True, default=False, - help='export WebVTT (.vtt) subtitles instead of SubRip (.srt), useful for YouTube') - parser.add_argument('--correct', dest='phoneme_corrector', action='store_const', + parser.add_argument('-f','--format', dest='sub_format', choices=["srt","vtt","lrc","ltr"], default="srt", + help='the output lyrics format') + + ad_params = parser.add_argument_group('advanced parameters') + ad_params.add_argument('-v','--verbose', dest='verbose', action='store_const', const=True, default=False, - help="correct transcribed lyrics based on phonemes (alpha, incompatible with -w)") - parser.add_argument('--no-slashes', dest='adv_slashes', action='store_const', + help='show verbose output') + ad_params.add_argument('--no-slashes', dest='adv_slashes', action='store_const', const=False, default=True, help="don't replace new lines with slashes (line 1 / line 2) in the lyrics. WARNING: can dramatically reduce quality if set") - parser.add_argument('--no-condition', dest='adv_condition', action='store_const', + ad_params.add_argument('--no-condition', dest='adv_condition', action='store_const', const=False, default=True, help="don't condition the model on previous text. WARNING: can dramatically reduce quality if set") - args = parser.parse_args() + ex_params = parser.add_argument_group('experimental', "These features are in development and might be unstable.") + ex_params.add_argument('-w','--highlight-words', dest='highlight', action='store_const', + const=True, default=False, + help='highlight individual words in the subtitles') + ex_params.add_argument('--correct', dest='phoneme_corrector', action='store_const', + const=True, default=False, + help="correct transcribed lyrics based on phonemes") + ex_params.add_argument('-s','--split-vocals', dest='clean_vocals', action='store_const', + const=True, default=False, + help="extract the vocals and use only them for text recognition (requires spleeter)") - def subtitle_file(result): - if args.vtt: - if args.highlight: - return to_word_vtt(result) - else: - return to_vtt(result) + dp_params = parser.add_argument_group('deprecated') + dp_params.add_argument('--vtt', dest='vtt', action='store_const', + const=True, default=False, + help='export WebVTT (.vtt) subtitles instead of SubRip (.srt), useful for YouTube') + return parser + +def subtitle_file(result, args): + if args.sub_format == "vtt" or args.vtt: + if args.highlight: + return to_word_vtt(result) + else: + return to_vtt(result) + elif args.sub_format == "lrc": + if args.highlight: + return to_word_lrc(result) + else: + return to_lrc(result) + elif args.sub_format == "ltr": + prompt = "" + with open(args.lyric_file, "r", encoding="utf-8") as file: + prompt = file.read() + dump = { + "ver":VERSION, + "params": { + "pbc": args.phoneme_corrector, + "highlight": args.highlight, + "demix": args.clean_vocals, + "adv":{"slashes":args.adv_slashes,"condition":args.adv_condition}, + "model":args.model, + "lang":args.language + }, + "result":result, + "lyrics":prompt + } + return json.dumps(dump) + else: + if args.highlight: + return to_word_srt(result) else: - if args.highlight: - return to_word_srt(result) - else: - return to_srt(result) + return to_srt(result) + +def cli(): + # set up parser + parser = setupParser() + args = parser.parse_args() + + if args.vtt: + print("\nWARNING: --vtt is deprecated and will be removed in the next version, please use --format vtt / -f vtt\n") if args.highlight: print("\nWARNING: highlighting words is an experimental feature\n") @@ -62,18 +107,34 @@ def cli(): exit() print("\nWARNING: phoneme-based correction is an experimental feature\n") - print(f"Transcribing {args.music_file}...") + song = args.music_file + voc_out_dir = None + if args.clean_vocals: + print("Loading spleeter...") + from .demix import split_vocals + print("Separating vocals...") + voc_out_dir = split_vocals(song) + if args.verbose: + print(f"output - {voc_out_dir.name}") + song = os.path.join(voc_out_dir.name,pathlib.Path(song).stem,"vocals.wav") + + print(f"Transcribing {song}...") # read lyrics prompt = "" with open(args.lyric_file, "r", encoding="utf-8") as file: prompt = file.read() # transcribe - result = transcribe(prompt, args.music_file, + result = transcribe(prompt, song, args.model, args.verbose, split_words=args.highlight or args.phoneme_corrector, slashes=args.adv_slashes, condition_on_previous_text=args.adv_condition, language=args.language) + raw_result = result print("Transcribed.") + # delete split vocal folder + if args.clean_vocals: + voc_out_dir.cleanup() + if args.verbose: print(result) @@ -85,15 +146,21 @@ def cli(): sub_path = None if args.output != None: sub_path = args.output - elif args.vtt: + elif args.sub_format == "vtt" or args.vtt: sub_path = pathlib.Path(args.lyric_file).stem + ".vtt" + elif args.sub_format == "lrc": + sub_path = pathlib.Path(args.lyric_file).stem + ".lrc" + elif args.sub_format == "ltr": + sub_path = pathlib.Path(args.lyric_file).stem + ".ltr" + if args.phoneme_corrector: + result = {"raw":raw_result,"phc":result} else: sub_path = pathlib.Path(args.lyric_file).stem + ".srt" # write subtitles print(f"Writing {sub_path}...") with open(sub_path, "w", encoding="utf-8") as file: - file.write(subtitle_file(result)) + file.write(subtitle_file(result, args)) print(f"Done! File saved to {sub_path}") diff --git a/lyrictimer/demix.py b/lyrictimer/demix.py new file mode 100644 index 0000000000000000000000000000000000000000..8036659da694037a1c765a90fd3b7ed1831e9e43 --- /dev/null +++ b/lyrictimer/demix.py @@ -0,0 +1,9 @@ +from spleeter.separator import Separator +import tempfile +import pathlib + +def split_vocals(in_file, model = "spleeter:2stems"): + separator = Separator(model) + out_dir = tempfile.TemporaryDirectory(prefix="LRT") + separator.separate_to_file(in_file, out_dir.name) + return out_dir diff --git a/lyrictimer/lrc.py b/lyrictimer/lrc.py new file mode 100644 index 0000000000000000000000000000000000000000..52efff74c648e07941d52220f12dd34423cabd51 --- /dev/null +++ b/lyrictimer/lrc.py @@ -0,0 +1,30 @@ +import math +from .srt import format_timestamp as format_srt_timestamp +from .__init__ import VERSION + +def to_word_lrc(lt): + lrc = "[re:Lyric Timer - gitlab.com/narektor/lyric-timer]\n" + lrc += f"[ve:{VERSION}]\n\n" + # for each segment: + for segment in lt: + # add the lyric time + lrc += f"[{format_timestamp(segment['from'])}]" + # add the words + for word in segment["words"]: + lrc += f" <{format_timestamp(word['from'])}> {word['word'].strip()}" + lrc += "\n" + # return the data stripped + return lrc.rstrip() + +def to_lrc(lt): + lrc = "[re:Lyric Timer - gitlab.com/narektor/lyric-timer]\n" + lrc += f"[ve:{VERSION}]\n\n" + # for each segment: + for segment in lt: + # add the lyric + lrc += f"[{format_timestamp(segment['from'])}] {segment['text']}\n" + # return the data stripped + return lrc.rstrip() + +def format_timestamp(seconds: float): + return format_srt_timestamp(seconds, False).replace(",", ".") diff --git a/lyrictimer/srt.py b/lyrictimer/srt.py index d909f5505a4a2984a589ad0bc17c646dcb12f80d..995c721f0e7ce25ab67ba7478d92d1eac760d69c 100644 --- a/lyrictimer/srt.py +++ b/lyrictimer/srt.py @@ -19,7 +19,7 @@ def to_srt(lt): # return the data stripped return srt.rstrip() -def format_timestamp(seconds: float): +def format_timestamp(seconds: float, hours_required = True): assert seconds >= 0, "non-negative timestamp expected" milliseconds = round(seconds * 1000.0) hours = milliseconds // 3_600_000 @@ -29,6 +29,8 @@ def format_timestamp(seconds: float): seconds = milliseconds // 1_000 milliseconds -= seconds * 1_000 hours_marker = f"{hours:02d}:" + if not hours_required and hours_marker == "00:": + hours_marker = "" return ( f"{hours_marker}{minutes:02d}:{seconds:02d},{milliseconds:03d}" ) diff --git a/setup.py b/setup.py index c5b4992f5720e700fff2a62f5b00527fd3888120..7b5a01c9ebc6258d319ac9c467c1c42d1b5a42fe 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ import os setup( name='lyrictimer', - version='0.2', + version='0.3', description='Adds timing to song lyrics when given the song and the lyrics using OpenAI Whisper.', long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown",