From 35bf3cfe8216db8af8fb51b5c0f21c5d10ec0bad Mon Sep 17 00:00:00 2001 From: Narek Date: Wed, 10 May 2023 14:34:05 +0400 Subject: [PATCH 1/9] Raise version to 0.3 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c5b4992..7b5a01c 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ import os setup( name='lyrictimer', - version='0.2', + version='0.3', description='Adds timing to song lyrics when given the song and the lyrics using OpenAI Whisper.', long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", -- GitLab From 74af366a13a6edcf365adfabc225d282893c1bc1 Mon Sep 17 00:00:00 2001 From: Narek Date: Wed, 10 May 2023 15:57:16 +0400 Subject: [PATCH 2/9] CLI: split into functions --- lyrictimer/__main__.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/lyrictimer/__main__.py b/lyrictimer/__main__.py index f7561da..6bc51de 100644 --- a/lyrictimer/__main__.py +++ b/lyrictimer/__main__.py @@ -8,8 +8,7 @@ from .vtt import to_vtt, to_word_vtt from .phoneme_correction import correct_result as ph_correct import pathlib -def cli(): - # set up parser +def setupParser(): parser = argparse.ArgumentParser( prog='lyrictimer', description='Adds timing to song lyrics when given the song and the lyrics.') @@ -39,19 +38,24 @@ def cli(): parser.add_argument('--no-condition', dest='adv_condition', action='store_const', const=False, default=True, help="don't condition the model on previous text. WARNING: can dramatically reduce quality if set") - args = parser.parse_args() + return parser - def subtitle_file(result): - if args.vtt: - if args.highlight: - return to_word_vtt(result) - else: - return to_vtt(result) +def subtitle_file(result, args): + if args.vtt: + if args.highlight: + return to_word_vtt(result) else: - if args.highlight: - return to_word_srt(result) - else: - return to_srt(result) + return to_vtt(result) + else: + if args.highlight: + return to_word_srt(result) + else: + return to_srt(result) + +def cli(): + # set up parser + parser = setupParser() + args = parser.parse_args() if args.highlight: print("\nWARNING: highlighting words is an experimental feature\n") @@ -93,7 +97,7 @@ def cli(): # write subtitles print(f"Writing {sub_path}...") with open(sub_path, "w", encoding="utf-8") as file: - file.write(subtitle_file(result)) + file.write(subtitle_file(result, args)) print(f"Done! File saved to {sub_path}") -- GitLab From 76b41773db07439bc75790d712ebfd1adce357c6 Mon Sep 17 00:00:00 2001 From: Narek Date: Wed, 10 May 2023 16:10:22 +0400 Subject: [PATCH 3/9] CLI: split args into groups --- lyrictimer/__main__.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/lyrictimer/__main__.py b/lyrictimer/__main__.py index 6bc51de..d0d798c 100644 --- a/lyrictimer/__main__.py +++ b/lyrictimer/__main__.py @@ -5,7 +5,7 @@ import argparse from . import transcribe from .srt import to_srt, to_word_srt from .vtt import to_vtt, to_word_vtt -from .phoneme_correction import correct_result as ph_correct +from .phoneme_correction import correct_result as ph_correct56 import pathlib def setupParser(): @@ -15,29 +15,31 @@ def setupParser(): parser.add_argument('music_file', help='path to the song') # positional argument parser.add_argument('lyric_file', help='path to the lyrics') # positional argument parser.add_argument('-o', '--output', dest='output', action='store', default=None, - help="path to output") + help="path to save the subtitles in") parser.add_argument('--model', dest='model', action='store', default="small", help='Whisper model (default: small)') - parser.add_argument('-v','--verbose', dest='verbose', action='store_const', - const=True, default=False, - help='show verbose output') parser.add_argument('-l','--language', dest='language', action='store', default=None, help='language of the song (default: auto detect)') - parser.add_argument('-w','--highlight-words', dest='highlight', action='store_const', - const=True, default=False, - help='highlight individual words in the subtitles (experimental)') parser.add_argument('--vtt', dest='vtt', action='store_const', const=True, default=False, help='export WebVTT (.vtt) subtitles instead of SubRip (.srt), useful for YouTube') - parser.add_argument('--correct', dest='phoneme_corrector', action='store_const', + ad_params = parser.add_argument_group('advanced parameters') + ad_params.add_argument('-v','--verbose', dest='verbose', action='store_const', const=True, default=False, - help="correct transcribed lyrics based on phonemes (alpha, incompatible with -w)") - parser.add_argument('--no-slashes', dest='adv_slashes', action='store_const', + help='show verbose output') + ad_params.add_argument('--no-slashes', dest='adv_slashes', action='store_const', const=False, default=True, help="don't replace new lines with slashes (line 1 / line 2) in the lyrics. WARNING: can dramatically reduce quality if set") - parser.add_argument('--no-condition', dest='adv_condition', action='store_const', + ad_params.add_argument('--no-condition', dest='adv_condition', action='store_const', const=False, default=True, help="don't condition the model on previous text. WARNING: can dramatically reduce quality if set") + ex_params = parser.add_argument_group('experimental', "These features are in development and might be unstable.") + ex_params.add_argument('-w','--highlight-words', dest='highlight', action='store_const', + const=True, default=False, + help='highlight individual words in the subtitles') + ex_params.add_argument('--correct', dest='phoneme_corrector', action='store_const', + const=True, default=False, + help="correct transcribed lyrics based on phonemes") return parser def subtitle_file(result, args): -- GitLab From 4d0cf23f5ee420ec7543705254748da87798da9c Mon Sep 17 00:00:00 2001 From: Narek Date: Wed, 17 May 2023 00:23:20 +0400 Subject: [PATCH 4/9] CLI: add lrc file option This also replaces --vtt with a more general --format argument. lrc exports support per-word timestamps natively instead of using HTML: generated lrc files will be simple format if word highlighting isn't enabled, and advanced format if it's on. --- .gitignore | 5 ++++- lyrictimer/__init__.py | 2 ++ lyrictimer/__main__.py | 28 ++++++++++++++++++++++------ lyrictimer/lrc.py | 30 ++++++++++++++++++++++++++++++ lyrictimer/srt.py | 4 +++- 5 files changed, 61 insertions(+), 8 deletions(-) create mode 100644 lyrictimer/lrc.py diff --git a/.gitignore b/.gitignore index af194c4..fe2edc0 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,7 @@ lyrictimer/__pycache__/ *.egg-info srts/ dist/ -build/ \ No newline at end of file +build/ +*.lrc +*.vtt +*.ltr diff --git a/lyrictimer/__init__.py b/lyrictimer/__init__.py index c6f7707..08e6e6d 100644 --- a/lyrictimer/__init__.py +++ b/lyrictimer/__init__.py @@ -1,5 +1,7 @@ import whisper +VERSION = "0.3" + def transcribe( lyrics, songfile, modelname = "small", verbose = False, slashes = True, language = None, split_words = False, condition_on_previous_text=True diff --git a/lyrictimer/__main__.py b/lyrictimer/__main__.py index d0d798c..b015b8e 100644 --- a/lyrictimer/__main__.py +++ b/lyrictimer/__main__.py @@ -5,7 +5,8 @@ import argparse from . import transcribe from .srt import to_srt, to_word_srt from .vtt import to_vtt, to_word_vtt -from .phoneme_correction import correct_result as ph_correct56 +from .lrc import to_lrc, to_word_lrc +from .phoneme_correction import correct_result as ph_correct import pathlib def setupParser(): @@ -20,9 +21,9 @@ def setupParser(): help='Whisper model (default: small)') parser.add_argument('-l','--language', dest='language', action='store', default=None, help='language of the song (default: auto detect)') - parser.add_argument('--vtt', dest='vtt', action='store_const', - const=True, default=False, - help='export WebVTT (.vtt) subtitles instead of SubRip (.srt), useful for YouTube') + parser.add_argument('-f','--format', dest='sub_format', choices=["srt","vtt","lrc"], default="srt", + help='the output lyrics format') + ad_params = parser.add_argument_group('advanced parameters') ad_params.add_argument('-v','--verbose', dest='verbose', action='store_const', const=True, default=False, @@ -40,14 +41,24 @@ def setupParser(): ex_params.add_argument('--correct', dest='phoneme_corrector', action='store_const', const=True, default=False, help="correct transcribed lyrics based on phonemes") + + dp_params = parser.add_argument_group('deprecated') + dp_params.add_argument('--vtt', dest='vtt', action='store_const', + const=True, default=False, + help='export WebVTT (.vtt) subtitles instead of SubRip (.srt), useful for YouTube') return parser def subtitle_file(result, args): - if args.vtt: + if args.sub_format == "vtt" or args.vtt: if args.highlight: return to_word_vtt(result) else: return to_vtt(result) + elif args.sub_format == "lrc": + if args.highlight: + return to_word_lrc(result) + else: + return to_lrc(result) else: if args.highlight: return to_word_srt(result) @@ -59,6 +70,9 @@ def cli(): parser = setupParser() args = parser.parse_args() + if args.vtt: + print("\nWARNING: --vtt is deprecated and will be removed in the next version, please use --format vtt / -f vtt\n") + if args.highlight: print("\nWARNING: highlighting words is an experimental feature\n") @@ -91,8 +105,10 @@ def cli(): sub_path = None if args.output != None: sub_path = args.output - elif args.vtt: + elif args.sub_format == "vtt" or args.vtt: sub_path = pathlib.Path(args.lyric_file).stem + ".vtt" + elif args.sub_format == "lrc": + sub_path = pathlib.Path(args.lyric_file).stem + ".lrc" else: sub_path = pathlib.Path(args.lyric_file).stem + ".srt" diff --git a/lyrictimer/lrc.py b/lyrictimer/lrc.py new file mode 100644 index 0000000..52efff7 --- /dev/null +++ b/lyrictimer/lrc.py @@ -0,0 +1,30 @@ +import math +from .srt import format_timestamp as format_srt_timestamp +from .__init__ import VERSION + +def to_word_lrc(lt): + lrc = "[re:Lyric Timer - gitlab.com/narektor/lyric-timer]\n" + lrc += f"[ve:{VERSION}]\n\n" + # for each segment: + for segment in lt: + # add the lyric time + lrc += f"[{format_timestamp(segment['from'])}]" + # add the words + for word in segment["words"]: + lrc += f" <{format_timestamp(word['from'])}> {word['word'].strip()}" + lrc += "\n" + # return the data stripped + return lrc.rstrip() + +def to_lrc(lt): + lrc = "[re:Lyric Timer - gitlab.com/narektor/lyric-timer]\n" + lrc += f"[ve:{VERSION}]\n\n" + # for each segment: + for segment in lt: + # add the lyric + lrc += f"[{format_timestamp(segment['from'])}] {segment['text']}\n" + # return the data stripped + return lrc.rstrip() + +def format_timestamp(seconds: float): + return format_srt_timestamp(seconds, False).replace(",", ".") diff --git a/lyrictimer/srt.py b/lyrictimer/srt.py index d909f55..995c721 100644 --- a/lyrictimer/srt.py +++ b/lyrictimer/srt.py @@ -19,7 +19,7 @@ def to_srt(lt): # return the data stripped return srt.rstrip() -def format_timestamp(seconds: float): +def format_timestamp(seconds: float, hours_required = True): assert seconds >= 0, "non-negative timestamp expected" milliseconds = round(seconds * 1000.0) hours = milliseconds // 3_600_000 @@ -29,6 +29,8 @@ def format_timestamp(seconds: float): seconds = milliseconds // 1_000 milliseconds -= seconds * 1_000 hours_marker = f"{hours:02d}:" + if not hours_required and hours_marker == "00:": + hours_marker = "" return ( f"{hours_marker}{minutes:02d}:{seconds:02d},{milliseconds:03d}" ) -- GitLab From d9b0045e8e307c9c060e82d8183fcb677207bc78 Mon Sep 17 00:00:00 2001 From: Narek Date: Wed, 17 May 2023 16:01:29 +0400 Subject: [PATCH 5/9] Skip empty lyrics Empty lyrics mess with phoneme-based correction and are unnecessary. --- lyrictimer/__init__.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/lyrictimer/__init__.py b/lyrictimer/__init__.py index 08e6e6d..40070c4 100644 --- a/lyrictimer/__init__.py +++ b/lyrictimer/__init__.py @@ -19,12 +19,20 @@ def transcribe( if verbose: print("raw result:") print(result) - return simplify(result, split_words) + return simplify(result, split_words, verbose) -def simplify(result, words): +def simplify(result, words, verbose = False): segments = [] # for each original segment: for w_segment in result["segments"]: + # skip empty lyrics + if w_segment['text'] == "": + if w_segment['no_speech_prob'] < 0.8: + print(f"WARNING: returned empty text at {w_segment['start']}s-{w_segment['end']}s, but silence probability is low - this indicates that an "+ + "error has occured during transcription. Skipping.") + if verbose: + print(f"skipping empty lyric at {w_segment['start']}s-{w_segment['end']}s") + continue # make a simplified one l_segment = { "text": w_segment['text'].strip(), -- GitLab From d4ffaa41a75043d2e9d0db9424f05a4989a9cf27 Mon Sep 17 00:00:00 2001 From: Narek Date: Wed, 17 May 2023 16:02:23 +0400 Subject: [PATCH 6/9] CLI: add raw export option This adds a .ltr format that contains raw data from the program and AI model. These files can be used to process the output (for example, apply phoneme-based correction) and export it to various formats without having to transcribe everything again. --- lyrictimer/__main__.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/lyrictimer/__main__.py b/lyrictimer/__main__.py index b015b8e..a1959a9 100644 --- a/lyrictimer/__main__.py +++ b/lyrictimer/__main__.py @@ -2,12 +2,15 @@ # (c) 2023 narektor import argparse +import pathlib +import json + from . import transcribe from .srt import to_srt, to_word_srt from .vtt import to_vtt, to_word_vtt from .lrc import to_lrc, to_word_lrc from .phoneme_correction import correct_result as ph_correct -import pathlib +from .__init__ import VERSION def setupParser(): parser = argparse.ArgumentParser( @@ -21,7 +24,7 @@ def setupParser(): help='Whisper model (default: small)') parser.add_argument('-l','--language', dest='language', action='store', default=None, help='language of the song (default: auto detect)') - parser.add_argument('-f','--format', dest='sub_format', choices=["srt","vtt","lrc"], default="srt", + parser.add_argument('-f','--format', dest='sub_format', choices=["srt","vtt","lrc","ltr"], default="srt", help='the output lyrics format') ad_params = parser.add_argument_group('advanced parameters') @@ -59,6 +62,23 @@ def subtitle_file(result, args): return to_word_lrc(result) else: return to_lrc(result) + elif args.sub_format == "ltr": + prompt = "" + with open(args.lyric_file, "r", encoding="utf-8") as file: + prompt = file.read() + dump = { + "ver":VERSION, + "params": { + "pbc": args.phoneme_corrector, + "highlight": args.highlight, + "adv":{"slashes":args.adv_slashes,"condition":args.adv_condition}, + "model":args.model, + "lang":args.language + }, + "result":result, + "lyrics":prompt + } + return json.dumps(dump) else: if args.highlight: return to_word_srt(result) @@ -92,6 +112,7 @@ def cli(): result = transcribe(prompt, args.music_file, args.model, args.verbose, split_words=args.highlight or args.phoneme_corrector, slashes=args.adv_slashes, condition_on_previous_text=args.adv_condition, language=args.language) + raw_result = result print("Transcribed.") if args.verbose: @@ -109,6 +130,10 @@ def cli(): sub_path = pathlib.Path(args.lyric_file).stem + ".vtt" elif args.sub_format == "lrc": sub_path = pathlib.Path(args.lyric_file).stem + ".lrc" + elif args.sub_format == "ltr": + sub_path = pathlib.Path(args.lyric_file).stem + ".ltr" + if args.phoneme_corrector: + result = {"raw":raw_result,"phc":result} else: sub_path = pathlib.Path(args.lyric_file).stem + ".srt" -- GitLab From b1adf14f0f41f21f9924690aab1f14723633994b Mon Sep 17 00:00:00 2001 From: Narek Date: Sun, 25 Jun 2023 02:21:16 +0400 Subject: [PATCH 7/9] Add support for demixing audio with Spleeter This commit adds a new -s/--split-vocals argument. When it's used, the program uses Spleeter to separate the vocals and melody of a song, and runs transcription on the clean vocals. This sometimes improves accuracy. --- .gitignore | 13 +++++++++++-- lyrictimer/__main__.py | 24 ++++++++++++++++++++++-- lyrictimer/demix.py | 9 +++++++++ 3 files changed, 42 insertions(+), 4 deletions(-) create mode 100644 lyrictimer/demix.py diff --git a/.gitignore b/.gitignore index fe2edc0..8b60046 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,19 @@ +# local envs timerenv/ +timerenv2/ +srts/ + +# build folders lyrictimer/__pycache__/ -*.srt *.egg-info -srts/ dist/ build/ + +# lyric formats +*.srt *.lrc *.vtt *.ltr + +# spleeter temp +pretrained_models/ \ No newline at end of file diff --git a/lyrictimer/__main__.py b/lyrictimer/__main__.py index a1959a9..2dc9177 100644 --- a/lyrictimer/__main__.py +++ b/lyrictimer/__main__.py @@ -4,6 +4,7 @@ import argparse import pathlib import json +import os from . import transcribe from .srt import to_srt, to_word_srt @@ -44,6 +45,9 @@ def setupParser(): ex_params.add_argument('--correct', dest='phoneme_corrector', action='store_const', const=True, default=False, help="correct transcribed lyrics based on phonemes") + ex_params.add_argument('-s','--split-vocals', dest='clean_vocals', action='store_const', + const=True, default=False, + help="extract the vocals and use only them for text recognition (requires spleeter)") dp_params = parser.add_argument_group('deprecated') dp_params.add_argument('--vtt', dest='vtt', action='store_const', @@ -71,6 +75,7 @@ def subtitle_file(result, args): "params": { "pbc": args.phoneme_corrector, "highlight": args.highlight, + "demix": args.clean_vocals, "adv":{"slashes":args.adv_slashes,"condition":args.adv_condition}, "model":args.model, "lang":args.language @@ -102,19 +107,34 @@ def cli(): exit() print("\nWARNING: phoneme-based correction is an experimental feature\n") - print(f"Transcribing {args.music_file}...") + song = args.music_file + voc_out_dir = None + if args.clean_vocals: + print("Loading spleeter...") + from .demix import split_vocals + print("Separating vocals...") + voc_out_dir = split_vocals(song) + if args.verbose: + print(f"output - {voc_out_dir.name}") + song = os.path.join(voc_out_dir.name,pathlib.Path(song).stem,"vocals.wav") + + print(f"Transcribing {song}...") # read lyrics prompt = "" with open(args.lyric_file, "r", encoding="utf-8") as file: prompt = file.read() # transcribe - result = transcribe(prompt, args.music_file, + result = transcribe(prompt, song, args.model, args.verbose, split_words=args.highlight or args.phoneme_corrector, slashes=args.adv_slashes, condition_on_previous_text=args.adv_condition, language=args.language) raw_result = result print("Transcribed.") + # delete split vocal folder + if args.clean_vocals: + voc_out_dir.cleanup() + if args.verbose: print(result) diff --git a/lyrictimer/demix.py b/lyrictimer/demix.py new file mode 100644 index 0000000..8036659 --- /dev/null +++ b/lyrictimer/demix.py @@ -0,0 +1,9 @@ +from spleeter.separator import Separator +import tempfile +import pathlib + +def split_vocals(in_file, model = "spleeter:2stems"): + separator = Separator(model) + out_dir = tempfile.TemporaryDirectory(prefix="LRT") + separator.separate_to_file(in_file, out_dir.name) + return out_dir -- GitLab From 780972ce5d7c01cf965c08a366577fc234d6ac5f Mon Sep 17 00:00:00 2001 From: Narek <6881875-narektor@users.noreply.gitlab.com> Date: Thu, 13 Jul 2023 21:42:59 +0000 Subject: [PATCH 8/9] Update file README.md --- README.md | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 02e1f7c..ee97218 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ After that, call the program, providing the music and lyrics file. lyrictimer path/to/song.mp3 path/to/lyrics.txt ``` -By default, the lyrics with timecodes will be saved as a SubRip (srt) file with the same name as the lyrics file in the current folder. +By default, the lyrics with timecodes will be saved as a SubRip (srt) file with the same name as the lyrics file in the current folder. However, the program supports WebVTT (vtt, useful for YouTube subtitles) and LRC files. ### Examples - Transcribe `another_tomorrow.mp3` with the lyrics being in `lyrics/another_tomorrow.txt`: @@ -46,18 +46,20 @@ Lyric Timer can also be configured using arguments. The entire list can be seen lyrictimer --model large-v2 -o Music/lyrics/saint.srt Music/Favorites/Saint.mp3 Music/lyrics/saint.txt ``` -- Transcribe `Music/Favorites/IThreeU/I3U.flac` with the lyrics being in `Music/lyrics/i_three_u.txt`, highlighting the lyrics: +- Transcribe `Music/Favorites/Lovebirds/I3U.flac` with the lyrics being in `Music/lyrics/i_three_u.txt`, highlighting the lyrics: ``` -lyrictimer -w Music/Favorites/IThreeU/I3U.flac Music/lyrics/i_three_u.txt +lyrictimer -w Music/Favorites/Lovebirds/I3U.flac Music/lyrics/i_three_u.txt ``` The lyrics will be italic and underlined: ![Screenshot of a music video with the words "Here come old flat top, he come grooving up slowly". The word "flat" is highlighted.](img/highlight_mode.png) -- Transcribe `Music/Favorites/Ami/Mon_Ami.wav` with the lyrics being in `Music/lyrics/mon_ami.txt`, specifying the language as French and using the `base` model: +> If you're exporting in the LRC format, the output file will be in [enhanced/A2 LRC format](https://en.wikipedia.org/wiki/LRC_(file_format)#A2_extension:_word_time_tag) with word-level timestamps. Keep in mind that not all media players support this format. + +- Transcribe `Music/Favorites/Groupe_Musique/Mon_Ami.wav` with the lyrics being in `Music/lyrics/mon_ami.txt`, specifying the language as French and using the `base` model: ``` -lyrictimer -l fr --model base Music/Favorites/Ami/Mon_Ami.wav Music/lyrics/mon_ami.txt +lyrictimer -l fr --model base Music/Favorites/Groupe_Musique/Mon_Ami.wav Music/lyrics/mon_ami.txt ``` ## F. A. Q. @@ -66,6 +68,8 @@ lyrictimer -l fr --model base Music/Favorites/Ami/Mon_Ami.wav Music/lyrics/mon_a The program is just using Whisper to transcribe the song, giving the lyrics as a prompt (so the AI makes less transcription errors). ### The transcribed lyrics are wrong/nonsense +Of course, no voice to text model is 100% accurate, but here are some tips to help improve the accuracy: + - Whisper's [transcription quality](https://github.com/openai/whisper#available-models-and-languages) naturally plays a huge role in the quality of the lyrics. - The transcription quality of songs with verses in multiple languages (e.g. the chorus or the last verse being in a different language to the rest of the song) is impacted by Whisper's accuracy in _both_ languages, regardless of the language of an individual verse. - For example, if a mostly English song has a verse in Russian, the quality of the timed lyrics in both the Russian verse and other parts is impacted by Whisper's accuracy in both English and Russian. @@ -75,7 +79,8 @@ The program is just using Whisper to transcribe the song, giving the lyrics as a - If the language of the song is known (and especially if the transcribed lyrics contain other alphabets), try specifying the language: `lyrictimer -l ...`. - Additionally, if the song is in English, try an English only model (see the tip above). - The model works worse with heavily processed voices. -- Try phoneme-based correction. +- Try phoneme-based correction. It essentially works by using the transcribed text as a guide to pick the real lyrics instead of using the transcribed lyrics. +- Try the "split vocals" feature. This sometimes results in reduced accuracy, but could help in other times. - Sometimes, using another source for the lyrics helps. - Lyric files that have punctuation and don't shorten the text (e.g. using "(x2)" to indicate that the verse is repeated twice or using "(chorus)" after writing the chorus lyrics once instead of repeating the chorus every time) usually result in transcriptions that are more accurate to the source. -- GitLab From 62b3bd1e813dfbe894050351eaac559dabca6e68 Mon Sep 17 00:00:00 2001 From: Narek <6881875-narektor@users.noreply.gitlab.com> Date: Sat, 15 Jul 2023 18:35:25 +0000 Subject: [PATCH 9/9] Document raw files --- README.md | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ee97218..89f8435 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,7 @@ set "PHONEMIZER_ESPEAK_PATH=C:\Program Files\eSpeak NG" ``` Assuming eSpeak is installed in the default path, `C:\Program Files\eSpeak NG`. If it's not installed there change the path. -## Usage from Python +## Usage from code Lyric Timer can also be used as a library. Here's an example: ```python from lyrictimer import transcribe @@ -129,7 +129,7 @@ This will return a similar output (i.e. you will still get the sentences), but y The words' timestamps are relative to the entire song, not the sentence they're in. -## Phoneme-based correction +### Phoneme-based correction This is a feature that looks at the phonemes in sentences to try and correct sentences that might be incorrectly transcribed, as well as tries to fix cases where several lyrics are transcribed as one. It currently doesn't support word highlighting. @@ -153,3 +153,25 @@ with open(lyrics_path, "r") as f: timed_lyrics = transcribe(lyrics, song_path, split_words=True) fixed_lyrics = ph_correct(timed_lyrics, lyrics) ``` + +### Raw exports + +This is an export format that might help developers or advanced users, and is especially useful if the program fails to recognize something. It uses a special `.ltr` (**L**yric **T**imer **r**aw) extension, and can be selected like any other format (`lyrictimer -f ltr`). + +Raw files are JSON files with the following structure: + +* `ver` - the version. +* `params` - the configuration of the program: + * `pbc` - is phoneme-based correction enabled? + * `highlight` - is word highlighting enabled? + * `adv` - advanced parameters. + * `model` - the model used to transcribe. + * `lang` - if specified, the language of the song. +* `lyrics` - the real lyrics of the song as an array of lines. +* `result` - the transcription output. + * If phoneme-based correction is on, it's an object with the following keys (the values are `result` objects): + * `raw` - the raw result as returned by the neural network. + * `phc` - the corrected result. + * If it's off, it's a serialized `result` object. + +Eventually it will be possible to use these raw files to process lyrics again and export them to a different format without having to re-run speech recognition. -- GitLab