summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSolomon Peachy <pizza@shaftnet.org>2024-04-19 21:53:43 -0400
committerSolomon Peachy <pizza@shaftnet.org>2024-04-21 18:08:47 -0400
commite8a51569ada3bfd85fc0c93911bd5061ce3b6017 (patch)
tree1d248f88fce315240f7df2e415b001b785cd3b6b
parent418a5acea08876cc02ccfec9ba51e2fede4e6eb6 (diff)
downloadrockbox-e8a51569ad.tar.gz
rockbox-e8a51569ad.zip
voice: Add support for the Piper TTS engine
https://github.com/rhasspy/piper High quality, offline, neural-network-based, with good language coverage Note that you have to manually download the piper voice models, and set PIPER_MODEL_DIR appropriately. The configure script will let you choose from the available models and remember your choices. Change-Id: I8eba9fcf78b51b01b89491539aac3e423cc42f16
-rwxr-xr-xtools/configure52
-rwxr-xr-xtools/voice.pl146
2 files changed, 147 insertions, 51 deletions
diff --git a/tools/configure b/tools/configure
index adccc3e5e4..51a47971fd 100755
--- a/tools/configure
+++ b/tools/configure
@@ -1159,6 +1159,13 @@ voiceconfig () {
DEFAULT_TTS_OPTS=$GTTS_OPTS
DEFAULT_CHOICE="g"
fi
+ if [ -n "`findtool piper`" ]; then
+ PIPER="(p)iper "
+ PIPER_OPTS=""
+ DEFAULT_TTS="piper"
+ DEFAULT_TTS_OPTS=$PIPER_OPTS
+ DEFAULT_CHOICE="p"
+ fi
if [ -n "`findtool rbspeak`" ]; then
RBSPEAK="(O)ther "
RBSPEAK_OPTS=""
@@ -1167,15 +1174,15 @@ voiceconfig () {
DEFAULT_CHOICE="O"
fi
- if [ "$FESTIVAL" = "$FLITE" ] && [ "$FLITE" = "$ESPEAK" ] && [ "$ESPEAK" = "$SAPI" ] && [ "$SAPI" = "$MIMIC" ] && [ "$MIMIC" = "$SWIFT" ] && [ "$SWIFT" = "$GTTS" ] && [ "$GTTS" = "$RBSPEAK" ] ; then
- echo "You need Festival, eSpeak, Mimic, Flite, gtts, or rbspeak in your path, or SAPI available to build voice files"
+ if [ "$FESTIVAL" = "$FLITE" ] && [ "$FLITE" = "$ESPEAK" ] && [ "$ESPEAK" = "$SAPI" ] && [ "$SAPI" = "$MIMIC" ] && [ "$MIMIC" = "$SWIFT" ] && [ "$SWIFT" = "$GTTS" ] && [ "$GTTS" = "$PIPER" ] && [ "$PIPER" = "$RBSPEAK" ] ; then
+ echo "You need Festival, eSpeak, Mimic, Flite, piper, gtts, or rbspeak in your path, or SAPI available to build voice files"
exit 3
fi
if [ "$ARG_TTS" ]; then
option=$ARG_TTS
else
- echo "TTS engine to use: ${FLITE}${FESTIVAL}${ESPEAK}${MIMIC}${SAPI}${SWIFT}${GTTS}${RBSPEAK}(${DEFAULT_CHOICE})?"
+ echo "TTS engine to use: ${FLITE}${FESTIVAL}${ESPEAK}${MIMIC}${SAPI}${SWIFT}${GTTS}${RBSPEAK}${PIPER}(${DEFAULT_CHOICE})?"
option=`input`
if [ -z "$option" ]; then option=${DEFAULT_CHOICE}; fi
advopts="$advopts --tts=$option"
@@ -1209,6 +1216,10 @@ voiceconfig () {
TTS_ENGINE="gtts"
TTS_OPTS=$GTTS_OPTS
;;
+ [Pp]|piper)
+ TTS_ENGINE="piper"
+ TTS_OPTS=$PIPER_OPTS
+ ;;
[Oo]|rbspeak)
TTS_ENGINE="rbspeak"
TTS_OPTS=$RBSPEAK_OPTS
@@ -1247,6 +1258,39 @@ voiceconfig () {
advopts="$advopts --voice=$CHOICE"
echo "Festival voice set to $TTS_FESTIVAL_VOICE"
echo "(voice_$TTS_FESTIVAL_VOICE)" > festival-prolog.scm
+ elif [ "$TTS_ENGINE" = "piper" ]; then
+ if [ -z "$PIPER_MODEL_DIR" ]; then
+ echo "Please set PIPER_MODEL_DIR!";
+ exit 1
+ fi
+ models=`(cd $PIPER_MODEL_DIR ; ls -1 *onnx)`
+ for model in $models; do
+ PIPER_MODEL="$model" # Default
+ break;
+ done
+ if [ "$ARG_VOICE" ]; then
+ CHOICE=$ARG_VOICE
+ else
+ i=1
+ for model in $models; do
+ printf "%3d. %s\n" "$i" "$model"
+ i=`expr $i + 1`
+ done
+ printf "Please select which piper model to use (default is $PIPER_MODEL): "
+ CHOICE=`input`
+ fi
+ i=1
+ for model in $models; do
+ if [ "$i" = "$CHOICE" -o "$model" = "$CHOICE" ]; then
+ PIPER_MODEL="$model"
+ break;
+ fi
+ i=`expr $i + 1`
+ done
+
+ TTS_OPTS="$TTS_OPTS --model $PIPER_MODEL_DIR/$PIPER_MODEL"
+ advopts="$advopts --voice=$PIPER_MODEL"
+ echo "Piper model set to $PIPER_MODEL"
elif [ "$TTS_ENGINE" = "mimic" ]; then
voicelist=`mimic -lv | cut -d':' -f2`
for voice in $voicelist; do
@@ -1268,6 +1312,7 @@ voiceconfig () {
for voice in $voicelist; do
if [ "$i" = "$CHOICE" -o "$voice" = "$CHOICE" ]; then
TTS_MIMIC_VOICE="$voice"
+ break
fi
i=`expr $i + 1`
done
@@ -4756,6 +4801,7 @@ export ANDROID_NDK_PATH=${ANDROID_NDK_PATH}
export ANDROID_SDK_PATH=${ANDROID_SDK_PATH}
export ANDROID_PLATFORM_VERSION=${ANDROID_PLATFORM_VERSION}
export TOOLSET=${toolset}
+export PIPER_MODEL_DIR=${PIPER_MODEL_DIR}
$CCACHE_ARG
CONFIGURE_OPTIONS=${cmdline}
diff --git a/tools/voice.pl b/tools/voice.pl
index 6b3f807854..337407e187 100755
--- a/tools/voice.pl
+++ b/tools/voice.pl
@@ -42,7 +42,8 @@ Usage: voice.pl [options] [path to dir]
Specify which target you want to build voicefile for. Must include
any features that target supports.
- -f=<file> Use existing voiceids file
+ -f=<file>
+ Use existing voiceids file
-i=<target_id>
Numeric target id. Needed for voice building.
@@ -64,7 +65,8 @@ Usage: voice.pl [options] [path to dir]
Options to pass to the TTS engine. Enclose in double quotes if the
options include spaces.
- -F Force the file to be regenerated even if present
+ -F
+ Force the file to be regenerated even if present
-v
Be verbose
@@ -73,57 +75,78 @@ USAGE
}
my %festival_lang_map = (
- 'english' => 'english',
- 'english-us' => 'english',
- 'espanol' => 'spanish',
- #'finnish' => 'finnish'
- #'italiano' => 'italian',
- #'czech' => 'czech',
- #'welsh' => 'welsh'
+ 'english' => 'english',
+ 'english-us' => 'english',
+ 'espanol' => 'spanish',
+ #'finnish' => 'finnish'
+ #'italiano' => 'italian',
+ #'czech' => 'czech',
+ #'welsh' => 'welsh'
);
my %gtts_lang_map = (
'english' => '-l en -t co.uk', # Always first, it's the golden master
- 'czech' => '-l cs', # not supported
- 'dansk' => '-l da',
- 'deutsch' => '-l de',
- 'english-us' => '-l en -t us',
- 'espanol' => '-l es',
- 'francais' => '-l fr',
- 'greek' => '-l el',
- 'magyar' => '-l hu',
- 'italiano' => '-l it',
- 'nederlands' => '-l nl',
- 'norsk' => '-l no',
- 'polski' => '-l pl',
- 'russian' => '-l ru',
- 'slovak' => '-l sk',
- 'srpski' => '-l sr',
- 'svenska' => '-l sv',
- 'turkce' => '-l tr',
+ 'czech' => '-l cs',
+ 'dansk' => '-l da',
+ 'deutsch' => '-l de',
+ 'english-us' => '-l en -t us',
+ 'espanol' => '-l es',
+ 'francais' => '-l fr',
+ 'greek' => '-l el',
+ 'magyar' => '-l hu',
+ 'italiano' => '-l it',
+ 'nederlands' => '-l nl',
+ 'norsk' => '-l no',
+ 'polski' => '-l pl',
+ 'russian' => '-l ru',
+ 'slovak' => '-l sk',
+ 'srpski' => '-l sr',
+ 'svenska' => '-l sv',
+ 'turkce' => '-l tr',
);
my %espeak_lang_map = (
- 'english' => 'en-gb', # Always first, it's the golden master
- 'czech' => 'cs',
- 'dansk' => 'da',
- 'deutsch' => 'de',
- 'english-us' => 'en-us',
- 'espanol' => 'es',
- 'francais' => 'fr-fr',
- 'greek' => 'el',
- 'nederlands' => 'nl',
- 'magyar' => 'hu',
- 'italiano' => 'it',
- 'japanese' => 'ja',
- 'nederlands' => 'nl',
- 'norsk' => 'no',
- 'polski' => 'pl',
- 'russian' => 'ru',
- 'slovak' => 'sk',
- 'srpski' => 'sr',
- 'svenska' => 'sv',
- 'turkce' => 'tr',
+ 'english' => '-ven-gb -k 5', # Always first, it's the golden master
+ 'czech' => '-vcs',
+ 'dansk' => '-vda',
+ 'deutsch' => '-vde',
+ 'english-us' => '-ven-us -k 5',
+ 'espanol' => '-ves',
+ 'francais' => '-vfr-fr',
+ 'greek' => '-vel',
+ 'magyar' => '-vhu',
+ 'italiano' => '-vit',
+ 'japanese' => '-vja',
+ 'nederlands' => '-vnl',
+ 'norsk' => '-vno',
+ 'polski' => '-vpl',
+ 'russian' => '-vru',
+ 'slovak' => '-vsk',
+ 'srpski' => '-vsr',
+ 'svenska' => '-vsv',
+ 'turkce' => '-vtr',
+ );
+
+my %piper_lang_map = (
+ 'english' => 'en_GB-cori-high.onnx', # Always first, it's the golden master
+ 'czech' => 'cs_CZ-jirka-medium.onnx',
+ 'dansk' => 'da_DK-talesyntese-medium.onnx',
+ 'deutsch' => 'de_DE-thorsten-high.onnx',
+ 'english-us' => 'en_US-libritts-high.onnx',
+ 'espanol' => 'es_ES-sharvard-medium.onnx',
+ 'francais' => 'fr_FR-siwis-medium.onnx',
+ 'greek' => 'el_GR-rapunzelina-low.onnx',
+# 'magyar' => '-vhu',
+ 'italiano' => 'it_IT-riccardo-x_low.onnx',
+# 'japanese' => '-vja',
+ 'nederlands' => 'nl_NL-mls-medium.onnx',
+ 'norsk' => 'no_NO-talesyntese-medium.onnx',
+ 'polski' => 'pl_PL-gosia-medium.onnx',
+ 'russian' => 'ru_RU-irina-medium.onnx',
+ 'slovak' => 'sk_SK-lili-medium.onnx',
+ 'srpski' => 'sr_RS-serbski_institut-medium.onnx',
+ 'svenska' => 'sv_SE-nst-medium.onnx',
+ 'turkce' => 'tr_TR-fettah-medium.onnx',
);
my $trim_thresh = 500; # Trim silence if over this, in ms
@@ -141,6 +164,7 @@ sub init_tts {
# Don't use given/when here - it's not compatible with old perl versions
if ($tts_engine eq 'festival') {
print("> festival $tts_engine_opts --server\n") if $verbose;
+ # Open command, and filehandles for STDIN, STDOUT, STDERR
my $pid = open(FESTIVAL_SERVER, "| festival $tts_engine_opts --server > /dev/null 2>&1");
my $dummy = *FESTIVAL_SERVER; #suppress warning
$SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); };
@@ -149,6 +173,21 @@ sub init_tts {
if (defined($festival_lang_map{$language}) && $tts_engine_opts !~ /--language/) {
$ret{"ttsoptions"} = "--language $festival_lang_map{$language} ";
}
+ } elsif ($tts_engine eq 'piper') {
+ my $cmd = "piper $tts_engine_opts --json-input";
+ print("> $cmd\n") if $verbose;
+
+ my $pid = open3(*CMD_IN, *CMD_OUT, *CMD_ERR, $cmd);
+ $SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); };
+ $SIG{KILL} = sub { kill TERM => $pid; print("boo"); panic_cleanup(); };
+ $ret{"pid"} = $pid;
+ binmode(*CMD_IN, ':encoding(utf8)');
+ binmode(*CMD_OUT, ':encoding(utf8)');
+ binmode(*CMD_ERR, ':encoding(utf8)');
+ if (defined($piper_lang_map{$language}) && $tts_engine_opts !~ /--model/) {
+ die("Need PIPER_MODEL_DIR\n") if (!defined($ENV{'PIPER_MODEL_DIR'}));
+ $ret{"ttsoptions"} = "--model $ENV{PIPER_MODEL_DIR}/$piper_lang_map{$language} ";
+ }
} elsif ($tts_engine eq 'sapi') {
my $toolsdir = dirname($0);
my $path = `cygpath $toolsdir -a -w`;
@@ -176,7 +215,7 @@ sub init_tts {
}
} elsif ($tts_engine eq 'espeak' || $tts_engine eq 'espeak-ng') {
if (defined($espeak_lang_map{$language}) && $tts_engine_opts !~ /-v/) {
- $ret{"ttsoptions"} = "-v$espeak_lang_map{$language} ";
+ $ret{"ttsoptions"} = " $espeak_lang_map{$language} ";
}
}
@@ -190,6 +229,10 @@ sub shutdown_tts {
# Send SIGTERM to festival server
kill TERM => $$tts_object{"pid"};
}
+ elsif ($$tts_object{'name'} eq 'piper') {
+ # Send SIGTERM to piper
+ kill TERM => $$tts_object{"pid"};
+ }
elsif ($$tts_object{'name'} eq 'sapi') {
print({$$tts_object{"stdin"}} "QUIT\r\n");
close($$tts_object{"stdin"});
@@ -244,6 +287,13 @@ sub voicestring {
close(CMD_OUT);
close(CMD_ERR);
}
+ elsif ($name eq 'piper') {
+ $cmd = "{ \"text\": \"$string\", \"output_file\": \"$output\" }";
+ print(">> $cmd\n") if $verbose;
+ print(CMD_IN "$cmd\n");
+ my $res = <CMD_OUT>;
+ $res = <CMD_ERR>;
+ }
elsif ($name eq 'flite') {
$cmd = "flite $tts_engine_opts -t \"$string\" \"$output\"";
print("> $cmd\n") if $verbose;
@@ -469,7 +519,6 @@ sub generateclips {
print("\n");
unlink($updfile) if (-f $updfile);
- shutdown_tts($tts_object);
}
# Assemble the voicefile
@@ -608,6 +657,7 @@ if ($V == 1) {
defined($t) ? $t : "unknown",
$l, $e, $E, $s, $S);
generateclips($l, $t, $e, $E, $tts_object, $S, $f);
+ shutdown_tts($tts_object);
createvoice($l, $i, $f);
deleteencs();
} elsif ($C) {