speech.espeak.espeak source code

1 /***************************************************************************
2  *   Copyright (C) 2005 to 2012 by Jonathan Duddington                     *
3  *   email: jonsd@users.sourceforge.net                                    *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 3 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, see:                                 *
17  *               <http://www.gnu.org/licenses/>.                           *
18  ***************************************************************************/
19 
20 
21 /*************************************************************/
22 /* This is the header file for the library version of espeak */
23 /*                                                           */
24 /*************************************************************/
25 module speech.espeak.espeak;
26 
27 import core.stdc.stdio;
28 import core.stdc.stddef;
29 
30 extern(C):
31 
32 enum ESPEAK_API_REVISION = 9;
33 
34 /*
35 Revision 2
36    Added parameter "options" to eSpeakInitialize()
37 
38 Revision 3
39    Added espeakWORDGAP to  espeak_PARAMETER
40 
41 Revision 4
42    Added flags parameter to espeak_CompileDictionary()
43 
44 Revision 5
45    Added espeakCHARS_16BIT
46 
47 Revision 6
48   Added macros: espeakRATE_MINIMUM, espeakRATE_MAXIMUM, espeakRATE_NORMAL
49 
50 Revision 7  24.Dec.2011
51   Changed espeak_EVENT structure to add id.string[] for phoneme mnemonics.
52   Added espeakINITIALIZE_PHONEME_IPA option for espeak_Initialize() to report phonemes as IPA names.
53 
54 Revision 8  26.Apr.2013
55   Added function espeak_TextToPhonemes().
56 
57 Revision 9  30.May.2013
58   Changed function espeak_TextToPhonemes().
59 
60 */
61          /********************/
62          /*  Initialization  */
63          /********************/
64 
65 // values for 'value' in espeak_SetParameter(espeakRATE, value, 0), nominally in words-per-minute
66 enum espeakRATE_MINIMUM  = 80;
67 enum espeakRATE_MAXIMUM = 450;
68 enum espeakRATE_NORMAL = 175;
69 
70 enum {
71     espeakEVENT_LIST_TERMINATED = 0, // Retrieval mode: terminates the event list.
72     espeakEVENT_WORD = 1,            // Start of word
73     espeakEVENT_SENTENCE = 2,        // Start of sentence
74     espeakEVENT_MARK = 3,            // Mark
75     espeakEVENT_PLAY = 4,            // Audio element
76     espeakEVENT_END = 5,             // End of sentence or clause
77     espeakEVENT_MSG_TERMINATED = 6,  // End of message
78     espeakEVENT_PHONEME = 7,         // Phoneme, if enabled in espeak_Initialize()
79     espeakEVENT_SAMPLERATE = 8       // internal use, set sample rate
80 }
81 
82 alias espeak_EVENT_TYPE = int;
83 
84 struct espeak_EVENT {
85 	espeak_EVENT_TYPE type;
86 	uint unique_identifier; // message identifier (or 0 for key or character)
87 	int text_position;    // the number of characters from the start of the text
88 	int length;           // word length, in characters (for espeakEVENT_WORD)
89 	int audio_position;   // the time in mS within the generated speech output data
90 	int sample;           // sample id (internal use)
91 	void* user_data;      // pointer supplied by the calling program
92 	union {
93 		int number;        // used for WORD and SENTENCE events.
94 		const(char)* name;  // used for MARK and PLAY events.  UTF8 string
95 		char[8] string;    // used for phoneme names (UTF8). Terminated by a zero byte unless the name needs the full 8 bytes.
96 	}
97 }
98 /*
99    When a message is supplied to espeak_synth, the request is buffered and espeak_synth returns. When the message is really processed, the callback function will be repetedly called.
100 
101 
102    In RETRIEVAL mode, the callback function supplies to the calling program the audio data and an event list terminated by 0 (LIST_TERMINATED).
103 
104    In PLAYBACK mode, the callback function is called as soon as an event happens.
105 
106    For example suppose that the following message is supplied to espeak_Synth:
107    "hello, hello."
108 
109 
110    * Once processed in RETRIEVAL mode, it could lead to 3 calls of the callback function :
111 
112    ** Block 1:
113    <audio data> +
114    List of events: SENTENCE + WORD + LIST_TERMINATED
115 
116    ** Block 2:
117    <audio data> +
118    List of events: WORD + END + LIST_TERMINATED
119 
120    ** Block 3:
121    no audio data
122    List of events: MSG_TERMINATED + LIST_TERMINATED
123 
124 
125    * Once processed in PLAYBACK mode, it could lead to 5 calls of the callback function:
126 
127    ** SENTENCE
128    ** WORD (call when the sounds are actually played)
129    ** WORD
130    ** END (call when the end of sentence is actually played.)
131    ** MSG_TERMINATED
132 
133 
134    The MSG_TERMINATED event is the last event. It can inform the calling program to clear the user data related to the message.
135    So if the synthesis must be stopped, the callback function is called for each pending message with the MSG_TERMINATED event.
136 
137    A MARK event indicates a <mark> element in the text.
138    A PLAY event indicates an <audio> element in the text, for which the calling program should play the named sound file.
139 */
140 enum {
141 	POS_CHARACTER = 1,
142 	POS_WORD,
143 	POS_SENTENCE
144 }
145 
146 alias espeak_POSITION_TYPE = int;
147 
148 enum {
149 	/* PLAYBACK mode: plays the audio data, supplies events to the calling program*/
150 	AUDIO_OUTPUT_PLAYBACK,
151 
152 	/* RETRIEVAL mode: supplies audio data and events to the calling program */
153 	AUDIO_OUTPUT_RETRIEVAL,
154 
155 	/* SYNCHRONOUS mode: as RETRIEVAL but doesn't return until synthesis is completed */
156 	AUDIO_OUTPUT_SYNCHRONOUS,
157 
158 	/* Synchronous playback */
159 	AUDIO_OUTPUT_SYNCH_PLAYBACK
160 }
161 
162 alias espeak_AUDIO_OUTPUT = int;
163 
164 enum {
165 	EE_OK=0,
166 	EE_INTERNAL_ERROR=-1,
167 	EE_BUFFER_FULL=1,
168 	EE_NOT_FOUND=2
169 }
170 
171 alias espeak_ERROR = int;
172 
173 enum espeakINITIALIZE_PHONEME_EVENTS = 0x0001;
174 enum espeakINITIALIZE_PHONEME_IPA = 0x0002;
175 enum espeakINITIALIZE_DONT_EXIT = 0x8000;
176 
177 int espeak_Initialize(espeak_AUDIO_OUTPUT output, int buflength, const(char)* path, int options);
178 /* Must be called before any synthesis functions are called.
179    output: the audio data can either be played by eSpeak or passed back by the SynthCallback function.
180 
181    buflength:  The length in mS of sound buffers passed to the SynthCallback function.
182             Value=0 gives a default of 200mS.
183             This paramater is only used for AUDIO_OUTPUT_RETRIEVAL and AUDIO_OUTPUT_SYNCHRONOUS modes.
184 
185    path: The directory which contains the espeak-data directory, or NULL for the default location.
186 
187    options: bit 0:  1=allow espeakEVENT_PHONEME events.
188             bit 1:  1= espeakEVENT_PHONEME events give IPA phoneme names, not eSpeak phoneme names
189             bit 15: 1=don't exit if espeak_data is not found (used for --help)
190 
191    Returns: sample rate in Hz, or -1 (EE_INTERNAL_ERROR).
192 */
193 alias t_espeak_callback = int function(short*, int, espeak_EVENT*);
194 
195 void espeak_SetSynthCallback(t_espeak_callback SynthCallback);
196 /* Must be called before any synthesis functions are called.
197    This specifies a function in the calling program which is called when a buffer of
198    speech sound data has been produced.
199 
200 
201    The callback function is of the form:
202 
203 int SynthCallback(short *wav, int numsamples, espeak_EVENT *events);
204 
205    wav:  is the speech sound data which has been produced.
206       NULL indicates that the synthesis has been completed.
207 
208    numsamples: is the number of entries in wav.  This number may vary, may be less than
209       the value implied by the buflength parameter given in espeak_Initialize, and may
210       sometimes be zero (which does NOT indicate end of synthesis).
211 
212    events: an array of espeak_EVENT items which indicate word and sentence events, and
213       also the occurance if <mark> and <audio> elements within the text.  The list of
214       events is terminated by an event of type = 0.
215 
216 
217    Callback returns: 0=continue synthesis,  1=abort synthesis.
218 */
219 
220 alias t_espeak_uri_callback = int function(int, const(char)*, const(char)*);
221 
222 void espeak_SetUriCallback(t_espeak_uri_callback UriCallback);
223 /* This function may be called before synthesis functions are used, in order to deal with
224    <audio> tags.  It specifies a callback function which is called when an <audio> element is
225    encountered and allows the calling program to indicate whether the sound file which
226    is specified in the <audio> element is available and is to be played.
227 
228    The callback function is of the form:
229 
230 int UriCallback(int type, const char *uri, const char *base);
231 
232    type:  type of callback event.  Currently only 1= <audio> element
233 
234    uri:   the "src" attribute from the <audio> element
235 
236    base:  the "xml:base" attribute (if any) from the <speak> element
237 
238    Return: 1=don't play the sound, but speak the text alternative.
239            0=place a PLAY event in the event list at the point where the <audio> element
240              occurs.  The calling program can then play the sound at that point.
241 */
242 
243 
244          /********************/
245          /*    Synthesis     */
246          /********************/
247 
248 
249 enum espeakCHARS_AUTO = 0;
250 enum espeakCHARS_UTF8 = 1;
251 enum espeakCHARS_8BIT = 2;
252 enum espeakCHARS_WCHAR = 3;
253 enum espeakCHARS_16BIT = 4;
254 
255 enum espeakSSML = 0x10;
256 enum espeakPHONEMES = 0x100;
257 enum espeakENDPAUSE = 0x1000;
258 enum espeakKEEP_NAMEDATA = 0x2000;
259 
260 espeak_ERROR espeak_Synth(const(void)* text,
261 	size_t size,
262 	uint position,
263 	espeak_POSITION_TYPE position_type,
264 	uint end_position,
265 	uint flags,
266 	uint* unique_identifier,
267 	void* user_data);
268 /* Synthesize speech for the specified text.  The speech sound data is passed to the calling
269    program in buffers by means of the callback function specified by espeak_SetSynthCallback(). The command is asynchronous: it is internally buffered and returns as soon as possible. If espeak_Initialize was previously called with AUDIO_OUTPUT_PLAYBACK as argument, the sound data are played by eSpeak.
270 
271    text: The text to be spoken, terminated by a zero character. It may be either 8-bit characters,
272       wide characters (wchar_t), or UTF8 encoding.  Which of these is determined by the "flags"
273       parameter.
274 
275    size: Equal to (or greatrer than) the size of the text data, in bytes.  This is used in order
276       to allocate internal storage space for the text.  This value is not used for
277       AUDIO_OUTPUT_SYNCHRONOUS mode.
278 
279    position:  The position in the text where speaking starts. Zero indicates speak from the
280       start of the text.
281 
282    position_type:  Determines whether "position" is a number of characters, words, or sentences.
283       Values:
284 
285    end_position:  If set, this gives a character position at which speaking will stop.  A value
286       of zero indicates no end position.
287 
288    flags:  These may be OR'd together:
289       Type of character codes, one of:
290          espeakCHARS_UTF8     UTF8 encoding
291          espeakCHARS_8BIT     The 8 bit ISO-8859 character set for the particular language.
292          espeakCHARS_AUTO     8 bit or UTF8  (this is the default)
293          espeakCHARS_WCHAR    Wide characters (wchar_t)
294          espeakCHARS_16BIT    16 bit characters.
295 
296       espeakSSML   Elements within < > are treated as SSML elements, or if not recognised are ignored.
297 
298       espeakPHONEMES  Text within [[ ]] is treated as phonemes codes (in espeak's Hirshenbaum encoding).
299 
300       espeakENDPAUSE  If set then a sentence pause is added at the end of the text.  If not set then
301          this pause is suppressed.
302 
303    unique_identifier: This must be either NULL, or point to an integer variable to
304        which eSpeak writes a message identifier number.
305        eSpeak includes this number in espeak_EVENT messages which are the result of
306        this call of espeak_Synth().
307 
308    user_data: a pointer (or NULL) which will be passed to the callback function in
309        espeak_EVENT messages.
310 
311    Return: EE_OK: operation achieved
312            EE_BUFFER_FULL: the command can not be buffered;
313              you may try after a while to call the function again.
314 	   EE_INTERNAL_ERROR.
315 */
316 
317 espeak_ERROR espeak_Synth_Mark(const(void)* text,
318 	size_t size,
319 	const(char)* index_mark,
320 	uint end_position,
321 	uint flags,
322 	uint* unique_identifier,
323 	void* user_data);
324 /* Synthesize speech for the specified text.  Similar to espeak_Synth() but the start position is
325    specified by the name of a <mark> element in the text.
326 
327    index_mark:  The "name" attribute of a <mark> element within the text which specified the
328       point at which synthesis starts.  UTF8 string.
329 
330    For the other parameters, see espeak_Synth()
331 
332    Return: EE_OK: operation achieved
333            EE_BUFFER_FULL: the command can not be buffered;
334              you may try after a while to call the function again.
335 	   EE_INTERNAL_ERROR.
336 */
337 
338 espeak_ERROR espeak_Key(const(char)* key_name);
339 /* Speak the name of a keyboard key.
340    If key_name is a single character, it speaks the name of the character.
341    Otherwise, it speaks key_name as a text string.
342 
343    Return: EE_OK: operation achieved
344            EE_BUFFER_FULL: the command can not be buffered;
345              you may try after a while to call the function again.
346 	   EE_INTERNAL_ERROR.
347 */
348 
349 espeak_ERROR espeak_Char(wchar_t character);
350 /* Speak the name of the given character
351 
352    Return: EE_OK: operation achieved
353            EE_BUFFER_FULL: the command can not be buffered;
354              you may try after a while to call the function again.
355 	   EE_INTERNAL_ERROR.
356 */
357 
358 
359 
360 
361          /***********************/
362          /*  Speech Parameters  */
363          /***********************/
364 
365 enum {
366   espeakSILENCE=0, /* internal use */
367   espeakRATE=1,
368   espeakVOLUME=2,
369   espeakPITCH=3,
370   espeakRANGE=4,
371   espeakPUNCTUATION=5,
372   espeakCAPITALS=6,
373   espeakWORDGAP=7,
374   espeakOPTIONS=8,   // reserved for misc. options.  not yet used
375   espeakINTONATION=9,
376 
377   espeakRESERVED1=10,
378   espeakRESERVED2=11,
379   espeakEMPHASIS,   /* internal use */
380   espeakLINELENGTH, /* internal use */
381   espeakVOICETYPE,  // internal, 1=mbrola
382   N_SPEECH_PARAM    /* last enum */
383 }
384 
385 alias espeak_PARAMETER = int;
386 
387 enum {
388   espeakPUNCT_NONE=0,
389   espeakPUNCT_ALL=1,
390   espeakPUNCT_SOME=2
391 }
392 
393 alias  espeak_PUNCT_TYPE = int;
394 
395 espeak_ERROR espeak_SetParameter(espeak_PARAMETER parameter, int value, int relative);
396 /* Sets the value of the specified parameter.
397    relative=0   Sets the absolute value of the parameter.
398    relative=1   Sets a relative value of the parameter.
399 
400    parameter:
401       espeakRATE:    speaking speed in word per minute.  Values 80 to 450.
402 
403       espeakVOLUME:  volume in range 0-200 or more.
404                      0=silence, 100=normal full volume, greater values may produce amplitude compression or distortion
405 
406       espeakPITCH:   base pitch, range 0-100.  50=normal
407 
408       espeakRANGE:   pitch range, range 0-100. 0-monotone, 50=normal
409 
410       espeakPUNCTUATION:  which punctuation characters to announce:
411          value in espeak_PUNCT_TYPE (none, all, some),
412          see espeak_GetParameter() to specify which characters are announced.
413 
414       espeakCAPITALS: announce capital letters by:
415          0=none,
416          1=sound icon,
417          2=spelling,
418          3 or higher, by raising pitch.  This values gives the amount in Hz by which the pitch
419             of a word raised to indicate it has a capital letter.
420 
421       espeakWORDGAP:  pause between words, units of 10mS (at the default speed)
422 
423    Return: EE_OK: operation achieved
424            EE_BUFFER_FULL: the command can not be buffered;
425              you may try after a while to call the function again.
426 	   EE_INTERNAL_ERROR.
427 */
428 
429 int espeak_GetParameter(espeak_PARAMETER parameter, int current);
430 /* current=0  Returns the default value of the specified parameter.
431    current=1  Returns the current value of the specified parameter, as set by SetParameter()
432 */
433 
434 espeak_ERROR espeak_SetPunctuationList(const(wchar_t)* punctlist);
435 /* Specified a list of punctuation characters whose names are to be spoken when the
436    value of the Punctuation parameter is set to "some".
437 
438    punctlist:  A list of character codes, terminated by a zero character.
439 
440    Return: EE_OK: operation achieved
441            EE_BUFFER_FULL: the command can not be buffered;
442              you may try after a while to call the function again.
443 	   EE_INTERNAL_ERROR.
444 */
445 
446 void espeak_SetPhonemeTrace(int value, FILE *stream);
447 /* Controls the output of phoneme symbols for the text
448    value=0  No phoneme output (default)
449    value=1  Output the translated phoneme symbols for the text
450    value=2  as (1), but also output a trace of how the translation was done (matching rules and list entries)
451    value=3  as (1), but produces IPA rather than ascii phoneme names
452 
453    stream   output stream for the phoneme symbols (and trace).  If stream=NULL then it uses stdout.
454 */
455 
456 const(char)* espeak_TextToPhonemes(const(void)** textptr, int textmode, int phonememode);
457 /* Translates text into phonemes.  Call espeak_SetVoiceByName() first, to select a language.
458 
459    It returns a pointer to a character string which contains the phonemes for the text up to
460    end of a sentence, or comma, semicolon, colon, or similar punctuation.
461 
462    textptr: The address of a pointer to the input text which is terminated by a zero character.
463       On return, the pointer has been advanced past the text which has been translated, or else set
464       to NULL to indicate that the end of the text has been reached.
465 
466    textmode: Type of character codes, one of:
467          espeakCHARS_UTF8     UTF8 encoding
468          espeakCHARS_8BIT     The 8 bit ISO-8859 character set for the particular language.
469          espeakCHARS_AUTO     8 bit or UTF8  (this is the default)
470          espeakCHARS_WCHAR    Wide characters (wchar_t)
471          espeakCHARS_16BIT    16 bit characters.
472 
473    phonememode: bits0-3:
474       0= just phonemes.
475       1= include ties (U+361) for phoneme names of more than one letter.
476       2= include zero-width-joiner for phoneme names of more than one letter.
477       3= separate phonemes with underscore characters.
478 
479 	 bits 4-7:
480       0= eSpeak's ascii phoneme names.
481       1= International Phonetic Alphabet (as UTF-8 characters).
482 */
483 
484 void espeak_CompileDictionary(const(char)* path, FILE *log, int flags);
485 /* Compile pronunciation dictionary for a language which corresponds to the currently
486    selected voice.  The required voice should be selected before calling this function.
487 
488    path:  The directory which contains the language's '_rules' and '_list' files.
489           'path' should end with a path separator character ('/').
490    log:   Stream for error reports and statistics information. If log=NULL then stderr will be used.
491 
492    flags:  Bit 0: include source line information for debug purposes (This is displayed with the
493           -X command line option).
494 */
495          /***********************/
496          /*   Voice Selection   */
497          /***********************/
498 
499 
500 // voice table
501 struct espeak_VOICE {
502 	immutable(char)* name;      // a given name for this voice. UTF8 string.
503 	immutable(char)* languages;       // list of pairs of (byte) priority + (string) language (and dialect qualifier)
504 	const(char)* identifier;      // the filename for this voice within espeak-data/voices
505 	char gender;  // 0=none 1=male, 2=female,
506 	char age;     // 0=not specified, or age in years
507 	char variant; // only used when passed as a parameter to espeak_SetVoiceByProperties
508 	char xx1;     // for internal use
509 	int score;       // for internal use
510 	void *spare;     // for internal use
511 }
512 
513 /* Note: The espeak_VOICE structure is used for two purposes:
514   1.  To return the details of the available voices.
515   2.  As a parameter to  espeak_SetVoiceByProperties() in order to specify selection criteria.
516 
517    In (1), the "languages" field consists of a list of (UTF8) language names for which this voice
518    may be used, each language name in the list is terminated by a zero byte and is also preceded by
519    a single byte which gives a "priority" number.  The list of languages is terminated by an
520    additional zero byte.
521 
522    A language name consists of a language code, optionally followed by one or more qualifier (dialect)
523    names separated by hyphens (eg. "en-uk").  A voice might, for example, have languages "en-uk" and
524    "en".  Even without "en" listed, voice would still be selected for the "en" language (because
525    "en-uk" is related) but at a lower priority.
526 
527    The priority byte indicates how the voice is preferred for the language. A low number indicates a
528    more preferred voice, a higher number indicates a less preferred voice.
529 
530    In (2), the "languages" field consists simply of a single (UTF8) language name, with no preceding
531    priority byte.
532 */
533 
534 const(espeak_VOICE)** espeak_ListVoices(espeak_VOICE *voice_spec);
535 /* Reads the voice files from espeak-data/voices and creates an array of espeak_VOICE pointers.
536    The list is terminated by a NULL pointer
537 
538    If voice_spec is NULL then all voices are listed.
539    If voice spec is given, then only the voices which are compatible with the voice_spec
540    are listed, and they are listed in preference order.
541 */
542 
543 espeak_ERROR espeak_SetVoiceByName(const(char)* name);
544 /* Searches for a voice with a matching "name" field.  Language is not considered.
545    "name" is a UTF8 string.
546 
547    Return: EE_OK: operation achieved
548            EE_BUFFER_FULL: the command can not be buffered;
549              you may try after a while to call the function again.
550 	   EE_INTERNAL_ERROR.
551 */
552 
553 espeak_ERROR espeak_SetVoiceByProperties(espeak_VOICE *voice_spec);
554 /* An espeak_VOICE structure is used to pass criteria to select a voice.  Any of the following
555    fields may be set:
556 
557    name     NULL, or a voice name
558 
559    languages  NULL, or a single language string (with optional dialect), eg. "en-uk", or "en"
560 
561    gender   0=not specified, 1=male, 2=female
562 
563    age      0=not specified, or an age in years
564 
565    variant  After a list of candidates is produced, scored and sorted, "variant" is used to index
566             that list and choose a voice.
567             variant=0 takes the top voice (i.e. best match). variant=1 takes the next voice, etc
568 */
569 
570 espeak_VOICE *espeak_GetCurrentVoice();
571 /* Returns the espeak_VOICE data for the currently selected voice.
572    This is not affected by temporary voice changes caused by SSML elements such as <voice> and <s>
573 */
574 
575 espeak_ERROR espeak_Cancel();
576 /* Stop immediately synthesis and audio output of the current text. When this
577    function returns, the audio output is fully stopped and the synthesizer is ready to
578    synthesize a new message.
579 
580    Return: EE_OK: operation achieved
581 	   EE_INTERNAL_ERROR.
582 */
583 
584 int espeak_IsPlaying();
585 /* Returns 1 if audio is played, 0 otherwise.
586 */
587 
588 espeak_ERROR espeak_Synchronize();
589 /* This function returns when all data have been spoken.
590    Return: EE_OK: operation achieved
591 	   EE_INTERNAL_ERROR.
592 */
593 
594 espeak_ERROR espeak_Terminate();
595 /* last function to be called.
596    Return: EE_OK: operation achieved
597 	   EE_INTERNAL_ERROR.
598 */
599 
600 const(char)* espeak_Info(const(char)** path_data);
601 /* Returns the version number string.
602    path_data  returns the path to espeak_data
603 */
604