PocketSphinx 5prealpha
ngram_search.h
Go to the documentation of this file.
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 2008 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37
42#ifndef __NGRAM_SEARCH_H__
43#define __NGRAM_SEARCH_H__
44
45/* SphinxBase headers. */
46#include <sphinxbase/cmd_ln.h>
47#include <sphinxbase/logmath.h>
48#include <sphinxbase/ngram_model.h>
49#include <sphinxbase/listelem_alloc.h>
50#include <sphinxbase/err.h>
51
52/* Local headers. */
54#include "hmm.h"
55
64typedef struct chan_s {
68 struct chan_s *next;
71 struct chan_s *alt;
73 int32 ciphone;
74 union {
79 int32 rc_id;
80 } info;
82
90typedef struct root_chan_s {
96 int32 penult_phn_wid;
100 int16 ciphone;
102 int16 ci2phone;
105
109typedef struct bptbl_s {
111 uint8 valid;
112 uint8 refcnt;
113 int32 wid;
114 int32 bp;
115 int32 score;
116 int32 s_idx;
117 int32 real_wid;
122
126typedef struct bptbl_seg_s {
128 int32 *bpidx;
129 int16 n_bpidx;
130 int16 cur;
132
133/*
134 * Candidates words for entering their last phones. Cleared and rebuilt in each
135 * frame.
136 * NOTE: candidates can only be multi-phone, real dictionary words.
137 */
138typedef struct lastphn_cand_s {
139 int32 wid;
140 int32 score;
141 int32 bp;
142 int32 next; /* next candidate starting at the same frame */
144
145/*
146 * Since the same instance of a word (i.e., <word,start-frame>) reaches its last
147 * phone several times, we can compute its best BP and LM transition score info
148 * just the first time and cache it for future occurrences. Structure for such
149 * a cache.
150 */
151typedef struct {
152 int32 sf; /* Start frame */
153 int32 dscr; /* Delta-score upon entering last phone */
154 int32 bp; /* Best BP */
156
157#define CAND_SF_ALLOCSIZE 32
158typedef struct {
159 int32 bp_ef;
160 int32 cand;
161} cand_sf_t;
162
163/*
164 * Structure for reorganizing the BP table entries in the current frame according
165 * to distinct right context ci-phones. Each entry contains the best BP entry for
166 * a given right context. Each successor word will pick up the correct entry based
167 * on its first ci-phone.
168 */
169typedef struct bestbp_rc_s {
170 int32 score;
171 int32 path; /* BP table index corresponding to this entry */
172 int32 lc; /* right most ci-phone of above BP entry word */
174
175#define NO_BP -1
176
180typedef struct ngram_search_stats_s {
181 int32 n_phone_eval;
182 int32 n_root_chan_eval;
183 int32 n_nonroot_chan_eval;
184 int32 n_last_chan_eval;
185 int32 n_word_lastchan_eval;
186 int32 n_lastphn_cand_utt;
187 int32 n_fwdflat_chan;
188 int32 n_fwdflat_words;
189 int32 n_fwdflat_word_transition;
190 int32 n_senone_active_utt;
192
193
198 ps_search_t base;
199 ngram_model_t *lmset;
202 /* Flags to quickly indicate which passes are enabled. */
203 uint8 fwdtree;
204 uint8 fwdflat;
205 uint8 bestpath;
206
207 /* State of procesing. */
208 uint8 done;
209
210 /* Allocators */
211 listelem_alloc_t *chan_alloc;
212 listelem_alloc_t *root_chan_alloc;
213 listelem_alloc_t *latnode_alloc;
247 bitvec_t *word_active;
276 int32 n_active_chan[2];
288 int32 n_active_word[2];
290 /*
291 * FIXME: Document all of these bits.
292 */
293 lastphn_cand_t *lastphn_cand;
294 int32 n_lastphn_cand;
295 last_ltrans_t *last_ltrans; /* one per word */
296 int32 cand_sf_alloc;
297 cand_sf_t *cand_sf;
298 bestbp_rc_t *bestbp_rc;
299
300 bptbl_t *bp_table; /* Forward pass lattice */
301 int32 bpidx; /* First free BPTable entry */
302 int32 bp_table_size;
303 int32 *bscore_stack; /* Score stack for all possible right contexts */
304 int32 bss_head; /* First free BScoreStack entry */
305 int32 bscore_stack_size;
306
308 int32 n_frame;
309 int32 *bp_table_idx; /* First BPTable entry for each frame */
310 int32 *word_lat_idx; /* BPTable index for any word in current frame;
311 cleared before each frame */
312
313 /*
314 * Flat lexicon (2nd pass) search stuff.
315 */
318 bitvec_t *expand_word_flag;
319 int32 *expand_word_list;
320 int32 n_expand_words;
321 int32 min_ef_width;
322 int32 max_sf_win;
323 float32 fwdflat_fwdtree_lw_ratio;
324
327 int32 renormalized;
328
329 /*
330 * DAG (3rd pass) search stuff.
331 */
332 float32 bestpath_fwdtree_lw_ratio;
333 float32 ascale;
336 ptmr_t fwdtree_perf;
337 ptmr_t fwdflat_perf;
338 ptmr_t bestpath_perf;
339 int32 n_tot_frame;
340
341 /* A collection of beam widths. */
342 int32 beam;
343 int32 dynamic_beam;
344 int32 pbeam;
345 int32 wbeam;
346 int32 lpbeam;
347 int32 lponlybeam;
348 int32 fwdflatbeam;
349 int32 fwdflatwbeam;
350 int32 fillpen;
351 int32 silpen;
352 int32 wip;
353 int32 nwpen;
354 int32 pip;
355 int32 maxwpf;
356 int32 maxhmmpf;
357};
358typedef struct ngram_search_s ngram_search_t;
359
363ps_search_t *ngram_search_init(const char *name,
364 ngram_model_t *lm,
365 cmd_ln_t *config,
366 acmod_t *acmod,
367 dict_t *dict,
368 dict2pid_t *d2p);
369
374
380int ngram_search_mark_bptable(ngram_search_t *ngs, int frame_idx);
381
385void ngram_search_save_bp(ngram_search_t *ngs, int frame_idx, int32 w,
386 int32 score, int32 path, int32 rc);
387
391void ngram_search_alloc_all_rc(ngram_search_t *ngs, int32 w);
392
396void ngram_search_free_all_rc(ngram_search_t *ngs, int32 w);
397
403int ngram_search_find_exit(ngram_search_t *ngs, int frame_idx, int32 *out_best_score);
404
410char const *ngram_search_bp_hyp(ngram_search_t *ngs, int bpidx);
411
416
421
425int32 ngram_search_exit_score(ngram_search_t *ngs, bptbl_t *pbe, int rcphone);
426
432void ngram_search_set_lm(ngram_model_t *lm);
433
434#endif /* __NGRAM_SEARCH_H__ */
Implementation of HMM base structure.
int32 frame_idx_t
Type for frame index values.
Definition: hmm.h:64
struct ngram_search_stats_s ngram_search_stats_t
Various statistics for profiling.
void ngram_search_set_lm(ngram_model_t *lm)
Sets the global language model.
struct bptbl_seg_s bptbl_seg_t
Segmentation "iterator" for backpointer table results.
void ngram_search_free_all_rc(ngram_search_t *ngs, int32 w)
Allocate last phone channels for all possible right contexts for word w.
Definition: ngram_search.c:647
void ngram_search_alloc_all_rc(ngram_search_t *ngs, int32 w)
Allocate last phone channels for all possible right contexts for word w.
Definition: ngram_search.c:598
int32 ngram_search_exit_score(ngram_search_t *ngs, bptbl_t *pbe, int rcphone)
Get the exit score for a backpointer entry with a given right context.
Definition: ngram_search.c:660
void ngram_compute_seg_scores(ngram_search_t *ngs, float32 lwf)
Compute language and acoustic scores for backpointer table entries.
ps_search_t * ngram_search_init(const char *name, ngram_model_t *lm, cmd_ln_t *config, acmod_t *acmod, dict_t *dict, dict2pid_t *d2p)
Initialize the N-Gram search module.
Definition: ngram_search.c:140
int ngram_search_mark_bptable(ngram_search_t *ngs, int frame_idx)
Record the current frame's index in the backpointer table.
Definition: ngram_search.c:329
ps_lattice_t * ngram_search_lattice(ps_search_t *search)
Construct a word lattice from the current hypothesis.
struct root_chan_s root_chan_t
Lexical tree node data type for the first phone (root) of each dynamic HMM tree structure.
void ngram_search_free(ps_search_t *ngs)
Finalize the N-Gram search module.
Definition: ngram_search.c:289
int ngram_search_find_exit(ngram_search_t *ngs, int frame_idx, int32 *out_best_score)
Find the best word exit for the current frame in the backpointer table.
Definition: ngram_search.c:506
struct bptbl_s bptbl_t
Back pointer table (forward pass lattice; actually a tree)
char const * ngram_search_bp_hyp(ngram_search_t *ngs, int bpidx)
Backtrace from a given backpointer index to obtain a word hypothesis.
Definition: ngram_search.c:550
void ngram_search_save_bp(ngram_search_t *ngs, int frame_idx, int32 w, int32 score, int32 path, int32 rc)
Enter a word in the backpointer table.
Definition: ngram_search.c:383
struct chan_s chan_t
Lexical tree node data type.
Internal implementation of PocketSphinx decoder.
Acoustic model structure.
Definition: acmod.h:148
Back pointer table (forward pass lattice; actually a tree)
Definition: ngram_search.h:109
int32 wid
Word index.
Definition: ngram_search.h:113
int16 last2_phone
next-to-last phone of this word
Definition: ngram_search.h:120
uint8 valid
For absolute pruning.
Definition: ngram_search.h:111
int32 bp
Back Pointer.
Definition: ngram_search.h:114
int32 prev_real_wid
wid of second-last real word
Definition: ngram_search.h:118
int32 real_wid
wid of this or latest predecessor real word
Definition: ngram_search.h:117
int32 score
Score (best among all right contexts)
Definition: ngram_search.h:115
int16 last_phone
last phone of this word
Definition: ngram_search.h:119
int32 s_idx
Start of BScoreStack for various right contexts.
Definition: ngram_search.h:116
uint8 refcnt
Reference count (number of successors)
Definition: ngram_search.h:112
frame_idx_t frame
start or end frame
Definition: ngram_search.h:110
Segmentation "iterator" for backpointer table results.
Definition: ngram_search.h:126
int16 cur
Current position in bpidx.
Definition: ngram_search.h:130
int32 * bpidx
Sequence of backpointer IDs.
Definition: ngram_search.h:128
int16 n_bpidx
Number of backpointer IDs.
Definition: ngram_search.h:129
ps_seg_t base
Base structure.
Definition: ngram_search.h:127
Lexical tree node data type.
Definition: ngram_search.h:64
int32 penult_phn_wid
list of words whose last phone follows this one; this field indicates the first of the list; the rest...
Definition: ngram_search.h:75
struct chan_s * next
first descendant of this channel; or, in the case of the last phone of a word, the next alternative r...
Definition: ngram_search.h:68
int32 ciphone
ciphone for this node
Definition: ngram_search.h:73
struct chan_s * alt
sibling; i.e., next descendant of parent HMM
Definition: ngram_search.h:71
hmm_t hmm
Basic HMM structure.
Definition: ngram_search.h:65
int32 rc_id
right-context id for last phone of words
Definition: ngram_search.h:79
Building composite triphone (as well as word internal triphones) with the dictionary.
Definition: dict2pid.h:84
a structure for a dictionary.
Definition: dict.h:76
Shared information between a set of HMMs.
An individual HMM among the HMM search space.
N-Gram search module structure.
Definition: ngram_search.h:197
int32 n_nonroot_chan
Number of valid non-root channels.
Definition: ngram_search.h:234
int32 * single_phone_wid
list of single-phone word ids
Definition: ngram_search.h:264
int32 best_score
Best Viterbi path score.
Definition: ngram_search.h:325
float32 ascale
Acoustic score scale for posterior probabilities.
Definition: ngram_search.h:333
root_chan_t * rhmm_1ph
Root HMMs for single-phone words.
Definition: ngram_search.h:236
listelem_alloc_t * latnode_alloc
For latnode_t.
Definition: ngram_search.h:213
int32 n_root_chan
Number of valid root_chan.
Definition: ngram_search.h:233
int32 n_frame_alloc
Number of frames allocated in bp_table_idx and friends.
Definition: ngram_search.h:307
int32 max_nonroot_chan
Maximum possible number of non-root channels.
Definition: ngram_search.h:235
int32 ** active_word_list
Array of active multi-phone words for current and next frame.
Definition: ngram_search.h:287
int32 n_frame
Number of frames actually present.
Definition: ngram_search.h:308
ngram_search_stats_t st
Various statistics for profiling.
Definition: ngram_search.h:335
listelem_alloc_t * root_chan_alloc
For root_chan_t.
Definition: ngram_search.h:212
int32 n_active_word[2]
Number entries in active_word_list.
Definition: ngram_search.h:288
ngram_model_t * lmset
Set of language models.
Definition: ngram_search.h:199
int32 * fwdflat_wordlist
List of active word IDs for utterance.
Definition: ngram_search.h:317
chan_t ** word_chan
Channels associated with a given word (only used for right contexts, single-phone words in fwdtree se...
Definition: ngram_search.h:246
int32 last_phone_best_score
Best Viterbi path score for last phone.
Definition: ngram_search.h:326
chan_t *** active_chan_list
Array of active channels for current and next frame.
Definition: ngram_search.h:275
int32 n_1ph_words
Number single phone words in dict (total)
Definition: ngram_search.h:265
int32 n_1ph_LMwords
Number single phone dict words also in LM; these come first in single_phone_wid.
Definition: ngram_search.h:266
ps_latnode_t ** frm_wordlist
List of active words in each frame.
Definition: ngram_search.h:316
int32 * homophone_set
Each node in the HMM tree structure may point to a set of words whose last phone would follow that no...
Definition: ngram_search.h:263
int32 n_root_chan_alloc
Number of root_chan allocated.
Definition: ngram_search.h:232
listelem_alloc_t * chan_alloc
For chan_t.
Definition: ngram_search.h:211
int32 n_active_chan[2]
Number entries in active_chan_list.
Definition: ngram_search.h:276
hmm_context_t * hmmctx
HMM context.
Definition: ngram_search.h:200
root_chan_t * root_chan
Search structure of HMM instances.
Definition: ngram_search.h:231
bitvec_t * word_active
array of active flags for all words.
Definition: ngram_search.h:247
Various statistics for profiling.
Definition: ngram_search.h:180
Word graph structure used in bestpath/nbest search.
Base structure for search module.
Base structure for hypothesis segmentation iterator.
Lexical tree node data type for the first phone (root) of each dynamic HMM tree structure.
Definition: ngram_search.h:90
int16 ci2phone
second ciphone of this node; one root HMM for each unique right context
Definition: ngram_search.h:102
hmm_t hmm
Basic HMM structure.
Definition: ngram_search.h:91
int16 ciphone
first ciphone of this node; all words rooted at this node begin with this ciphone
Definition: ngram_search.h:100
chan_t * next
first descendant of this channel
Definition: ngram_search.h:94
int32 this_phn_wid
list of words consisting of this single phone; actually the first of the list, like penult_phn_wid; -...
Definition: ngram_search.h:97