aboutsummaryrefslogtreecommitdiff
path: root/prez/prez.tex
blob: 3befdd0dad464e85efc97cfb6e3c2f316946e35c (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Jacobs Portrait Poster
% LaTeX Template
% Version 1.0 (31/08/2015)
% (Based on Version 1.0 (29/03/13) of the landscape template
%
% Created by:
% Computational Physics and Biophysics Group, Jacobs University
% https://teamwork.jacobs-university.de:8443/confluence/display/CoPandBiG/LaTeX+Poster
% 
% Further modified by:
% Nathaniel Johnston (nathaniel@njohnston.ca)
%
% Portrait version by:
% John Hammersley
%
% The landscape version of this template was downloaded from:
% http://www.LaTeXTemplates.com
%
% License:
% CC BY-NC-SA 3.0 (http://creativecommons.org/licenses/by-nc-sa/3.0/)
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%----------------------------------------------------------------------------------------
%	PACKAGES AND OTHER DOCUMENT CONFIGURATIONS
%----------------------------------------------------------------------------------------

\documentclass[final]{beamer}

\usepackage[scale=1.24]{beamerposter} % Use the beamerposter package for laying out the poster
\usepackage[utf8]{inputenc}

\usepackage{relsize}

\usetheme{confposter} % Use the confposter theme supplied with this template

\setbeamercolor{block title}{fg=ngreen,bg=white} % Colors of the block titles
\setbeamercolor{block body}{fg=black,bg=white} % Colors of the body of blocks
\setbeamercolor{block alerted title}{fg=white,bg=dblue!70} % Colors of the highlighted block titles
\setbeamercolor{block alerted body}{fg=black,bg=dblue!10} % Colors of the body of highlighted blocks
% Many more colors are available for use in beamerthemeconfposter.sty

%-----------------------------------------------------------
% Define the column widths and overall poster size
% To set effective sepwid, onecolwid and twocolwid values, first choose how many columns you want and how much separation you want between columns
% In this template, the separation width chosen is 0.024 of the paper width and a 4-column layout
% onecolwid should therefore be (1-(# of columns+1)*sepwid)/# of columns e.g. (1-(4+1)*0.024)/4 = 0.22
% Set twocolwid to be (2*onecolwid)+sepwid = 0.464
% Set threecolwid to be (3*onecolwid)+2*sepwid = 0.708

\newlength{\sepwid}
\newlength{\onecolwid}
\newlength{\twocolwid}
\newlength{\threecolwid}
\setlength{\paperwidth}{36in} % A0 width: 46.8in
\setlength{\paperheight}{48in} % A0 height: 33.1in
\setlength{\sepwid}{0.024\paperwidth} % Separation width (white space) between columns
\setlength{\onecolwid}{0.22\paperwidth} % Width of one column
\setlength{\twocolwid}{0.464\paperwidth} % Width of two columns
\setlength{\threecolwid}{0.708\paperwidth} % Width of three columns
\setlength{\topmargin}{-0.5in} % Reduce the top margin size
%-----------------------------------------------------------

\usepackage{graphicx}  % Required for including images

\usepackage{booktabs} % Top and bottom rules for tables


\usepackage{lmodern} 
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{graphicx}

\usepackage[T1]{fontenc}

\DeclareFontShape{OMX}{cmex}{m}{n}{
  <-7.5> cmex7
  <7.5-8.5> cmex8
  <8.5-9.5> cmex9
  <9.5-> cmex10
}{}

\SetSymbolFont{largesymbols}{normal}{OMX}{cmex}{m}{n}
\SetSymbolFont{largesymbols}{bold}  {OMX}{cmex}{m}{n}

%----------------------------------------------------------------------------------------
%	TITLE SECTION 
%----------------------------------------------------------------------------------------

\title{Connectionist Temporal Classification: \\Labelling Unsegmented Sequences with \\Recurrent Neural Networks} % Poster title

\author{Thomas Mesnard, Alex Auvolat} % Author(s)

\institute{Probabilisitc Graphical Models Project, MVA Master} % Institution(s)

%----------------------------------------------------------------------------------------

\begin{document}

\addtobeamertemplate{block end}{}{\vspace*{2ex}} % White space under blocks
\addtobeamertemplate{block alerted end}{}{\vspace*{2ex}} % White space under highlighted (alert) blocks

\setlength{\belowcaptionskip}{2ex} % White space under figures
\setlength\belowdisplayshortskip{2ex} % White space under equations

\begin{frame}[t] % The whole poster is enclosed in one beamer frame

\begin{columns}[t] % The whole poster consists of three major columns, the second of which is split into two columns twice - the [t] option aligns each column's content to the top

\begin{column}{\sepwid}\end{column} % Empty spacer column

\begin{column}{\twocolwid} % Begin a column which is two columns wide (column 2)

\begin{columns}[t,totalwidth=\twocolwid] % Split up the two columns wide column

\begin{column}{\onecolwid}\vspace{-.6in} % The first column within column 2 (column 2.1)

%----------------------------------------------------------------------------------------
%	MATERIALS
%----------------------------------------------------------------------------------------


\begin{block}{Abstract}

Many real-world sequence learning tasks require the prediction of sequences of
labels from noisy, unsegmented input data. Recurrent
neural networks (RNNs) are powerful sequence learners that would seem well
suited to such tasks. However, because they require pre-segmented training
data, and post-processing to transform their outputs into label sequences,
they cannot be applied directly. CTC is a method
for training RNNs to label unsegmented sequences directly, thereby solving both
problems.

\end{block}


%----------------------------------------------------------------------------------------

\end{column} % End of column 2.1

\begin{column}{\onecolwid}\vspace{-.6in} % The second column within column 2 (column 2.2)

%----------------------------------------------------------------------------------------
%	METHODS
%----------------------------------------------------------------------------------------

\begin{block}{Main Idea}

RNNs are powerful learners for sequences, but:

\begin{itemize}
\item Standard methods need pre-segmented training data
\item Need for complex post-preprocessing
\end{itemize}

CTC solves this problem:

\begin{itemize}
\item Able to train RNNs using unsegmented training data
\item Learns the segmentation automatically
\item Provides directly usable output
\end{itemize}

This method is now extremely used, even by Google!

\end{block}


%----------------------------------------------------------------------------------------

\end{column} % End of column 2.2

\end{columns} % End of the split of column 2 - any content after this will now take up 2 columns width

%----------------------------------------------------------------------------------------
%	IMPORTANT RESULT
%----------------------------------------------------------------------------------------

\begin{alertblock}{The problem and how CTC solves it}

\begin{figure}
\includegraphics[width=0.9\linewidth]{azerty3.png}
\caption{\small Output of classic framewise phoneme classification and RNN trained with CTC}
\end{figure}

\end{alertblock} 

%----------------------------------------------------------------------------------------


\begin{block}{Model}
\begin{columns}[t,totalwidth=\twocolwid] % Split up the two columns wide column again


\begin{column}{\onecolwid} % The first column within column 2 (column 2.1)

\begin{itemize}
\item Cost function for RNNs
\item RNN outputs probabilities for the different symbols, plus blank symbol
\item Many possible alignments for the correct label (shorter than input)
\item Dynamic programming: sums all the possible alignments
\item Provides gradients for the RNN to learn a good alignment
\end{itemize}

\vspace{1em}
\begin{figure}
\includegraphics[width=0.8\linewidth]{azerty4.png}
\caption{Simple bidirectional RNN model with CTC cost layer}
\end{figure}

%----------------------------------------------------------------------------------------

\end{column} % End of column 2.1

\begin{column}{\onecolwid} % The second column within column 2 (column 2.2)


CTC is a dynamic programming algorithm that calculates the following sum:
\[
\alpha_t(s) = \sum_{\substack{\pi \in N^T :\\\mathcal{B}(\pi_{1:t}) = l_{1:s}}} 
	\prod_{t'=1}^t y_{\pi_{t'}}^{t'}
\]

Where $\mathcal{B}$ is the transform that removes blanks and duplicates.

\begin{figure}
\includegraphics[width=\linewidth]{azerty1.png}
\caption{Computation graph for $\alpha_t(s)$ (corresponds to an unrolled automaton)}
\end{figure}

Tools used for our implementation:
\begin{itemize}
\item Theano (GPU computation library)
\item Blocks (deep learning framework)
\end{itemize}

%----------------------------------------------------------------------------------------

\end{column} % End of column 2.2

\end{columns} % End of the split of column 2

\end{block}

\end{column} % End of the second column



\begin{column}{\sepwid}\end{column} % Empty spacer column



%=================================================================

\begin{column}{\twocolwid} % Begin a column which is two columns wide (column 2)

\begin{block}{Recurrence equations}

\begin{columns}[t,totalwidth=\twocolwid] % Split up the two columns wide column
\begin{column}{\onecolwid} % The first column

We define the following notation:

$y_k^t$: output at time $t$ for symbol $k$

$l$: label, $l'$: label with blanks

Initialization:
\[
\begin{tabular}{rcl}
$\alpha_1(1)$ &=& $y_b^1$\\
$\alpha_1(2)$ &=& $y_{l_1}^1$\\
$\alpha_1(s)$ &=& $0, \forall s > 2$
\end{tabular}
\]

Recurrence relation:
\[
\alpha_t(s) = 
\begin{cases}
	\bar{\alpha}_t(s) y_{l'_s}^t \mbox{ \; if } l'_s = b\mbox{ or }l'_{s-2}=l'_s \\
	(\bar{\alpha}_t(s)+\alpha_{t-1}(s-2)) y_{l'_s}^t \\
		\hspace{3em} \mbox{ otherwise}\\
\end{cases}
\]
\[
\bar{\alpha}_t(s) = \alpha_{t-1}(s) + \alpha_{t-1}(s-1)
\]

Finally, we have:
\[
p(l|x) = \alpha_T(|l'|) + \alpha_T(|l'|-1)
\]

\end{column}
\begin{column}{\sepwid}\end{column} % Empty spacer column
\begin{column}{\onecolwid} % The third column

\begin{figure}
\includegraphics[width=\linewidth]{azerty2.png}
\caption{Evolution of the CTC error signal}
\end{figure}

To avoid numerical underflow, at each step $t$:
\[
C_t = \sum_s \alpha_t(s)
\hspace{1em}
\hat{\alpha}_t(s) = \frac{\alpha_t(s)}{C_t}
\]

Other solution: do calculations in the logarithmic domain. 


\end{column} % End of column 2.2

\end{columns} % End of the split of column 2


\end{block}

\begin{columns}[t,totalwidth=\twocolwid] % Split up the two columns wide column
\begin{column}{\onecolwid} % The first column

\begin{block}{Toy dataset}

We first tried our implementation on a simple task:

{\centering
$1^*2^*3^*4^*5^* \to 1$  \\
$1^*2^*3^*2^*1^* \to 2$  \\
$5^*4^*3^*2^*1^* \to 3$  \\
$5^*4^*3^*4^*5^* \to 4$  \\
}

\begin{itemize}
\item A RNN can easily solve this
\item It needs to read the full sequence before predicting a label
\item CTC provides satisfactory results
\end{itemize}

\begin{table}
\vspace{2ex}
\begin{tabular}{l l l}
\toprule
\textbf{Results} & \textbf{train} & \textbf{valid}\\ 
\midrule
Sequence length & 5 -- 20 & 5 -- 20 \\
Error rate & 0.62 & 0.63 \\
Mean edit distance & 1.0 & 1.1 \\ 
Errors per character & 0.08 & 0.09 \\
\bottomrule
\end{tabular}
\caption{Performances of CTC on our toy dataset}
\end{table}

\begin{figure}
\includegraphics[width=\linewidth]{ctc_cost_best.png}
\caption{Training and validation cost of the CTC model (negative log likelihood)}
\end{figure}
\end{block}

\begin{block}{Conclusion}
CTC is a very powerfull model, and also has a nice mathematical formulation. It is also very used in practice (most successfull applications: speech recognition, handwriting recognition).
\end{block}

%------------------------------------------------


%----------------------------------------------------------------------------------------

\end{column} % End of the first column

\begin{column}{\sepwid}\end{column} % Empty spacer column


\begin{column}{\onecolwid} % The third column

\begin{block}{TIMIT}

We then tried on the classical TIMIT dataset:

\begin{itemize}
\item Raw speech signal dataset
\item Labelled by phonemes or by words
\item 4120 sentences
\item Average audio length: 50000 samples
\item Avg. sentence length: 38 phonemes
\end{itemize}

Model:

\begin{itemize}
\item Convolution layers on raw signal
\item Bidirectional LSTM layers
\item Dropout and noise for regularization
\item CTC cost function
\end{itemize}

This model avoids hand-crafted feature extraction on the speech signal. However it is extremely complicated to train such models. Our model hasn't converged yet.

\end{block}



%\setbeamercolor{block alerted title}{fg=black,bg=norange} % Change the alert block title colors
%\setbeamercolor{block alerted body}{fg=black,bg=white} % Change the alert block body colors

\setbeamercolor{block title}{fg=red,bg=white} % Change the block title color
\begin{block}{Contact Information}

\begin{itemize}
\item Web: \url{http://github.com/thomasmesnard/CTC-LSTM}
\item Email: \url{thomas.mesnard@ens.fr}
			 \url{alex.auvolat@ens.fr}
\end{itemize}

\end{block}

\setbeamercolor{block title}{fg=red,bg=white} % Change the block title color
\begin{block}{References}

\nocite{*} % Insert publications even if they are not cited in the poster
\small{\bibliographystyle{unsrt}
\bibliography{sample}\vspace{0.75in}}


\end{block}

% LOGOS
%\begin{center}
%\begin{tabular}{ccc}
%\includegraphics[width=0.4\linewidth]{logo.png} & \hfill & \includegraphics[width=0.4\linewidth]{logo.png}
%\end{tabular}
%\end{center}

%----------------------------------------------------------------------------------------

\end{column} % End of the third column

\end{columns} % End of all the columns in the poster

\end{column} % End of the third column

\end{columns} % End of all the columns in the poster

\end{frame} % End of the enclosing frame

\end{document}