#!/usr/bin/env python
import theano
import numpy
from theano import tensor
from blocks.bricks import Linear, Tanh, Rectifier
from blocks.bricks.conv import Convolutional, MaxPooling
from blocks.bricks.lookup import LookupTable
from blocks.bricks.recurrent import SimpleRecurrent, LSTM
from blocks.initialization import IsotropicGaussian, Constant
from blocks.algorithms import (GradientDescent, Scale, AdaDelta, RemoveNotFinite, RMSProp, BasicMomentum,
StepClipping, CompositeRule, Momentum)
from blocks.graph import ComputationGraph
from blocks.model import Model
from blocks.main_loop import MainLoop
from blocks.filter import VariableFilter
from blocks.roles import WEIGHT, BIAS
from blocks.graph import ComputationGraph, apply_dropout, apply_noise
from blocks.extensions import ProgressBar
from blocks.extensions.monitoring import TrainingDataMonitoring, DataStreamMonitoring
from blocks.extensions import FinishAfter, Printing
from blocks.extras.extensions.plot import Plot
from ctc import CTC
from timit import setup_datastream
from edit_distance import batch_edit_distance
from ext_param_info import ParamInfo
# ==========================================================================================
# THE HYPERPARAMETERS
# ==========================================================================================
# Stop after this many epochs
n_epochs = 10000
# How often (number of batches) to print / plot
monitor_freq = 50
sort_batch_count = 50
batch_size = 100
# The convolutionnal layers. Parameters:
# nfilter the number of filters
# filter_size the size of the filters (number of timesteps)
# stride the stride on which to apply the filter (non-1 stride are not optimized, runs very slowly with current Theano)
# pool_stride the block size for max pooling
# normalize do we normalize the values before applying the activation function?
# activation a brick for the activation function
# dropout dropout applied after activation function
# skip do we introduce skip connections from previous layer(s) to next layer(s) ?
convs = [
{'nfilter': 20,
'filter_size': 200,
'stride': 1,
'pool_stride': 10,
'normalize': True,
'activation': Rectifier(name='a0'),
'dropout': 0.0,
'skip': ['min', 'max', 'subsample']},
{'nfilter': 20,
'filter_size': 200,
'stride': 1,
'pool_stride': 10,
'normalize': True,
'activation': Rectifier(name='a1'),
'dropout': 0.0,
'skip': ['max']},
{'nfilter': 20,
'filter_size': 30,
'stride': 1,
'pool_stride': 2,
'normalize': True,
'activation': Rectifier(name='a2'),
'dropout': 0.0,
'skip': ['max']},
{'nfilter': 100,
'filter_size': 20,
'stride': 1,
'pool_stride': 2,
'normalize': True,
'activation': Rectifier(name='a3'),
'dropout': 0.0,
'skip': []},
]
# recurrent layers. Parameters:
# type type of the layer (simple, lstm, blstm)
# dim size of the state
# normalize do we normalize the values after the RNN ?
# dropout dropout after the RNN
# skip do we introduce skip connections from previous layer(s) to next layer(s) ?
recs = [
{'type': 'blstm',
'dim': 50,
'normalize': False,
'dropout': 0.0,
'skip': True},
{'type': 'blstm',
'dim': 50,
'normalize': False,
'dropout': 0.0,
'skip': True},
]
# do we normalize the activations just before the softmax layer ?
normalize_out = True
# regularization : noise on the weights
weight_noise = 0.01
# regularization : L2 penalization
l2_output_bias = 0.
l2_output_weight = 0.
l2_all_bias = 0.0
l2_all_weight = 0.
# number of phonemes in timit, a constant
num_output_classes = 61
# the step rule (uncomment your favorite choice)
step_rule = CompositeRule([AdaDelta(), RemoveNotFinite()])
#step_rule = CompositeRule([Momentum(learning_rate=0.00001, momentum=0.99), RemoveNotFinite()])
#step_rule = CompositeRule([Momentum(learning_rate=0.001, momentum=0.9), RemoveNotFinite()])
#step_rule = CompositeRule([AdaDelta(), Scale(0.01), RemoveNotFinite()])
#step_rule = CompositeRule([RMSProp(learning_rate=0.1, decay_rate=0.95),
# RemoveNotFinite()])
#step_rule = CompositeRule([RMSProp(learning_rate=0.0001, decay_rate=0.95),
# BasicMomentum(momentum=0.9),
# RemoveNotFinite()])
# How the weights are initialized
weights_init = IsotropicGaussian(0.01)
biases_init = Constant(0.001)
# ==========================================================================================
# THE MODEL
# ==========================================================================================
print('Building model ...')
# THEANO INPUT VARIABLES
inputt = tensor.matrix('input')
input_mask = tensor.matrix('input_mask')
y = tensor.lmatrix('output').T
y_mask = tensor.matrix('output_mask').T
y_len = y_mask.sum(axis=0)
L = y.shape[0]
B = y.shape[1]
# inputt : B x T
# input_mask : B x T
# y : L x B
# y_mask : L x B
# NORMALIZE THE INPUTS
inputt = inputt / (inputt**2).mean()
dropout_locs = []
# CONVOLUTION LAYERS
conv_in = inputt[:, None, :, None]
conv_in_channels = 1
conv_in_mask = input_mask
cb = []
for i, p in enumerate(convs):
# Convolution bricks
conv = Convolutional(filter_size=(p['filter_size'],1),
# step=(p['stride'],1),
num_filters=p['nfilter'],
num_channels=conv_in_channels,
batch_size=batch_size,
border_mode='valid',
tied_biases=True,
name='conv%d'%i)
cb.append(conv)
maxpool = MaxPooling(pooling_size=(p['pool_stride'], 1), name='mp%d'%i)
conv_out = conv.apply(conv_in)[:, :, ::p['stride'], :]
conv_out = maxpool.apply(conv_out)
if p['normalize']:
conv_out_mean = conv_out.mean(axis=2).mean(axis=0)
conv_out_var = ((conv_out - conv_out_mean[None, :, None, :])**2).mean(axis=2).mean(axis=0).sqrt()
conv_out = (conv_out - conv_out_mean[None, :, None, :]) / conv_out_var[None, :, None, :]
if p['activation'] is not None:
conv_out = p['activation'].apply(conv_out)
if p['dropout'] > 0:
b = [p['activation'] if p['activation'] is not None else conv]
dropout_locs.append((VariableFilter(bricks=b, name='output'), p['dropout']))
if p['skip'] is not None and len(p['skip'])>0:
maxpooladd = MaxPooling(pooling_size=(p['stride']*p['pool_stride'], 1), name='Mp%d'%i)
skip = []
if 'max' in p['skip']:
skip.append(maxpooladd.apply(conv_in)[:, :, :conv_out.shape[2], :])
if 'min' in p['skip']:
skip.append(maxpooladd.apply(-conv_in)[:, :, :conv_out.shape[2], :])
if 'subsample' in p['skip']:
skip.append(conv_in[:, :, ::(p['stride']*p['pool_stride']), :][:, :, :conv_out.shape[2], :])
conv_out = tensor.concatenate([conv_out] + skip, axis=1)
conv_out_channels = p['nfilter'] + len(p['skip']) * conv_in_channels
else:
conv_out_channels = p['nfilter']
conv_out_mask = conv_in_mask[:, ::(p['stride']*p['pool_stride'])][:, :conv_out.shape[2]]
conv_in = conv_out
conv_in_channels = conv_out_channels
conv_in_mask = conv_out_mask
# RECURRENT LAYERS
rec_mask = conv_out_mask.dimshuffle(1, 0)
rec_in = conv_out[:, :, :, 0].dimshuffle(2, 0, 1)
rec_in_dim = conv_out_channels
rb = []
for i, p in enumerate(recs):
# RNN bricks
if p['type'] == 'lstm':
pre_rec = Linear(input_dim=rec_in_dim, output_dim=4*p['dim'], name='rnn_linear%d'%i)
rec = LSTM(activation=Tanh(), dim=p['dim'], name="rnn%d"%i)
rb = rb + [pre_rec, rec]
rnn_in = pre_rec.apply(rec_in)
rec_out, _ = rec.apply(inputs=rnn_in, mask=rec_mask)
dropout_b = [rec]
rec_out_dim = p['dim']
elif p['type'] == 'simple':
pre_rec = Linear(input_dim=rec_in_dim, output_dim=p['dim'], name='rnn_linear%d'%i)
rec = SimpleRecurrent(activation=Tanh(), dim=p['dim'], name="rnn%d"%i)
rb = rb + [pre_rec, rec]
rnn_in = pre_rec.apply(rec_in)
rec_out = rec.apply(inputs=rnn_in, mask=rec_mask)
dropout_b = [rec]
rec_out_dim = p['dim']
elif p['type'] == 'blstm':
pre_frec = Linear(input_dim=rec_in_dim, output_dim=4*p['dim'], name='frnn_linear%d'%i)
pre_brec = Linear(input_dim=rec_in_dim, output_dim=4*p['dim'], name='brnn_linear%d'%i)
frec = LSTM(activation=Tanh(), dim=p['dim'], name="frnn%d"%i)
brec = LSTM(activation=Tanh(), dim=p['dim'], name="brnn%d"%i)
rb = rb + [pre_frec, pre_brec, frec, brec]
frnn_in = pre_frec.apply(rec_in)
frnn_out, _ = frec.apply(inputs=frnn_in, mask=rec_mask)
brnn_in = pre_brec.apply(rec_in)
brnn_out, _ = brec.apply(inputs=brnn_in, mask=rec_mask)
rec_out = tensor.concatenate([frnn_out, brnn_out], axis=2)
dropout_b = [frec, brec]
rec_out_dim = 2*p['dim']
else:
assert False
if p['normalize']:
rec_out_mean = rec_out.mean(axis=1).mean(axis=0)
rec_out_var = ((rec_out - rec_out_mean[None, None, :])**2).mean(axis=1).mean(axis=0).sqrt()
rec_out = (rec_out - rec_out_mean[None, None, :]) / rec_out_var[None, None, :]
if p['dropout'] > 0:
dropout_locs.append((VariableFilter(bricks=dropout_b, name='output'), p['dropout']))
if p['skip']:
rec_out = tensor.concatenate([rec_in, rec_out], axis=2)
rec_out_dim = rec_in_dim + rec_out_dim
rec_in = rec_out
rec_in_dim = rec_out_dim
# LINEAR FOR THE OUTPUT
rec_to_o = Linear(name='rec_to_o',
input_dim=rec_out_dim,
output_dim=num_output_classes + 1)
y_hat_pre = rec_to_o.apply(rec_out)
# y_hat_pre : T x B x C+1
if normalize_out:
y_hat_pre_mean = y_hat_pre.mean(axis=1).mean(axis=0)
y_hat_pre_var = ((y_hat_pre - y_hat_pre_mean[None, None, :])**2).mean(axis=1).mean(axis=0).sqrt()
y_hat_pre = (y_hat_pre - y_hat_pre_mean[None, None, :]) / y_hat_pre_var[None, None, :]
# y_hat : T x B x C+1
y_hat = tensor.nnet.softmax(
y_hat_pre.reshape((-1, num_output_classes + 1))
).reshape((y_hat_pre.shape[0], y_hat_pre.shape[1], -1))
y_hat.name = 'y_hat'
y_hat_mask = rec_mask
# CTC COST AND ERROR MEASURE
cost = CTC().apply_log_domain(y, y_hat, y_len, y_hat_mask).mean()
cost.name = 'CTC'
dl, dl_length = CTC().best_path_decoding(y_hat, y_hat_mask)
dl = dl[:L, :]
dl_length = tensor.minimum(dl_length, L)
edit_distances = batch_edit_distance(dl.T.astype('int32'), dl_length.astype('int32'),
y.T.astype('int32'), y_len.astype('int32'))
edit_distance = edit_distances.mean()
edit_distance.name = 'edit_distance'
errors_per_char = (edit_distances / y_len).mean()
errors_per_char.name = 'errors_per_char'
is_error = tensor.neq(dl, y) * tensor.lt(tensor.arange(L)[:,None], y_len[None,:])
is_error = tensor.switch(is_error.sum(axis=0), tensor.ones((B,)), tensor.neq(y_len, dl_length))
error_rate = is_error.mean()
error_rate.name = 'error_rate'
# REGULARIZATION
cg = ComputationGraph([cost, error_rate])
if weight_noise > 0:
noise_vars = VariableFilter(roles=[WEIGHT])(cg)
cg = apply_noise(cg, noise_vars, weight_noise)
for vfilter, p in dropout_locs:
cg = apply_dropout(cg, vfilter(cg), p)
[cost_reg, error_rate_reg] = cg.outputs
ctc_reg = cost_reg + 1e-24
ctc_reg.name = 'CTC'
if l2_output_bias > 0:
cost_reg += l2_output_bias * sum(x.norm(2) for x in VariableFilter(roles=[BIAS], bricks=[rec_to_o])(cg))
if l2_output_weight > 0:
cost_reg += l2_output_weight * sum(x.norm(2) for x in VariableFilter(roles=[WEIGHT], bricks=[rec_to_o])(cg))
if l2_all_bias > 0:
cost_reg += l2_all_bias * sum(x.norm(2) for x in VariableFilter(roles=[BIAS])(cg))
if l2_all_weight > 0:
cost_reg += l2_all_weight * sum(x.norm(2) for x in VariableFilter(roles=[WEIGHT])(cg))
cost_reg.name = 'cost'
# INITIALIZATION
for brick in [rec_to_o] + cb + rb:
brick.weights_init = weights_init
brick.biases_init = biases_init
brick.initialize()
# ==========================================================================================
# THE INFRASTRUCTURE
# ==========================================================================================
# SET UP THE DATASTREAM
print('Bulding DataStream ...')
ds, stream = setup_datastream('/home/lx.nobackup/datasets/timit/readable',
batch_size=batch_size,
sort_batch_count=sort_batch_count)
valid_ds, valid_stream = setup_datastream('/home/lx.nobackup/datasets/timit/readable',
batch_size=batch_size,
sort_batch_count=sort_batch_count,
valid=True)
# SET UP THE BLOCKS ALGORITHM WITH EXTENSIONS
print('Bulding training process...')
algorithm = GradientDescent(cost=cost_reg,
parameters=ComputationGraph(cost).parameters,
step_rule=step_rule)
monitor_cost = TrainingDataMonitoring([ctc_reg, cost_reg, error_rate_reg],
prefix="train",
every_n_batches=monitor_freq,
after_epoch=False)
monitor_valid = DataStreamMonitoring([cost, error_rate, edit_distance, errors_per_char],
data_stream=valid_stream,
prefix="valid",
after_epoch=True)
plot = Plot(document='CTC_timit_%s%s%s%s_%s'%
(repr([p['nfilter'] for p in convs]),
repr([p['filter_size'] for p in convs]),
repr([p['stride'] for p in convs]),
repr([p['pool_stride'] for p in convs]),
repr([p['dim'] for p in recs])),
channels=[['train_cost', 'train_CTC', 'valid_CTC'],
['train_error_rate', 'valid_error_rate'],
['valid_edit_distance'],
['valid_errors_per_char']],
every_n_batches=monitor_freq,
after_epoch=True)
model = Model(cost)
main_loop = MainLoop(data_stream=stream, algorithm=algorithm,
extensions=[
ProgressBar(),
monitor_cost, monitor_valid,
plot,
Printing(every_n_batches=monitor_freq, after_epoch=True),
ParamInfo(Model([cost]), every_n_batches=monitor_freq),
FinishAfter(after_n_epochs=n_epochs),
],
model=model)
# NOW WE FINALLY CAN TRAIN OUR MODEL
print('Starting training ...')
main_loop.run()
# vim: set sts=4 ts=4 sw=4 tw=0 et: