dlbook_cn.bib

@article{Abadi-et-al-2015,
	author={Abadi, M. and Agarwal, A. and Barham, P. and Brevdo, E. and Chen, Z. and Citro, C. and Corrado, G. S. and Davis, A. and Dean, J. and Devin, M. and Ghemawat, S. and Goodfellow, I. and Harp, A. and Irving, G. and Isard, M. and Jia, Y. and Jozefowicz, R. and Kaiser, L. and Kudlur, M. and Levenberg, J. and Mané, D. and Monga, R. and Moore, S. and Murray, D. and Olah, C. and Schuster, M. and Shlens, J. and Steiner, B. and Sutskever, I. and Talwar, K. and Tucker, P. and Vanhoucke, V. and Vasudevan, V. and Viégas, F. and Vinyals, O. and Warden, P. and Wattenberg, M. and Wicke, M. and Yu, Y. and Zheng, X.},
	title={TensorFlow: Large-scale machine learning on heterogeneous systems},
	journal={Software available from tensorflow.org.},
	year={2015}
}
@article{Ackley-et-al-1985,
	author={Ackley, D. H. and Hinton, G. E. and Sejnowski, T. J.},
	title={A learning algorithm for Boltzmann machines},
	journal={Cognitive Science},
	volume={9},
	pages={147-169},
	year={1985}
}
@inproceedings{Alain-Bengio-2013,
	author={Alain, G. and Bengio, Y.},
	title={What regularized auto-encoders learn from the data generating distribution},
	booktitle={In ICLR'2013, arXiv:1211.4246},
	year={2013}
}
@article{Alain-et-al-2015,
	author={Alain, G. and Bengio, Y. and Yao, L. and Éric Thibodeau-Laufer, Yosinski, J. and Vincent, P.},
	title={GSNs: Generative stochastic networks},
	journal={arXiv:1503.05571.},
	year={2015}
}
@article{Anderson-1935,
	author={Anderson, E.},
	title={The Irises of the Gaspé Peninsula},
	journal={Bulletin of the American Iris Society},
	volume={59},
	pages={2-5},
	year={1935}
}
@article{Ba-et-al-2014,
	author={Ba, J. and Mnih, V. and Kavukcuoglu, K.},
	title={Multiple object recognition with visual attention},
	journal={arXiv:1412.7755.},
	year={2014}
}
@inproceedings{Bachman-Precup-2015,
	author={Bachman, P. and Precup, D.},
	title={Variational generative stochastic networks with collaborative shaping},
	booktitle={In Proceedings of the 32nd International Conference on Machine Learning, ICML 2015, Lille, France, 6-11 July 2015},
	pages={1964-1972},
	year={2015}
}
@inproceedings{Bacon-et-al-2015,
	author={Bacon, P.-L. and Bengio, E. and Pineau, J. and Precup, D.},
	title={Conditional computation in neural networks using a decision-theoretic approach},
	booktitle={In 2nd Multidisciplinary Conference on Reinforcement Learning and Decision Making (RLDM 2015).},
	year={2015}
}
@inproceedings{Bagnell-Bradley-2009,
	author={Bagnell, J. A. and Bradley, D. M.},
	title={Differentiable sparse coding},
	booktitle={In D. Koller, D. Schuurmans, Y. Bengio, and L. Bottou, editors, Advances in Neural Information Processing Systems 21 (NIPS'08)},
	pages={113-120},
	year={2009}
}
@inproceedings{Bahdanau-et-al-2015,
	author={Bahdanau, D. and Cho, K. and Bengio, Y.},
	title={Neural machine translation by jointly learning to align and translate},
	booktitle={In ICLR'2015, arXiv:1409.0473},
	year={2015}
}
@article{Bahl-et-al-1987,
	author={Bahl, L. R. and Brown, P. and de Souza, P. V. and Mercer, R. L.},
	title={Speech recognition with continuous-parameter hidden Markov models},
	journal={Computer, Speech and Language,2},
	pages={219-234},
	year={1987}
}
@article{Baldi-Hornik-1989,
	author={Baldi, P. and Hornik, K.},
	title={Neural networks and principal component analysis: Learning from examples without local minima},
	journal={Neural Networks},
	volume={2},
	pages={53-58},
	year={1989}
}
@article{Baldi-et-al-1999,
	author={Baldi, P. and Brunak, S. and Frasconi, P. and Soda, G. and Pollastri, G.},
	title={Exploiting the past and the future in protein secondary structure prediction},
	journal={Bioinformatics},
	volume={15},
	number={11},
	pages={937-946},
	year={1999}
}
@article{Baldi-et-al-2014,
	author={Baldi, P. and Sadowski, P. and Whiteson, D.},
	title={Searching for exotic particles in high-energy physics with deep learning},
	journal={Nature communications, 5.},
	year={2014}
}
@article{Ballard-et-al-1983,
	author={Ballard, D. H. and Hinton, G. E. and Sejnowski, T. J.},
	title={Parallel vision computation},
	journal={Nature.},
	year={1983}
}
@article{Barlow-1989,
	author={Barlow, H. B.},
	title={Unsupervised learning},
	journal={Neural Computation},
	volume={1},
	pages={295-311},
	year={1989}
}
@article{Barron-1993,
	author={Barron, A. E.},
	title={Universal approximation bounds for superpositions of a sigmoidal function},
	journal={IEEE Trans. on Information Theory},
	volume={39},
	pages={930-945},
	year={1993}
}
@article{Bartholomew-1987,
	author={Bartholomew, D. J.},
	title={Latent variable models and factor analysis},
	journal={Oxford University Press.},
	year={1987}
}
@article{Basilevsky-1994,
	author={Basilevsky, A.},
	title={Statistical Factor Analysis and Related Methods: Theory and Applications},
	journal={Wiley.},
	year={1994}
}
@article{Bastien-et-al-2012,
	author={Bastien, F. and Lamblin, P. and Pascanu, R. and Bergstra, J. and Goodfellow, I. J. and Bergeron, A. and Bouchard, N. and Bengio, Y.},
	title={Theano: new features and speed improvements},
	journal={Deep Learning and Unsupervised Feature Learning NIPS 2012 Workshop.},
	year={2012}
}
@inproceedings{Basu-Christensen-2013,
	author={Basu, S. and Christensen, J.},
	title={Teaching classification boundaries to humans},
	booktitle={In AAAI'2013.},
	year={2013}
}
@inproceedings{Baxter-1995,
	author={Baxter, J.},
	title={Learning internal representations},
	booktitle={In Proceedings of the 8th International Conference on Computational Learning Theory (COLT'95)},
	pages = {311-320},
	publisher = {Santa Cruz, California. ACM Press},
	year={1995}
}
@article{Bayer-Osendorfer-2014,
	author={Bayer, J. and Osendorfer, C.},
	title={Learning stochastic recurrent networks},
	journal={ArXiv e-prints.},
	year={2014}
}
@article{Becker-Hinton-1992,
	author={Becker, S. and Hinton, G.},
	title={A self-organizing neural network that discovers surfaces in random-dot stereograms},
	journal={Nature},
	volume={355},
	pages={161-163},
	year={1992}
}
@article{Behnke-2001,
	author={Behnke, S.},
	title={Learning iterative image reconstruction in the neural abstraction pyramid},
	journal={Int. J. Computational Intelligence and Applications},
	volume={1},
	number={4},
	pages={427-438},
	year={2001}
}
@article{Beiu-et-al-2003,
	author={Beiu, V. and Quintana, J. M. and Avedillo, M. J.},
	title={VLSI implementations of threshold logic-a comprehensive survey},
	journal={Neural Networks, IEEE Transactions on},
	volume={14},
	number={5},
	pages={1217-1243},
	year={2003}
}
@inproceedings{Belkin-Niyogi-2002,
	author={Belkin, M. and Niyogi, P.},
	title={Laplacian eigenmaps and spectral techniques for embedding and clustering},
	booktitle={In T. Dietterich, S. Becker, and Z. Ghahramani, editors, Advances in Neural Information Processing Systems 14 (NIPS'01), Cambridge, MA. MIT Press.},
	year={2002}
}
@article{Belkin-Niyogi-2003,
	author={Belkin, M. and Niyogi, P.},
	title={Laplacian eigenmaps for dimensionality reduction and data representation},
	journal={Neural Computation},
	volume={15},
	number={6},
	pages={1373-1396},
	year={2003}
}
@article{Bengio-et-al-2015a,
	author={Bengio, E. and Bacon, P.-L. and Pineau, J. and Precup, D.},
	title={Conditional computation in neural networks for faster models},
	journal={arXiv:1511.06297.},
	year={2015}
}
@article{Bengio-Bengio-2000a,
	author={Bengio, S. and Bengio, Y.},
	title={Taking on the curse of dimensionality in joint distributions using neural networks},
	journal={IEEE Transactions on Neural Networks, special issue on Data Mining and Knowledge Discovery},
	volume={11},
	number={3},
	pages={550-557},
	year={2000}
}
@article{Bengio-et-al-2015b,
	author={Bengio, S. and Vinyals, O. and Jaitly, N. and Shazeer, N.},
	title={Scheduled sampling for sequence prediction with recurrent neural networks},
	journal={Technical report, arXiv:1506.03099.},
	year={2015}
}
@phdthesis{Bengio-1991,
	author={Bengio, Y.},
	title={Artificial Neural Networks and their Application to Sequence Recognition},
	school={McGill University, (Computer Science), Montreal, Canada.},
	year={1991}
}
@article{Bengio-2000,
	author={Bengio, Y.},
	title={Gradient-based optimization of hyperparameters},
	journal={Neural Computation},
	volume={12},
	number={8},
	pages={1889-1900},
	year={2000}
}
@article{Bengio-2002,
	author={Bengio, Y.},
	title={New distributed probabilistic language models},
	journal={Technical Report 1215, Dept. IRO, Université de Montréal.},
	year={2002}
}
@article{Bengio-2009,
	author={Bengio, Y.},
	title={Learning deep architectures for AI},
	journal={Now Publishers.},
	year={2009}
}
@inproceedings{Bengio-2013,
	author={Bengio, Y.},
	title={Deep learning of representations: looking forward},
	booktitle={In Statistical Language and Speech Processing, volume 7978 of Lecture Notes in Computer Science},
	pages={1-37},
	publisher={Springer, also in arXiv at http://arxiv.org/abs/1305.0445},
	year={2013}
}
@article{Bengio-2015,
	author={Bengio, Y.},
	title={Early inference in energy-based models approximates back-propagation},
	journal={Technical Report arXiv:1510.02777, Universite de Montreal.},
	year={2015}
}
@inproceedings{Bengio-Bengio-2000b,
	author={Bengio, Y. and Bengio, S.},
	title={Modeling high-dimensional discrete data with multi-layer neural networks},
	booktitle={In NIPS 12 },
	pages={400-406},
	publisher={MIT Press},
	year={2000}
}
@article{Bengio-Delalleau-2009,
	author={Bengio, Y. and Delalleau, O.},
	title={Justifying and generalizing contrastive divergence},
	journal={Neural Computation},
	volume={21},
	number={6},
	pages={1601-1621},
	year={2009}
}
@inproceedings{Bengio-Grandvalet-2004,
	author={Bengio, Y. and Grandvalet, Y.},
	title={No unbiased estimator of the variance of k-fold cross-validation},
	booktitle={In S. Thrun, L. Saul, and B. Schölkopf, editors, Advances in Neural Information Processing Systems 16 (NIPS'03), Cambridge, MA. MIT Press, Cambridge.},
	year={2004}
}
@inproceedings{Bengio-LeCun-2007,
	author={Bengio, Y. and LeCun, Y.},
	title={Scaling learning algorithms towards AI},
	booktitle={In Large Scale Kernel Machines.},
	year={2007}
}
@inproceedings{Bengio-Monperrus-2005,
	author={Bengio, Y. and Monperrus, M.},
	title={Non-local manifold tangent learning},
	booktitle={In L. Saul, Y. Weiss, and L. Bottou, editors, Advances in Neural Information Processing Systems 17 (NIPS'04)},
	pages={129-136},
	publisher={MIT Press},
	year={2005}
}
@inproceedings{Bengio-Senecal-2003,
	author={Bengio, Y. and Sénécal, J.-S.},
	title={Quick training of probabilistic neural nets by importance sampling},
	booktitle={In Proceedings of AISTATS 2003.},
	year={2003}
}
@article{Bengio-Senecal-2008,
	author={Bengio, Y. and Sénécal, J.-S.},
	title={Adaptive importance sampling to accelerate training of a neural probabilistic language model},
	journal={IEEE Trans. Neural Networks},
	volume={19},
	number={4},
	pages={713-722},
	year={2008}
}
@inproceedings{Bengio-et-al-1991,
	author={Bengio, Y. and De Mori, R. and Flammia, G. and Kompe, R.},
	title={Phonetically motivated acoustic parameters for continuous speech recognition using artificial neural networks},
	booktitle={In Proceedings of EuroSpeech'91.},
	year={1991}
}
@inproceedings{Bengio-et-al-1992,
	author={Bengio, Y. and De Mori, R. and Flammia, G. and Kompe, R.},
	title={Neural network-Gaussian mixture hybrid for speech recognition or density estimation},
	booktitle={In NIPS 4 },
	pages={175-182},
	publisher={Morgan Kaufmann},
	year={1992}
}
@inproceedings{Bengio-et-al-1993,
	author={Bengio, Y. and Frasconi, P. and Simard, P.},
	title={The problem of learning long-term dependencies in recurrent networks},
	booktitle={In IEEE International Conference on Neural NetworksSan Francisco (invited paper)},
	pages = {1183-1195},
	publisher = {IEEE Press},
	year={1993}
}
@article{Bengio-et-al-1994,
	author={Bengio, Y. and Simard, P. and Frasconi, P.},
	title={Learning long-term dependencies with gradient descent is difficult},
	journal={IEEE Tr. Neural Nets.},
	year={1994}
}
@article{Bengio-et-al-1999,
	author={Bengio, Y. and Latendresse, S. and Dugas, C.},
	title={Gradient-based learning of hyper-parameters},
	journal={Learning Conference, Snowbird.},
	year={1999}
}
@inproceedings{Bengio-et-al-2001,
	author={Bengio, Y. and Ducharme, R. and Vincent, P.},
	title={A neural probabilistic language model},
	booktitle={In T. K. Leen, T. G. Dietterich, and V. Tresp, editors, NIPS'2000 },
	pages={932-938},
	publisher={MIT Press},
	year={2001}
}
@article{Bengio-et-al-2003,
	author={Bengio, Y. and Ducharme, R. and Vincent, P. and Jauvin, C.},
	title={A neural probabilistic language model},
	journal={JMLR},
	volume={3},
	pages={1137-1155},
	year={2003}
}
@inproceedings{Bengio-et-al-2006a,
	author={Bengio, Y. and Le Roux, N. and Vincent, P. and Delalleau, O. and Marcotte, P.},
	title={Convex neural networks},
	booktitle={In NIPS'2005 },
	pages={123-130},
	year={2006}
}
@inproceedings{Bengio-et-al-2006b,
	author={Bengio, Y. and Delalleau, O. and Le Roux, N.},
	title={The curse of highly variable functions for local kernel machines},
	booktitle={In NIPS'2005},
	year={2006}
}
@inproceedings{Bengio-et-al-2006c,
	author={Bengio, Y. and Larochelle, H. and Vincent, P.},
	title={Non-local manifold Parzen windows},
	booktitle={In NIPS'2005. MIT Press.},
	year={2006}
}
@inproceedings{Bengio-et-al-2007,
	author={Bengio, Y. and Lamblin, P. and Popovici, D. and Larochelle, H.},
	title={Greedy layer-wise training of deep networks},
	booktitle={In NIPS'2006},
	year={2007}
}
@inproceedings{Bengio-et-al-2009,
	author={Bengio, Y. and Louradour, J. and Collobert, R. and Weston, J.},
	title={Curriculum learning},
	booktitle={In ICML'09.},
	year={2009}
}
@inproceedings{Bengio-et-al-2013a,
	author={Bengio, Y. and Mesnil, G. and Dauphin, Y. and Rifai, S.},
	title={Better mixing via deep representations},
	booktitle={In ICML'2013.},
	year={2013}
}
@article{Bengio-et-al-2013b,
	author={Bengio, Y. and Léonard, N. and Courville, A.},
	title={Estimating or propagating gradients through stochastic neurons for conditional computation},
	journal={arXiv:1308.3432.},
	year={2013}
}
@inproceedings{Bengio-et-al-2013c,
	author={Bengio, Y. and Yao, L. and Alain, G. and Vincent, P.},
	title={Generalized denoising auto-encoders as generative models},
	booktitle={In NIPS'2013.},
	year={2013}
}
@article{Bengio-et-al-2013d,
	author={Bengio, Y. and Courville, A. and Vincent, P.},
	title={Representation learning: A review and new perspectives},
	journal={IEEE Trans. Pattern Analysis and Machine Intelligence (PAMI)},
	volume={35},
	number={8},
	pages={1798-1828},
	year={2013}
}
@inproceedings{Bengio-et-al-2014,
	author={Bengio, Y. and Thibodeau-Laufer, E. and Alain, G. and Yosinski, J.},
	title={Deep generative stochastic networks trainable by backprop},
	booktitle={In ICML'2014},
	year={2014}
}
@article{Bennett-1976,
	author={Bennett, C.},
	title={Efficient estimation of free energy differences from Monte Carlo data},
	journal={Journal of Computational Physics},
	volume={22},
	number={2},
	pages={245-268},
	year={1976}
}
@misc{Bennett-Lanning-2007,
	author={Bennett, J. and Lanning, S.},
	title={The Netflix prize},
	year={2007}
}
@article{Berger-et-al-1996,
	author={Berger, A. L. and Della Pietra, V. J. and Della Pietra, S. A.},
	title={A maximum entropy approach to natural language processing},
	journal={Computational Linguistics},
	volume={22},
	pages={39-71},
	year={1996}
}
@article{Berglund-Raiko-2013,
	author={Berglund, M. and Raiko, T.},
	title={Stochastic gradient estimate variance in contrastive divergence and persistent contrastive divergence},
	journal={CoRR, abs/1312.6002.},
	year={2013}
}
@phdthesis{Bergstra-2011,
	author={Bergstra, J.},
	title={Incorporating Complex Cells into Neural Networks for Pattern Classification},
	school={Université de Montréal.},
	year={2011}
}
@inproceedings{Bergstra-Bengio-2009,
	author={Bergstra, J. and Bengio, Y.},
	title={Slow, decorrelated features for pretraining complex cell-like networks},
	booktitle={In NIPS'2009.},
	year={2009}
}
@article{Bergstra-Bengio-2012,
	author={Bergstra, J. and Bengio, Y.},
	title={Random search for hyper-parameter optimization},
	journal={J.  Machine Learning Res.},
	volume={13},
	pages={281-305},
	year={2012}
}
@inproceedings{Bergstra-et-al-2010,
	author={Bergstra, J. and Breuleux, O. and Bastien, F. and Lamblin, P. and Pascanu, R. and Desjardins, G. and Turian, J. and Warde-Farley, D. and Bengio, Y.},
	title={Theano: a CPU and GPU math expression compiler},
	booktitle={In Proc. SciPy.},
	year={2010}
}
@inproceedings{Bergstra-et-al-2011,
	author={Bergstra, J. and Bardenet, R. and Bengio, Y. and Kégl, B.},
	title={Algorithms for hyper-parameter optimization},
	booktitle={In NIPS'2011.},
	year={2011}
}
@article{Berkes-Wiskott-2005,
	author={Berkes, P. and Wiskott, L.},
	title={Slow feature analysis yields a rich repertoire of complex cell properties},
	journal={Journal of Vision},
	volume={5},
	number={6},
	pages={579-602},
	year={2005}
}
@article{Bertsekas-Tsitsiklis-1996,
	author={Bertsekas, D. P. and Tsitsiklis, J.},
	title={Neuro-Dynamic Programming},
	journal={Athena Scientific.},
	year={1996}
}
@article{Besag-1975,
	author={Besag, J.},
	title={Statistical analysis of non-lattice data},
	journal={The Statistician},
	volume={24},
	number={3},
	pages={179-195},
	year={1975}
}
@misc{Bishop-1994,
	author={Bishop, C. M.},
	title={Mixture density networks},
	year={1994}
}
@inproceedings{Bishop-1995a,
	author={Bishop, C. M.},
	title={Regularization and complexity control in feed-forward networks},
	booktitle={In Proceedings International Conference on Artificial Neural Networks ICANN'95},
	volume = {1},
	pages={141-148},
	year={1995}
}
@article{Bishop-1995b,
	author={Bishop, C. M.},
	title={Training with noise is equivalent to Tikhonov regularization},
	journal={Neural Computation},
	volume={7},
	number={1},
	pages={108-116},
	year={1995}
}
@article{Bishop-2006,
	author={Bishop, C. M.},
	title={Pattern Recognition and Machine Learning},
	journal={Springer.},
	year={2006}
}
@inproceedings{Blum-Rivest-1992,
	title={Training a 3-node neural network is NP-complete},
	booktitle = {Proceedings of the 1st {International} {Conference} on {Neural} {Information} {Processing} {Systems}},
	publisher = {MIT Press},
	author = {Blum, Avrim and Rivest, Ronald L.},
	year = {1988},
	pages = {494-501}
}
@article{Blumer-et-al-1989,
	author={Blumer, A. and Ehrenfeucht, A. and Haussler, D. and Warmuth, M. K.},
	title={Learnability and the Vapnik-Chervonenkis dimension},
	journal={Journal of the ACM },
	volume={36},
	number={4},
	pages={929-865},
	year={1989}
}
@article{Bonnet-1964,
	author={Bonnet, G.},
	title={Transformations des signaux aléatoires à travers les systèmes non linéaires sans mémoire},
	journal={Annales des Télécommunications},
	volume = {19},
	number = {9-10},
	pages={203-220},
	year={1964}
}
@inproceedings{Bordes-et-al-2011,
	author={Bordes, A. and Weston, J. and Collobert, R. and Bengio, Y.},
	title={Learning structured embeddings of knowledge bases},
	booktitle={In AAAI 2011.},
	year={2011}
}
@article{Bordes-et-al-2012,
	author={Bordes, A. and Glorot, X. and Weston, J. and Bengio, Y.},
	title={Joint learning of words and meaning representations for open-text semantic parsing},
	journal={AISTATS'2012},
	year={2012}
}
@article{Bordes-et-al-2013a,
	author={Bordes, A. and Glorot, X. and Weston, J. and Bengio, Y.},
	title={A semantic matching energy function for learning with multi-relational data},
	journal={Machine Learning: Special Issue on Learning Semantics.},
	year={2013}
}
@inproceedings{Bordes-et-al-2013b,
	author={Bordes, A. and Usunier, N. and Garcia-Duran, A. and Weston, J. and Yakhnenko, O.},
	title={Translating embeddings for modeling multi-relational data},
	booktitle={In C. Burges, L. Bottou, M. Welling, Z. Ghahramani, and K. Weinberger, editors, Advances in Neural Information Processing Systems 26 },
	pages={2787-2795},
	publisher={Curran Associates, Inc},
	year={2013}
}
@inproceedings{Bornschein-Bengio-2015,
	author={Bornschein, J. and Bengio, Y.},
	title={Reweighted wake-sleep},
	booktitle={In ICLR'2015, arXiv:1406.2751.},
	year={2015}
}
@article{Bornschein-et-al-2015,
	author={Bornschein, J. and Shabanian, S. and Fischer, A. and Bengio, Y.},
	title={Training bidirectional Helmholtz machines},
	journal={Technical report, arXiv:1506.03877.},
	year={2015}
}
@inproceedings{Boser-et-al-1992,
	author={Boser, B. E. and Guyon, I. M. and Vapnik, V. N.},
	title={A training algorithm for optimal margin classifiers},
	booktitle={In COLT '92: Proceedings of the fifth annual workshop on Computational learning theory, New York, NY, USA.},
	pages = {144-152},
	publisher = {ACM},
	year={1992}
}
@inproceedings{Bottou-1998,
	author={Bottou, L.},
	title={Online algorithms and stochastic approximations},
	booktitle={In D. Saad, editor, Online Learning in Neural Networks. Cambridge University Press, Cambridge, UK.},
	year={1998}
}
@article{Bottou-2011,
	author={Bottou, L.},
	title={From machine learning to machine reasoning},
	journal={Technical report, arXiv.1102.1808.},
	year={2011}
}
@article{Bottou-2015,
	author={Bottou, L.},
	title={Multilayer neural networks},
	journal={Deep Learning Summer School.},
	year={2015}
}
@inproceedings{Bottou-Bousquet-2008,
	author={Bottou, L. and Bousquet, O.},
	title={The tradeoffs of large scale learning},
	booktitle={In NIPS'2008 .  282},
	year={2008}
}
@inproceedings{Boulanger-Lewandowski-et-al-2012,
	author={Boulanger-Lewandowski, N. and Bengio, Y. and Vincent, P.},
	title={Modeling temporal dependencies in high-dimensional sequences: Application to polyphonic music generation and transcription},
	booktitle={In ICML'12},
	year={2012}
}
@inproceedings{Boureau-et-al-2010,
	author={Boureau, Y. and Ponce, J. and LeCun, Y.},
	title={A theoretical analysis of feature pooling in vision algorithms},
	booktitle={In Proc. International Conference on Machine learning (ICML'10).},
	year={2010}
}
@inproceedings{Boureau-et-al-2011,
	author={Boureau, Y. and Le Roux, N. and Bach, F. and Ponce, J. and LeCun, Y.},
	title={Ask the locals: multi-way local pooling for image recognition},
	booktitle={In Proc. International Conference on Computer Vision (ICCV'11). IEEE.},
	year={2011}
}
@article{Bourlard-Kamp-1988,
	author={Bourlard, H. and Kamp, Y.},
	title={Auto-association by multilayer perceptrons and singular value decomposition},
	journal={Biological Cybernetics},
	volume={59},
	pages={291-294},
	year={1988}
}
@article{Bourlard-Wellekens-1989,
	author={Bourlard, H. and Wellekens, C.},
	title={Speech pattern discrimination and multi-layered perceptrons},
	journal={Computer Speech and Language},
	volume={3},
	pages={1-19},
	year={1989}
}
@article{Boyd-Vandenberghe-2004,
	author={Boyd, S. and Vandenberghe, L.},
	title={Convex Optimization},
	journal={Cambridge University Press, New York, NY, USA.},
	year={2004}
}
@article{Brady-et-al-1989,
	author={Brady, M. L. and Raghavan, R. and Slawny, J.},
	title={Back-propagation fails to separate where perceptrons succeed},
	journal={IEEE Transactions on Circuits and Systems,36},
	pages={665-674},
	year={1989}
}
@article{Brakel-et-al-2013,
	author={Brakel, P. and Stroobandt, D. and Schrauwen, B.},
	title={Training energy-based models for time-series imputation},
	journal={Journal of Machine Learning Research,14},
	pages={2771-2797},
	year={2013}
}
@inproceedings{Brand-2003,
	author={Brand, M.},
	title={Charting a manifold},
	booktitle={In NIPS'2002 },
	pages={961-968},
	publisher={MIT Press},
	year={2003}
}
@article{Breiman-1994,
	author={Breiman, L.},
	title={Bagging predictors},
	journal={Machine Learning},
	volume={24},
	number={2},
	pages={123-140},
	year={1994}
}
@article{Breiman-et-al-1984,
	author={Breiman, L. and Friedman, J. H. and Olshen, R. A. and Stone, C. J.},
	title={Classification and Regression Trees},
	journal={Wadsworth International Group, Belmont, CA.},
	year={1984}
}
@article{Bridle-1990,
	author={Bridle, J. S.},
	title={Alphanets: a recurrent ‘neural' network architecture with a hidden Markov model interpretation},
	journal={Speech Communication},
	volume={9},
	number={1},
	pages={83-92},
	year={1990}
}
@inproceedings{Briggman-et-al-2009,
	author={Briggman, K. and Denk, W. and Seung, S. and Helmstaedter, M. N. and Turaga, S. C.},
	title={Maximin affinity learning of image segmentation},
	booktitle={In NIPS'2009 },
	pages={1865-1873},
	year={2009}
}
@article{Brown-et-al-1990,
	author={Brown, P. F. and Cocke, J. and Pietra, S. A. D. and Pietra, V. J. D. and Jelinek, F. and Lafferty, J. D. and Mercer, R. L. and Roossin, P. S.},
	title={A statistical approach to machine translation},
	journal={Computational linguistics},
	volume={16},
	number={2},
	pages={79-85},
	year={1990}
}
@article{Brown-et-al-1992,
	author={Brown, P. F. and Pietra, V. J. D. and DeSouza, P. V. and Lai, J. C. and Mercer, R. L.},
	title={Class-based n-gram models of natural language},
	journal={Computational Linguistics,18},
	pages={467-479},
	year={1992}
}
@article{Bryson-Ho-1969,
	author={Bryson, A. and Ho, Y.},
	title={Applied optimal control: optimization, estimation, and control},
	journal={Blaisdell Pub. Co.},
	year={1969}
}
@article{Bryson-Denham-1961,
	author={Bryson, A. E. and Denham, W. F.},
	title={A steepest-ascent method for solving optimum programming problems},
	journal={Technical Report BR-1303, Raytheon Company, Missle and Space Division.},
	year={1961}
}
@inproceedings{Bucilua-et-al-2006,
	author={Buciluˇa, C. and Caruana, R. and Niculescu-Mizil, A.},
	title={Model compression},
	booktitle={In Proceedings of the 12th ACM SIGKDD international conference on Knowledge discovery and data mining},
	pages={535-541},
	publisher={ACM},
	year={2006}
}
@article{Burda-et-al-2015,
	author={Burda, Y. and Grosse, R. and Salakhutdinov, R.},
	title={Importance weighted autoencoders},
	journal={arXiv preprint arXiv:1509.00519.},
	year={2015}
}
@inproceedings{Cai-et-al-2013,
	author={Cai, M. and Shi, Y. and Liu, J.},
	title={Deep maxout neural networks for speech recognition},
	booktitle={In Automatic Speech Recognition and Understanding (ASRU), 2013 IEEE Workshop on},
	pages={291-296},
	publisher={IEEE},
	year={2013}
}
@inproceedings{Carreira-Perpinan-Hinton-2005,
	author={Carreira-Perpiñan, M. A. and Hinton, G. E.},
	title={On contrastive divergence learning},
	booktitle={In R. G. Cowell and Z. Ghahramani, editors, Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics (AISTATS'05)},
	pages={33-40},
	publisher={Society for Artificial Intelligence and Statistics},
	year={2005}
}
@inproceedings{Caruana-1993,
	author={Caruana, R.},
	title={Multitask connectionist learning},
	booktitle={In Proc. 1993 Connectionist Models Summer School},
	pages={372-379},
	year={1993}
}
@inproceedings{Cauchy-1847,
	author={Cauchy, A.},
	title={Méthode générale pour la résolution de systèmes d'équations simultanées},
	booktitle={In Compte rendu des séances de l'académie des sciences},
	pages={536-538},
	year={1847}
}
@article{Cayton-2005,
	author={Cayton, L.},
	title={Algorithms for manifold learning},
	journal={Technical Report CS2008-0923, UCSD.},
	year={2005}
}
@article{Chandola-et-al-2009,
	author={Chandola, V. and Banerjee, A. and Kumar, V.},
	title={Anomaly detection: A survey},
	journal={ACM computing surveys (CSUR)},
	volume = {41},
	number = {3},
	pages = {15},
	year={2009}
}
@inproceedings{Chapelle-et-al-2003,
	author={Chapelle, O. and Weston, J. and Schölkopf, B.},
	title={Cluster kernels for semi-supervised learning},
	booktitle={In S. Becker, S. Thrun, and K. Obermayer, editors, Advances in Neural Information Processing Systems 15 (NIPS'02), Cambridge, MA.},
	pages = {585-592},
	publisher = {MIT Press},
	year={2003}
}
@article{Chapelle-et-al-2006,
	author={Chapelle, O. and Schölkopf, B. and Zien, A. and editors},
	title={Semi-Supervised Learning},
	journal={MIT Press, Cambridge, MA.},
	year={2006}
}
@inproceedings{Chellapilla-et-al-2006,
	author={Chellapilla, K. and Puri, S. and Simard, P.},
	title={High Performance Convolutional Neural Networks for Document Processing},
	booktitle={In Guy Lorette, editor, Tenth International Workshop on Frontiers in Handwriting Recognition, La Baule (France). Université de Rennes 1, Suvisoft. http://www.suvisoft.com.},
	year={2006}
}
@article{Chen-et-al-2010,
	author={Chen, B. and Ting, J.-A. and Marlin, B. M. and de Freitas, N.},
	title={Deep learning of invariant spatio-temporal features from video},
	journal={NIPS*2010 Deep Learning and Unsupervised Feature Learning Workshop.},
	year={2010}
}
@article{Chen-Goodman-1999,
	author={Chen, S. F. and Goodman, J. T.},
	title={An empirical study of smoothing techniques for language modeling},
	journal={Computer, Speech and Language},
	volume={13},
	number={4},
	pages={359-393},
	year={1999}
}
@inproceedings{Chen-et-al-2014a,
	author={Chen, T. and Du, Z. and Sun, N. and Wang, J. and Wu, C. and Chen, Y. and Temam, O.},
	title={DianNao: A small-footprint high-throughput accelerator for ubiquitous machine-learning},
	booktitle={In Proceedings of the 19th international conference on Architectural support for programming languages and operating systems},
	pages={269-284},
	publisher={ACM},
	year={2014}
}
@article{Chen-et-al-2015,
	author={Chen, T. and Li, M. and Li, Y. and Lin, M. and Wang, N. and Wang, M. and Xiao, T. and Xu, B. and Zhang, C. and Zhang, Z.},
	title={MXNet: A flexible and efficient machine learning library for heterogeneous distributed systems},
	journal={arXiv preprint arXiv:1512.01274},
	year={2015}
}
@inproceedings{Chen-et-al-2014b,
	author={Chen, Y. and Luo, T. and Liu, S. and Zhang, S. and He, L. and Wang, J. and Li, L. and Chen, T. and Xu, Z. and Sun, N. and et al.},
	title={DaDianNao: A machine-learning supercomputer},
	booktitle={In Microarchitecture (MICRO), 2014 47th Annual IEEE/ACM International Symposium on},
	pages={609-622},
	publisher={ IEEE},
	year={2014}
}
@inproceedings{Chilimbi-et-al-2014,
	author={Chilimbi, T. and Suzue, Y. and Apacible, J. and Kalyanaraman, K.},
	title={Project Adam: Building an efficient and scalable deep learning training system},
	booktitle={In 11th USENIX Symposium on Operating Systems Design and Implementation (OSDI'14).},
	year={2014}
}
@inproceedings{Cho-et-al-2010,
	author={Cho, K. and Raiko, T. and Ilin, A.},
	title={Parallel tempering is efficient for learning restricted Boltzmann machines},
	booktitle={In IJCNN'2010},
	year={2010}
}
@inproceedings{Cho-et-al-2011,
	author={Cho, K. and Raiko, T. and Ilin, A.},
	title={Enhanced gradient and adaptive learning rate for training restricted Boltzmann machines},
	booktitle={In ICML'2011 },
	pages={105-112},
	year={2011}
}
@inproceedings{Cho-et-al-2014a,
	author={Cho, K. and van Merriënboer, B. and Gulcehre, C. and Bougares, F. and Schwenk, H. and Bengio, Y.},
	title={Learning phrase representations using RNN encoder-decoder for statistical machine translation},
	booktitle={In Proceedings of the Empiricial Methods in Natural Language Processing (EMNLP 2014).},
	year={2014}
}
@article{Cho-et-al-2014b,
	author={Cho, K. and Van Merriënboer, B. and Bahdanau, D. and Bengio, Y.},
	title={On the properties of neural machine translation: Encoder-decoder approaches},
	journal={ArXiv e-prints, abs/1409.1259.},
	year={2014}
}
@article{Choromanska-et-al-2014,
	author={Choromanska, A. and Henaff, M. and Mathieu, M. and Arous, G. B. and LeCun, Y.},
	title={The loss surface of multilayer networks},
	journal={285},
	year={2014}
}
@article{Chorowski-et-al-2014,
	author={Chorowski, J. and Bahdanau, D. and Cho, K. and Bengio, Y.},
	title={End-to-end continuous speech recognition using attention-based recurrent NN: First results},
	journal={arXiv:1412.1602.},
	year={2014}
}
@article{Christianson-1992,
	author={Christianson, B.},
	title={Automatic Hessians by reverse accumulation},
	journal={IMA Journal of Numerical Analysis},
	volume={12},
	number={2},
	pages={135-150},
	year={1992}
}
@article{Chrupala-et-al-2015,
	author={Chrupala, G. and Kadar, A. and Alishahi, A.},
	title={Learning language through pictures},
	journal={arXiv 1506.03694.},
	year={2015}
}
@article{Chung-et-al-2014,
	author={Chung, J. and Gulcehre, C. and Cho, K. and Bengio, Y.},
	title={Empirical evaluation of gated recurrent neural networks on sequence modeling},
	journal={NIPS'2014 Deep Learning workshop, arXiv 1412.3555.},
	year={2014}
}
@inproceedings{Chung-et-al-2015a,
	author={Chung, J. and Gülçehre, Ç. and Cho, K. and Bengio, Y.},
	title={Gated feedback recurrent neural networks},
	booktitle={In ICML'15.},
	year={2015}
}
@inproceedings{Chung-et-al-2015b,
	author={Chung, J. and Kastner, K. and Dinh, L. and Goel, K. and Courville, A. and Bengio, Y.},
	title={A recurrent latent variable model for sequential data},
	booktitle={In NIPS'2015.},
	year={2015}
}
@article{Ciresan-et-al-2012,
	author={Ciresan, D. and Meier, U. and Masci, J. and Schmidhuber, J.},
	title={Multi-column deep neural network for traffic sign classification},
	journal={Neural Networks},
	volume={32},
	pages={333-338},
	year={2012}
}
@article{Ciresan-et-al-2010,
	author={Ciresan, D. C. and Meier, U. and Gambardella, L. M. and Schmidhuber, J.},
	title={Deep big simple neural nets for handwritten digit recognition},
	journal={Neural Computation,22},
	pages={1-14},
	year={2010}
}
@inproceedings{Coates-Ng-2011,
	author={Coates, A. and Ng, A. Y.},
	title={The importance of encoding versus training with sparse coding and vector quantization},
	booktitle={In ICML'2011.},
	year={2011}
}
@inproceedings{Coates-et-al-2011,
	author={Coates, A. and Lee, H. and Ng, A. Y.},
	title={An analysis of single-layer networks in unsupervised feature learning},
	booktitle={In Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics (AISTATS 2011).},
	year={2011}
}
@inproceedings{Coates-et-al-2013,
	author={Coates, A. and Huval, B. and Wang, T. and Wu, D. and Catanzaro, B. and Andrew, N.},
	title={Deep learning with COTS HPC systems},
	booktitle={In S. Dasgupta and D. McAllester, editors, Proceedings of the 30th International Conference on Machine Learning (ICML-13), JMLR Workshop and Conference Proceedings},
	volume = {28},
	pages={1337-1345},
	year={2013}
}
@article{Cohen-et-al-2015,
	author={Cohen, N. and Sharir, O. and Shashua, A.},
	title={On the expressive power of deep learning: A tensor analysis},
	journal={arXiv:1509.05009.},
	year={2015}
}
@phdthesis{Collobert-2004,
	author={Collobert, R.},
	title={Large Scale Machine Learning},
	school={Université de Paris VI, LIP6.},
	year={2004}
}
@inproceedings{Collobert-2011,
	author={Collobert, R.},
	title={Deep learning for efficient discriminative parsing},
	booktitle={In AISTATS'2011},
	year={2011}
}
@inproceedings{Collobert-Weston-2008a,
	author={Collobert, R. and Weston, J.},
	title={A unified architecture for natural language processing: Deep neural networks with multitask learning},
	booktitle={In ICML'2008.},
	year={2008}
}
@inproceedings{Collobert-Weston-2008b,
	author={Collobert, R. and Weston, J.},
	title={A unified architecture for natural language processing: Deep neural networks with multitask learning},
	booktitle={In ICML'2008.},
	year={2008}
}
@article{Collobert-et-al-2001,
	author={Collobert, R. and Bengio, S. and Bengio, Y.},
	title={A parallel mixture of SVMs for very large scale problems},
	journal={Technical Report IDIAP-RR-01-12, IDIAP.},
	year={2001}
}
@article{Collobert-et-al-2002,
	author={Collobert, R. and Bengio, S. and Bengio, Y.},
	title={Parallel mixture of SVMs for very large scale problems},
	journal={Neural Computation},
	volume={14},
	number={5},
	pages={1105-1114},
	year={2002}
}
@article{Collobert-et-al-2011a,
	author={Collobert, R. and Weston, J. and Bottou, L. and Karlen, M. and Kavukcuoglu, K. and Kuksa, P.},
	title={Natural language processing (almost) from scratch},
	journal={The Journal of Machine Learning Research},
	volume={12},
	pages={2493-2537},
	year={2011}
}
@inproceedings{Collobert-et-al-2011b,
	author={Collobert, R. and Kavukcuoglu, K. and Farabet, C.},
	title={Torch7: A Matlab-like environment for machine learning},
	booktitle={In BigLearn, NIPS Workshop.},
	year={2011}
}
@article{Comon-1994,
	author={Comon, P.},
	title={Independent component analysis - a new concept?},
	journal={Signal Processing},
	volume={36},
	pages={287-314},
	year={1994}
}
@article{Cortes-Vapnik-1995,
	author={Cortes, C. and Vapnik, V.},
	title={Support vector networks},
	journal={Machine Learning,20},
	pages={273-297},
	year={1995}
}
@inproceedings{Couprie-et-al-2013,
	author={Couprie, C. and Farabet, C. and Najman, L. and LeCun, Y.},
	title={Indoor semantic segmentation using depth information},
	booktitle={In International Conference on Learning Representations (ICLR2013).},
	year={2013}
}
@inproceedings{Courbariaux-et-al-2015,
	author={Courbariaux, M. and Bengio, Y. and David, J.-P.},
	title={Low precision arithmetic for deep learning},
	booktitle={In Arxiv:1412.7024, ICLR'2015 Workshop.},
	year={2015}
}
@inproceedings{Courville-et-al-2011,
	author={Courville, A. and Bergstra, J. and Bengio, Y.},
	title={Unsupervised models of images by spike-and-slab RBMs},
	booktitle={In ICML'11},
	year={2011}
}
@article{Courville-et-al-2014,
	author={Courville, A. and Desjardins, G. and Bergstra, J. and Bengio, Y.},
	title={The spike-and-slab RBM and extensions to discrete and sparse data distributions},
	journal={Pattern Analysis and Machine Intelligence, IEEE Transactions on},
	volume={36},
	number={9},
	pages={1874-1887},
	year={2014}
}
@article{Cover-Thomas-2006,
	author={Cover, T. M. and Thomas, J. A.},
	title={Elements of Information Theory, 2nd Edition},
	journal={Wiley-Interscience.},
	year={2006}
}
@inproceedings{Cox-Pinto-2011,
	author={Cox, D. and Pinto, N.},
	title={Beyond simple features: A large-scale feature search approach to unconstrained face recognition},
	booktitle={In Automatic Face \& Gesture Recognition and Workshops (FG 2011), 2011 IEEE International Conference on},
	pages={8-15},
	publisher={IEEE},
	year={2011}
}
@article{Cramer-1946,
	author={Cramér, H.},
	title={Mathematical methods of statistics},
	journal={Princeton University Press.},
	year={1946}
}
@article{Crick-Mitchison-1983,
	author={Crick, F. H. C. and Mitchison, G.},
	title={The function of dream sleep},
	journal={Nature,304},
	pages={111-114},
	year={1983}
}
@article{Cybenko-1989,
	author={Cybenko, G.},
	title={Approximation by superpositions of a sigmoidal function},
	journal={Mathematics of Control, Signals, and Systems},
	volume={2},
	pages={303-314},
	year={1989}
}
@inproceedings{Dahl-et-al-2010,
	author={Dahl, G. E. and Ranzato, M. and Mohamed, A. and Hinton, G. E.},
	title={Phone recognition with the mean-covariance restricted Boltzmann machine},
	booktitle={In NIPS'2010},
	year={2010}
}
@article{Dahl-et-al-2012,
	author={Dahl, G. E. and Yu, D. and Deng, L. and Acero, A.},
	title={Context-dependent pre-trained deep neural networks for large vocabulary speech recognition},
	journal={IEEE Transactions on Audio, Speech, and Language Processing},
	volume={20},
	number={1},
	pages={33-42},
	year={2012}
}
@inproceedings{Dahl-et-al-2013,
	author={Dahl, G. E. and Sainath, T. N. and Hinton, G. E.},
	title={Improving deep neural networks for LVCSR using rectified linear units and dropout},
	booktitle={In ICASSP'2013},
	year={2013}
}
@article{Dahl-et-al-2014,
	author={Dahl, G. E. and Jaitly, N. and Salakhutdinov, R.},
	title={Multi-task neural networks for QSAR predictions},
	journal={arXiv:1406.1231.},
	year={2014}
}
@inproceedings{Dauphin-Bengio-2013,
	author={Dauphin, Y. and Bengio, Y.},
	title={Stochastic ratio matching of RBMs for sparse high-dimensional inputs},
	booktitle={In NIPS26 . NIPS Foundation.},
	year={2013}
}
@inproceedings{Dauphin-et-al-2011,
	author={Dauphin, Y. and Glorot, X. and Bengio, Y.},
	title={Large-scale learning of embeddings with reconstruction sampling},
	booktitle={In ICML'2011.},
	year={2011}
}
@inproceedings{Dauphin-et-al-2014,
	author={Dauphin, Y. and Pascanu, R. and Gulcehre, C. and Cho, K. and Ganguli, S. and Bengio, Y.},
	title={Identifying and attacking the saddle point problem in high-dimensional non-convex optimization},
	booktitle={In NIPS'2014},
	year={2014}
}
@article{Davis-et-al-2014,
	author={Davis, A. and Rubinstein, M. and Wadhwa, N. and Mysore, G. and Durand, F. and Freeman, W. T.},
	title={The visual microphone: Passive recovery of sound from video},
	journal={ACM Transactions on Graphics (Proc. SIGGRAPH)},
	volume = {33},
	number = {4},
	pages = {79:1-79:10},
	year={2014}
}
@inproceedings{Dayan-1990,
	author={Dayan, P.},
	title={Reinforcement comparison},
	booktitle={In Connectionist Models: Proceedings of the 1990 Connectionist Summer School, San Mateo, CA.},
	year={1990}
}
@article{Dayan-Hinton-1996,
	author={Dayan, P. and Hinton, G. E.},
	title={Varieties of Helmholtz machine},
	journal={Neural Networks},
	volume={9},
	number={8},
	pages={1385-1403},
	year={1996}
}
@article{Dayan-et-al-1995,
	author={Dayan, P. and Hinton, G. E. and Neal, R. M. and Zemel, R. S.},
	title={The Helmholtz machine},
	journal={Neural computation},
	volume={7},
	number={5},
	pages={889-904},
	year={1995}
}
@inproceedings{Dean-et-al-2012,
	author={Dean, J. and Corrado, G. and Monga, R. and Chen, K. and Devin, M. and Le, Q. and Mao, M. and Ranzato, M. and Senior, A. and Tucker, P. and Yang, K. and Ng, A. Y.},
	title={Large scale distributed deep networks},
	booktitle={In NIPS'2012},
	year={2012}
}
@article{Dean-Kanazawa-1989,
	author={Dean, T. and Kanazawa, K.},
	title={A model for reasoning about persistence and causation},
	journal={Computational Intelligence},
	volume={5},
	number={3},
	pages={142-150},
	year={1989}
}
@article{Deerwester-et-al-1990,
	author={Deerwester, S. and Dumais, S. T. and Furnas, G. W. and Landauer, T. K. and Harshman, R.},
	title={Indexing by latent semantic analysis},
	journal={Journal of the American Society for Information Science},
	volume={41},
	number={6},
	pages={391-407},
	year={1990}
}
@inproceedings{Delalleau-Bengio-2011,
	author={Delalleau, O. and Bengio, Y.},
	title={Shallow vs. deep sum-product networks},
	booktitle={In NIPS.},
	year={2011}
}
@inproceedings{Deng-et-al-2009,
	author={Deng, J. and Dong, W. and Socher, R. and Li, L.-J. and Li, K. and Fei-Fei, L.},
	title={ImageNet: A Large-Scale Hierarchical Image Database},
	booktitle={In CVPR09},
	year={2009}
}
@inproceedings{Deng-et-al-2010a,
	author={Deng, J. and Berg, A. C. and Li, K. and Fei-Fei, L.},
	title={What does classifying more than 10,000 image categories tell us?},
	booktitle={In Proceedings of the 11th European Conference on Computer Vision: Part V , ECCV'10, Berlin, Heidelberg.},
	pages = {71-84},
	publisher = {Springer-Verlag},
	year={2010}
}
@article{Deng-Yu-2014,
	author={Deng, L. and Yu, D.},
	title={Deep learning - methods and applications},
	journal={Foundations and Trends in Signal Processing.},
	year={2014}
}
@inproceedings{Deng-et-al-2010b,
	author={Deng, L. and Seltzer, M. and Yu, D. and Acero, A. and Mohamed, A. and Hinton, G.},
	title={Binary coding of speech spectrograms using a deep auto-encoder},
	booktitle={In Interspeech 2010 , Makuhari, Chiba, Japan.},
	year={2010}
}
@article{Denil-et-al-2012,
	author={Denil, M. and Bazzani, L. and Larochelle, H. and de Freitas, N.},
	title={Learning where to attend with deep architectures for image tracking},
	journal={Neural Computation},
	volume={24},
	number={8},
	pages={2151-2184},
	year={2012}
}
@article{Denton-et-al-2015,
	author={Denton, E. and Chintala, S. and Szlam, A. and Fergus, R.},
	title={Deep generative image models using a Laplacian pyramid of adversarial networks},
	journal={NIPS.},
	year={2015}
}
@article{Desjardins-Bengio-2008,
	author={Desjardins, G. and Bengio, Y.},
	title={Empirical evaluation of convolutional RBMs for vision},
	journal={Technical Report 1327, Département d'Informatique et de Recherche Opérationnelle, Université de Montréal.},
	year={2008}
}
@inproceedings{Desjardins-et-al-2010,
	author={Desjardins, G. and Courville, A. C. and Bengio, Y. and Vincent, P. and Delalleau, O.},
	title={Tempered Markov chain Monte Carlo for training of restricted Boltzmann machines},
	booktitle={In International Conference on Artificial Intelligence and Statistics},
	pages={145-152},
	year={2010}
}
@inproceedings{Desjardins-et-al-2011,
	author={Desjardins, G. and Courville, A. and Bengio, Y.},
	title={On tracking the partition function},
	booktitle={In NIPS'2011.},
	year={2011}
}
@inproceedings{Desjardins-et-al-2015,
	author={Desjardins, G. and Simonyan, K. and Pascanu, R. and et al.},
	title={Natural neural networks},
	booktitle={In Advances in Neural Information Processing Systems},
	pages={2062-2070},
	year={2015}
}
@inproceedings{Devlin-et-al-2014,
	author={Devlin, J. and Zbib, R. and Huang, Z. and Lamar, T. and Schwartz, R. and Makhoul, J.},
	title={Fast and robust neural network joint models for statistical machine translation},
	booktitle={In Proc. ACL'2014.},
	year={2014}
}
@article{Devroye-2013,
	author={Devroye, L.},
	title={Non-Uniform Random Variate Generation},
	journal={SpringerLink : Bücher. Springer New York.},
	year={2013}
}
@article{DiCarlo-2013,
	author={DiCarlo, J. J.},
	title={Mechanisms underlying visual object recognition: Humans vs. neurons vs. machines},
	journal={NIPS Tutorial.},
	year={2013}
}
@article{Dinh-et-al-2014,
	author={Dinh, L. and Krueger, D. and Bengio, Y.},
	title={NICE: Non-linear independent components estimation},
	journal={arXiv:1410.8516.},
	year={2014}
}
@article{Donahue-et-al-2014,
	author={Donahue, J. and Hendricks, L. A. and Guadarrama, S. and Rohrbach, M. and Venugopalan, S. and Saenko, K. and Darrell, T.},
	title={Long-term recurrent convolutional networks for visual recognition and description},
	journal={arXiv:1411.4389.},
	year={2014}
}
@article{Donoho-Grimes-2003,
	author={Donoho, D. L. and Grimes, C.},
	title={Hessian eigenmaps: new locally linear embedding techniques for high-dimensional data},
	journal={Technical Report 2003-08, Dept. Statistics, Stanford University.},
	year={2003}
}
@inproceedings{Dosovitskiy-et-al-2015,
	author={Dosovitskiy, A. and Springenberg, J. T. and Brox, T.},
	title={Learning to generate chairs with convolutional neural networks},
	booktitle={In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
	pages={1538-1546},
	year={2015}
}
@article{Doya-1993,
	author={Doya, K.},
	title={Bifurcations of recurrent neural networks in gradient descent learning},
	journal={IEEE Transactions on Neural Networks},
	volume={1},
	pages={75-80},
	year={1993}
}
@article{Dreyfus-1962,
	author={Dreyfus, S. E.},
	title={The numerical solution of variational problems},
	journal={Journal of Mathematical Analysis and Applications},
	volume={5},
	number={1},
	pages={30-45},
	year={1962}
}
@article{Dreyfus-1973,
	author={Dreyfus, S. E.},
	title={The computational solution of optimal control problems with time lag},
	journal={IEEE Transactions on Automatic Control},
	volume={18},
	number={4},
	pages={383-385},
	year={1973}
}
@article{Drucker-LeCun-1992,
	author={Drucker, H. and LeCun, Y.},
	title={Improving generalisation performance using double back-propagation},
	journal={IEEE Transactions on Neural Networks},
	volume={3},
	number={6},
	pages={991-997},
	year={1992}
}
@article{Duchi-et-al-2011,
	author={Duchi, J. and Hazan, E. and Singer, Y.},
	title={Adaptive subgradient methods for online learning and stochastic optimization},
	journal={Journal of Machine Learning Research.},
	year={2011}
}
@inproceedings{Dudik-et-al-2011,
	author={Dudik, M. and Langford, J. and Li, L.},
	title={Doubly robust policy evaluation and learning},
	booktitle={In Proceedings of the 28th International Conference on Machine learning, ICML '11.},
	year={2011}
}
@inproceedings{Dugas-et-al-2001,
	author={Dugas, C. and Bengio, Y. and Bélisle, F. and Nadeau, C.},
	title={Incorporating second-order functional knowledge for better option pricing},
	booktitle={In T. Leen, T. Dietterich, and V. Tresp, editors, Advances in Neural Information Processing Systems 13 (NIPS'00)},
	pages={472-478},
	publisher={MIT Press},
	year={2001}
}
@article{Dziugaite-et-al-2015,
	author={Dziugaite, G. K. and Roy, D. M. and Ghahramani, Z.},
	title={Training generative neural networks via maximum mean discrepancy optimization},
	journal={arXiv preprint arXiv:1505.03906},
	year={2015}
}
@inproceedings{Hihi-Bengio-1996,
	author={El Hihi, S. and Bengio, Y.},
	title={Hierarchical recurrent neural networks for long-term dependencies},
	booktitle={In NIPS'1995},
	year={1996}
}
@inproceedings{Elkahky-et-al-2015,
	author={Elkahky, A. M. and Song, Y. and He, X.},
	title={A multi-view deep learning approach for cross domain user modeling in recommendation systems},
	booktitle={In Proceedings of the 24th International Conference on World Wide Web},
	pages={278-288},
	year={2015}
}
@article{Elman-1993,
	author={Elman, J. L.},
	title={Learning and development in neural networks: The importance of starting small},
	journal={Cognition},
	volume={48},
	pages={781-799},
	year={1993}
}
@inproceedings{Erhan-et-al-2009,
	author={Erhan, D. and Manzagol, P.-A. and Bengio, Y. and Bengio, S. and Vincent, P.},
	title={The difficulty of training deep architectures and the effect of unsupervised pre-training},
	booktitle={In Proceedings of AISTATS'2009.},
	year={2009}
}
@article{Erhan-et-al-2010,
	author={Erhan, D. and Bengio, Y. and Courville, A. and Manzagol, P. and Vincent, P. and Bengio, S.},
	title={Why does unsupervised pre-training help deep learning?},
	journal={J. Machine Learning Res.},
	year={2010}
}
@inproceedings{Fahlman-et-al-1983,
	author={Fahlman, S. E. and Hinton, G. E. and Sejnowski, T. J.},
	title={Massively parallel architectures for AI: NETL, thistle, and Boltzmann machines},
	booktitle={In Proceedings of the National Conference on Artificial Intelligence AAAI-83.},
	year={1983}
}
@article{Fang-et-al-2015,
	author={Fang, H. and Gupta, S. and Iandola, F. and Srivastava, R. and Deng, L. and Dollár, P. and Gao, J. and He, X. and Mitchell, M. and Platt, J. C. and Zitnick, C. L. and Zweig, G.},
	title={From captions to visual concepts and back},
	journal={arXiv:1411.4952.},
	year={2015}
}
@inproceedings{Farabet-et-al-2011,
	author={Farabet, C. and LeCun, Y. and Kavukcuoglu, K. and Culurciello, E. and Martini, B. and Akselrod, P. and Talay, S.},
	title={Large-scale FPGA-based convolutional networks},
	booktitle={In R. Bekkerman, M. Bilenko, and J. Langford, editors, Scaling up Machine Learning: Parallel and Distributed Approaches. Cambridge University Press.},
	year={2011}
}
@article{Farabet-et-al-2013,
	author={Farabet, C. and Couprie, C. and Najman, L. and LeCun, Y.},
	title={Learning hierarchical features for scene labeling},
	journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
	volume={35},
	number={8},
	pages={1915-1929},
	year={2013}
}
@article{Fei-Fei-et-al-2006,
	author={Fei-Fei, L. and Fergus, R. and Perona, P.},
	title={One-shot learning of object categories},
	journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
	volume={28},
	number={4},
	pages={594-611},
	year={2006}
}
@article{Finn-et-al-2015,
	author={Finn, C. and Tan, X. Y. and Duan, Y. and Darrell, T. and Levine, S. and Abbeel, P.},
	title={Learning visual feature spaces for robotic manipulation with deep spatial autoencoders},
	journal={arXiv preprint arXiv:1509.06113.},
	year={2015}
}
@article{Fisher-1936,
	author={Fisher, R. A.},
	title={The use of multiple measurements in taxonomic problems},
	journal={Annals of Eugenics},
	volume={7},
	pages={179-188},
	year={1936}
}
@inproceedings{Foldiak-1989,
	author={Földiák, P.},
	title={Adaptive network for optimal linear feature extraction},
	booktitle={In International Joint Conference on Neural Networks (IJCNN)},
	volume = {1},
	pages = {401-405},
	publisher = {Washington 1989. IEEE, New York},
	year={1989}
}
@article{Franzius-et-al-2007,
	title={Slowness and sparseness lead to place, head-direction, and spatial-view cells},
	volume = {3},
	number = {8},
	journal = {PLoS Comput Biol},
	author = {Franzius, Mathias and Sprekeler, Henning and Wiskott, Laurenz},
	year = {2007},
	pages = {e166}
}
@inproceedings{Franzius-et-al-2008,
	author={Franzius, M. and Wilbert, N. and Wiskott, L.},
	title={Invariant object recognition with slow feature analysis},
	booktitle={In Artificial Neural Networks-ICANN 2008 },
	pages={961-970},
	publisher={Springer},
	year={2008}
}
@inproceedings{Frasconi-et-al-1997,
	author={Frasconi, P. and Gori, M. and Sperduti, A.},
	title={On the efficient classification of data structures by neural networks},
	booktitle={In Proc. Int. Joint Conf. on Artificial Intelligence.},
	year={1997}
}
@article{Frasconi-et-al-1998,
	author={Frasconi, P. and Gori, M. and Sperduti, A.},
	title={A general framework for adaptive processing of data structures},
	journal={IEEE Transactions on Neural Networks},
	volume={9},
	number={5},
	pages={768-786},
	year={1998}
}
@inproceedings{Freund-Schapire-1996a,
	author={Freund, Y. and Schapire, R. E.},
	title={Experiments with a new boosting algorithm},
	booktitle={In Machine Learning: Proceedings of Thirteenth International Conference, USA.},
	pages = {148-156},
	publisher = {ACM},
	year={1996}
}
@inproceedings{Freund-Schapire-1996b,
	author={Freund, Y. and Schapire, R. E.},
	title={Game theory, on-line prediction and boosting},
	booktitle={In Proceedings of the Ninth Annual Conference on Computational Learning Theory},
	pages={325-332},
	year={1996}
}
@article{Frey-1998,
	author={Frey, B. J.},
	title={Graphical models for machine learning and digital communication},
	journal={MIT Press.},
	year={1998}
}
@inproceedings{Frey-et-al-1996,
	author={Frey, B. J. and Hinton, G. E. and Dayan, P.},
	title={Does the wake-sleep algorithm learn good density estimators?},
	booktitle={In D. Touretzky, M. Mozer, and M. Hasselmo, editors, Advances in Neural Information Processing Systems 8 (NIPS'95)},
	pages={661-670},
	publisher={MIT Press, Cambridge, MA},
	year={1996}
}
@article{Frobenius-1908,
	author={Frobenius, G.},
	title={Über matrizen aus positiven elementen, s},
	journal={B. Preuss. Akad. Wiss. Berlin, Germany.},
	year={1908}
}
@article{Fukushima-1975,
	author={Fukushima, K.},
	title={Cognitron: A self-organizing multilayered neural network},
	journal={Biological Cybernetics},
	volume={20},
	pages={121-136},
	year={1975}
}
@article{Fukushima-1980,
	author={Fukushima, K.},
	title={Neocognitron: A self-organizing neural network model for a mechanism of pattern recognition unaffected by shift in position},
	journal={Biological Cybernetics},
	volume={36},
	pages={193-202},
	year={1980}
}
@article{Gal-Ghahramani-2015,
	author={Gal, Y. and Ghahramani, Z.},
	title={Bayesian convolutional neural networks with Bernoulli approximate variational inference},
	journal={arXiv preprint arXiv:1506.02158},
	year={2015}
}
@inproceedings{Gallinari-et-al-1987,
	author={Gallinari, P. and LeCun, Y. and Thiria, S. and Fogelman-Soulie, F.},
	title={Memoires associatives distribuees},
	booktitle={In Proceedings of COGNITIVA 87 , Paris, La Villette.},
	year={1987}
}
@article{Garcia-Duran-et-al-2015,
	author={Garcia-Duran, A. and Bordes, A. and Usunier, N. and Grandvalet, Y.},
	title={Combining two and three-way embeddings models for link prediction in knowledge bases},
	journal={arXiv preprint arXiv:1506.00999.},
	year={2015}
}
@article{Garofolo-et-al-1993,
	author={Garofolo, J. S. and Lamel, L. F. and Fisher, W. M. and Fiscus, J. G. and Pallett, D. S.},
	title={Darpa timit acoustic-phonetic continous speech corpus cd-rom},
	journal={nist speech disc 1-1.1. NASA STI/Recon Technical Report N , 93, 27403.},
	year={1993}
}
@article{Garson-1900,
	author={Garson, J.},
	title={The metric system of identification of criminals, as used in Great Britain and Ireland},
	journal={The Journal of the Anthropological Institute of Great Britain and Ireland, (2)},
	pages={177-227},
	year={1900}
}
@article{Gers-et-al-2000,
	author={Gers, F. A. and Schmidhuber, J. and Cummins, F.},
	title={Learning to forget: Continual prediction with LSTM},
	journal={Neural computation},
	volume={12},
	number={10},
	pages={2451-2471},
	year={2000}
}
@article{Ghahramani-Hinton-1996,
	author={Ghahramani, Z. and Hinton, G. E.},
	title={The EM algorithm for mixtures of factor analyzers},
	journal={Technical Report CRG-TR-96-1, Dpt. of Comp. Sci., Univ. of Toronto.},
	year={1996}
}
@article{Gillick-et-al-2015,
	author={Gillick, D. and Brunk, C. and Vinyals, O. and Subramanya, A.},
	title={Multilingual language processing from bytes},
	journal={arXiv preprint arXiv:1512.00103.},
	year={2015}
}
@article{Girshick-et-al-2015,
	title={Region-based convolutional networks for accurate object detection and segmentation},
	volume = {38},
	url = {http://ieeexplore.ieee.org/abstract/document/7112511/},
	number = {1},
	journal = {IEEE transactions on pattern analysis and machine intelligence},
	author = {Girshick, Ross and Donahue, Jeff and Darrell, Trevor and Malik, Jitendra},
	year = {2016},
	pages = {142-158}
}
@article{Giudice-et-al-2009,
	author={Giudice, M. D. and Manera, V. and Keysers, C.},
	title={Programmed to learn?},
	journal={The ontogeny of mirror neurons. Dev. Sci.},
	volume={12},
	number={2},
	pages={350-363},
	year={2009}
}
@inproceedings{Glorot-Bengio-2010,
	author={Glorot, X. and Bengio, Y.},
	title={Understanding the difficulty of training deep feedforward neural networks},
	booktitle={In AISTATS'2010.},
	year={2010}
}
@inproceedings{Glorot-et-al-2011a,
	author={Glorot, X. and Bordes, A. and Bengio, Y.},
	title={Deep sparse rectifier neural networks},
	booktitle={In AISTATS'2011},
	year={2011}
}
@inproceedings{Glorot-et-al-2011b,
	author={Glorot, X. and Bordes, A. and Bengio, Y.},
	title={Domain adaptation for large-scale sentiment classification: A deep learning approach},
	booktitle={In ICML'2011},
	year={2011}
}
@inproceedings{Goldberger-et-al-2005,
	author={Goldberger, J. and Roweis, S. and Hinton, G. E. and Salakhutdinov, R.},
	title={Neighbourhood components analysis},
	booktitle={In L. Saul, Y. Weiss, and L. Bottou, editors, Advances in Neural Information Processing Systems 17 (NIPS'04). MIT Press.},
	year={2005}
}
@article{Gong-et-al-2000,
	author={Gong, S. and McKenna, S. and Psarrou, A.},
	title={Dynamic Vision: From Images to Face Recognition},
	journal={Imperial College Press.},
	year={2000}
}
@inproceedings{Goodfellow-et-al-2009,
	author={Goodfellow, I. and Le, Q. and Saxe, A. and Ng, A.},
	title={Measuring invariances in deep networks},
	booktitle={In NIPS'2009},
	pages={646-654},
	year={2009}
}
@inproceedings{Goodfellow-et-al-2010,
	author={Goodfellow, I. and Koenig, N. and Muja, M. and Pantofaru, C. and Sorokin, A. and Takayama, L.},
	title={Help me help you: Interfaces for personal robots},
	booktitle={In Proc. of Human Robot Interaction (HRI), Osaka, Japan. ACM Press, ACM Press.},
	year={2010}
}
@article{Goodfellow-2010,
	author={Goodfellow, I. J.},
	title={Technical report: Multidimensional, downsampled convolution for autoencoders},
	journal={Technical report, Université de Montréal.},
	year={2010}
}
@inproceedings{Goodfellow-2014,
	author={Goodfellow, I. J.},
	title={On distinguishability criteria for estimating generative models},
	booktitle={In International Conference on Learning Representations, Workshops Track.},
	year={2014}
}
@inproceedings{Goodfellow-et-al-2011,
	author={Goodfellow, I. J. and Courville, A. and Bengio, Y.},
	title={Spike-and-slab sparse coding for unsupervised feature discovery},
	booktitle={In NIPS Workshop on Challenges in Learning Hierarchical Models.},
	year={2011}
}
@inproceedings{Goodfellow-et-al-2013a,
	author={Goodfellow, I. J. and Warde-Farley, D. and Mirza, M. and Courville, A. and Bengio, Y.},
	title={Maxout networks},
	booktitle={In S. Dasgupta and D. McAllester, editors, ICML'13 },
	pages={1319-1327},
	year={2013}
}
@inproceedings{Goodfellow-et-al-2013b,
	author={Goodfellow, I. J. and Mirza, M. and Courville, A. and Bengio, Y.},
	title={Multi-prediction deep Boltzmann machines},
	booktitle={In NIPS26 . NIPS Foundation.},
	year={2013}
}
@article{Goodfellow-et-al-2013c,
	author={Goodfellow, I. J. and Warde-Farley, D. and Lamblin, P. and Dumoulin, V. and Mirza, M. and Pascanu, R. and Bergstra, J. and Bastien, F. and Bengio, Y.},
	title={Pylearn2: a machine learning research library},
	journal={arXiv preprint arXiv:1308.4214.},
	year={2013}
}
@article{Goodfellow-et-al-2013d,
	author={Goodfellow, I. J. and Courville, A. and Bengio, Y.},
	title={Scaling up spike-and-slab models for unsupervised feature learning},
	journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
	volume={35},
	number={8},
	pages={1902-1914},
	year={2013}
}
@inproceedings{Goodfellow-et-al-2014a,
	author={Goodfellow, I. J. and Mirza, M. and Xiao, D. and Courville, A. and Bengio, Y.},
	title={An empirical investigation of catastrophic forgeting in gradient-based neural networks},
	booktitle={In ICLR'2014},
	year={2014}
}
@article{Goodfellow-et-al-2014b,
	author={Goodfellow, I. J. and Shlens, J. and Szegedy, C.},
	title={Explaining and harnessing adversarial examples},
	journal={CoRR, abs/1412.6572.},
	year={2014}
}
@inproceedings{Goodfellow-et-al-2014c,
	author={Goodfellow, I. J. and Pouget-Abadie, J. and Mirza, M. and Xu, B. and Warde-Farley, D. and Ozair, S. and Courville, A. and Bengio, Y.},
	title={Generative adversarial networks},
	booktitle={In NIPS'2014.},
	year={2014}
}
@inproceedings{Goodfellow-et-al-2014d,
	author={Goodfellow, I. J. and Bulatov, Y. and Ibarz, J. and Arnoud, S. and Shet, V.},
	title={Multi-digit number recognition from Street View imagery using deep convolutional neural networks},
	booktitle={In International Conference on Learning Representations.},
	year={2014}
}
@inproceedings{Goodfellow-et-al-2015,
	author={Goodfellow, I. J. and Vinyals, O. and Saxe, A. M.},
	title={Qualitatively characterizing neural network optimization problems},
	booktitle={In International Conference on Learning Representations.},
	year={2015}
}
@inproceedings{Goodman-2001,
	author={Goodman, J.},
	title={Classes for fast maximum entropy training},
	booktitle={In International Conference on Acoustics, Speech and Signal Processing (ICASSP), Utah.},
	year={2001}
}
@article{Gori-Tesi-1992,
	author={Gori, M. and Tesi, A.},
	title={On the problem of local minima in backpropagation},
	journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
	volume = {14},
	number = {1},
	pages={76-86},
	year={1992}
}
@article{Gosset-1908,
	author={Gosset, W. S.},
	title={The probable error of a mean},
	journal={Biometrika. Originally published under the pseudonym ``Student''.},
	volume = {6},
	number = {1},
	pages = {1-25},
	year={1908}
}
@article{Gouws-et-al-2014,
	author={Gouws, S. and Bengio, Y. and Corrado, G.},
	title={BilBOWA: Fast bilingual distributed representations without word alignments},
	journal={Technical report, arXiv:1410.2455.},
	year={2014}
}
@article{Graf-Jackel-1989,
	author={Graf, H. P. and Jackel, L. D.},
	title={Analog electronic neural network circuits},
	journal={Circuits and Devices Magazine, IEEE},
	volume={5},
	number={4},
	pages={44-49},
	year={1989}
}
@inproceedings{Graves-2011,
	author={Graves, A.},
	title={Practical variational inference for neural networks},
	booktitle={In NIPS'2011},
	year={2011}
}
@article{Graves-2012,
	author={Graves, A.},
	title={Supervised Sequence Labelling with Recurrent Neural Networks},
	journal={Studies in Computational Intelligence. Springer.},
	year={2012}
}
@article{Graves-2013,
	author={Graves, A.},
	title={Generating sequences with recurrent neural networks},
	journal={Technical report, arXiv:1308.0850.},
	year={2013}
}
@inproceedings{Graves-Jaitly-2014,
	author={Graves, A. and Jaitly, N.},
	title={Towards end-to-end speech recognition with recurrent neural networks},
	booktitle={In ICML'2014.},
	year={2014}
}
@article{Graves-Schmidhuber-2005,
	author={Graves, A. and Schmidhuber, J.},
	title={Framewise phoneme classification with bidirectional LSTM and other neural network architectures},
	journal={Neural Networks},
	volume={18},
	number={5},
	pages={602-610},
	year={2005}
}
@inproceedings{Graves-Schmidhuber-2009,
	author={Graves, A. and Schmidhuber, J.},
	title={Offline handwriting recognition with multidimensional recurrent neural networks},
	booktitle={In D. Koller, D. Schuurmans, Y. Bengio, and L. Bottou, editors, NIPS'2008},
	pages={545-552},
	year={2009}
}
@inproceedings{Graves-et-al-2006,
	author={Graves, A. and Fernández, S. and Gomez, F. and Schmidhuber, J.},
	title={Connectionist temporal classification: Labelling unsegmented sequence data with recurrent neural networks},
	booktitle={In ICML'2006, Pittsburgh, USA.},
	pages = {369-376},
	year={2006}
}
@inproceedings{Graves-et-al-2008,
	author={Graves, A. and Liwicki, M. and Bunke, H. and Schmidhuber, J. and Fernández, S.},
	title={Unconstrained on-line handwriting recognition with recurrent neural networks},
	booktitle={In J. Platt, D. Koller, Y. Singer, and S. Roweis, editors, NIPS'2007 },
	pages={577-584},
	year={2008}
}
@article{Graves-et-al-2009,
	author={Graves, A. and Liwicki, M. and Fernández, S. and Bertolami, R. and Bunke, H. and Schmidhuber, J.},
	title={A novel connectionist system for unconstrained handwriting recognition},
	journal={Pattern Analysis and Machine Intelligence, IEEE Transactions on},
	volume={31},
	number={5},
	pages={855-868},
	year={2009}
}
@inproceedings{Graves-et-al-2013,
	author={Graves, A. and Mohamed, A. and Hinton, G.},
	title={Speech recognition with deep recurrent neural networks},
	booktitle={In ICASSP'2013 },
	pages={6645-6649},
	year={2013}
}
@article{Graves-et-al-2014a,
	author={Graves, A. and Wayne, G. and Danihelka, I.},
	title={Neural Turing machines},
	journal={arXiv:1410.5401.},
	year={2014}
}
@article{Graves-et-al-2014b,
	author={Graves, A. and Wayne, G. and Danihelka, I.},
	title={Neural Turing machines},
	journal={arXiv preprint arXiv:1410.5401.},
	year={2014}
}
@inproceedings{Grefenstette-et-al-2015,
	author={Grefenstette, E. and Hermann, K. M. and Suleyman, M. and Blunsom, P.},
	title={Learning to transduce with unbounded memory},
	booktitle={In NIPS'2015},
	year={2015}
}
@article{Greff-et-al-2015,
	author={Greff, K. and Srivastava, R. K. and Koutník, J. and Steunebrink, B. R. and Schmidhuber, J.},
	title={LSTM: a search space odyssey},
	journal={arXiv preprint arXiv:1503.04069},
	year={2015}
}
@article{Gregor-LeCun-2010a,
	author={Gregor, K. and LeCun, Y.},
	title={Emergence of complex-like cells in a temporal product network with local receptive fields},
	journal={Technical report, arXiv:1006.0448.},
	year={2010}
}
@inproceedings{Gregor-LeCun-2010b,
	author={Gregor, K. and LeCun, Y.},
	title={Learning fast approximations of sparse coding},
	booktitle={In L. Bottou and M. Littman, editors, Proceedings of the Twenty-seventh International Conference on Machine Learning (ICML-10). ACM.},
	year={2010}
}
@inproceedings{Gregor-et-al-2014,
	author={Gregor, K. and Danihelka, I. and Mnih, A. and Blundell, C. and Wierstra, D.},
	title={Deep autoregressive networks},
	booktitle={In International Conference on Machine Learning (ICML'2014).},
	year={2014}
}
@article{Gregor-et-al-2015,
	author={Gregor, K. and Danihelka, I. and Graves, A. and Wierstra, D.},
	title={DRAW: A recurrent neural network for image generation},
	journal={arXiv preprint arXiv:1502.04623},
	year={2015}
}
@article{Gretton-et-al-2012,
	author={Gretton, A. and Borgwardt, K. M. and Rasch, M. J. and Schölkopf, B. and Smola, A.},
	title={A kernel two-sample test},
	journal={The Journal of Machine Learning Research},
	volume={13},
	number={1},
	pages={723-773},
	year={2012}
}
@inproceedings{Gulcehre-Bengio-2013,
	author={Gülçehre, Ç. and Bengio, Y.},
	title={Knowledge matters: Importance of prior information for optimization},
	booktitle={In International Conference on Learning Representations (ICLR'2013).},
	year={2013}
}
@article{Guo-Gelfand-1992,
	author={Guo, H. and Gelfand, S. B.},
	title={Classification trees with neural network feature extraction},
	journal={Neural Networks, IEEE Transactions on},
	volume={3},
	number={6},
	pages={923-933},
	year={1992}
}
@article{Gupta-et-al-2015,
	author={Gupta, S. and Agrawal, A. and Gopalakrishnan, K. and Narayanan, P.},
	title={Deep learning with limited numerical precision},
	journal={CoRR, abs/1502.02551.},
	year={2015}
}
@inproceedings{Gutmann-Hyvarinen-2010,
	author={Gutmann, M. and Hyvarinen, A.},
	title={Noise-contrastive estimation: A new estimation principle for unnormalized statistical models},
	booktitle={In Proceedings of The Thirteenth International Conference on Artificial Intelligence and Statistics (AISTATS'10).},
	year={2010}
}
@inproceedings{Hadsell-et-al-2007,
	author={Hadsell, R. and Sermanet, P. and Ben, J. and Erkan, A. and Han, J. and Muller, U. and LeCun, Y.},
	title={Online learning for offroad robots: Spatial label propagation to learn long-range traversability},
	booktitle={In Proceedings of Robotics: Science and Systems, Atlanta, GA, USA.},
	year={2007}
}
@article{Hajnal-et-al-1993,
	author={Hajnal, A. and Maass, W. and Pudlak, P. and Szegedy, M. and Turan, G.},
	title={Threshold circuits of bounded depth},
	journal={J. Comput. System. Sci.},
	volume={46},
	pages={129-154},
	year={1993}
}
@inproceedings{Hastad-1986,
	author={Håstad, J.},
	title={Almost optimal lower bounds for small depth circuits},
	booktitle={In Proceedings of the 18th annual ACM Symposium on Theory of Computing, Berkeley, California. ACM Press.},
	pages = {6-20},
	year={1986}
}
@article{Hastad-Goldmann-1991,
	author={Håstad, J. and Goldmann, M.},
	title={On the power of small-depth threshold circuits},
	journal={Computational Complexity},
	volume={1},
	pages={113-129},
	year={1991}
}
@article{Hastie-et-al-2001,
	author={Hastie, T. and Tibshirani, R. and Friedman, J.},
	title={The elements of statistical learning: data mining, inference and prediction},
	journal={Springer Series in Statistics. Springer Verlag.},
	year={2001}
}
@article{He-et-al-2015,
	author={He, K. and Zhang, X. and Ren, S. and Sun, J.},
	title={Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification},
	journal={arXiv preprint arXiv:1502.01852.},
	year={2015}
}
@article{Hebb-1949,
	author={Hebb, D. O.},
	title={The Organization of Behavior},
	journal={Wiley, New York.},
	year={1949}
}
@inproceedings{Henaff-et-al-2011,
	author={Henaff, M. and Jarrett, K. and Kavukcuoglu, K. and LeCun, Y.},
	title={Unsupervised learning of sparse features for scalable audio classification},
	booktitle={In ISMIR'11.},
	year={2011}
}
@inproceedings{Henderson-2003,
	author={Henderson, J.},
	title={Inducing history representations for broad coverage statistical parsing},
	booktitle={In HLT-NAACL},
	pages={103-110},
	year={2003}
}
@inproceedings{Henderson-2004,
	author={Henderson, J.},
	title={Discriminative training of a neural network statistical parser},
	booktitle={In Proceedings of the 42nd Annual Meeting on Association for Computational Linguistics, page 95.},
	year={2004}
}
@inproceedings{Henniges-et-al-2010,
	author={Henniges, M. and Puertas, G. and Bornschein, J. and Eggert, J. and Lücke, J.},
	title={Binary sparse coding},
	booktitle={In Latent Variable Analysis and Signal Separation},
	pages={450-457},
	publisher={Springer},
	year={2010}
}
@article{Herault-Ans-1984,
	author={Herault, J. and Ans, B.},
	title={Circuits neuronaux à synapses modifiables: Décodage de messages composites par apprentissage non supervisé},
	journal={Comptes Rendus de l'Académie des Sciences},
	volume = {299},
	number = {III-13},
	pages={525-528},
	year={1984}
}
@article{Hinton-2012,
	author={Hinton, G.},
	title={Neural networks for machine learning},
	journal={Coursera, video lectures.},
	year={2012}
}
@article{Hinton-et-al-2012a,
	author={Hinton, G. and Deng, L. and Dahl, G. E. and Mohamed, A. and Jaitly, N. and Senior, A. and Vanhoucke, V. and Nguyen, P. and Sainath, T. and Kingsbury, B.},
	title={Deep neural networks for acoustic modeling in speech recognition},
	journal={IEEE Signal Processing Magazine},
	volume={29},
	number={6},
	pages={82-97},
	year={2012}
}
@article{Hinton-et-al-2015,
	author={Hinton, G. and Vinyals, O. and Dean, J.},
	title={Distilling the knowledge in a neural network},
	journal={arXiv preprint arXiv:1503.02531.},
	year={2015}
}
@article{Hinton-1989,
	author={Hinton, G. E.},
	title={Connectionist learning procedures},
	journal={Artificial Intelligence,40},
	pages={185-234},
	year={1989}
}
@article{Hinton-1990,
	author={Hinton, G. E.},
	title={Mapping part-whole hierarchies into connectionist networks},
	journal={Artificial Intelligence},
	volume={46},
	number={1},
	pages={47-75},
	year={1990}
}
@inproceedings{Hinton-1999,
	author={Hinton, G. E.},
	title={Products of experts},
	booktitle={In ICANN'1999.},
	year={1999}
}
@article{Hinton-2000,
	author={Hinton, G. E.},
	title={Training products of experts by minimizing contrastive divergence},
	journal={Technical Report GCNU TR 2000-004, Gatsby Unit, University College London.},
	year={2000}
}
@article{Hinton-2006,
	author={Hinton, G. E.},
	title={To recognize shapes, first learn to generate images},
	journal={Technical Report UTML TR 2006-003, University of Toronto.},
	year={2006}
}
@article{Hinton-2007a,
	author={Hinton, G. E.},
	title={How to do backpropagation in a brain},
	journal={Invited talk at the NIPS'2007 Deep Learning Workshop.},
	year={2007}
}
@article{Hinton-2007b,
	author={Hinton, G. E.},
	title={Learning multiple layers of representation},
	journal={Trends in cognitive sciences},
	volume={11},
	number={10},
	pages={428-434},
	year={2007}
}
@article{Hinton-2010,
	author={Hinton, G. E.},
	title={A practical guide to training restricted Boltzmann machines},
	journal={Technical Report UTML TR 2010-003, Department of Computer Science, University of Toronto.},
	year={2010}
}
@article{Hinton-Ghahramani-1997,
	author={Hinton, G. E. and Ghahramani, Z.},
	title={Generative models for discovering sparse distributed representations},
	journal={Philosophical Transactions of the Royal Society of London.},
	year={1997}
}
@inproceedings{Hinton-McClelland-1988,
	author={Hinton, G. E. and McClelland, J. L.},
	title={Learning representations by recirculation},
	booktitle={In NIPS'1987 },
	pages={358-366},
	year={1988}
}
@inproceedings{Hinton-Roweis-2003,
	author={Hinton, G. E. and Roweis, S.},
	title={Stochastic neighbor embedding},
	booktitle={In NIPS'2002},
	year={2003}
}
@article{Hinton-Salakhutdinov-2006,
	author={Hinton, G. E. and Salakhutdinov, R.},
	title={Reducing the dimensionality of data with neural networks},
	journal={Science},
	volume={313},
	number={5786},
	pages={504-507},
	year={2006}
}
@inproceedings{Hinton-Sejnowski-1986,
	author={Hinton, G. E. and Sejnowski, T. J.},
	title={Learning and relearning in Boltzmann machines},
	booktitle={In D. E. Rumelhart and J. L. McClelland, editors, Parallel Distributed Processing, volume 1, chapter 7},
	pages={282-317},
	publisher={MIT Press, Cambridge},
	year={1986}
}
@article{Hinton-Sejnowski-1999,
	author={Hinton, G. E. and Sejnowski, T. J.},
	title={Unsupervised learning: foundations of neural computation},
	journal={MIT press.},
	year={1999}
}
@article{Hinton-Shallice-1991,
	author={Hinton, G. E. and Shallice, T.},
	title={Lesioning an attractor network: investigations of acquired dyslexia},
	journal={Psychological review},
	volume = {98},
	number = {1},
	pages = {74},
	year={1991}
}
@inproceedings{Hinton-Zemel-1994,
	author={Hinton, G. E. and Zemel, R. S.},
	title={Autoencoders, minimum description length, and Helmholtz free energy},
	booktitle={In NIPS'1993},
	year={1994}
}
@article{Hinton-et-al-1984,
	author={Hinton, G. E. and Sejnowski, T. J. and Ackley, D. H.},
	title={Boltzmann machines: Constraint satisfaction networks that learn},
	journal={Technical Report TR-CMU-CS-84-119, Carnegie-Mellon University, Dept. of Computer Science.},
	year={1984}
}
@inproceedings{Hinton-et-al-1986,
	author={Hinton, G. E. and McClelland, J. and Rumelhart, D.},
	title={Distributed representations},
	booktitle={In D. E. Rumelhart and J. L. McClelland, editors, Parallel Distributed Processing: Explorations in the Microstructure of Cognition},
	volume = {1},
	pages={77-109},
	publisher={MIT Press, Cambridge},
	year={1986}
}
@inproceedings{Hinton-et-al-1995a,
	author={Hinton, G. E. and Revow, M. and Dayan, P.},
	title={Recognizing handwritten digits using mixtures of linear models},
	booktitle={In G. Tesauro, D. Touretzky, and T. Leen, editors, Advances in Neural Information Processing Systems 7 (NIPS'94)},
	pages={1015-1022},
	publisher={MIT Press, Cambridge, MA},
	year={1995}
}
@article{Hinton-et-al-1995b,
	author={Hinton, G. E. and Dayan, P. and Frey, B. J. and Neal, R. M.},
	title={The wake-sleep algorithm for unsupervised neural networks},
	journal={Science},
	volume={268},
	pages={1558-1161},
	year={1995}
}
@article{Hinton-et-al-1997,
	author={Hinton, G. E. and Dayan, P. and Revow, M.},
	title={Modelling the manifolds of images of handwritten digits},
	journal={IEEE Transactions on Neural Networks},
	volume={8},
	pages={65-74},
	year={1997}
}
@inproceedings{Hinton-et-al-2001,
	author={Hinton, G. E. and Welling, M. and Teh, Y. W. and Osindero, S.},
	title={A new view of ICA},
	booktitle={In Proceedings of 3rd International Conference on Independent Component Analysis and Blind Signal Separation (ICA'01), San Diego, CA.},
	pages = {746-751},
	year={2001}
}
@article{Hinton-et-al-2006,
	author={Hinton, G. E. and Osindero, S. and Teh, Y.},
	title={A fast learning algorithm for deep belief nets},
	journal={Neural Computation},
	volume={18},
	pages={1527-1554},
	year={2006}
}
@article{Hinton-et-al-2012b,
	author={Hinton, G. E. and Deng, L. and Yu, D. and Dahl, G. E. and Mohamed, A. and Jaitly, N. and Senior, A. and Vanhoucke, V. and Nguyen, P. and Sainath, T. N. and Kingsbury, B.},
	title={Deep neural networks for acoustic modeling in speech recognition: The shared views of four research groups},
	journal={IEEE Signal Process. Mag.},
	volume={29},
	number={6},
	pages={82-97},
	year={2012}
}
@article{Hinton-et-al-2012c,
	author={Hinton, G. E. and Srivastava, N. and Krizhevsky, A. and Sutskever, I. and Salakhutdinov, R.},
	title={Improving neural networks by preventing co-adaptation of feature detectors},
	journal={Technical report, arXiv:1207.0580.},
	year={2012}
}
@article{Hinton-et-al-2014,
	author={Hinton, G. E. and Vinyals, O. and Dean, J.},
	title={Dark knowledge},
	journal={Invited talk at the BayLearn Bay Area Machine Learning Symposium.},
	year={2014}
}
@article{Hochreiter-1991,
	author={Hochreiter, S.},
	title={Untersuchungen zu dynamischen neuronalen Netzen},
	journal={Diploma thesis, T.U. München.},
	year={1991}
}
@inproceedings{Hochreiter-Schmidhuber-1995,
	author={Hochreiter, S. and Schmidhuber, J.},
	title={Simplifying neural nets by discovering flat minima},
	booktitle={In Advances in Neural Information Processing Systems 7 },
	pages={529-536},
	publisher={MIT Press},
	year={1995}
}
@article{Hochreiter-Schmidhuber-1997,
	author={Hochreiter, S. and Schmidhuber, J.},
	title={Long short-term memory},
	journal={Neural Computation},
	volume={9},
	number={8},
	pages={1735-1780},
	year={1997}
}
@inproceedings{Hochreiter-et-al-2001,
	author={Hochreiter, S. and Bengio, Y. and Frasconi, P.},
	title={Gradient flow in recurrent nets: the difficulty of learning long-term dependencies},
	booktitle={In J. Kolen and S. Kremer, editors, Field Guide to Dynamical Recurrent Networks. IEEE Press.},
	year={2001}
}
@article{Holi-Hwang-1993,
	author={Holi, J. L. and Hwang, J.-N.},
	title={Finite precision error analysis of neural network hardware implementations},
	journal={Computers, IEEE Transactions on},
	volume={42},
	number={3},
	pages={281-290},
	year={1993}
}
@inproceedings{Holt-Baker-1991,
	author={Holt, J. L. and Baker, T. E.},
	title={Back propagation simulations using limited precision calculations},
	booktitle={In Neural Networks, 1991., IJCNN-91-Seattle International Joint Conference on},
	volume = {2},
	pages={121-126},
	publisher={IEEE},
	year={1991}
}
@article{Hornik-et-al-1989,
	author={Hornik, K. and Stinchcombe, M. and White, H.},
	title={Multilayer feedforward networks are universal approximators},
	journal={Neural Networks},
	volume={2},
	pages={359-366},
	year={1989}
}
@article{Hornik-et-al-1990,
	author={Hornik, K. and Stinchcombe, M. and White, H.},
	title={Universal approximation of an unknown mapping and its derivatives using multilayer feedforward networks},
	journal={Neural networks},
	volume={3},
	number={5},
	pages={551-560},
	year={1990}
}
@article{Hsu-2002,
	author={Hsu, F.-H.},
	title={Behind Deep Blue: Building the Computer That Defeated the World Chess Champion},
	journal={Princeton University Press, Princeton, NJ, USA.},
	year={2002}
}
@article{Huang-Ogata-2002,
	author={Huang, F. and Ogata, Y.},
	title={Generalized pseudo-likelihood estimates for Markov random fields on lattice},
	journal={Annals of the Institute of Statistical Mathematics},
	volume={54},
	number={1},
	pages={1-18},
	year={2002}
}
@inproceedings{Huang-et-al-2013,
	author={Huang, P.-S. and He, X. and Gao, J. and Deng, L. and Acero, A. and Heck, L.},
	title={Learning deep structured semantic models for web search using clickthrough data},
	booktitle={In Proceedings of the 22nd ACM international conference on Conference on information \& knowledge management},
	pages={2333-2338},
	publisher={ACM},
	year={2013}
}
@article{Hubel-Wiesel-1968,
	author={Hubel, D. and Wiesel, T.},
	title={Receptive fields and functional architecture of monkey striate cortex},
	journal={Journal of Physiology (London)},
	volume={195},
	pages={215-243},
	year={1968}
}
@article{Hubel-Wiesel-1959,
	author={Hubel, D. H. and Wiesel, T. N.},
	title={Receptive fields of single neurons in the cat's striate cortex},
	journal={Journal of Physiology},
	volume={148},
	pages={574-591},
	year={1959}
}
@article{Hubel-Wiesel-1962,
	author={Hubel, D. H. and Wiesel, T. N.},
	title={Receptive fields, binocular interaction, and functional architecture in the cat's visual cortex},
	journal={Journal of Physiology (London),160},
	pages={106-154},
	year={1962}
}
@article{Huszar-2015,
	author={Huszar, F.},
	title={How (not) to train your generative model: schedule sampling, likelihood, adversary?},
	journal={arXiv:1511.05101.},
	year={2015}
}
@inproceedings{Hutter-et-al-2011,
	author={Hutter, F. and Hoos, H. and Leyton-Brown, K.},
	title={Sequential model-based optimization for general algorithm configuration},
	booktitle={In LION-5. Extended version as UBC Tech report TR-},
	pages={2010-10},
	year={2011}
}
@inproceedings{Hyotyniemi-1996,
	author={Hyotyniemi, H.},
	title={Turing machines are recurrent neural networks},
	booktitle={In STeP'96 },
	pages={13-24},
	year={1996}
}
@article{Hyvarinen-1999,
	author={Hyvärinen, A.},
	title={Survey on independent component analysis},
	journal={Neural Computing Surveys},
	volume={2},
	pages={94-128},
	year={1999}
}
@article{Hyvarinen-2005,
	author={Hyvärinen, A.},
	title={Estimation of non-normalized statistical models using score matching},
	journal={Journal of Machine Learning Research},
	volume={6},
	pages={695-709},
	year={2005}
}
@article{Hyvarinen-2007a,
	author={Hyvärinen, A.},
	title={Connections between score matching, contrastive divergence, and pseudolikelihood for continuous-valued variables},
	journal={IEEE Transactions on Neural Networks},
	volume={18},
	pages={1529-1531},
	year={2007}
}
@article{Hyvarinen-2007b,
	author={Hyvärinen, A.},
	title={Some extensions of score matching},
	journal={Computational Statistics and Data Analysis},
	volume={51},
	pages={2499-2512},
	year={2007}
}
@inproceedings{Hyvarinen-Hoyer-1999,
	author={Hyvärinen, A. and Hoyer, P. O.},
	title={Emergence of topography and complex cell properties from natural images using extensions of ica},
	booktitle={In NIPS },
	pages={827-833},
	year={1999}
}
@article{Hyvarinen-Pajunen-1999,
	author={Hyvärinen, A. and Pajunen, P.},
	title={Nonlinear independent component analysis: Existence and uniqueness results},
	journal={Neural Networks},
	volume={12},
	number={3},
	pages={429-439},
	year={1999}
}
@article{Hyvarinen-et-al-2001a,
	author={Hyvärinen, A. and Karhunen, J. and Oja, E.},
	title={Independent Component Analysis},
	journal={Wiley-Interscience.},
	year={2001}
}
@article{Hyvarinen-et-al-2001b,
	author={Hyvärinen, A. and Hoyer, P. O. and Inki, M. O.},
	title={Topographic independent component analysis},
	journal={Neural Computation},
	volume={13},
	number={7},
	pages={1527-1558},
	year={2001}
}
@article{Hyvarinen-et-al-2009,
	author={Hyvärinen, A. and Hurri, J. and Hoyer, P. O.},
	title={Natural Image Statistics: A probabilistic approach to early computational vision},
	journal={Springer-Verlag.},
	year={2009}
}
@article{Iba-2001,
	author={Iba, Y.},
	title={Extended ensemble Monte Carlo},
	journal={International Journal of Modern Physics, C12},
	pages={623-656},
	year={2001}
}
@inproceedings{Inayoshi-Kurita-2005,
	author={Inayoshi, H. and Kurita, T.},
	title={Improved generalization by adding both auto-association and hidden-layer noise to neural-network-based-classifiers},
	booktitle={IEEE Workshop on Machine Learning for Signal Processing},
	pages = {141-146},
	year={2005}
}
@article{Ioffe-Szegedy-2015,
	author={Ioffe, S. and Szegedy, C.},
	title={Batch normalization: Accelerating deep network training by reducing internal covariate shift},
	journal = {arXiv preprint arXiv:1502.03167},
	year={2015}
}
@article{Jacobs-1988,
	author={Jacobs, R. A.},
	title={Increased rates of convergence through learning rate adaptation},
	journal={Neural networks},
	volume={1},
	number={4},
	pages={295-307},
	year={1988}
}
@article{Jacobs-et-al-1991,
	author={Jacobs, R. A. and Jordan, M. I. and Nowlan, S. J. and Hinton, G. E.},
	title={Adaptive mixtures of local experts},
	journal={Neural Computation},
	volume={3},
	pages={79-87},
	year={1991}
}
@inproceedings{Jaeger-2003,
	author={Jaeger, H.},
	title={Adaptive nonlinear system identification with echo state networks},
	booktitle={In Advances in Neural Information Processing Systems 15},
	year={2003}
}
@article{Jaeger-2007a,
	author={Jaeger, H.},
	title={Discovering multiscale dynamical features with hierarchical echo state networks},
	journal={Technical report, Jacobs University.},
	year={2007}
}
@article{Jaeger-2007b,
	author={Jaeger, H.},
	title={Echo state network},
	url = {http://www.scholarpedia.org/article/Echo_state_network},
	journal={Scholarpedia},
	volume = {2},
	number = {9},
	pages = {2330},
	year={2007}
}
@article{Jaeger-2012,
	author={Jaeger, H.},
	title={Long short-term memory in echo state networks: Details of a simulation study},
	journal={Technical report, Technical report, Jacobs University Bremen.},
	year={2012}
}
@article{Jaeger-Haas-2004,
	author={Jaeger, H. and Haas, H.},
	title={Harnessing nonlinearity: Predicting chaotic systems and saving energy in wireless communication},
	journal={Science},
	volume={304},
	number={5667},
	pages={78-80},
	year={2004}
}
@article{Jaeger-et-al-2007,
	author={Jaeger, H. and Lukosevicius, M. and Popovici, D. and Siewert, U.},
	title={Optimization and applications of echo state networks with leaky-integrator neurons},
	journal={Neural Networks},
	volume={20},
	number={3},
	pages={335-352},
	year={2007}
}
@inproceedings{Jain-et-al-2007,
	author={Jain, V. and Murray, J. F. and Roth, F. and Turaga, S. and Zhigulin, V. and Briggman, K. L. and Helmstaedter, M. N. and Denk, W. and Seung, H. S.},
	title={Supervised learning of image restoration with convolutional networks},
	booktitle={In Computer Vision, 2007. ICCV 2007. IEEE 11th International Conference on},
	pages={1-8},
	publisher={IEEE},
	year={2007}
}
@inproceedings{Jaitly-Hinton-2011,
	author={Jaitly, N. and Hinton, G.},
	title={Learning a better representation of speech soundwaves using restricted Boltzmann machines},
	booktitle={In Acoustics, Speech and Signal Processing (ICASSP), 2011 IEEE International Conference on},
	pages={5884-5887},
	publisher={IEEE},
	year={2011}
}
@inproceedings{Jaitly-Hinton-2013,
	author={Jaitly, N. and Hinton, G. E.},
	title={Vocal tract length perturbation (VTLP) improves speech recognition},
	booktitle={In ICML'2013.},
	year={2013}
}
@inproceedings{Jarrett-et-al-2009,
	author={Jarrett, K. and Kavukcuoglu, K. and Ranzato, M. and LeCun, Y.},
	title={What is the best multi-stage architecture for object recognition?},
	booktitle={In ICCV'09.},
	year={2009}
}
@article{Jarzynski-1997,
	author={Jarzynski, C.},
	title={Nonequilibrium equality for free energy differences},
	journal={Phys. Rev. Lett.},
	volume={78},
	pages={2690-2693},
	year={1997}
}
@article{Jaynes-2003,
	author={Jaynes, E. T.},
	title={Probability Theory: The Logic of Science},
	journal={Cambridge University Press.},
	year={2003}
}
@article{Jean-et-al-2014,
	author={Jean, S. and Cho, K. and Memisevic, R. and Bengio, Y.},
	title={On using very large target vocabulary for neural machine translation},
	journal={arXiv:1412.2007.},
	year={2014}
}
@inproceedings{Jelinek-Mercer-1980,
	author={Jelinek, F. and Mercer, R. L.},
	title={Interpolated estimation of Markov source parameters from sparse data},
	booktitle={In E. S. Gelsema and L. N. Kanal, editors, Pattern Recognition in Practice. North-Holland, Amsterdam.},
	year={1980}
}
@article{Jia-2013,
	author={Jia, Y.},
	title={Caffe: An open source convolutional architecture for fast feature embedding},
	journal={http://caffe.berkeleyvision.org/.},
	year={2013}
}
@inproceedings{Jia-et-al-2012,
	author={Jia, Y. and Huang, C. and Darrell, T.},
	title={Beyond spatial pyramids: Receptive field learning for pooled image features},
	booktitle={In Computer Vision and Pattern Recognition (CVPR), 2012 IEEE Conference on},
	pages={3370-3377},
	publisher={IEEE},
	year={2012}
}
@article{Jim-et-al-1996,
	author={Jim, K.-C. and Giles, C. L. and Horne, B. G.},
	title={An analysis of noise in recurrent neural networks: convergence and generalization},
	journal={IEEE Transactions on Neural Networks},
	volume={7},
	number={6},
	pages={1424-1438},
	year={1996}
}
@article{Jordan-1998,
	author={Jordan, M. I.},
	title={Learning in Graphical Models},
	journal={Kluwer, Dordrecht, Netherlands.},
	year={1998}
}
@article{Joulin-Mikolov-2015,
	author={Joulin, A. and Mikolov, T.},
	title={Inferring algorithmic patterns with stack-augmented recurrent nets},
	journal={arXiv preprint arXiv:1503.01007.},
	year={2015}
}
@inproceedings{Jozefowicz-et-al-2015,
	author={Jozefowicz, R. and Zaremba, W. and Sutskever, I.},
	title={An empirical evaluation of recurrent network architectures},
	booktitle={In ICML'2015},
	year={2015}
}
@article{Judd-1989,
	author={Judd, J. S.},
	title={Neural Network Design and the Complexity of Learning},
	journal={MIT press.},
	year={1989}
}
@article{Jutten-Herault-1991,
	author={Jutten, C. and Herault, J.},
	title={Blind separation of sources, part I: an adaptive algorithm based on neuromimetic architecture},
	journal={Signal Processing},
	volume={24},
	pages={1-10},
	year={1991}
}
@inproceedings{Kahou-et-al-2013,
	author={Kahou, S. E. and Pal, C. and Bouthillier, X. and Froumenty, P. and Gülçehre, C. and Memisevic, R. and Vincent, P. and Courville, A. and Bengio, Y. and Ferrari, R. C. and Mirza, M. and Jean, S. and Carrier, P. L. and Dauphin, Y. and Boulanger-Lewandowski, N. and Aggarwal, A. and Zumer, J. and Lamblin, P. and Raymond, J.-P. and Desjardins, G. and Pascanu, R. and Warde-Farley, D. and Torabi, A. and Sharma, A. and Bengio, E. and Côté, M. and Konda, K. R. and Wu, Z.},
	title={Combining modality specific deep neural networks for emotion recognition in video},
	booktitle={In Proceedings of the 15th ACM on International Conference on Multimodal Interaction.},
	year={2013}
}
@inproceedings{Kalchbrenner-Blunsom-2013,
	author={Kalchbrenner, N. and Blunsom, P.},
	title={Recurrent continuous translation models},
	booktitle={In EMNLP'2013.},
	year={2013}
}
@article{Kalchbrenner-et-al-2015,
	author={Kalchbrenner, N. and Danihelka, I. and Graves, A.},
	title={Grid long short-term memory},
	journal={arXiv preprint arXiv:1507.01526.},
	year={2015}
}
@article{Kamyshanska-Memisevic-2015,
	author={Kamyshanska, H. and Memisevic, R.},
	title={The potential energy of an autoencoder},
	journal={IEEE Transactions on Pattern Analysis and Machine Intelligence.},
	year={2015}
}
@inproceedings{Karpathy-Li-2015,
	author={Karpathy, A. and Li, F.-F.},
	title={Deep visual-semantic alignments for generating image descriptions},
	booktitle={In CVPR'2015. arXiv:1412.2306.},
	year={2015}
}
@inproceedings{Karpathy-et-al-2014,
	author={Karpathy, A. and Toderici, G. and Shetty, S. and Leung, T. and Sukthankar, R. and Fei-Fei, L.},
	title={Large-scale video classification with convolutional neural networks},
	booktitle={In CVPR.},
	year={2014}
}
@article{Karush-1939,
	author={Karush, W.},
	title={Minima of Functions of Several Variables with Inequalities as Side Constraints},
	journal={Master's thesis, Dept. of Mathematics, Univ. of Chicago.},
	year={1939}
}
@article{Katz-1987,
	author={Katz, S. M.},
	title={Estimation of probabilities from sparse data for the language model component of a speech recognizer},
	journal={IEEE Transactions on Acoustics, Speech, and Signal Processing},
	volume = {35},
	number = {3},
	pages={400-401},
	year={1987}
}
@article{Kavukcuoglu-et-al-2008,
	author={Kavukcuoglu, K. and Ranzato, M. and LeCun, Y.},
	title={Fast inference in sparse coding algorithms with applications to object recognition},
	journal={Technical report, Computational and Biological Learning Lab, Courant Institute, NYU. Tech Report CBLL-TR-2008-},
	pages={12-01},
	year={2008}
}
@inproceedings{Kavukcuoglu-et-al-2009,
	author={Kavukcuoglu, K. and Ranzato, M.-A. and Fergus, R. and LeCun, Y.},
	title={Learning invariant features through topographic filter maps},
	booktitle={In CVPR'2009},
	year={2009}
}
@inproceedings{Kavukcuoglu-et-al-2010,
	author={Kavukcuoglu, K. and Sermanet, P. and Boureau, Y.-L. and Gregor, K. and Mathieu, M. and LeCun, Y.},
	title={Learning convolutional feature hierarchies for visual recognition},
	booktitle={In NIPS'2010 .  364},
	year={2010}
}
@article{Kelley-1960,
	author={Kelley, H. J.},
	title={Gradient theory of optimal flight paths},
	journal={ARS Journal},
	volume={30},
	number={10},
	pages={947-954},
	year={1960}
}
@inproceedings{Khan-et-al-2011,
	author={Khan, F. and Zhu, X. and Mutlu, B.},
	title={How do humans teach: On curriculum learning and teaching dimension},
	booktitle={In Advances in Neural Information Processing Systems 24 (NIPS'11)},
	pages={1449-1457},
	year={2011}
}
@inproceedings{Kim-et-al-2009,
	author={Kim, S. K. and McAfee, L. C. and McMahon, P. L. and Olukotun, K.},
	title={A highly scalable restricted Boltzmann machine FPGA implementation},
	booktitle={In Field Programmable Logic and Applications, 2009. FPL 2009. International Conference on},
	pages={367-372},
	publisher={IEEE},
	year={2009}
}
@article{Kindermann-1980,
	author={Kindermann, R.},
	title={Markov Random Fields and Their Applications (Contemporary Mathematics ; V},
	journal={1). American Mathematical Society.},
	year={1980}
}
@article{Kingma-Ba-2014,
	author={Kingma, D. and Ba, J.},
	title={Adam: A method for stochastic optimization},
	journal={arXiv preprint arXiv:1412.6980.},
	year={2014}
}
@inproceedings{Kingma-LeCun-2010,
	author={Kingma, D. and LeCun, Y.},
	title={Regularized estimation of image statistics by score matching},
	booktitle={In NIPS'2010},
	year={2010}
}
@inproceedings{Kingma-et-al-2014,
	author={Kingma, D. and Rezende, D. and Mohamed, S. and Welling, M.},
	title={Semi-supervised learning with deep generative models},
	booktitle={In NIPS'2014.},
	year={2014}
}
@article{Kingma-2013,
	author={Kingma, D. P.},
	title={Fast gradient-based inference with continuous latent variable models in auxiliary form},
	journal={Technical report, arxiv:1306.0733.},
	year={2013}
}
@inproceedings{Kingma-Welling-2014a,
	author={Kingma, D. P. and Welling, M.},
	title={Auto-encoding variational bayes},
	booktitle={In Proceedings of the International Conference on Learning Representations (ICLR).},
	year={2014}
}
@article{Kingma-Welling-2014b,
	author={Kingma, D. P. and Welling, M.},
	title={Efficient gradient-based inference through transformations between bayes nets and neural nets},
	journal={Technical report, arxiv:1402.0480.},
	year={2014}
}
@article{Kirkpatrick-et-al-1983,
	author={Kirkpatrick, S. and Gelatt, C. D. and Vecchi, M. P.},
	title={Optimization by simulated annealing},
	journal={Science},
	volume={220},
	pages={671-680},
	year={1983}
}
@inproceedings{Kiros-et-al-2014a,
	author={Kiros, R. and Salakhutdinov, R. and Zemel, R.},
	title={Multimodal neural language models},
	booktitle={In ICML'2014.},
	year={2014}
}
@article{Kiros-et-al-2014b,
	author={Kiros, R. and Salakhutdinov, R. and Zemel, R.},
	title={Unifying visual-semantic embeddings with multimodal neural language models},
	journal={arXiv:1411.2539 [cs.LG].},
	year={2014}
}
@inproceedings{Klementiev-et-al-2012,
	author={Klementiev, A. and Titov, I. and Bhattarai, B.},
	title={Inducing crosslingual distributed representations of words},
	booktitle={In Proceedings of COLING 2012},
	year={2012}
}
@article{KnowlesBarley-et-al-2014,
	author={Knowles-Barley, S. and Jones, T. R. and Morgan, J. and Lee, D. and Kasthuri, N. and Lichtman, J. W. and Pfister, H.},
	title={Deep learning for the connectome},
	journal={GPU Technology Conference.},
	year={2014}
}
@article{Koller-Friedman-2009,
	author={Koller, D. and Friedman, N.},
	title={Probabilistic Graphical Models: Principles and Techniques},
	journal={MIT Press.},
	year={2009}
}
@inproceedings{Konig-et-al-1996,
	author={Konig, Y. and Bourlard, H. and Morgan, N.},
	title={REMAP: Recursive estimation and maximization of a posteriori probabilities - application to transition-based connectionist speech recognition},
	booktitle={In D. Touretzky, M. Mozer, and M. Hasselmo, editors, Advances in Neural Information Processing Systems 8 (NIPS'95). MIT Press, Cambridge, MA.},
	year={1996}
}
@article{Koren-2009,
	author={Koren, Y.},
	title={The BellKor solution to the Netflix grand prize},
	journal={258},
	year={2009}
}
@inproceedings{Kotzias-et-al-2015,
	author={Kotzias, D. and Denil, M. and de Freitas, N. and Smyth, P.},
	title={From group to individual labels using deep features},
	booktitle={In ACM SIGKDD.},
	year={2015}
}
@inproceedings{Koutnik-et-al-2014,
	author={Koutnik, J. and Greff, K. and Gomez, F. and Schmidhuber, J.},
	title={A clockwork RNN},
	booktitle={In ICML'2014.},
	year={2014}
}
@inproceedings{Kocisky-et-al-2014,
	author={Kočiský, T. and Hermann, K. M. and Blunsom, P.},
	title={Learning Bilingual Word Representations by Marginalizing Alignments},
	booktitle={In Proceedings of ACL.},
	year={2014}
}
@inproceedings{Krause-et-al-2013,
	author={Krause, O. and Fischer, A. and Glasmachers, T. and Igel, C.},
	title={Approximation properties of DBNs with binary hidden units and real-valued visible units},
	booktitle={In ICML'2013.},
	year={2013}
}
@article{Krizhevsky-2010,
	author={Krizhevsky, A.},
	title={Convolutional deep belief networks on CIFAR-10},
	journal={Technical report, University of Toronto. Unpublished Manuscript: http://www.cs.utoronto.ca/ kriz/conv- cifar10-aug2010.pdf.},
	year={2010}
}
@article{Krizhevsky-Hinton-2009,
	author={Krizhevsky, A. and Hinton, G.},
	title={Learning multiple layers of features from tiny images},
	journal={Technical report, University of Toronto.},
	year={2009}
}
@inproceedings{Krizhevsky-Hinton-2011,
	author={Krizhevsky, A. and Hinton, G. E.},
	title={Using very deep autoencoders for content-based image retrieval},
	booktitle={In ESANN},
	year={2011}
}
@inproceedings{Krizhevsky-et-al-2012,
	author={Krizhevsky, A. and Sutskever, I. and Hinton, G.},
	title={ImageNet classification with deep convolutional neural networks},
	booktitle={In NIPS'2012},
	year={2012}
}
@article{Krueger-Dayan-2009,
	author={Krueger, K. A. and Dayan, P.},
	title={Flexible shaping: how learning in small steps helps},
	journal={Cognition},
	volume={110},
	pages={380-394},
	year={2009}
}
@inproceedings{Kuhn-Tucker-1951,
	author={Kuhn, H. W. and Tucker, A. W.},
	title={Nonlinear programming},
	booktitle={In Proceedings of the Second Berkeley Symposium on Mathematical Statistics and Probability, Berkeley, Calif},
	pages = {481-492},
	publisher = {University of California Press},
	year={1951}
}
@article{Kumar-et-al-2015,
	author={Kumar, A. and Irsoy, O. and Su, J. and Bradbury, J. and English, R. and Pierce, B. and Ondruska, P. and Iyyer, M. and Gulrajani, I. and Socher, R.},
	title={Ask me anything: Dynamic memory networks for natural language processing},
	journal={arXiv:1506.07285.},
	year={2015}
}
@inproceedings{Kumar-et-al-2010,
	author={Kumar, M. P. and Packer, B. and Koller, D.},
	title={Self-paced learning for latent variable models},
	booktitle={In NIPS'2010.},
	year={2010}
}
@article{Lang-Hinton-1988,
	author={Lang, K. J. and Hinton, G. E.},
	title={The development of the time-delay neural network architecture for speech recognition},
	journal={Technical Report CMU-CS-88-152, Carnegie-Mellon University.},
	year={1988}
}
@article{Lang-et-al-1990,
	author={Lang, K. J. and Waibel, A. H. and Hinton, G. E.},
	title={A time-delay neural network architecture for isolated word recognition},
	journal={Neural networks},
	volume={3},
	number={1},
	pages={23-43},
	year={1990}
}
@inproceedings{Langford-Zhang-2008,
	author={Langford, J. and Zhang, T.},
	title={The epoch-greedy algorithm for contextual multi-armed bandits},
	booktitle={In NIPS'2008},
	pages={1096-1103},
	year={2008}
}
@inproceedings{Lappalainen-et-al-2000,
	author={Lappalainen, H. and Giannakopoulos, X. and Honkela, A. and Karhunen, J.},
	title={Nonlinear independent component analysis using ensemble learning: Experiments and discussion},
	booktitle={In Proc. ICA. Citeseer.},
	year={2000}
}
@inproceedings{Larochelle-Bengio-2008,
	author={Larochelle, H. and Bengio, Y.},
	title={Classification using discriminative restricted Boltzmann machines},
	booktitle={In ICML'2008},
	year={2008}
}
@inproceedings{Larochelle-Hinton-2010,
	author={Larochelle, H. and Hinton, G. E.},
	title={Learning to combine foveal glimpses with a third-order Boltzmann machine},
	booktitle={In Advances in Neural Information Processing Systems 23 },
	pages={1243-1251},
	year={2010}
}
@inproceedings{Larochelle-Murray-2011,
	author={Larochelle, H. and Murray, I.},
	title={The Neural Autoregressive Distribution Estimator},
	booktitle={In AISTATS'2011},
	year={2011}
}
@inproceedings{Larochelle-et-al-2008,
	author={Larochelle, H. and Erhan, D. and Bengio, Y.},
	title={Zero-data learning of new tasks},
	booktitle={In AAAI Conference on Artificial Intelligence.},
	year={2008}
}
@article{Larochelle-et-al-2009,
	author={Larochelle, H. and Bengio, Y. and Louradour, J. and Lamblin, P.},
	title={Exploring strategies for training deep neural networks},
	journal={Journal of Machine Learning Research},
	volume={10},
	pages={1-40},
	year={2009}
}
@inproceedings{Lasserre-et-al-2006,
	author={Lasserre, J. A. and Bishop, C. M. and Minka, T. P.},
	title={Principled hybrids of generative and discriminative models},
	booktitle={In Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR'06), Washington, DC, USA.},
	pages = {87-94},
	publisher = {IEEE Computer Society},
	year={2006}
}
@inproceedings{Le-et-al-2010,
	author={Le, Q. and Ngiam, J. and Chen, Z. and hao Chia, D. J. and Koh, P. W. and Ng, A.},
	title={Tiled convolutional neural networks},
	booktitle={In J. Lafferty, C. K. I. Williams, J. Shawe-Taylor, R. Zemel, and A. Culotta, editors, Advances in Neural Information Processing Systems 23 (NIPS'10)},
	pages={1279-1287},
	year={2010}
}
@inproceedings{Le-et-al-2011,
	author={Le, Q. and Ngiam, J. and Coates, A. and Lahiri, A. and Prochnow, B. and Ng, A.},
	title={On optimization methods for deep learning},
	booktitle={In Proc. ICML'2011. ACM.},
	year={2011}
}
@inproceedings{Le-et-al-2012,
	author={Le, Q. and Ranzato, M. and Monga, R. and Devin, M. and Corrado, G. and Chen, K. and Dean, J. and Ng, A.},
	title={Building high-level features using large scale unsupervised learning},
	booktitle={In ICML'2012.},
	year={2012}
}
@article{Le-Roux-Bengio-2008,
	author={Le Roux, N. and Bengio, Y.},
	title={Representational power of restricted Boltzmann machines and deep belief networks},
	journal={Neural Computation},
	volume={20},
	number={6},
	pages={1631-1649},
	year={2008}
}
@article{Le-Roux-Bengio-2010,
	author={Le Roux, N. and Bengio, Y.},
	title={Deep belief networks are compact universal approximators},
	journal={Neural Computation},
	volume={22},
	number={8},
	pages={2192-2207},
	year={2010}
}
@inproceedings{LeCun-1985,
	author={LeCun, Y.},
	title={Une procédure d'apprentissage pour Réseau à seuil assymétrique},
	booktitle={In Cognitiva 85: A la Frontière de l'Intelligence Artificielle, des Sciences de la Connaissance et des Neurosciences, Paris 1985. CESTA, Paris.},
	pages = {599-604},
	year={1985}
}
@inproceedings{LeCun-1986,
	author={LeCun, Y.},
	title={Learning processes in an asymmetric threshold network},
	booktitle={In F. Fogelman-Soulié, E. Bienenstock, and G. Weisbuch, editors, Disordered Systems and Biological Organization},
	pages={233-240},
	publisher={Springer-Verlag, Les Houches, France},
	year={1986}
}
@phdthesis{LeCun-1987,
	author={LeCun, Y.},
	title={Modèles connexionistes de l'apprentissage},
	school={Université de Paris VI.},
	year={1987}
}
@article{LeCun-1989,
	author={LeCun, Y.},
	title={Generalization and network design strategies},
	journal={Technical Report CRG-TR-89-4, University of Toronto.},
	year={1989}
}
@article{LeCun-et-al-1989,
	author={LeCun, Y. and Jackel, L. D. and Boser, B. and Denker, J. S. and Graf, H. P. and Guyon, I. and Henderson, D. and Howard, R. E. and Hubbard, W.},
	title={Handwritten digit recognition: Applications of neural network chips and automatic learning},
	journal={IEEE Communications Magazine},
	volume={27},
	number={11},
	pages={41-46},
	year={1989}
}
@inproceedings{LeCun-et-al-1998a,
	author={LeCun, Y. and Bottou, L. and Orr, G. B. and Müller, K.-R.},
	title={Efficient backprop},
	booktitle={In Neural Networks, Tricks of the Trade, Lecture Notes in Computer Science LNCS 1524. Springer Verlag.},
	year={1998}
}
@article{LeCun-et-al-1998b,
	author={LeCun, Y. and Bottou, L. and Bengio, Y. and Haffner, P.},
	title={Gradient based learning applied to document recognition},
	journal={Proc. IEEE.},
	year={1998}
}
@inproceedings{LeCun-et-al-2010,
	author={LeCun, Y. and Kavukcuoglu, K. and Farabet, C.},
	title={Convolutional networks and applications in vision},
	booktitle={In Circuits and Systems (ISCAS), Proceedings of 2010 IEEE International Symposium on},
	pages={253-256},
	publisher={IEEE},
	year={2010}
}
@inproceedings{L'Ecuyer-1994,
	author={L'Ecuyer, P.},
	title={Efficiency improvement and variance reduction},
	booktitle={In Proceedings of the 1994 Winter Simulation Conference},
	pages={122-132},
	year={1994}
}
@article{Lee-et-al-2014,
	author={Lee, C.-Y. and Xie, S. and Gallagher, P. and Zhang, Z. and Tu, Z.},
	title={Deeply-supervised nets},
	journal={arXiv preprint arXiv:1409.5185.},
	year={2014}
}
@inproceedings{Lee-et-al-2007,
	author={Lee, H. and Battle, A. and Raina, R. and Ng, A.},
	title={Efficient sparse coding algorithms},
	booktitle={In B. Schölkopf, J. Platt, and T. Hoffman, editors, Advances in Neural Information Processing Systems 19 (NIPS'06)},
	pages={801-808},
	publisher={MIT Press},
	year={2007}
}
@inproceedings{Lee-et-al-2008,
	author={Lee, H. and Ekanadham, C. and Ng, A.},
	title={Sparse deep belief net model for visual area V2},
	booktitle={In NIPS'07.},
	year={2008}
}
@inproceedings{Lee-et-al-2009,
	author={Lee, H. and Grosse, R. and Ranganath, R. and Ng, A. Y.},
	title={Convolutional deep belief networks for scalable unsupervised learning of hierarchical representations},
	booktitle={In L. Bottou and M. Littman, editors, Proceedings of the Twenty-sixth International Conference on Machine Learning (ICML'09). ACM, Montreal, Canada.},
	year={2009}
}
@inproceedings{Lee-Grauman-2011,
	author={Lee, Y. J. and Grauman, K.},
	title={Learning the easy things first: self-paced visual category discovery},
	booktitle={In CVPR'2011},
	year={2011}
}
@article{Leibniz-1676,
	author={Leibniz, G. W.},
	title={Memoir using the chain rule},
	journal={(Cited in TMME 7:2\&3 p 321-332, 2010).},
	year={1676}
}
@article{Lenat-Guha-1989,
	author={Lenat, D. B. and Guha, R. V.},
	title={Building large knowledge-based systems; representation and inference in the Cyc project},
	journal={Addison-Wesley Longman Publishing Co., Inc.},
	year={1989}
}
@article{Leshno-et-al-1993,
	author={Leshno, M. and Lin, V. Y. and Pinkus, A. and Schocken, S.},
	title={Multilayer feedforward networks with a nonpolynomial activation function can approximate any function},
	journal={Neural Networks},
	volume={6},
	pages={861-867},
	year={1993}
}
@article{Levenberg-1944,
	author={Levenberg, K.},
	title={A method for the solution of certain non-linear problems in least squares},
	journal={Quarterly Journal of Applied Mathematics, II(2)},
	pages={164-168},
	year={1944}
}
@article{L'Hopital-1696,
	author={L'Hôpital, G. F. A.},
	title={Analyse des infiniment petits, pour l'intelligence des lignes courbes},
	journal={Paris: L'Imprimerie Royale.},
	year={1696}
}
@article{Li-et-al-2015,
	author={Li, Y. and Swersky, K. and Zemel, R. S.},
	title={Generative moment matching networks},
	journal={CoRR, abs/1502.02761.},
	year={2015}
}
@article{Lin-et-al-1996,
	author={Lin, T. and Horne, B. G. and Tino, P. and Giles, C. L.},
	title={Learning long-term dependencies is not as difficult with NARX recurrent neural networks},
	journal={IEEE Transactions on Neural Networks},
	volume={7},
	number={6},
	pages={1329-1338},
	year={1996}
}
@inproceedings{Lin-et-al-2015,
	author={Lin, Y. and Liu, Z. and Sun, M. and Liu, Y. and Zhu, X.},
	title={Learning entity and relation embeddings for knowledge graph completion},
	booktitle={In Proc. AAAI'15},
	year={2015}
}
@article{Linde-1992,
	author={Linde, N.},
	title={The machine that changed the world, episode 3},
	journal={Documentary miniseries.},
	year={1992}
}
@inproceedings{Lindsey-Lindblad-1994,
	author={Lindsey, C. and Lindblad, T.},
	title={Review of hardware neural networks: a user's perspective},
	booktitle={In Proc. Third Workshop on Neural Networks: From Biology to High Energy Physics, Isola d'Elba, Italy.},
	pages = {195-202},
	year={1994}
}
@article{Linnainmaa-1976,
	author={Linnainmaa, S.},
	title={Taylor expansion of the accumulated rounding error},
	journal={BIT Numerical Mathematics},
	volume={16},
	number={2},
	pages={146-160},
	year={1976}
}
@article{LISA-2008,
	author={LISA},
	title={Deep learning tutorials: Restricted Boltzmann machines},
	journal={Technical report, LISA Lab, Université de Montréal.},
	year={2008}
}
@inproceedings{Long-Servedio-2010,
	author={Long, P. M. and Servedio, R. A.},
	title={Restricted Boltzmann machines are hard to approximately evaluate or simulate},
	booktitle={In Proceedings of the 27th International Conference on Machine Learning (ICML'10).},
	year={2010}
}
@article{Lotter-et-al-2015,
	author={Lotter, W. and Kreiman, G. and Cox, D.},
	title={Unsupervised learning of visual structure using predictive generative networks},
	journal={arXiv preprint arXiv:1511.06380},
	year={2015}
}
@article{Lovelace-1842,
	author={Lovelace, A.},
	title={Notes upon L},
	journal={F. Menabrea's “Sketch of the Analytical Engine invented by Charles Babbage”.},
	year={1842}
}
@inproceedings{Lu-et-al-2015,
	author={Lu, L. and Zhang, X. and Cho, K. and Renals, S.},
	title={A study of the recurrent neural network encoder-decoder for large vocabulary speech recognition},
	booktitle={In Proc. Interspeech.},
	year={2015}
}
@inproceedings{Lu-et-al-2010,
	author={Lu, T. and Pál, D. and Pál, M.},
	title={Contextual multi-armed bandits},
	booktitle={In International Conference on Artificial Intelligence and Statistics},
	pages={485-492},
	year={2010}
}
@article{Luenberger-1984,
	author={Luenberger, D. G.},
	title={Linear and Nonlinear Programming},
	journal={Addison Wesley.},
	year={1984}
}
@article{Lukosevicius-Jaeger-2009,
	author={Lukoševičius, M. and Jaeger, H.},
	title={Reservoir computing approaches to recurrent neural network training},
	journal={Computer Science Review},
	volume={3},
	number={3},
	pages={127-149},
	year={2009}
}
@inproceedings{Luo-et-al-2011,
	author={Luo, H. and Shen, R. and Niu, C. and Ullrich, C.},
	title={Learning class-relevant features and class-irrelevant features via a hybrid third-order RBM},
	booktitle={In International Conference on Artificial Intelligence and Statistics},
	pages={470-478},
	year={2011}
}
@inproceedings{Luo-et-al-2013,
	author={Luo, H. and Carrier, P. L. and Courville, A. and Bengio, Y.},
	title={Texture modeling with convolutional spike-and-slab RBMs and deep extensions},
	booktitle={In AISTATS'2013},
	year={2013}
}
@inproceedings{Lyu-2009,
	author={Lyu, S.},
	title={Interpretation and generalization of score matching},
	booktitle={In Proceedings of the Twenty-fifth Conference in Uncertainty in Artificial Intelligence (UAI'09).},
	year={2009}
}
@article{Ma-et-al-2015,
	author={Ma, J. and Sheridan, R. P. and Liaw, A. and Dahl, G. E. and Svetnik, V.},
	title={Deep neural nets as a method for quantitative structure - activity relationships},
	journal={J. Chemical information and modeling.},
	year={2015}
}
@inproceedings{Maas-et-al-2013,
	author={Maas, A. L. and Hannun, A. Y. and Ng, A. Y.},
	title={Rectifier nonlinearities improve neural network acoustic models},
	booktitle={In ICML Workshop on Deep Learning for Audio, Speech, and Language Processing.},
	year={2013}
}
@inproceedings{Maass-1992,
	author={Maass, W.},
	title={Bounds for the computational power and learning complexity of analog neural nets (extended abstract)},
	booktitle={In Proc. of the 25th ACM Symp. Theory of Computing},
	pages={335-344},
	year={1992}
}
@article{Maass-et-al-1994,
	author={Maass, W. and Schnitger, G. and Sontag, E. D.},
	title={A comparison of the computational power of sigmoid and Boolean threshold circuits},
	journal={Theoretical Advances in Neural Computation and Learning},
	pages={127-151},
	year={1994}
}
@article{Maass-et-al-2002,
	author={Maass, W. and Natschlaeger, T. and Markram, H.},
	title={Real-time computing without stable states: A new framework for neural computation based on perturbations},
	journal={Neural Computation},
	volume={14},
	number={11},
	pages={2531-2560},
	year={2002}
}
@article{MacKay-2003,
	author={MacKay, D.},
	title={Information Theory, Inference and Learning Algorithms},
	journal={Cambridge University Press.},
	year={2003}
}
@article{Maclaurin-et-al-2015,
	author={Maclaurin, D. and Duvenaud, D. and Adams, R. P.},
	title={Gradient-based hyperparameter optimization through reversible learning},
	journal={arXiv preprint arXiv:1502.03492},
	year={2015}
}
@inproceedings{Mao-et-al-2015,
	author={Mao, J. and Xu, W. and Yang, Y. and Wang, J. and Huang, Z. and Yuille, A. L.},
	title={Deep captioning with multimodal recurrent neural networks},
	booktitle={In ICLR'2015 . arXiv:1410.1090.},
	year={2015}
}
@article{Marcotte-Savard-1992,
	author={Marcotte, P. and Savard, G.},
	title={Novel approaches to the discrimination problem},
	journal={Zeitschrift für Operations Research (Theory)},
	volume={36},
	pages={517-545},
	year={1992}
}
@inproceedings{Marlin-de-Freitas-2011,
	author={Marlin, B. and de Freitas, N.},
	title={Asymptotic efficiency of deterministic estimators for discrete energy-based models: Ratio matching and pseudolikelihood},
	booktitle={In UAI'2011},
	year={2011}
}
@inproceedings{Marlin-et-al-2010,
	author={Marlin, B. and Swersky, K. and Chen, B. and de Freitas, N.},
	title={Inductive principles for restricted Boltzmann machine learning},
	booktitle={In Proceedings of The Thirteenth International Conference on Artificial Intelligence and Statistics (AISTATS'10)},
	volume = {9},
	pages={509-516},
	year={2010}
}
@article{Marquardt-1963,
	author={Marquardt, D. W.},
	title={An algorithm for least-squares estimation of non-linear parameters},
	journal={Journal of the Society of Industrial and Applied Mathematics},
	volume={11},
	number={2},
	pages={431-441},
	year={1963}
}
@article{Marr-Poggio-1976,
	author={Marr, D. and Poggio, T.},
	title={Cooperative computation of stereo disparity},
	journal={Science, 194.},
	year={1976}
}
@inproceedings{Martens-2010,
	author={Martens, J.},
	title={Deep learning via Hessian-free optimization},
	booktitle={In L. Bottou and M. Littman, editors, Proceedings of the Twenty-seventh International Conference on Machine Learning (ICML-10)},
	pages={735-742},
	publisher={ACM},
	year={2010}
}
@article{Martens-Medabalimi-2014,
	author={Martens, J. and Medabalimi, V.},
	title={On the expressive efficiency of sum product networks},
	journal={arXiv:1411.7717.},
	year={2014}
}
@inproceedings{Martens-Sutskever-2011,
	author={Martens, J. and Sutskever, I.},
	title={Learning recurrent neural networks with Hessian-free optimization},
	booktitle={In Proc. ICML'2011. ACM.},
	year={2011}
}
@article{Mase-1995,
	author={Mase, S.},
	title={Consistency of the maximum pseudo-likelihood estimator of continuous state space Gibbsian processes},
	journal={The Annals of Applied Probability},
	volume = {5},
	number = {3},
	pages={603-612},
	year={1995}
}
@inproceedings{McClelland-et-al-1995,
	author={McClelland, J. and Rumelhart, D. and Hinton, G.},
	title={The appeal of parallel distributed processing},
	booktitle={In Computation \& intelligence},
	pages={305-341},
	publisher={American Association for Artificial Intelligence},
	year={1995}
}
@article{McCulloch-Pitts-1943,
	author={McCulloch, W. S. and Pitts, W.},
	title={A logical calculus of ideas immanent in nervous activity},
	journal={Bulletin of Mathematical Biophysics},
	volume={5},
	pages={115-133},
	year={1943}
}
@article{Mead-Ismail-2012,
	author={Mead, C. and Ismail, M.},
	title={Analog VLSI implementation of neural systems , volume 80},
	journal={Springer Science \& Business Media.},
	year={2012}
}
@article{Melchior-et-al-2013,
	author={Melchior, J. and Fischer, A. and Wiskott, L.},
	title={How to center binary deep Boltzmann machines},
	journal={arXiv preprint arXiv:1311.1354.},
	year={2013}
}
@inproceedings{Memisevic-Hinton-2007,
	author={Memisevic, R. and Hinton, G. E.},
	title={Unsupervised learning of image transformations},
	booktitle={In Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR'07).},
	year={2007}
}
@article{Memisevic-Hinton-2010,
	author={Memisevic, R. and Hinton, G. E.},
	title={Learning to represent spatial transformations with factored higher-order Boltzmann machines},
	journal={Neural Computation },
	volume={22},
	number={6},
	pages={1473-1492},
	year={2010}
}
@inproceedings{Mesnil-et-al-2011,
	author={Mesnil, G. and Dauphin, Y. and Glorot, X. and Rifai, S. and Bengio, Y. and Goodfellow, I. and Lavoie, E. and Muller, X. and Desjardins, G. and Warde-Farley, D. and Vincent, P. and Courville, A. and Bergstra, J.},
	title={Unsupervised and transfer learning challenge: a deep learning approach},
	booktitle={In JMLR W\&CP: Proc. Unsupervised and Transfer Learning},
	volume = {7},
	year={2011}
}
@article{Mesnil-et-al-2012,
	author={Mesnil, G. and Rifai, S. and Dauphin, Y. and Bengio, Y. and Vincent, P.},
	title={Surfing on the manifold},
	journal={Learning Workshop, Snowbird.},
	year={2012}
}
@article{Miikkulainen-Dyer-1991,
	author={Miikkulainen, R. and Dyer, M. G.},
	title={Natural language processing with modular PDP networks and distributed lexicon},
	journal={Cognitive Science},
	volume={15},
	pages={343-399},
	year={1991}
}
@phdthesis{Mikolov-2012,
	author={Mikolov, T.},
	title={Statistical Language Models based on Neural Networks},
	school={Brno University of Technology.},
	year={2012}
}
@inproceedings{Mikolov-et-al-2011a,
	author={Mikolov, T. and Deoras, A. and Kombrink, S. and Burget, L. and Cernocky, J.},
	title={Empirical evaluation and combination of advanced language modeling techniques},
	booktitle={In Proc. 12th annual conference of the international speech communication association (INTERSPEECH 2011).},
	year={2011}
}
@inproceedings{Mikolov-et-al-2011b,
	author={Mikolov, T. and Deoras, A. and Povey, D. and Burget, L. and Cernocky, J.},
	title={Strategies for training large scale neural network language models},
	booktitle={In Proc. ASRU'2011.},
	year={2011}
}
@inproceedings{Mikolov-et-al-2013a,
	author={Mikolov, T. and Chen, K. and Corrado, G. and Dean, J.},
	title={Efficient estimation of word representations in vector space},
	booktitle={In International Conference on Learning Representations: Workshops Track.},
	year={2013}
}
@article{Mikolov-et-al-2013b,
	author={Mikolov, T. and Le, Q. V. and Sutskever, I.},
	title={Exploiting similarities among languages for machine translation},
	journal={Technical report, arXiv:1309.4168.},
	year={2013}
}
@techreport{Minka-2005,
	author={Minka, T.},
	title={Divergence measures and message passing},
	institution={Microsoft Research Cambridge UK Tech Rep MSRTR2005173},
	volume = {72},
	number = {TR-2005-173},
	year={2005}
}
@article{Minsky-Papert-1969,
	author={Minsky, M. L. and Papert, S. A.},
	title={Perceptrons},
	journal={MIT Press, Cambridge.},
	year={1969}
}
@article{Mirza-Osindero-2014,
	author={Mirza, M. and Osindero, S.},
	title={Conditional generative adversarial nets},
	journal={arXiv preprint arXiv:1411.1784.},
	year={2014}
}
@article{Mishkin-Matas-2015,
	author={Mishkin, D. and Matas, J.},
	title={All you need is a good init},
	journal={arXiv preprint arXiv:1511.06422.},
	year={2015}
}
@article{Misra-Saha-2010,
	author={Misra, J. and Saha, I.},
	title={Artificial neural networks in hardware: A survey of two decades of progress},
	journal={Neurocomputing},
	volume={74},
	number={1},
	pages={239-255},
	year={2010}
}
@article{Mitchell-1997,
	author={Mitchell, T. M.},
	title={Machine Learning},
	journal={McGraw-Hill, New York.},
	year={1997}
}
@inproceedings{Miyato-et-al-2015,
	author={Miyato, T. and Maeda, S. and Koyama, M. and Nakae, K. and Ishii, S.},
	title={Distributional smoothing with virtual adversarial training},
	booktitle={In ICLR. Preprint: arXiv:1507.00677.},
	year={2015}
}
@inproceedings{Mnih-Gregor-2014,
	author={Mnih, A. and Gregor, K.},
	title={Neural variational inference and learning in belief networks},
	booktitle={In ICML'2014},
	year={2014}
}
@inproceedings{Mnih-Hinton-2007,
	author={Mnih, A. and Hinton, G. E.},
	title={Three new graphical models for statistical language modelling},
	booktitle={In Z. Ghahramani, editor, Proceedings of the Twenty-fourth International Conference on Machine Learning (ICML'07)},
	pages={641-648},
	publisher={ACM},
	year={2007}
}
@inproceedings{Mnih-Hinton-2009,
	author={Mnih, A. and Hinton, G. E.},
	title={A scalable hierarchical distributed language model},
	booktitle={In D. Koller, D. Schuurmans, Y. Bengio, and L. Bottou, editors, Advances in Neural Information Processing Systems 21 (NIPS'08)},
	pages={1081-1088},
	year={2009}
}
@inproceedings{Mnih-Kavukcuoglu-2013,
	author={Mnih, A. and Kavukcuoglu, K.},
	title={Learning word embeddings efficiently with noise-contrastive estimation},
	booktitle={In C. Burges, L. Bottou, M. Welling, Z. Ghahramani, and K. Weinberger, editors, Advances in Neural Information Processing Systems 26 },
	pages={2265-2273},
	publisher={Curran Associates, Inc},
	year={2013}
}
@inproceedings{Mnih-Teh-2012,
	author={Mnih, A. and Teh, Y. W.},
	title={A fast and simple algorithm for training neural probabilistic language models},
	booktitle={In ICML'2012},
	pages={1751-1758},
	year={2012}
}
@inproceedings{Mnih-Hinton-2010,
	author={Mnih, V. and Hinton, G.},
	title={Learning to detect roads in high-resolution aerial images},
	booktitle={In Proceedings of the 11th European Conference on Computer Vision (ECCV).},
	year={2010}
}
@inproceedings{Mnih-et-al-2011,
	author={Mnih, V. and Larochelle, H. and Hinton, G.},
	title={Conditional restricted Boltzmann machines for structure output prediction},
	booktitle={In Proc. Conf. on Uncertainty in Artificial Intelligence (UAI).},
	year={2011}
}
@article{Mnih-et-al-2013,
	author={Mnih, V. and Kavukcuoglo, K. and Silver, D. and Graves, A. and Antonoglou, I. and Wierstra, D.},
	title={Playing Atari with deep reinforcement learning},
	journal={Technical report, arXiv:1312.5602.},
	year={2013}
}
@inproceedings{Mnih-et-al-2014,
	author={Mnih, V. and Heess, N. and Graves, A. and Kavukcuoglu, K.},
	title={Recurrent models of visual attention},
	booktitle={In Z. Ghahramani, M. Welling, C. Cortes, N. Lawrence, and K. Weinberger, editors, NIPS'2014},
	pages={2204-2212},
	year={2014}
}
@article{Mnih-et-al-2015,
	author={Mnih, V. and Kavukcuoglo, K. and Silver, D. and Rusu, A. A. and Veness, J. and Bellemare, M. G. and Graves, A. and Riedmiller, M. and Fidgeland, A. K. and Ostrovski, G. and Petersen, S. and Beattie, C. and Sadik, A. and Antonoglou, I. and King, H. and Kumaran, D. and Wierstra, D. and Legg, S. and Hassabis, D.},
	title={Human-level control through deep reinforcement learning},
	journal={Nature},
	volume={518},
	pages={529-533},
	year={2015}
}
@inproceedings{Mobahi-Fisher-2015,
	author={Mobahi, H. and Fisher, III, J. W.},
	title={A theoretical analysis of optimization by Gaussian continuation},
	booktitle={In AAAI'2015.},
	year={2015}
}
@inproceedings{Mobahi-et-al-2009,
	author={Mobahi, H. and Collobert, R. and Weston, J.},
	title={Deep learning from temporal coherence in video},
	booktitle={In L. Bottou and M. Littman, editors, Proceedings of the 26th International Conference on Machine Learning, Montreal},
	pages = {737-744},
	publisher = {Omnipress},
	year={2009}
}
@inproceedings{Mohamed-et-al-2009,
	title={Deep belief networks for phone recognition},
	volume = {1},
	url = {http://www.cs.utoronto.ca/~gdahl/papers/dbnPhoneRec.pdf},
	booktitle = {Nips workshop on deep learning for speech recognition and related applications},
	author = {Mohamed, Abdel-rahman and Dahl, George and Hinton, Geoffrey},
	year = {2009},
	pages = {39}
}
@inproceedings{Mohamed-et-al-2011,
	author={Mohamed, A. and Sainath, T. N. and Dahl, G. and Ramabhadran, B. and Hinton, G. E. and Picheny, M. A.},
	title={Deep belief networks using discriminative features for phone recognition},
	booktitle={In Acoustics, Speech and Signal Processing (ICASSP), 2011 IEEE International Conference on},
	pages={5060-5063},
	publisher={IEEE},
	year={2011}
}
@article{Mohamed-et-al-2012a,
	author={Mohamed, A. and Dahl, G. and Hinton, G.},
	title={Acoustic modeling using deep belief networks},
	journal={IEEE Trans. on Audio, Speech and Language Processing },
	volume={20},
	number={1},
	pages={14-22},
	year={2012}
}
@inproceedings{Mohamed-et-al-2012b,
	author={Mohamed, A. and Hinton, G. and Penn, G.},
	title={Understanding how deep belief networks perform acoustic modelling},
	booktitle={In Acoustics, Speech and Signal Processing (ICASSP), 2012 IEEE International Conference on},
	pages={4273-4276},
	publisher={IEEE},
	year={2012}
}
@article{Moller-1993,
	author={Moller, M. F.},
	title={A scaled conjugate gradient algorithm for fast supervised learning},
	journal={Neural Networks},
	volume={6},
	pages={525-533},
	year={1993}
}
@inproceedings{Montavon-Muller-2012,
	author={Montavon, G. and Muller, K.-R.},
	title={Deep Boltzmann machines and the centering trick},
	booktitle={In G. Montavon, G. Orr, and K.-R. Müller, editors, Neural Networks: Tricks of the Trade, volume 7700 of Lecture Notes in Computer Science},
	pages={621-637},
	publisher={Preprint: http://arxiv.org/abs/1203.3783},
	year={2012}
}
@article{Montufar-2014,
	author={Montúfar, G.},
	title={Universal approximation depth and errors of narrow belief networks with discrete units},
	journal={Neural Computation, 26.},
	year={2014}
}
@article{Montufar-Ay-2011,
	author={Montúfar, G. and Ay, N.},
	title={Refinements of universal approximation results for deep belief networks and restricted Boltzmann machines},
	journal={Neural Computation},
	volume={23},
	number={5},
	pages={1306-1319},
	year={2011}
}
@inproceedings{Montufar-et-al-2014,
	author={Montufar, G. F. and Pascanu, R. and Cho, K. and Bengio, Y.},
	title={On the number of linear regions of deep neural networks},
	booktitle={In NIPS'2014},
	year={2014}
}
@article{Mor-Yosef-et-al-1990,
	author={Mor-Yosef, S. and Samueloff, A. and Modan, B. and Navot, D. and Schenker, J. G.},
	title={Ranking the risk factors for cesarean: logistic regression analysis of a nationwide study},
	journal={Obstet Gynecol},
	volume={75},
	number={6},
	pages={944-7},
	year={1990}
}
@inproceedings{Morin-Bengio-2005,
	author={Morin, F. and Bengio, Y.},
	title={Hierarchical probabilistic neural network language model},
	booktitle={In AISTATS'2005.},
	year={2005}
}
@inproceedings{Mozer-1992,
	author={Mozer, M. C.},
	title={The induction of multiscale temporal structure},
	booktitle={In J. M. S. Hanson and R. Lippmann, editors, Advances in Neural Information Processing Systems. 4 (NIPS'91), San Mateo, CA.},
	pages = {275-282},
	publisher = {Morgan Kaufmann},
	year={1992}
}
@article{Murphy-2012,
	author={Murphy, K. P.},
	title={Machine Learning: a Probabilistic Perspective},
	journal={MIT Press, Cambridge, MA, USA.},
	year={2012}
}
@inproceedings{Murray-Larochelle-2014,
	author={Murray, B. U. I. and Larochelle, H.},
	title={A deep and tractable density estimator},
	booktitle={In ICML'2014.},
	year={2014}
}
@inproceedings{Nair-Hinton-2010,
	author={Nair, V. and Hinton, G.},
	title={Rectified linear units improve restricted Boltzmann machines},
	booktitle={In ICML'2010},
	year={2010}
}
@inproceedings{Nair-Hinton-2009,
	author={Nair, V. and Hinton, G. E.},
	title={3d object recognition with deep belief nets},
	booktitle={In Y. Bengio, D. Schuurmans, J. D. Lafferty, C. K. I. Williams, and A. Culotta, editors, Advances in Neural Information Processing Systems 22},
	pages={1339-1347},
	publisher={Curran Associates, Inc},
	year={2009}
}
@inproceedings{Narayanan-Mitter-2010,
	author={Narayanan, H. and Mitter, S.},
	title={Sample complexity of testing the manifold hypothesis},
	booktitle={In NIPS'2010.},
	year={2010}
}
@article{Naumann-2008,
	author={Naumann, U.},
	title={Optimal Jacobian accumulation is NP-complete},
	journal={Mathematical Programming},
	volume={112},
	number={2},
	pages={427-441},
	year={2008}
}
@article{Navigli-Velardi-2005,
	author={Navigli, R. and Velardi, P.},
	title={Structural semantic interconnections: a knowledge-based approach to word sense disambiguation},
	journal={IEEE Trans. Pattern Analysis and Machine Intelligence},
	volume={27},
	number={7},
	pages={1075-1086},
	year={2005}
}
@inproceedings{Neal-Hinton-1999,
	author={Neal, R. and Hinton, G.},
	title={A view of the EM algorithm that justifies incremental, sparse, and other variants},
	booktitle={In M. I. Jordan, editor, Learning in Graphical Models. MIT Press, Cambridge, MA.},
	year={1999}
}
@article{Neal-1990,
	author={Neal, R. M.},
	title={Learning stochastic feedforward networks},
	journal={Technical report.},
	year={1990}
}
@article{Neal-1993,
	author={Neal, R. M.},
	title={Probabilistic inference using Markov chain Monte-Carlo methods},
	journal={Technical Report CRG-TR-93-1, Dept. of Computer Science, University of Toronto.},
	year={1993}
}
@article{Neal-1994,
	author={Neal, R. M.},
	title={Sampling from multimodal distributions using tempered transitions},
	journal={Technical Report 9421, Dept. of Statistics, University of Toronto.},
	year={1994}
}
@article{Neal-1996,
	author={Neal, R. M.},
	title={Bayesian Learning for Neural Networks},
	journal={Lecture Notes in Statistics.  Springer.},
	year={1996}
}
@article{Neal-2001,
	author={Neal, R. M.},
	title={Annealed importance sampling},
	journal={Statistics and Computing},
	volume={11},
	number={2},
	pages={125-139},
	year={2001}
}
@article{Neal-2005,
	author = {Neal, Radford M.},
	title={Estimating ratios of normalizing constants using linked importance sampling},
	journal = {arXiv preprint math/0511216},
	year={2005}
}
@article{Nesterov-1983,
	author={Nesterov, Y.},
	title={A method of solving a convex programming problem with convergence rate O(1/k2)},
	journal={Soviet Mathematics Doklady},
	volume={27},
	pages={372-376},
	year={1983}
}
@article{Nesterov-2004,
	author={Nesterov, Y.},
	title={Introductory lectures on convex optimization : a basic course},
	journal={Applied optimization. Kluwer Academic Publ., Boston, Dordrecht, London.},
	year={2004}
}
@article{Netzer-et-al-2011,
	author={Netzer, Y. and Wang, T. and Coates, A. and Bissacco, A. and Wu, B. and Ng, A. Y.},
	title={Reading digits in natural images with unsupervised feature learning},
	journal={Deep Learning and Unsupervised Feature Learning Workshop, NIPS.},
	year={2011}
}
@inproceedings{Ney-Kneser-1993,
	author={Ney, H. and Kneser, R.},
	title={Improved clustering techniques for class-based statistical language modelling},
	booktitle={In European Conference on Speech Communication and Technology (Eurospeech), Berlin.},
	pages = {973-976},
	year={1993}
}
@article{Ng-2015,
	author={Ng, A.},
	title={Advice for applying machine learning},
	journal={https://see.stanford.edu/materials/aimlcs229/ML-advice.pdf.},
	year={2015}
}
@inproceedings{Niesler-et-al-1998,
	author={Niesler, T. R. and Whittaker, E. W. D. and Woodland, P. C.},
	title={Comparison of part-of-speech and automatically derived category-based language models for speech recognition},
	booktitle={In International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
	pages={177-180},
	year={1998}
}
@article{Ning-et-al-2005,
	author={Ning, F. and Delhomme, D. and LeCun, Y. and Piano, F. and Bottou, L. and Barbano, P. E.},
	title={Toward automatic phenotyping of developing embryos from videos},
	journal={Image Processing, IEEE Transactions on},
	volume={14},
	number={9},
	pages={1360-1371},
	year={2005}
}
@article{Nocedal-Wright-2006,
	author={Nocedal, J. and Wright, S.},
	title={Numerical Optimization},
	journal={Springer.},
	year={2006}
}
@inproceedings{Norouzi-Fleet-2011,
	author={Norouzi, M. and Fleet, D. J.},
	title={Minimal loss hashing for compact binary codes},
	booktitle={In ICML'2011.},
	year={2011}
}
@article{Nowlan-1990,
	author={Nowlan, S. J.},
	title={Competing experts: An experimental investigation of associative mixture models},
	journal={Technical Report CRG-TR-90-5, University of Toronto.},
	year={1990}
}
@article{Nowlan-Hinton-1992,
	author={Nowlan, S. J. and Hinton, G. E.},
	title={Simplifying neural networks by soft weight-sharing},
	journal={Neural Computation},
	volume={4},
	number={4},
	pages={473-493},
	year={1992}
}
@article{Olshausen-Field-2005,
	author={Olshausen, B. and Field, D. J.},
	title={How close are we to understanding V1?},
	journal={Neural Computation},
	volume={17},
	pages={1665-1699},
	year={2005}
}
@article{Olshausen-Field-1996,
	author={Olshausen, B. A. and Field, D. J.},
	title={Emergence of simple-cell receptive field properties by learning a sparse code for natural images},
	journal={Nature},
	volume={381},
	pages={607-609},
	year={1996}
}
@article{Olshausen-et-al-1993,
	author={Olshausen, B. A. and Anderson, C. H. and Van Essen, D. C.},
	title={A neurobiological model of visual attention and invariant pattern recognition based on dynamic routing of information},
	journal={J. Neurosci.},
	volume={13},
	number={11},
	pages={4700-4719},
	year={1993}
}
@article{Opper-Archambeau-2009,
	author={Opper, M. and Archambeau, C.},
	title={The variational Gaussian approximation revisited},
	journal={Neural computation},
	volume={21},
	number={3},
	pages={786-792},
	year={2009}
}
@inproceedings{Oquab-et-al-2014,
	author={Oquab, M. and Bottou, L. and Laptev, I. and Sivic, J.},
	title={Learning and transferring mid-level image representations using convolutional neural networks},
	booktitle={In Computer Vision and Pattern Recognition (CVPR), 2014 IEEE Conference on},
	pages={1717-1724},
	publisher={IEEE},
	year={2014}
}
@inproceedings{Osindero-Hinton-2008,
	author={Osindero, S. and Hinton, G. E.},
	title={Modeling image patches with a directed hierarchy of Markov random fields},
	booktitle={In J. Platt, D. Koller, Y. Singer, and S. Roweis, editors, Advances in Neural Information Processing Systems 20 (NIPS'07), Cambridge, MA. MIT Press.},
	pages = {1121-1128},
	year={2008}
}
@article{Ovid-Martin-2004,
	author={Ovid and Martin, C.},
	title={Metamorphoses},
	journal={W.W. Norton.},
	year={2004}
}
@inproceedings{Paccanaro-Hinton-2000,
	author={Paccanaro, A. and Hinton, G. E.},
	title={Extracting distributed representations of concepts and relations from positive and negative propositions},
	booktitle={In International Joint Conference on Neural Networks (IJCNN), Como, Italy. IEEE, New York.},
	year={2000}
}
@article{Paine-et-al-2014,
	author={Paine, T. L. and Khorrami, P. and Han, W. and Huang, T. S.},
	title={An analysis of unsupervised pre-training in light of recent advances},
	journal={arXiv preprint arXiv:1412.6597},
	year={2014}
}
@inproceedings{Palatucci-et-al-2009,
	author={Palatucci, M. and Pomerleau, D. and Hinton, G. E. and Mitchell, T. M.},
	title={Zero-shot learning with semantic output codes},
	booktitle={In Y. Bengio, D. Schuurmans, J. D. Lafferty, C. K. I. Williams, and A. Culotta, editors, Advances in Neural Information Processing Systems 22},
	pages={1410-1418},
	publisher={Curran Associates, Inc},
	year={2009}
}
@article{Parker-1985,
	author={Parker, D. B.},
	title={Learning-logic},
	journal={Technical Report TR-47, Center for Comp. Research in Economics and Management Sci., MIT.},
	year={1985}
}
@inproceedings{Pascanu-et-al-2013,
	author={Pascanu, R. and Mikolov, T. and Bengio, Y.},
	title={On the difficulty of training recurrent neural networks},
	booktitle={In ICML'2013},
	year={2013}
}
@inproceedings{Pascanu-et-al-2014a,
	author={Pascanu, R. and Gülçehre, Ç. and Cho, K. and Bengio, Y.},
	title={How to construct deep recurrent neural networks},
	booktitle={In ICLR'2014},
	year={2014}
}
@inproceedings{Pascanu-et-al-2014b,
	author={Pascanu, R. and Montufar, G. and Bengio, Y.},
	title={On the number of inference regions of deep feed forward networks with piece-wise linear activations},
	booktitle={In ICLR'2014},
	year={2014}
}
@inproceedings{Pati-et-al-1993,
	author={Pati, Y. and Rezaiifar, R. and Krishnaprasad, P.},
	title={Orthogonal matching pursuit: Recursive function approximation with applications to wavelet decomposition},
	booktitle={In Proceedings of the 27 th Annual Asilomar Conference on Signals, Systems, and Computers},
	pages={40-44},
	year={1993}
}
@inproceedings{Pearl-1985,
	author={Pearl, J.},
	title={Bayesian networks: A model of self-activated memory for evidential reasoning},
	booktitle={In Proceedings of the 7th Conference of the Cognitive Science Society, University of California, Irvine},
	pages={329-334},
	year={1985}
}
@article{Pearl-1988,
	author={Pearl, J.},
	title={Probabilistic Reasoning in Intelligent Systems: Networks of Plausible Inference},
	journal={Morgan Kaufmann.},
	year={1988}
}
@article{Perron-1907,
	author={Perron, O.},
	title={Zur theorie der matrices},
	journal={Mathematische Annalen},
	volume={64},
	number={2},
	pages={248-263},
	year={1907}
}
@article{Petersen-Pedersen-2006,
	author={Petersen, K. B. and Pedersen, M. S.},
	title={The matrix cookbook},
	journal={Version 20051003.},
	year={2006}
}
@article{Peterson-2004,
	author={Peterson, G. B.},
	title={A day of great illumination: B},
	journal={F. Skinner's discovery of shaping. Journal of the Experimental Analysis of Behavior},
	volume={82},
	number={3},
	pages={317-328},
	year={2004}
}
@inproceedings{Pham-et-al-1992,
	author={Pham, D.-T. and Garat, P. and Jutten, C.},
	title={Separation of a mixture of independent sources through a maximum likelihood approach},
	booktitle={In EUSIPCO},
	pages={771-774},
	year={1992}
}
@inproceedings{Pham-et-al-2012,
	author={Pham, P.-H. and Jelaca, D. and Farabet, C. and Martini, B. and LeCun, Y. and Culurciello, E.},
	title={NeuFlow: dataflow vision processing system-on-a-chip},
	booktitle={In Circuits and Systems (MWS-CAS), 2012 IEEE 55th International Midwest Symposium on},
	pages={1044-1047},
	publisher={IEEE},
	year={2012}
}
@inproceedings{Pinheiro-Collobert-2014,
	author={Pinheiro, P. H. O. and Collobert, R.},
	title={Recurrent convolutional neural networks for scene labeling},
	booktitle={In ICML'2014.},
	year={2014}
}
@inproceedings{Pinheiro-Collobert-2015,
	author={Pinheiro, P. H. O. and Collobert, R.},
	title={From image-level to pixel-level labeling with convolutional networks},
	booktitle={In Conference on Computer Vision and Pattern Recognition (CVPR).},
	year={2015}
}
@article{Pinto-et-al-2008,
	author={Pinto, N. and Cox, D. D. and DiCarlo, J. J.},
	title={Why is real-world visual object recognition hard?},
	journal={PLoS Comput Biol, 4.},
	year={2008}
}
@inproceedings{Pinto-et-al-2011,
	author={Pinto, N. and Stone, Z. and Zickler, T. and Cox, D.},
	title={Scaling up biologically-inspired computer vision: A case study in unconstrained face recognition on facebook},
	booktitle={In Computer Vision and Pattern Recognition Workshops (CVPRW), 2011 IEEE Computer Society Conference on},
	pages={35-42},
	publisher={IEEE},
	year={2011}
}
@article{Pollack-1990,
	author={Pollack, J. B.},
	title={Recursive distributed representations},
	journal={Artificial Intelligence},
	volume={46},
	number={1},
	pages={77-105},
	year={1990}
}
@article{Polyak-Juditsky-1992,
	author={Polyak, B. and Juditsky, A.},
	title={Acceleration of stochastic approximation by averaging},
	journal={SIAM J. Control and Optimization},
	volume={30},
	number={4},
	pages={838-855},
	year={1992}
}
@article{Polyak-1964,
	author={Polyak, B. T.},
	title={Some methods of speeding up the convergence of iteration methods},
	journal={USSR Computational Mathematics and Mathematical Physics},
	volume={4},
	number={5},
	pages={1-17},
	year={1964}
}
@article{Poole-et-al-2014,
	author={Poole, B. and Sohl-Dickstein, J. and Ganguli, S.},
	title={Analyzing noise in autoencoders and deep networks},
	journal={CoRR, abs/1406.1831.},
	year={2014}
}
@inproceedings{Poon-Domingos-2011,
	author={Poon, H. and Domingos, P.},
	title={Sum-product networks: A new deep architecture},
	booktitle={In Proceedings of the Twenty-seventh Conference in Uncertainty in Artificial Intelligence (UAI), Barcelona, Spain.},
	year={2011}
}
@inproceedings{Presley-Haggard-1994,
	author={Presley, R. K. and Haggard, R. L.},
	title={A fixed point implementation of the backpropagation learning algorithm},
	booktitle={In Southeastcon'94. Creative Technology Transfer-A Global Affair., Proceedings of the 1994 IEEE },
	pages={136-138},
	publisher={IEEE},
	year={1994}
}
@article{Price-1958,
	author={Price, R.},
	title={A useful theorem for nonlinear devices having Gaussian inputs},
	journal={IEEE Transactions on Information Theory},
	volume={4},
	number={2},
	pages={69-72},
	year={1958}
}
@article{Quiroga-et-al-2005,
	author={Quiroga, R. Q. and Reddy, L. and Kreiman, G. and Koch, C. and Fried, I.},
	title={Invariant visual representation by single neurons in the human brain},
	journal={Nature},
	volume={435},
	number={7045},
	pages={1102-1107},
	year={2005}
}
@article{Radford-et-al-2015,
	author={Radford, A. and Metz, L. and Chintala, S.},
	title={Unsupervised representation learning with deep convolutional generative adversarial networks},
	journal={arXiv preprint arXiv:1511.06434},
	year={2015}
}
@article{Raiko-et-al-2014,
	author={Raiko, T. and Yao, L. and Cho, K. and Bengio, Y.},
	title={Iterative neural autoregressive distribution estimator (NADE-k)},
	journal={Technical report, arXiv:1406.1485.},
	year={2014}
}
@inproceedings{Raina-et-al-2009,
	author={Raina, R. and Madhavan, A. and Ng, A. Y.},
	title={Large-scale deep unsupervised learning using graphics processors},
	booktitle={In L. Bottou and M. Littman, editors, Proceedings of the Twenty-sixth International Conference on Machine Learning (ICML'09), New York, NY, USA. ACM.},
	pages = {873-880},
	year={2009}
}
@inproceedings{Ramsey-1926,
	author={Ramsey, F. P.},
	title={Truth and probability},
	booktitle={In R. B. Braithwaite, editor, The Foundations of Mathematics and other Logical Essays , chapter 7},
	pages={156-198},
	publisher={McMaster University Archive for the History of Economic Thought},
	year={1926}
}
@inproceedings{Ranzato-Hinton-2010,
	author={Ranzato, M. and Hinton, G. H.},
	title={Modeling pixel means and covariances using factorized third-order Boltzmann machines},
	booktitle={In CVPR'2010 },
	pages={2551-2558},
	year={2010}
}
@inproceedings{Ranzato-et-al-2007a,
	author={Ranzato, M. and Poultney, C. and Chopra, S. and LeCun, Y.},
	title={Efficient learning of sparse representations with an energy-based model},
	booktitle={In NIPS'2006},
	year={2007}
}
@inproceedings{Ranzato-et-al-2007b,
	author={Ranzato, M. and Huang, F. and Boureau, Y. and LeCun, Y.},
	title={Unsupervised learning of invariant feature hierarchies with applications to object recognition},
	booktitle={In Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR'07). IEEE Press.},
	year={2007}
}
@inproceedings{Ranzato-et-al-2008,
	author={Ranzato, M. and Boureau, Y. and LeCun, Y.},
	title={Sparse feature learning for deep belief networks},
	booktitle={In NIPS'2007.},
	year={2008}
}
@inproceedings{Ranzato-et-al-2010a,
	author={Ranzato, M. and Krizhevsky, A. and Hinton, G. E.},
	title={Factored 3-way restricted Boltzmann machines for modeling natural images},
	booktitle={In Proceedings of AISTATS 2010},
	year={2010}
}
@inproceedings{Ranzato-et-al-2010b,
	author={Ranzato, M. and Mnih, V. and Hinton, G.},
	title={Generating more realistic images using gated MRFs},
	booktitle={In NIPS'2010.},
	year={2010}
}
@article{Rao-1945,
	author={Rao, C.},
	title={Information and the accuracy attainable in the estimation of statistical parameters},
	journal={Bulletin of the Calcutta Mathematical Society},
	volume={37},
	pages={81-89},
	year={1945}
}
@article{Rasmus-et-al-2015,
	author={Rasmus, A. and Valpola, H. and Honkala, M. and Berglund, M. and Raiko, T.},
	title={Semi-supervised learning with ladder network},
	journal={arXiv preprint arXiv:1507.02672},
	year={2015}
}
@inproceedings{Recht-et-al-2011,
	author={Recht, B. and Re, C. and Wright, S. and Niu, F.},
	title={Hogwild: A lock-free approach to parallelizing stochastic gradient descent},
	booktitle={In NIPS'2011},
	year={2011}
}
@inproceedings{Reichert-et-al-2011,
	author={Reichert, D. P. and Seriès, P. and Storkey, A. J.},
	title={Neuronal adaptation for sampling-based probabilistic inference in perceptual bistability},
	booktitle={In Advances in Neural Information Processing Systems},
	pages={2357-2365},
	year={2011}
}
@inproceedings{Rezende-et-al-2014,
	author={Rezende, D. J. and Mohamed, S. and Wierstra, D.},
	title={Stochastic backpropagation and approximate inference in deep generative models},
	booktitle={In ICML'2014. Preprint: arXiv:1401.4082.},
	year={2014}
}
@inproceedings{Rifai-et-al-2011a,
	author={Rifai, S. and Vincent, P. and Muller, X. and Glorot, X. and Bengio, Y.},
	title={Contractive auto-encoders: Explicit invariance during feature extraction},
	booktitle={In ICML'2011},
	year={2011}
}
@inproceedings{Rifai-et-al-2011b,
	author={Rifai, S. and Mesnil, G. and Vincent, P. and Muller, X. and Bengio, Y. and Dauphin, Y. and Glorot, X.},
	title={Higher order contractive auto-encoder},
	booktitle={In ECML PKDD.},
	year={2011}
}
@inproceedings{Rifai-et-al-2011c,
	author={Rifai, S. and Dauphin, Y. and Vincent, P. and Bengio, Y. and Muller, X.},
	title={The manifold tangent classifier},
	booktitle={In NIPS'2011},
	year={2011}
}
@inproceedings{Rifai-et-al-2012,
	author={Rifai, S. and Bengio, Y. and Dauphin, Y. and Vincent, P.},
	title={A generative process for sampling contractive auto-encoders},
	booktitle={In ICML'2012.},
	year={2012}
}
@article{Ringach-Shapley-2004,
	author={Ringach, D. and Shapley, R.},
	title={Reverse correlation in neurophysiology},
	journal={Cognitive Science},
	volume={28},
	number={2},
	pages={147-166},
	year={2004}
}
@article{Roberts-Everson-2001,
	author={Roberts, S. and Everson, R.},
	title={Independent component analysis: principles and practice},
	journal={Cambridge University Press.},
	year={2001}
}
@article{Robinson-Fallside-1991,
	author={Robinson, A. J. and Fallside, F.},
	title={A recurrent error propagation network speech recognition system},
	journal={Computer Speech and Language},
	volume={5},
	number={3},
	pages={259-274},
	year={1991}
}
@article{Rockafellar-1997,
	author={Rockafellar, R. T.},
	title={Convex analysis},
	journal={princeton landmarks in mathematics.},
	year={1997}
}
@inproceedings{Romero-et-al-2015,
	author={Romero, A. and Ballas, N. and Ebrahimi Kahou, S. and Chassang, A. and Gatta, C. and Bengio, Y.},
	title={Fitnets: Hints for thin deep nets},
	booktitle={In ICLR'2015, arXiv:1412.6550},
	year={2015}
}
@article{Rosen-1960,
	author={Rosen, J. B.},
	title={The gradient projection method for nonlinear programming},
	journal={part i. linear constraints. Journal of the Society for Industrial and Applied Mathematics},
	volume = {8},
	number = {1},
	pages={181-217},
	year={1960}
}
@article{Rosenblatt-1958,
	author={Rosenblatt, F.},
	title={The perceptron: A probabilistic model for information storage and organization in the brain},
	journal={Psychological Review},
	volume={65},
	pages={386-408},
	year={1958}
}
@article{Rosenblatt-1962,
	author={Rosenblatt, F.},
	title={Principles of Neurodynamics},
	journal={Spartan, New York.},
	year={1962}
}
@article{Roweis-Saul-2000,
	author={Roweis, S. and Saul, L. K.},
	title={Nonlinear dimensionality reduction by locally linear embedding},
	journal={Science},
	volume = {290},
	number = {5500},
	year={2000}
}
@inproceedings{Roweis-et-al-2002,
	author={Roweis, S. and Saul, L. and Hinton, G.},
	title={Global coordination of local linear models},
	booktitle={In T. Dietterich, S. Becker, and Z. Ghahramani, editors, Advances in Neural Information Processing Systems 14 (NIPS'01), Cambridge, MA. MIT Press.},
	year={2002}
}
@article{Rubin-1984,
	author={Rubin, D. B.},
	title={Bayesianly justifiable and relevant frequency calculations for the applied statistician},
	journal={The Annals of Statistics},
	volume={12},
	number={4},
	pages={1151-1172},
	year={1984}
}
@article{Rumelhart-et-al-1986a,
	author={Rumelhart, D. and Hinton, G. and Williams, R.},
	title={Learning representations by back-propagating errors},
	journal={Nature},
	volume={323},
	pages={533-536},
	year={1986}
}
@inproceedings{Rumelhart-et-al-1986b,
	author={Rumelhart, D. E. and Hinton, G. E. and Williams, R. J.},
	title={Learning internal represen-tations by error propagation},
	booktitle={In D. E. Rumelhart and J. L. McClelland, editors, Parallel Distributed Processing, volume 1, chapter 8},
	pages={318-362},
	publisher={MIT Press, Cambridge},
	year={1986}
}
@article{Rumelhart-et-al-1986c,
	author={Rumelhart, D. E. and McClelland, J. L. and the PDP Research Group},
	title={Parallel Distributed Processing: Explorations in the Microstructure of Cognition},
	journal={MIT Press, Cambridge.},
	year={1986}
}
@misc{Russakovsky-et-al-2014a,
	author={Russakovsky, O. and Deng, J. and Su, H. and Krause, J. and Satheesh, S. and Ma, S. and Huang, Z. and Karpathy, A. and Khosla, A. and Bernstein, M. and Berg, A. C. and Fei-Fei, L.},
	title={ImageNet Large Scale Visual Recognition Challenge},
	year={2014}
}
@article{Russakovsky-et-al-2014b,
	author={Russakovsky, O. and Deng, J. and Su, H. and Krause, J. and Satheesh, S. and Ma, S. and Huang, Z. and Karpathy, A. and Khosla, A. and Bernstein, M. and et al.},
	title={Imagenet large scale visual recognition challenge},
	journal={arXiv preprint arXiv:1409.0575.},
	year={2014}
}
@article{Russel-Norvig-2003,
	author={Russel, S. J. and Norvig, P.},
	title={Artificial Intelligence: a Modern Approach},
	journal={Prentice Hall.},
	year={2003}
}
@article{Rust-et-al-2005,
	author={Rust, N. and Schwartz, O. and Movshon, J. A. and Simoncelli, E.},
	title={Spatiotemporal elements of macaque V1 receptive fields},
	journal={Neuron},
	volume={46},
	number={6},
	pages={945-956},
	year={2005}
}
@inproceedings{Sainath-et-al-2013,
	author={Sainath, T. and Mohamed, A. and Kingsbury, B. and Ramabhadran, B.},
	title={Deep convolutional neural networks for LVCSR},
	booktitle={In ICASSP 2013.},
	year={2013}
}
@inproceedings{Salakhutdinov-2010,
	author={Salakhutdinov, R.},
	title={Learning in Markov random fields using tempered transitions},
	booktitle={In Y. Bengio, D. Schuurmans, C. Williams, J. Lafferty, and A. Culotta, editors, Advances in Neural Information Processing Systems 22 (NIPS'09).},
	year={2010}
}
@inproceedings{Salakhutdinov-Hinton-2009a,
	author={Salakhutdinov, R. and Hinton, G.},
	title={Deep Boltzmann machines},
	booktitle={In Proceedings of the International Conference on Artificial Intelligence and Statistics},
	volume = {5},
	pages={448-455},
	year={2009}
}
@inproceedings{Salakhutdinov-Hinton-2009b,
	author={Salakhutdinov, R. and Hinton, G.},
	title={Semantic hashing},
	booktitle={In International Journal of Approximate Reasoning.},
	year={2009}
}
@inproceedings{Salakhutdinov-Hinton-2007a,
	author={Salakhutdinov, R. and Hinton, G. E.},
	title={Learning a nonlinear embedding by preserving class neighbourhood structure},
	booktitle={In Proceedings of the Eleventh International Conference on Artificial Intelligence and Statistics (AISTATS'07), San Juan, Porto Rico. Omnipress.},
	year={2007}
}
@inproceedings{Salakhutdinov-Hinton-2007b,
	author={Salakhutdinov, R. and Hinton, G. E.},
	title={Semantic hashing},
	booktitle={In SIGIR'2007},
	year={2007}
}
@inproceedings{Salakhutdinov-Hinton-2008,
	author={Salakhutdinov, R. and Hinton, G. E.},
	title={Using deep belief nets to learn covariance kernels for Gaussian processes},
	booktitle={In J. Platt, D. Koller, Y. Singer, and S. Roweis, editors, Advances in Neural Information Processing Systems 20 (NIPS'07), Cambridge, MA.},
	pages = {1249-1256},
	publisher = {MIT Press},
	year={2008}
}
@inproceedings{Salakhutdinov-Larochelle-2010,
	author={Salakhutdinov, R. and Larochelle, H.},
	title={Efficient learning of deep Boltzmann machines},
	booktitle={In Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics (AISTATS 2010), JMLR W\&CP},
	volume = {9},
	pages={693-700},
	year={2010}
}
@inproceedings{Salakhutdinov-Mnih-2008,
	author={Salakhutdinov, R. and Mnih, A.},
	title={Probabilistic matrix factorization},
	booktitle={In NIPS'2008.},
	year={2008}
}
@inproceedings{Salakhutdinov-Murray-2008,
	author={Salakhutdinov, R. and Murray, I.},
	title={On the quantitative analysis of deep belief networks},
	booktitle={In W. W. Cohen, A. McCallum, and S. T. Roweis, editors, Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)},
	volume = {25},
	pages={872-879},
	publisher={ACM},
	year={2008}
}
@inproceedings{Salakhutdinov-et-al-2007,
	author={Salakhutdinov, R. and Mnih, A. and Hinton, G.},
	title={Restricted Boltzmann machines for collaborative filtering},
	booktitle={In ICML.},
	year={2007}
}
@article{Sanger-1994,
	author={Sanger, T. D.},
	title={Neural network learning control of robot manipulators using gradually increasing task difficulty},
	journal={IEEE Transactions on Robotics and Automation},
	volume = {10},
	number = {3},
	year={1994}
}
@inproceedings{Saul-Jordan-1996,
	author={Saul, L. K. and Jordan, M. I.},
	title={Exploiting tractable substructures in intractable networks},
	booktitle={In D. Touretzky, M. Mozer, and M. Hasselmo, editors, Advances in Neural Information Processing Systems 8 (NIPS'95). MIT Press, Cambridge, MA.},
	year={1996}
}
@article{Saul-et-al-1996,
	author={Saul, L. K. and Jaakkola, T. and Jordan, M. I.},
	title={Mean field theory for sigmoid belief networks},
	journal={Journal of Artificial Intelligence Research},
	volume={4},
	pages={61-76},
	year={1996}
}
@article{Savich-et-al-2007,
	author={Savich, A. W. and Moussa, M. and Areibi, S.},
	title={The impact of arithmetic representation on implementing mlp-bp on fpgas: A study},
	journal={Neural Networks, IEEE Transactions on},
	volume={18},
	number={1},
	pages={240-252},
	year={2007}
}
@inproceedings{Saxe-et-al-2011,
	author={Saxe, A. M. and Koh, P. W. and Chen, Z. and Bhand, M. and Suresh, B. and Ng, A.},
	title={On random weights and unsupervised feature learning},
	booktitle={In Proc. ICML'2011 . ACM.},
	year={2011}
}
@inproceedings{Saxe-et-al-2013,
	author={Saxe, A. M. and McClelland, J. L. and Ganguli, S.},
	title={Exact solutions to the nonlinear dynamics of learning in deep linear neural networks},
	booktitle={In ICLR.},
	year={2013}
}
@inproceedings{Schaul-et-al-2014,
	author={Schaul, T. and Antonoglou, I. and Silver, D.},
	title={Unit tests for stochastic optimization},
	booktitle={In International Conference on Learning Representations.},
	year={2014}
}
@article{Schmidhuber-1992,
	author={Schmidhuber, J.},
	title={Learning complex, extended sequences using the principle of history compression},
	journal={Neural Computation},
	volume={4},
	number={2},
	pages={234-242},
	year={1992}
}
@article{Schmidhuber-1996,
	author={Schmidhuber, J.},
	title={Sequential neural text compression},
	journal={IEEE Transactions on Neural Networks},
	volume={7},
	number={1},
	pages={142-146},
	year={1996}
}
@article{Schmidhuber-2012,
	author={Schmidhuber, J.},
	title={Self-delimiting neural networks},
	journal={arXiv preprint arXiv:1210.0118.},
	year={2012}
}
@article{Scholkopf-Smola-2002,
	author={Schölkopf, B. and Smola, A. J.},
	title={Learning with kernels: Support vector machines, regularization, optimization, and beyond},
	journal={MIT press.},
	year={2002}
}
@article{Scholkopf-et-al-1998,
	author={Schölkopf, B. and Smola, A. and Müller, K.-R.},
	title={Nonlinear component analysis as a kernel eigenvalue problem},
	journal={Neural Computation},
	volume={10},
	pages={1299-1319},
	year={1998}
}
@article{Scholkopf-et-al-1999,
	author={Schölkopf, B. and Burges, C. J. C. and Smola, A. J.},
	title={Advances in Kernel Methods — Support Vector Learning},
	journal={MIT Press, Cambridge, MA.},
	year={1999}
}
@inproceedings{Scholkopf-et-al-2012,
	author={Schölkopf, B. and Janzing, D. and Peters, J. and Sgouritsa, E. and Zhang, K. and Mooij, J.},
	title={On causal and anticausal learning},
	booktitle={In ICML'2012 },
	pages={1255-1262},
	year={2012}
}
@article{Schuster-1999,
	title={On supervised learning from sequential data with applications for speech recognition},
	journal = {Daktaro disertacija, Nara Institute of Science and Technology},
	author = {Schuster, Michael},
	year={1999}
}
@article{Schuster-Paliwal-1997,
	author={Schuster, M. and Paliwal, K.},
	title={Bidirectional recurrent neural networks},
	journal={IEEE Transactions on Signal Processing},
	volume={45},
	number={11},
	pages={2673-2681},
	year={1997}
}
@article{Schwenk-2007,
	author={Schwenk, H.},
	title={Continuous space language models},
	journal={Computer speech and language},
	volume={21},
	pages={492-518},
	year={2007}
}
@article{Schwenk-2010,
	author={Schwenk, H.},
	title={Continuous space language models for statistical machine translation},
	journal={The Prague Bulletin of Mathematical Linguistics},
	volume={93},
	pages={137-146},
	year={2010}
}
@misc{Schwenk-2014,
	author={Schwenk, H.},
	title={Cleaned subset of WMT '14 dataset},
	year={2014}
}
@inproceedings{Schwenk-Bengio-1998,
	author={Schwenk, H. and Bengio, Y.},
	title={Training methods for adaptive boosting of neural networks},
	booktitle={In M. Jordan, M. Kearns, and S. Solla, editors, Advances in Neural Information Processing Systems 10 (NIPS'97)},
	pages={647-653},
	publisher={MIT Press},
	year={1998}
}
@inproceedings{Schwenk-Gauvain-2002,
	author={Schwenk, H. and Gauvain, J.-L.},
	title={Connectionist language modeling for large vocabulary continuous speech recognition},
	booktitle={In International Conference on Acoustics, Speech and Signal Processing (ICASSP), Orlando, Florida.},
	pages = {765-768},
	year={2002}
}
@inproceedings{Schwenk-et-al-2006,
	author={Schwenk, H. and Costa-jussà, M. R. and Fonollosa, J. A. R.},
	title={Continuous space language models for the IWSLT 2006 task},
	booktitle={In International Workshop on Spoken Language Translation},
	pages={166-173},
	year={2006}
}
@inproceedings{Seide-et-al-2011,
	author={Seide, F. and Li, G. and Yu, D.},
	title={Conversational speech transcription using context-dependent deep neural networks},
	booktitle={In Interspeech 2011},
	pages={437-440},
	year={2011}
}
@inproceedings{Sejnowski-1987,
	author={Sejnowski, T.},
	title={Higher-order Boltzmann machines},
	booktitle={In AIP Conference Proceedings 151 on Neural Networks for Computing},
	pages={398-403},
	publisher={American Institute of Physics Inc},
	year={1987}
}
@inproceedings{Series-et-al-2010,
	author={Series, P. and Reichert, D. P. and Storkey, A. J.},
	title={Hallucinations in Charles Bonnet syndrome induced by homeostasis: a deep Boltzmann machine model},
	booktitle={In Advances in Neural Information Processing Systems},
	pages={2020-2028},
	year={2010}
}
@article{Sermanet-et-al-2012,
	author={Sermanet, P. and Chintala, S. and LeCun, Y.},
	title={Convolutional neural networks applied to house numbers digit classification},
	journal={CoRR, abs/1204.3968.},
	year={2012}
}
@inproceedings{Sermanet-et-al-2013,
	author={Sermanet, P. and Kavukcuoglu, K. and Chintala, S. and LeCun, Y.},
	title={Pedestrian detection with unsupervised multi-stage feature learning},
	booktitle={In Proc. International Conference on Computer Vision and Pattern Recognition (CVPR'13). IEEE.},
	year={2013}
}
@article{Shilov-1977,
	author={Shilov, G.},
	title={Linear Algebra},
	journal={Dover Books on Mathematics Series. Dover Publications.},
	year={1977}
}
@article{Siegelmann-1995,
	author={Siegelmann, H.},
	title={Computation beyond the Turing limit},
	journal={Science},
	volume={268},
	number={5210},
	pages={545-548},
	year={1995}
}
@article{Siegelmann-Sontag-1991,
	author={Siegelmann, H. and Sontag, E.},
	title={Turing computability with neural nets},
	journal={Applied Mathematics Letters},
	volume={4},
	number={6},
	pages={77-80},
	year={1991}
}
@article{Siegelmann-Sontag-1995,
	author={Siegelmann, H. T. and Sontag, E. D.},
	title={On the computational power of neural nets},
	journal={Journal of Computer and Systems Sciences},
	volume={50},
	number={1},
	pages={132-150},
	year={1995}
}
@article{Sietsma-Dow-1991,
	author={Sietsma, J. and Dow, R.},
	title={Creating artificial neural networks that generalize},
	journal={Neural Networks},
	volume={4},
	number={1},
	pages={67-79},
	year={1991}
}
@inproceedings{Simard-et-al-2003,
	author={Simard, D. and Steinkraus, P. Y. and Platt, J. C.},
	title={Best practices for convolutional neural networks},
	booktitle={In ICDAR'2003.},
	year={2003}
}
@inproceedings{Simard-Graf-1994,
	author={Simard, P. and Graf, H. P.},
	title={Backpropagation without multiplication},
	booktitle={In Advances in Neural Information Processing Systems},
	pages={232-239},
	year={1994}
}
@inproceedings{Simard-et-al-1992,
	author={Simard, P. and Victorri, B. and LeCun, Y. and Denker, J.},
	title={Tangent prop - A formalism for specifying selected invariances in an adaptive network},
	booktitle={In NIPS'1991},
	year={1992}
}
@inproceedings{Simard-et-al-1993,
	author={Simard, P. Y. and LeCun, Y. and Denker, J.},
	title={Efficient pattern recognition using a new transformation distance},
	booktitle={In NIPS'92.},
	year={1993}
}
@article{Simard-et-al-1998,
	author={Simard, P. Y. and LeCun, Y. A. and Denker, J. S. and Victorri, B.},
	title={Transformation invariance in pattern recognition — tangent distance and tangent propagation},
	journal={Lecture Notes in Computer Science, 1524.},
	year={1998}
}
@article{Simons-Levin-1998,
	author={Simons, D. J. and Levin, D. T.},
	title={Failure to detect changes to people during a real-world interaction},
	journal={Psychonomic Bulletin \& Review},
	volume={5},
	number={4},
	pages={644-649},
	year={1998}
}
@inproceedings{Simonyan-Zisserman-2015,
	author={Simonyan, K. and Zisserman, A.},
	title={Very deep convolutional networks for large-scale image recognition},
	booktitle={In ICLR.},
	year={2015}
}
@article{Sjoberg-Ljung-1995,
	author={Sjöberg, J. and Ljung, L.},
	title={Overtraining, regularization and searching for a minimum, with application to neural networks},
	journal={International Journal of Control},
	volume={62},
	number={6},
	pages={1391-1407},
	year={1995}
}
@article{Skinner-1958,
	author={Skinner, B. F.},
	title={Reinforcement today},
	journal={American Psychologist},
	volume={13},
	pages={94-99},
	year={1958}
}
@inproceedings{Smolensky-1986,
	author={Smolensky, P.},
	title={Information processing in dynamical systems: Foundations of harmony theory},
	booktitle={In D. E. Rumelhart and J. L. McClelland, editors, Parallel Distributed Processing, volume 1, chapter 6},
	pages={194-281},
	publisher={MIT Press, Cambridge},
	year={1986}
}
@inproceedings{Snoek-et-al-2012,
	author={Snoek, J. and Larochelle, H. and Adams, R. P.},
	title={Practical Bayesian optimization of machine learning algorithms},
	booktitle={In NIPS'2012.},
	year={2012}
}
@inproceedings{Socher-et-al-2011a,
	author={Socher, R. and Huang, E. H. and Pennington, J. and Ng, A. Y. and Manning, C. D.},
	title={Dynamic pooling and unfolding recursive autoencoders for paraphrase detection},
	booktitle={In NIPS'2011},
	year={2011}
}
@inproceedings{Socher-et-al-2011b,
	author={Socher, R. and Manning, C. and Ng, A. Y.},
	title={Parsing natural scenes and natural language with recursive neural networks},
	booktitle={In Proceedings of the Twenty-Eighth International Conference on Machine Learning (ICML'2011).},
	year={2011}
}
@inproceedings{Socher-et-al-2011c,
	author={Socher, R. and Pennington, J. and Huang, E. H. and Ng, A. Y. and Manning, C. D.},
	title={Semi-supervised recursive autoencoders for predicting sentiment distributions},
	booktitle={In EMNLP'2011.},
	year={2011}
}
@inproceedings{Socher-et-al-2013a,
	author={Socher, R. and Perelygin, A. and Wu, J. Y. and Chuang, J. and Manning, C. D. and Ng, A. Y. and Potts, C.},
	title={Recursive deep models for semantic compositionality over a sentiment treebank},
	booktitle={In EMNLP'2013.},
	year={2013}
}
@inproceedings{Socher-et-al-2013b,
	author={Socher, R. and Ganjoo, M. and Manning, C. D. and Ng, A. Y.},
	title={Zero-shot learning through cross-modal transfer},
	booktitle={In 27th Annual Conference on Neural Information Processing Systems (NIPS 2013).},
	year={2013}
}
@article{Sohl-Dickstein-et-al-2015,
	title={Deep unsupervised learning using nonequilibrium thermodynamics},
	journal = {arXiv preprint arXiv:1503.03585},
	author = {Sohl-Dickstein, Jascha and Weiss, Eric A. and Maheswaranathan, Niru and Ganguli, Surya},
	year={2015}
}
@inproceedings{Sohn-et-al-2013,
	author={Sohn, K. and Zhou, G. and Lee, H.},
	title={Learning and selecting features jointly with point-wise gated Boltzmann machines},
	booktitle={In ICML'2013.},
	year={2013}
}
@inproceedings{Solomonoff-1989,
	title={A system for incremental learning based on algorithmic probability},
	booktitle = {Proceedings of the {Sixth} {Israeli} {Conference} on {Artificial} {Intelligence}, {Computer} {Vision} and {Pattern} {Recognition}},
	author = {Solomonoff, Ray J.},
	year = {1989},
	pages = {515--527}
}
@article{Sontag-1998,
	author={Sontag, E. D.},
	title={VC dimension of neural networks},
	journal={NATO ASI Series F Computer and Systems Sciences},
	volume={168},
	pages={69-96},
	year={1998}
}
@article{Sontag-Sussman-1989,
	author={Sontag, E. D. and Sussman, H. J.},
	title={Backpropagation can give rise to spurious local minima even for networks without hidden layers},
	journal={Complex Systems},
	volume={3},
	pages={91-106},
	year={1989}
}
@article{Sparkes-1996,
	author={Sparkes, B.},
	title={The Red and the Black: Studies in Greek Pottery},
	journal={Routledge.},
	year={1996}
}
@inproceedings{Spitkovsky-et-al-2010,
	author={Spitkovsky, V. I. and Alshawi, H. and Jurafsky, D.},
	title={From baby steps to leapfrog: how “less is more” in unsupervised dependency parsing},
	booktitle={In HLT'10.},
	year={2010}
}
@article{Squire-Trapp-1998,
	author={Squire, W. and Trapp, G.},
	title={Using complex variables to estimate derivatives of real functions},
	journal={SIAM Rev.},
	volume={40},
	number={1},
	pages={110-112},
	year={1998}
}
@inproceedings{Srebro-Shraibman-2005,
	author={Srebro, N. and Shraibman, A.},
	title={Rank, trace-norm and max-norm},
	booktitle={In Proceedings of the 18th Annual Conference on Learning Theory},
	pages={545-560},
	publisher={Springer-Verlag},
	year={2005}
}
@article{Srivastava-2013,
	author={Srivastava, N.},
	title={Improving Neural Networks With Dropout},
	journal={Master's thesis, U.  Toronto.},
	year={2013}
}
@inproceedings{Srivastava-Salakhutdinov-2012,
	author={Srivastava, N. and Salakhutdinov, R.},
	title={Multimodal learning with deep Boltzmann machines},
	booktitle={In NIPS'2012.},
	year={2012}
}
@article{Srivastava-et-al-2013,
	author={Srivastava, N. and Salakhutdinov, R. R. and Hinton, G. E.},
	title={Modeling documents with deep Boltzmann machines},
	journal={arXiv preprint arXiv:1309.6865},
	year={2013}
}
@article{Srivastava-et-al-2014,
	author={Srivastava, N. and Hinton, G. and Krizhevsky, A. and Sutskever, I. and Salakhutdinov, R.},
	title={Dropout: A simple way to prevent neural networks from overfitting},
	journal={Journal of Machine Learning Research},
	volume={15},
	pages={1929-1958},
	year={2014}
}
@article{Srivastava-et-al-2015,
	author={Srivastava, R. K. and Greff, K. and Schmidhuber, J.},
	title={Highway networks},
	journal={arXiv:1505.00387.},
	year={2015}
}
@inproceedings{Steinkrau-et-al-2005,
	author={Steinkrau, D. and Simard, P. Y. and Buck, I.},
	title={Using GPUs for machine learning algorithms},
	booktitle={8th International Conference on Document Analysis and Recognition},
	pages={1115-1119},
	year={2005}
}
@inproceedings{Stoyanov-et-al-2011,
	author={Stoyanov, V. and Ropson, A. and Eisner, J.},
	title={Empirical risk minimization of graphical model parameters given approximate inference, decoding, and model structure},
	booktitle={In Proceedings of the 14th International Conference on Artificial Intelligence and Statistics (AISTATS), volume 15 of JMLR Workshop and Conference Proceedings, Fort Lauderdable. Supplementary material (4 pages) also available.},
	pages = {725-733},
	year={2011}
}
@article{Sukhbaatar-et-al-2015,
	author={Sukhbaatar, S. and Szlam, A. and Weston, J. and Fergus, R.},
	title={Weakly supervised memory networks},
	journal={arXiv preprint arXiv:1503.08895.},
	year={2015}
}
@inproceedings{Supancic-Ramanan-2013,
	author={Supancic, J. and Ramanan, D.},
	title={Self-paced learning for long-term tracking},
	booktitle={In CVPR'2013.},
	year={2013}
}
@article{Sussillo-2014,
	author={Sussillo, D.},
	title={Random walks: Training very deep nonlinear feed-forward networks with smart initialization},
	journal={CoRR, abs/1412.6558.},
	year={2014}
}
@phdthesis{Sutskever-2012,
	author={Sutskever, I.},
	title={Training Recurrent Neural Networks},
	school={Department of computer science, University of Toronto.},
	year={2012}
}
@article{Sutskever-Hinton-2008,
	author={Sutskever, I. and Hinton, G. E.},
	title={Deep narrow sigmoid belief networks are universal approximators},
	journal={Neural Computation},
	volume={20},
	number={11},
	pages={2629-2636},
	year={2008}
}
@inproceedings{Sutskever-Tieleman-2010,
	author={Sutskever, I. and Tieleman, T.},
	title={On the Convergence Properties of Contrastive Divergence},
	editor = {Y. W. Teh and M. Titterington},
	booktitle={International Conference on Artificial Intelligence and Statistics (AISTATS)},
	volume = {9},
	pages={789-795},
	year={2010}
}
@inproceedings{Sutskever-et-al-2009,
	author={Sutskever, I. and Hinton, G. and Taylor, G.},
	title={The recurrent temporal restricted Boltzmann machine},
	booktitle={In NIPS'2008.},
	year={2009}
}
@inproceedings{Sutskever-et-al-2011,
	author={Sutskever, I. and Martens, J. and Hinton, G. E.},
	title={Generating text with recurrent neural networks},
	booktitle={In ICML'2011 },
	pages={1017-1024},
	year={2011}
}
@inproceedings{Sutskever-et-al-2013,
	author={Sutskever, I. and Martens, J. and Dahl, G. and Hinton, G.},
	title={On the importance of initialization and momentum in deep learning},
	booktitle={In ICML.},
	year={2013}
}
@inproceedings{Sutskever-et-al-2014,
	author={Sutskever, I. and Vinyals, O. and Le, Q. V.},
	title={Sequence to sequence learning with neural networks},
	booktitle={In NIPS'2014, arXiv:1409.3215},
	year={2014}
}
@article{Sutton-Barto-1998,
	author={Sutton, R. and Barto, A.},
	title={Reinforcement Learning: An Introduction},
	journal={MIT Press.},
	year={1998}
}
@inproceedings{Sutton-et-al-2000,
	author={Sutton, R. S. and Mcallester, D. and Singh, S. and Mansour, Y.},
	title={Policy gradient methods for reinforcement learning with function approximation},
	booktitle={In NIPS'1999},
	pages = {1057-1063},
	publisher = {MIT Press},
	year={2000}
}
@inproceedings{Swersky-et-al-2011,
	author={Swersky, K. and Ranzato, M. and Buchman, D. and Marlin, B. and de Freitas, N.},
	title={On autoencoders and score matching for energy based models},
	booktitle={In ICML'2011. ACM.},
	year={2011}
}
@article{Swersky-et-al-2014,
	author={Swersky, K. and Snoek, J. and Adams, R. P.},
	title={Freeze-thaw Bayesian optimization},
	journal={arXiv preprint arXiv:1406.3896.},
	year={2014}
}
@article{Szegedy-et-al-2014a,
	author={Szegedy, C. and Liu, W. and Jia, Y. and Sermanet, P. and Reed, S. and Anguelov, D. and Erhan, D. and Vanhoucke, V. and Rabinovich, A.},
	title={Going deeper with convolutions},
	journal={Technical report, arXiv:1409.4842.},
	year={2014}
}
@article{Szegedy-et-al-2014b,
	author={Szegedy, C. and Zaremba, W. and Sutskever, I. and Bruna, J. and Erhan, D. and Goodfellow, I. J. and Fergus, R.},
	title={Intriguing properties of neural networks},
	journal={ICLR,abs/1312.6199.},
	year={2014}
}
@article{Szegedy-et-al-2015,
	author={Szegedy, C. and Vanhoucke, V. and Ioffe, S. and Shlens, J. and Wojna, Z.},
	title={Rethinking the Inception Architecture for Computer Vision},
	journal={ArXiv e-prints.},
	year={2015}
}
@inproceedings{Taigman-et-al-2014,
	author={Taigman, Y. and Yang, M. and Ranzato, M. and Wolf, L.},
	title={DeepFace: Closing the gap to human-level performance in face verification},
	booktitle={In CVPR'2014},
	year={2014}
}
@article{Tandy-1997,
	author={Tandy, D. W.},
	title={Works and Days: A Translation and Commentary for the Social Sciences},
	journal={University of California Press.},
	year={1997}
}
@inproceedings{Tang-Eliasmith-2010,
	author={Tang, Y. and Eliasmith, C.},
	title={Deep networks for robust visual recognition},
	booktitle={In Proceedings of the 27th International Conference on Machine Learning, June 21-24, 2010, Haifa, Israel.},
	year={2010}
}
@article{Tang-et-al-2012,
	author={Tang, Y. and Salakhutdinov, R. and Hinton, G.},
	title={Deep mixtures of factor analysers},
	journal={arXiv preprint arXiv:1206.4635.},
	year={2012}
}
@inproceedings{Taylor-Hinton-2009,
	author={Taylor, G. and Hinton, G.},
	title={Factored conditional restricted Boltzmann machines for modeling motion style},
	booktitle={In L. Bottou and M. Littman, editors, Proceedings of the Twenty-sixth International Conference on Machine Learning (ICML'09)},
	pages = {1025-1032},
	publisher = {Montreal, Quebec, Canada. ACM.},
	year={2009}
}
@inproceedings{Taylor-et-al-2007,
	author={Taylor, G. and Hinton, G. E. and Roweis, S.},
	title={Modeling human motion using binary latent variables},
	booktitle={In B. Schölkopf, J. Platt, and T. Hoffman, editors, Advances in Neural Information Processing Systems 19 (NIPS'06)},
	pages={1345-1352},
	publisher={MIT Press, Cambridge, MA},
	year={2007}
}
@article{Teh-et-al-2003,
	author={Teh, Y. and Welling, M. and Osindero, S. and Hinton, G. E.},
	title={Energy-based models for sparse overcomplete representations},
	journal={Journal of Machine Learning Research,4},
	pages={1235-1260},
	year={2003}
}
@article{Tenenbaum-et-al-2000,
	author={Tenenbaum, J. and de Silva, V. and Langford, J. C.},
	title={A global geometric framework for nonlinear dimensionality reduction},
	journal={Science},
	volume={290},
	number={5500},
	pages={2319-2323},
	year={2000}
}
@article{Theis-et-al-2015,
	author={Theis, L. and van den Oord, A. and Bethge, M.},
	title={A note on the evaluation of generative models},
	journal={arXiv:1511.01844.},
	year={2015}
}
@inproceedings{Thompson-et-al-2014,
	author={Thompson, J. and Jain, A. and LeCun, Y. and Bregler, C.},
	title={Joint training of a convolutional network and a graphical model for human pose estimation},
	booktitle={In NIPS'2014.},
	year={2014}
}
@inproceedings{Thrun-1995,
	author={Thrun, S.},
	title={Learning to play the game of chess},
	booktitle={In NIPS'1994},
	year={1995}
}
@article{Tibshirani-1995,
	author={Tibshirani, R. J.},
	title={Regression shrinkage and selection via the lasso},
	journal={Journal of the Royal Statistical Society B},
	volume={58},
	pages={267-288},
	year={1995}
}
@inproceedings{Tieleman-2008,
	author={Tieleman, T.},
	title={Training restricted Boltzmann machines using approximations to the likelihood gradient},
	booktitle={In W. W. Cohen, A. McCallum, and S. T. Roweis, editors, Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)},
	pages={1064-1071},
	publisher={ACM},
	year={2008}
}
@inproceedings{Tieleman-Hinton-2009,
	author={Tieleman, T. and Hinton, G.},
	title={Using fast weights to improve persistent contrastive divergence},
	booktitle={In L. Bottou and M. Littman, editors, Proceedings of the Twenty-sixth International Conference on Machine Learning (ICML'09)},
	pages={1033-1040},
	publisher={ACM},
	year={2009}
}
@article{Tipping-Bishop-1999,
	author={Tipping, M. E. and Bishop, C. M.},
	title={Probabilistic principal components analysis},
	journal={Journal of the Royal Statistical Society B},
	volume={61},
	number={3},
	pages={611-622},
	year={1999}
}
@inproceedings{Torralba-et-al-2008,
	author={Torralba, A. and Fergus, R. and Weiss, Y.},
	title={Small codes and large databases for recognition},
	booktitle={In Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR'08)},
	pages={1-8},
	year={2008}
}
@inproceedings{Touretzky-Minton-1985,
	author={Touretzky, D. S. and Minton, G. E.},
	title={Symbols among the neurons: Details of a connectionist inference architecture},
	booktitle={In Proceedings of the 9th International Joint Conference on Artificial Intel ligence, IJCAI'85},
	volume = {1},
	pages = {238-243},
	publisher = {San Francisco, CA, USA. Morgan Kaufmann Publishers Inc},
	year={1985}
}
@inproceedings{Tu-Honavar-2011,
	author={Tu, K. and Honavar, V.},
	title={On the utility of curricula in unsupervised learning of probabilistic grammars},
	booktitle={In IJCAI'2011.},
	year={2011}
}
@article{Turaga-et-al-2010,
	author={Turaga, S. C. and Murray, J. F. and Jain, V. and Roth, F. and Helmstaedter, M. and Briggman, K. and Denk, W. and Seung, H. S.},
	title={Convolutional networks can learn to generate affinity graphs for image segmentation},
	journal={Neural Computation},
	volume={22},
	number={2},
	pages={511-538},
	year={2010}
}
@inproceedings{Turian-et-al-2010,
	author={Turian, J. and Ratinov, L. and Bengio, Y.},
	title={Word representations: A simple and general method for semi-supervised learning},
	booktitle={In Proc. ACL'2010},
	pages={384-394},
	year={2010}
}
@article{Toscher-et-al-2009,
	title={The BigChaos solution to the Netflix grand prize},
	url = {http://www.stat.osu.edu/~dmsl/GrandPrize2009_BPC_BigChaos.pdf},
	journal = {Netflix prize documentation},
	author = {Töscher, Andreas and Jahrer, Michael and Bell, Robert M.},
	year = {2009},
	pages = {1--52}
}
@inproceedings{Uria-et-al-2013,
	author={Uria, B. and Murray, I. and Larochelle, H.},
	title={Rnade: The real-valued neural autoregressive density-estimator},
	booktitle={In NIPS'2013},
	year={2013}
}
@inproceedings{van-den-Oord-et-al-2013,
	author={van den Oörd, A. and Dieleman, S. and Schrauwen, B.},
	title={Deep content-based music recommendation},
	booktitle={In NIPS'2013.},
	year={2013}
}
@article{van-der-Maaten-Hinton-2008,
	author={van der Maaten, L. and Hinton, G. E.},
	title={Visualizing data using t-SNE},
	journal={J. Machine Learning Res., 9.},
	year={2008}
}
@inproceedings{Vanhoucke-et-al-2011,
	author={Vanhoucke, V. and Senior, A. and Mao, M. Z.},
	title={Improving the speed of neural networks on CPUs},
	booktitle={In Proc. Deep Learning and Unsupervised Feature Learning NIPS Workshop.},
	year={2011}
}
@article{Vapnik-1982,
	author={Vapnik, V. N.},
	title={Estimation of Dependences Based on Empirical Data},
	journal={Springer-Verlag, Berlin.},
	year={1982}
}
@article{Vapnik-1995,
	author={Vapnik, V. N.},
	title={The Nature of Statistical Learning Theory},
	journal={Springer, New York.},
	year={1995}
}
@article{Vapnik-Chervonenkis-1971,
	author={Vapnik, V. N. and Chervonenkis, A. Y.},
	title={On the uniform convergence of relative frequencies of events to their probabilities},
	journal={Theory of Probability and Its Applications},
	volume={16},
	pages={264-280},
	year={1971}
}
@article{Vincent-2011,
	author={Vincent, P.},
	title={A connection between score matching and denoising autoencoders},
	journal={Neural Computation},
	volume = {23},
	number = {7},
	year={2011}
}
@inproceedings{Vincent-Bengio-2003,
	author={Vincent, P. and Bengio, Y.},
	title={Manifold Parzen windows},
	booktitle={In NIPS'2002 . MIT Press.},
	year={2003}
}
@inproceedings{Vincent-et-al-2008,
	author={Vincent, P. and Larochelle, H. and Bengio, Y. and Manzagol, P.-A.},
	title={Extracting and composing robust features with denoising autoencoders},
	booktitle={In ICML 2008},
	year={2008}
}
@article{Vincent-et-al-2010,
	author={Vincent, P. and Larochelle, H. and Lajoie, I. and Bengio, Y. and Manzagol, P.-A.},
	title={Stacked denoising autoencoders: Learning useful representations in a deep network with a local denoising criterion},
	journal={Journal of Machine Learning Research},
	volume = {11},
	year={2010}
}
@inproceedings{Vincent-et-al-2015,
	author={Vincent, P. and de Brébisson, A. and Bouthillier, X.},
	title={Efficient exact gradient update for training deep networks with very large sparse targets},
	booktitle={In C. Cortes, N. D. Lawrence, D. D. Lee, M. Sugiyama, and R. Garnett, editors, Advances in Neural Information Processing Systems 28 },
	pages={1108-1116},
	publisher={Curran Associates, Inc},
	year={2015}
}
@article{Vinyals-et-al-2014a,
	author={Vinyals, O. and Kaiser, L. and Koo, T. and Petrov, S. and Sutskever, I. and Hinton, G.},
	title={Grammar as a foreign language},
	journal={Technical report, arXiv:1412.7449.},
	year={2014}
}
@article{Vinyals-et-al-2014b,
	author={Vinyals, O. and Toshev, A. and Bengio, S. and Erhan, D.},
	title={Show and tell: a neural image caption generator},
	journal={arXiv 1411.4555.},
	year={2014}
}
@article{Vinyals-et-al-2015a,
	author={Vinyals, O. and Fortunato, M. and Jaitly, N.},
	title={Pointer networks},
	journal={arXiv preprint arXiv:1506.03134.},
	year={2015}
}
@inproceedings{Vinyals-et-al-2015b,
	author={Vinyals, O. and Toshev, A. and Bengio, S. and Erhan, D.},
	title={Show and tell: a neural image caption generator},
	booktitle={In CVPR'2015. arXiv:1411.4555.},
	year={2015}
}
@inproceedings{Viola-Jones-2001,
	author={Viola, P. and Jones, M.},
	title={Robust real-time object detection},
	booktitle={In International Journal of Computer Vision.},
	year={2001}
}
@article{Visin-et-al-2015,
	author={Visin, F. and Kastner, K. and Cho, K. and Matteucci, M. and Courville, A. and Bengio, Y.},
	title={ReNet: A recurrent neural network based alternative to convolutional networks},
	journal={arXiv preprint arXiv:1505.00393.},
	year={2015}
}
@article{Von-Melchner-et-al-2000,
	author={Von Melchner, L. and Pallas, S. L. and Sur, M.},
	title={Visual behaviour mediated by retinal projections directed to the auditory pathway},
	journal={Nature},
	volume={404},
	number={6780},
	pages={871-876},
	year={2000}
}
@inproceedings{Wager-et-al-2013,
	author={Wager, S. and Wang, S. and Liang, P.},
	title={Dropout training as adaptive regularization},
	booktitle={In Advances in Neural Information Processing Systems 26},
	pages={351-359},
	year={2013}
}
@article{Waibel-et-al-1989,
	author={Waibel, A. and Hanazawa, T. and Hinton, G. E. and Shikano, K. and Lang, K.},
	title={Phoneme recognition using time-delay neural networks},
	journal={IEEE Transactions on Acoustics, Speech, and Signal Processing},
	volume = {37},
	number = {3},
	pages={328-339},
	year={1989}
}
@inproceedings{Wan-et-al-2013,
	author={Wan, L. and Zeiler, M. and Zhang, S. and LeCun, Y. and Fergus, R.},
	title={Regularization of neural networks using dropconnect},
	booktitle={In ICML'2013.},
	year={2013}
}
@inproceedings{Wang-Manning-2013,
	author={Wang, S. and Manning, C.},
	title={Fast dropout training},
	booktitle={In ICML'2013 . 266},
	year={2013}
}
@inproceedings{Wang-et-al-2014a,
	author={Wang, Z. and Zhang, J. and Feng, J. and Chen, Z.},
	title={Knowledge graph and text jointly embedding},
	booktitle={In Proc. EMNLP'2014.},
	year={2014}
}
@inproceedings{Wang-et-al-2014b,
	author={Wang, Z. and Zhang, J. and Feng, J. and Chen, Z.},
	title={Knowledge graph embedding by translating on hyperplanes},
	booktitle={In Proc. AAAI'2014},
	year={2014}
}
@inproceedings{WardeFarley-et-al-2014,
	author={Warde-Farley, D. and Goodfellow, I. J. and Courville, A. and Bengio, Y.},
	title={An empirical analysis of dropout in piecewise linear networks},
	booktitle={In ICLR'2014},
	year={2014}
}
@article{Wawrzynek-et-al-1996,
	author={Wawrzynek, J. and Asanovic, K. and Kingsbury, B. and Johnson, D. and Beck, J. and Morgan, N.},
	title={Spert-II: A vector microprocessor system},
	journal={Computer},
	volume={29},
	number={3},
	pages={79-86},
	year={1996}
}
@inproceedings{Weaver-Tao-2001,
	author={Weaver, L. and Tao, N.},
	title={The optimal reward baseline for gradient-based reinforcement learning},
	booktitle={In Proc. UAI'2001},
	pages={538-545},
	year={2001}
}
@inproceedings{Weinberger-Saul-2004,
	author={Weinberger, K. Q. and Saul, L. K.},
	title={Unsupervised learning of image manifolds by semidefinite programming},
	booktitle={In CVPR'2004},
	pages={988-995},
	year={2004}
}
@inproceedings{Weiss-et-al-2008,
	author={Weiss, Y. and Torralba, A. and Fergus, R.},
	title={Spectral hashing},
	booktitle={In NIPS},
	pages={1753-1760},
	year={2008}
}
@inproceedings{Welling-et-al-2002,
	author={Welling, M. and Zemel, R. S. and Hinton, G. E.},
	title={Self supervised boosting},
	booktitle={In Advances in Neural Information Processing Systems},
	pages={665-672},
	year={2002}
}
@inproceedings{Welling-et-al-2003a,
	author={Welling, M. and Hinton, G. E. and Osindero, S.},
	title={Learning sparse topographic representations with products of Student-t distributions},
	booktitle={In NIPS'2002.},
	year={2003}
}
@inproceedings{Welling-et-al-2003b,
	author={Welling, M. and Zemel, R. and Hinton, G. E.},
	title={Self-supervised boosting},
	booktitle={In S. Becker, S. Thrun, and K. Obermayer, editors, Advances in Neural Information Processing Systems 15 (NIPS'02)},
	pages={665-672},
	publisher={MIT Press},
	year={2003}
}
@inproceedings{Welling-et-al-2005,
	author={Welling, M. and Rosen-Zvi, M. and Hinton, G. E.},
	title={Exponential family harmoniums with an application to information retrieval},
	booktitle={In L. Saul, Y. Weiss, and L. Bottou, editors, Advances in Neural Information Processing Systems 17 (NIPS'04)},
	volume = {17},
	publisher = {Cambridge, MA. MIT Press},
	year={2005}
}
@inproceedings{Werbos-1981,
	author={Werbos, P. J.},
	title={Applications of advances in nonlinear sensitivity analysis},
	booktitle={In Proceedings of the 10th IFIP Conference, 31.8 - 4.9, NYC },
	pages={762-770},
	year={1981}
}
@article{Weston-et-al-2010,
	author={Weston, J. and Bengio, S. and Usunier, N.},
	title={Large scale image annotation: learning to rank with joint word-image embeddings},
	journal={Machine Learning},
	volume={81},
	number={1},
	pages={21-35},
	year={2010}
}
@article{Weston-et-al-2014,
	author={Weston, J. and Chopra, S. and Bordes, A.},
	title={Memory networks},
	journal={arXiv preprint arXiv:1410.3916.},
	year={2014}
}
@inproceedings{Widrow-Hoff-1960,
	author={Widrow, B. and Hoff, M. E.},
	title={Adaptive switching circuits},
	booktitle={In 1960 IRE WESCON Convention Record},
	volume = {4},
	pages={96-104},
	publisher={IRE, New York},
	year={1960}
}
@article{Wikipedia-2015,
	author={Wikipedia},
	title={List of animals by number of neurons — Wikipedia, the free encyclopedia},
	journal={[Online; accessed 4-March-2015].},
	year={2015}
}
@article{Williams-Agakov-2002,
	author={Williams, C. K. I. and Agakov, F. V.},
	title={Products of Gaussians and Probabilistic Minor Component Analysis},
	journal={Neural Computation},
	volume={14},
	number={5},
	pages={1169-1182},
	year={2002}
}
@inproceedings{Williams-Rasmussen-1996,
	author={Williams, C. K. I. and Rasmussen, C. E.},
	title={Gaussian processes for regression},
	booktitle={In D. Touretzky, M. Mozer, and M. Hasselmo, editors, Advances in Neural Information Processing Systems 8 (NIPS'95)},
	pages={514-520},
	publisher={MIT Press, Cambridge, MA},
	year={1996}
}
@article{Williams-1992,
	author={Williams, R. J.},
	title={Simple statistical gradient-following algorithms connectionist reinforcement learning},
	journal={Machine Learning},
	volume={8},
	pages={229-256},
	year={1992}
}
@article{Williams-Zipser-1989,
	author={Williams, R. J. and Zipser, D.},
	title={A learning algorithm for continually running fully recurrent neural networks},
	journal={Neural Computation},
	volume={1},
	pages={270-280},
	year={1989}
}
@article{Wilson-Martinez-2003,
	author={Wilson, D. R. and Martinez, T. R.},
	title={The general inefficiency of batch training for gradient descent learning},
	journal={Neural Networks},
	volume={16},
	number={10},
	pages={1429-1451},
	year={2003}
}
@article{Wilson-1984,
	author={Wilson, J. R.},
	title={Variance reduction techniques for digital simulation},
	journal={American Journal of Mathematical and Management Sciences},
	volume={4},
	number={3},
	pages={277-312},
	year={1984}
}
@article{Wiskott-Sejnowski-2002,
	author={Wiskott, L. and Sejnowski, T. J.},
	title={Slow feature analysis: Unsupervised learning of invariances},
	journal={Neural Computation},
	volume={14},
	number={4},
	pages={715-770},
	year={2002}
}
@article{Wolpert-MacReady-1997,
	author={Wolpert, D. and MacReady, W.},
	title={No free lunch theorems for optimization},
	journal={IEEE Transactions on Evolutionary Computation},
	volume={1},
	pages={67-82},
	year={1997}
}
@article{Wolpert-1996,
	author={Wolpert, D. H.},
	title={The lack of a priori distinction between learning algorithms},
	journal={Neural Computation},
	volume={8},
	number={7},
	pages={1341-1390},
	year={1996}
}
@article{Wu-et-al-2015,
	author={Wu, R. and Yan, S. and Shan, Y. and Dang, Q. and Sun, G.},
	title={Deep image: Scaling up image recognition},
	journal={arXiv:1501.02876.},
	year={2015}
}
@article{Wu-1997,
	author={Wu, Z.},
	title={Global continuation for distance geometry problems},
	journal={SIAM Journal of Optimization},
	volume={7},
	pages={814-836},
	year={1997}
}
@article{Xiong-et-al-2011,
	author={Xiong, H. Y. and Barash, Y. and Frey, B. J.},
	title={Bayesian prediction of tissue-regulated splicing using RNA sequence and cellular context},
	journal={Bioinformatics},
	volume={27},
	number={18},
	pages={2554-2562},
	year={2011}
}
@inproceedings{Xu-et-al-2015,
	author={Xu, K. and Ba, J. L. and Kiros, R. and Cho, K. and Courville, A. and Salakhutdinov, R. and Zemel, R. S. and Bengio, Y.},
	title={Show, attend and tell: Neural image caption generation with visual attention},
	booktitle={In ICML'2015, arXiv:1502.03044},
	year={2015}
}
@article{Yildiz-et-al-2012,
	author={Yildiz, I. B. and Jaeger, H. and Kiebel, S. J.},
	title={Re-visiting the echo state property},
	journal={Neural networks},
	volume={35},
	pages={1-9},
	year={2012}
}
@inproceedings{Yosinski-et-al-2014,
	author={Yosinski, J. and Clune, J. and Bengio, Y. and Lipson, H.},
	title={How transferable are features in deep neural networks?},
	booktitle={In NIPS'2014},
	year={2014}
}
@inproceedings{Younes-1998,
	author={Younes, L.},
	title={On the convergence of Markovian stochastic algorithms with rapidly decreasing ergodicity rates},
	booktitle={In Stochastics and Stochastics Models},
	pages={177-228},
	year={1998}
}
@article{Yu-et-al-2010,
	author={Yu, D. and Wang, S. and Deng, L.},
	title={Sequential labeling using deep-structured conditional random fields},
	journal={IEEE Journal of Selected Topics in Signal Processing.},
	year={2010}
}
@article{Zaremba-Sutskever-2014,
	author={Zaremba, W. and Sutskever, I.},
	title={Learning to execute},
	journal={arXiv 1410.4615.},
	year={2014}
}
@article{Zaremba-Sutskever-2015,
	author={Zaremba, W. and Sutskever, I.},
	title={Reinforcement learning neural Turing machines},
	journal={arXiv:1505.00521.},
	year={2015}
}
@article{Zaslavsky-1975,
	author={Zaslavsky, T.},
	title={Facing Up to Arrangements: Face-Count Formulas for Partitions of Space by Hyperplanes},
	journal={Number no. 154 in Memoirs of the American Mathematical Society. American Mathematical Society.},
	year={1975}
}
@inproceedings{Zeiler-Fergus-2014,
	author={Zeiler, M. D. and Fergus, R.},
	title={Visualizing and understanding convolutional networks},
	booktitle={In ECCV'14.},
	year={2014}
}
@inproceedings{Zeiler-et-al-2013,
	author={Zeiler, M. D. and Ranzato, M. and Monga, R. and Mao, M. and Yang, K. and Le, Q. and Nguyen, P. and Senior, A. and Vanhoucke, V. and Dean, J. and Hinton, G. E.},
	title={On rectified linear units for speech processing},
	booktitle={In ICASSP 2013.},
	year={2013}
}
@article{Zhou-et-al-2015,
	author={Zhou, B. and Khosla, A. and Lapedriza, A. and Oliva, A. and Torralba, A.},
	title={Object detectors emerge in deep scene CNNs},
	journal={ICLR'2015, arXiv:1412.6856.},
	year={2015}
}
@inproceedings{Zhou-Troyanskaya-2014,
	author={Zhou, J. and Troyanskaya, O. G.},
	title={Deep supervised and convolutional generative stochastic network for protein secondary structure prediction},
	booktitle={In ICML'2014.},
	year={2014}
}
@inproceedings{Zhou-Chellappa-1988,
	author={Zhou, Y. and Chellappa, R.},
	title={Computation of optical flow using a neural network},
	booktitle={In Neural Networks, 1988., IEEE International Conference on},
	pages={71-78},
	publisher={IEEE},
	year={1988}
}
@inproceedings{Zohrer-Pernkopf-2014,
	author={Zöhrer, M. and Pernkopf, F.},
	title={General stochastic networks for classification},
	booktitle={In NIPS'2014.},
	year={2014}
}