author = {Marcin Junczys-Dowmunt and Tomasz Dwojak and Hieu Hoang},
  title = {Is Neural Machine Translation Ready for Deployment? A Case Study on 30 Translation Directions},
  booktitle = {Arxiv},
  month = {October},
  year = {2016},
  url = {}
  author = {Junczys-Dowmunt, Marcin and Grundkiewicz, Roman},
  title = {Phrase-based Machine Translation is State-of-the-Art for Automatic Grammatical Error Correction},
  booktitle = {Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing},
  month = {November},
  year = {2016},
  address = {Austin, USA},
  publisher = {Association for Computational Linguistics},
  pages = {1546--1556},
  url = {}
  author = {Hieu Hoang and Nikolay Bogoychev and Lane Schwartz and Marcin Junczys-Dowmunt},
  title = {Fast, Scalable Phrase-Based SMT Decoding},
  booktitle = {Proceedings of the Association for Machine Translation in the Americas 2016},
  month = {October},
  year = {2016},
  address = {Austin, USA},
  publisher = {AMTA}
  author = {Junczys-Dowmunt, Marcin  and  Grundkiewicz, Roman},
  title = {Log-linear Combinations of Monolingual and Bilingual Neural Machine Translation Models for Automatic Post-Editing},
  booktitle = {Proceedings of the First Conference on Machine Translation},
  month = {August},
  year = {2016},
  address = {Berlin, Germany},
  publisher = {Association for Computational Linguistics},
  pages = {751--758},
  url = {}
  author = {Marcin Junczys{-}Dowmunt and
               Tomasz Dwojak and
               Rico Sennrich},
  title = {The {AMU-UEDIN} Submission to the {WMT16} News Translation Task: Attention-based
               {NMT} Models as Feature Functions in Phrase-based {SMT}},
  booktitle = {Proceedings of the First Conference on Machine Translation, {WMT}
               2016, colocated with {ACL} 2016, August 11-12, Berlin, Germany},
  pages = {319--325},
  year = {2016},
  url = {}
  author = {Ales Tamchyna and
               Alexander M. Fraser and
               Ondrej Bojar and
               Marcin Junczys{-}Dowmunt},
  title = {Target-Side Context for Discriminative Models in Statistical Machine
  booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational
               Linguistics, {ACL} 2016, August 7-12, 2016, Berlin, Germany, Volume
               1: Long Papers},
  year = {2016},
  url = {},
  abstract = {Discriminative  translation  models  utilizing  source  context  have  been  shown  to help statistical machine translation performance.  We propose a novel extension of
this work using target context information. Surprisingly, we show that this model can be efficiently integrated directly in the decoding  process.   Our  approach  scales  to 
large training data sizes and results in consistent  improvements  in  translation  quality on four language pairs.   We also provide  an  analysis  comparing  the  strengths
of the baseline source-context model with our  extended  source-context  and  target-context  model  and  we  show  that  our  extension  allows  us  to  better  capture  morphological coherence.  Our work is freely
available as part of Moses.}
  author = {Marcin Junczys-Dowmunt and Bruno Pouliquen and Christophe Mazenc},
  title = {COPPA V2.0: Corpus of Parallel Patent Applications. Building Large Parallel Corpora with GNU Make},
  booktitle = {Proceedings of the 4th Workshop on Challenges in the Management of Large Corpora, Portoro{\v{z}}, Slovenia, May 23-28, 2016},
  year = {2016},
  url = {}
  author = {Michal Ziemski and
               Marcin Junczys{-}Dowmunt and
               Bruno Pouliquen},
  title = {The United Nations Parallel Corpus v1.0},
  booktitle = {Proceedings of the Tenth International Conference on Language Resources
               and Evaluation {LREC} 2016, Portoro{\v{z}}, Slovenia, May 23-28, 2016},
  year = {2016},
  url = {}
  author = {Grundkiewicz, Roman  and  Junczys-Dowmunt, Marcin  and  Gillian, Edward},
  title = {Human Evaluation of Grammatical Error Correction Systems},
  booktitle = {Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing},
  month = {September},
  year = {2015},
  address = {Lisbon, Portugal},
  publisher = {Association for Computational Linguistics},
  pages = {461--470},
  url = {},
  abstract = {The paper presents the results of the first large-scale human evaluation of automatic grammatical error correction (GEC) systems. Twelve participating systems and the unchanged input of the CoNLL-2014 shared task have been reassessed in a WMT-inspired human evaluation procedure. Methods introduced for the Workshop of Machine Translation evaluation campaigns have been adapted to GEC and extended where necessary. 
The produced rankings are used to evaluate standard metrics for grammatical error correction in terms of correlation with human judgment.}
  title = {Large Scale Speech-to-Text Translation with Out-of-Domain Corpora Using Better Context-Based Models and Domain Adaptation},
  author = {Junczys-Dowmunt, Marcin and Przybysz, Pawe{\l} and Staszuk, Arleta and Kim, Eun-Kyoung and Lee, Jaewon},
  booktitle = {Sixteenth Annual Conference of the International Speech Communication Association},
  url = {},
  abstract = {In this paper, we described the process of building a large-scale speech-to-text  pipeline. Two target domains, 
  daily conversations and travel-related conversations between two agents, for the English-German language pair (both directions) are examined.   
  The  SMT  component  is  built  from  out-of-domain  but freely-available bilingual and monolingual data.  We make use of  most  of  the  known  
  available  resources  to  examine  the  effects of unrestricted data and large scale models. A naive baseline  delivers  solid  results  in  terms 
  of  MT-quality.Extending the baseline with context-based translation model features like operations sequence models, higher-order class-based 
  language models, and additional web-scale word-based  language  models  leads  to  a  system  that  significantly  outperforms  the  baseline.   
  Domain  adaption  is  performed  by  separately  weighting the influence of the out-of-domain subcorpora. This is explored for 
  translation models and language models yielding significant improvements in both cases. Automatic and manual evaluation results are 
  provided for raw MT-quality and ASR+MT-quality.},
  year = {2015}
  title = {SMT at the International Maritime Organization: Experiences with Combining In-house Corpora with Out-of-domain Corpora},
  author = {Bruno Pouliquen and Marcin Junczys-Dowmunt and Blanca Pinero and Micha{\l} Ziemski},
  booktitle = {European Association for Machine Translation 2015},
  url = {},
  abstract = {This paper presents a machine translation tool -- based on Moses -- developed for the International Maritime Organization (IMO) for the automatic translation 
  of documents from Spanish, French, Russian and Arabic to/from English. The main challenge lies in the insufficient size of in-house corpora (especially for Russian and Arabic). 
  The United Nations (UN) granted IMO the right to use UN resources and we describe experiments and results we obtained with different translation model combination techniques.
  While BLEU results remain inconclusive for combinations, we also analyze user preferences for certain models (when choosing between IMO only or combined with UN). 
  The combined models are perceived by translators as being much better for general texts while IMO only models seem better for technical texts.},
  year = {2015}
  author = {Roman Grundkiewicz and Marcin Junczys-Dowmunt},
  title = {The WikEd Error Corpus: A Corpus of Corrective Wikipedia Edits
and its Application to Grammatical Error Correction},
  booktitle = {Advances in Natural Language Processing -- Lecture Notes in Computer Science},
  editor = {Adam Przepiórkowski and Maciej Ogrodniczuk},
  publisher = {Springer},
  year = {2014},
  volume = {8686},
  pages = {478--490},
  url = {},
  abstract = {This paper introduces the freely available WikEd Error Corpus. We describe the data mining 
process from Wikipedia revision histories, corpus content and format. The 
corpus consists of more than 12 million sentences with a total of 14 million edits of 
various types. 

As one possible application, we show that WikEd can be successfully 
adapted to improve a strong baseline in an ESL grammatical error correction task 
by 2.63\%. Used together with an ESL error corpus, a composed system gains 
 1.64\% when compared to the ESL-trained system.}
  publisher = {Association for Computational Linguistics},
  author = {Marcin Junczys-Dowmunt and Roman Grundkiewicz},
  title = {The AMU System in the CoNLL-2014 Shared Task: Grammatical Error 
Correction by Data-Intensive and Feature-Rich Statistical Machine Translation},
  booktitle = {Proceedings of the Eighteenth Conference on Computational Natural Language Learning: Shared Task (CoNLL-2014 Shared Task)},
  address = {Baltimore, USA},
  year = {2014},
  url = {},
  pages = {25--33},
  abstract = {
  Statistical machine translation toolkits like Moses have not been 
designed with grammatical error correction in mind. In order to achieve 
competitive results in this area, it is not enough to simply add more data. 
Optimization procedures need to be customized, task-specific features should be 
introduced. Only then can the decoder take advantage of relevant data.

We demonstrate the validity of the above claims by combining web-scale language 
models and large-scale error-corrected texts with parameter tuning according to 
the task metric and correction-specific features. Our system achieves a result 
of 35.0\% F0.5 on the blind CoNLL-2014 test set, ranking on third place. A 
similar system, equipped with 
identical models but without tuned parameters and specialized features, 
stagnates at 25.4\%. 
  author = {Marcin Junczys-Dowmunt and Bruno Pouliquen},
  title = {SMT of German Patents at WIPO: Decompounding and Verb Structure Pre-reordering},
  booktitle = {17th Annual Conference of the European Association for Machine Translation (EAMT)},
  address = {Dubrovnik, Croatia},
  year = {2014},
  pages = {217--220},
  url = {},
  abstract = {We describe fragments of the SMT pipeline at WIPO for German as a
  source language. Two subsystems are discussed in detail: word decompounding
  and verb structure pre-reordering. Apart from automatic evaluation results
  for both subsystems, for the pre-reordering mechanism manual evaluation
  results are reported. }
  author = {Bruno Pouliquen and Cecilia Elizalde and Marcin Junczys-Dowmunt and Christophe Mazenc and José García-Verdugo},
  title = {Large-scale multiple language translation accelerator at the United Nations},
  booktitle = {14th Machine Translation Summit},
  address = {Nice, France},
  year = {2013},
  editor = {K. Sima’an and M. L. Forcada and D. Grasmick and H. Depraetere and A. Way},
  pages = {345--352},
  url = {},
  abstract = {Described is a large-scale implementation of a Moses-based machine
  translation system in the United Nations aiming at accelerating the work of
  translators. The system (called TAPTA4UN) has been trained on extensive
  parallel corpora in 6 languages. Both automatic and human evaluations are
  provided. The system is now used in production by professional translators.
  The technical challenges of scalability and the final evaluation by users are
  also described.}
  author = {Marcin Junczys-Dowmunt},
  title = {Phrasal Rank-Encoding: Exploiting Phrase Redundancy and Translational Relations for Phrase Table Compression},
  journal = {Prague Bull. Math. Linguistics},
  volume = {98},
  year = {2012},
  pages = {63--74},
  url = {},
  abstract = {We describe Phrasal Rank-Encoding (PR-Enc), a novel method for the
  compression of word-aligned target language data in phrase tables as used in
  phrase-based SMT. This method reduces the redundancy in phrase tables which is
  a direct effect of the phrase-based approach. A combination of PR-Enc with
  Huffman coding allows to reduce the size of an aggressively compressed phrase
  table by another 39 percent. Using this and other methods for space reduction
  in a new binary phrase table implementation, a size reduction by an order of
  magnitude is achieved when comparing to the Moses on-disk phrase table
  implementation. Concerning decoding speed, all variants of the new phrase
  table are faster than the Moses binary phrase table implementation while the
  PR-Enc encoded variant outperforms all other methods.}
  author = {Marcin Junczys-Dowmunt},
  title = {A Phrase Table without Phrases: Rank Encoding for Better Phrase Table Compression},
  booktitle = {16th Annual Conference of the European Association for Machine Translation (EAMT)},
  year = {2012},
  address = {Trento, Italy},
  url = {},
  pages = {245--252},
  abstract = {This paper describes the first steps towards a minimum-size phrase
  table implementation to be used for phrase-based statistical machine
  translation. The focus lies on the size reduction of target language data in a
  phrase table. Rank Encoding (R-Enc), a novel method for the compression of
  word-aligned target language in phrase tables is presented. Combined with
  Huffman coding a relative size reduction of 56 percent for target phrase words
  and alignment data is achieved when compared to bare Huffman coding without
  R-Enc. In the context of the complete phrase table the size reduction is 22
  author = {Marcin Junczys-Dowmunt},
  title = {A Space-Efficient Phrase Table Implementation Using Minimal Perfect Hash Functions},
  booktitle = {15th International Conference on Text, Speech and Dialogue (TSD)},
  year = {2012},
  series = {Lecture Notes in Computer Science},
  editor = {Sojka, Petr and Horák, Ales and Kopecek, Ivan and Pala, Karel},
  volume = {7499},
  publisher = {Springer},
  url = {},
  pages = {320--327},
  abstract = {We describe the structure of a space-efficient phrase table for
  phrase- based statistical machine translation with the Moses decoder. The new
  phrase table can be used in-memory or be partially mapped on-disk. Compared to
  the standard Moses on-disk phrase table implementation a size reduction by a
  factor of 6 is achieved. The focus of this work lies on the source phrase
  index which is implemented using minimal perfect hash functions. Two methods
  are discussed that reduce the memory consumption of a baseline
  author = {Marcin Junczys-Dowmunt},
  title = {A Genetic Programming Experiment in Natural Language Grammar Engineering},
  booktitle = {15th International Conference on Text, Speech and Dialogue (TSD)},
  editor = {Sojka, Petr and Horák, Ales and Kopecek, Ivan and Pala, Karel},
  year = {2012},
  series = {Lecture Notes in Computer Science},
  volume = {7499},
  address = {Brno, Czech Republic},
  publisher = {Springer},
  url = {},
  pages = {336--344},
  abstract = {This paper describes an experiment in grammar engineering for a
  shallow syntactic parser using Genetic Programming and a treebank. The goal of
  the experiment is to improve the Parseval score of a previously manually
  created seed grammar. We illustrate the adaptation of the Genetic Programming
  paradigm to the problem of grammar engineering. The used genetic operators are
  described. The performance of the evolved grammar after 1,000 generations on
  an unseen test set is improved by 2.7 points F-score (3.7 points on the
  training set). Despite the large number of generations no overfitting effect
  is observed}
  author = {Graliński, Filip and Jassem, Krzysztof and Junczys-Dowmunt, Marcin},
  journal = {Computational Linguistics - Applications},
  pages = {27--39},
  series = {Studies in Computational Intelligence},
  publisher = {Springer},
  title = {{PSI-Toolkit}: Natural Language Processing Pipeline},
  volume = 458,
  year = 2012,
  url = {},
  abstract = {The paper presents the main ideas and the architecture of the open
  source PSI-Toolkit, a set of linguistic tools being developed within a project
  financed by the Polish Ministry of Science and Higher Education. The toolkit
  is intended for experienced language engineers as well as casual users not
  having any technological background. The former group of users is delivered a
  set of libraries that may be included in their Perl, Python or Java
  applications. The needs of the latter group should be satisfied by a user
  friendly web interface. The main feature of the toolkit is its data structure,
  the so-called PSI-lattice that assembles annotations delivered by all PSI
  tools. This cohesive architecture allows the user to invoke a series of
  processes with one command. The command has the form of a pipeline of
  instructions resembling shell command pipelines known from Linux-based
  author = {Marcin Junczys-Dowmunt and Arkadiusz Szał},
  title = {SyMGiza++: Symmetrized Word Alignment Models for Machine Translation},
  booktitle = {Security and Intelligent Information Systems (SIIS)},
  year = {2012},
  editor = {Bouvry, Pascal and Klopotek, Mieczyslaw A. and Leprévost, Franck and Marciniak, Malgorzata and Mykowiecka, Agnieszka and Rybinski, Henryk},
  series = {Lecture Notes in Computer Science},
  volume = {7053},
  pages = {379-390},
  address = {Warsaw, Poland},
  publisher = {Springer},
  url = {},
  abstract = {SyMGiza++ -- a tool that computes symmetric word align- ment models
  with the capability to take advantage of multi-p rocessor systems -- is
  presented. A series of fairly simple modifications to the original IBM/Giza++
  word alignment models allows to update the symmetrized models between chosen
  iterations of the original training algorithms. We achieve a relative
  alignment quality improvem ent of more than 17% compared to Giza++ and MGiza++
  on the standard Canadian Hansards task, while maintaining the speed
  improvemen ts provided by the capability of parallel computations of MGiza++.
  Furthermore, the alignment models are evaluated in the cont ext of phrase-
  based statistical machine translation, where a consistent improvement measured
  in BLEU scores can be observed when SyMGiza++ is use d instead of Giza++ or
  title = {A Comparison of Search Algorithms for Syntax-based SMT},
  author = {Marcin Junczys-Dowmunt},
  journal = {Speech, Language and Technology},
  volume = {11},
  year = {2011},
  address = {Poznań, Poland}
  author = {Marcin Junczys-Dowmunt and Arkadiusz Szał},
  title = {SyMGiza++: A Tool for Parallel Computation of Symmetrized Word Alignment Models},
  booktitle = {5th International Multiconference on Computer Science and Information Technology},
  year = {2010},
  address = {Wisła, Poland},
  pages = {397-401},
  url = {},
  abstract = {SyMGiza++ -- a tool that computes symmetric word alignment models
  with the capability to take advantage of multi-processor systems -- is
  presented. A series of fairly simple modifications to the original IBM/Giza++
  word alignm ent models allows to update the symmetrized models between each
  iteration of the original training algorithms. We achieve a relative alignment
  quality improvement of more than 17% compared to Giza++ and MGiza++ on the
  standard Canadian Hansards task, while maintaining the speed improvements
  provided by MGiza++’s capability of parallel computations.}
  author = {Marcin Junczys-Dowmunt},
  title = {A Maximum Entropy Approach to Translation Rule Filtering},
  booktitle = {11th International Conference on Intelligent Text Processing and Computational Linguistics (CICLing)},
  year = {2010},
  series = {Lecture Notes in Computer Science},
  editor = {Gelbukh, Alexander F.},
  volume = {6008},
  address = {Iasi, Romania},
  publisher = {Springer},
  pages = {451--463},
  url = {},
  abstract = {In this paper we will present a maximum entropy filter for the
  translation rules of a statistical machine translation system based on tree
  transducers. This filter can be successfully used to reduce the number of
  translation rules by more than 70% without negatively affecting translation
  quality as measured by BLEU. For some filter configurations, translation
  quality is even improved. Our investigations include a discussion of the
  relationship of Alignment Error Rate and Consistent Translation Rule Score
  with translation quality in the context of Syntactic Statistical Machine
  title = {It's all about the Trees - Towards a Hybrid Syntax-Based MT System},
  booktitle = {4th International Multiconference on Computer Science and Information Technology},
  address = {Mrągowo, Poland},
  author = {Marcin Junczys-Dowmunt},
  year = {2009},
  pages = {219--226},
  url = {},
  abstract = {The aim of this paper is to describe the first steps of research
  towards a hybrid MT system that combines the streng ths of rule-based
  syntactic transfer with recently developed syntax-based statistical
  translation methods within a unified framework. The similarities of both
  paradigms concerning the processing of syntactically parsed input trees serve
  as a basis for this reseach. We focus on the statistical part of the future
  system and present a syntax-based statistical machine translation system --
  BONSAI -- for Polish-to-French translation. Although BONSAI is still under
  develepmont, it reaches a translation quality on par with that of a modern
  phrase-based system. We provide the theoretical background as well as some
  implementation deta ils and preliminary evaluation results for BONSAI. At
  the end of this paper we shortly discuss the benefits of a combined approach.}
  title = {Niemieckie rzeczowniki złożone i ich polskie odpowiedniki - Automatyczna ekstrakcja, analiza i weryfikacja na podstawie korpusów równoległych},
  author = {Marcin Junczys-Dowmunt},
  school = {Adam Mickiewicz University, Poznań, Poland},
  year = {2009},
  url = {}
  author = {Marcin Junczys-Dowmunt},
  title = {Wprowadzenie do metod statystycznych w tłumaczeniu automatycznym},
  journal = {Investigationes Linguisticae},
  volume = {16},
  url = {},
  address = {Poznań, Poland},
  year = {2008},
  pages = {44--66}
  author = {Marcin Junczys-Dowmunt},
  title = {Influence of accurate compound noun splitting on bilingual vocabulary extraction},
  series = {Text Resources and Lexical Knowledge. Text, Translation, Computational Processing (TTCP)},
  volume = {8},
  address = {Berlin, Germany},
  booktitle = {9. Konferenz zur Verarbeitung natürlicher Sprache (Konvens)},
  publisher = {Mouton de Gruyter},
  year = {2008},
  pages = {91--105},
  url = {},
  abstract = {The influence of compound noun splitting on a German-Polish
  bilingual vocabulary extraction task is investigated. To accomplish this,
  several unsupervised methods for increasingly accurate compound noun splitting
  are introduced. Bilingual evidence from a parallel German-Polish corpus and
  co-occurrence counts from the web are used to disambiguate compound noun
  analyses directly. These collected splits serve as training data for a
  probabilistic model that abstracts away from the errors made by the direct
  methods and reaches an f-measure of 95.10%. Furthermore, these methods are
  evaluated in terms of word alignment quality and extraction accuracy where
  linguistically accurate methods are found to outperform the corpus-based
  methods proposed in the literature. A comparison of alignment quality achieved
  with the best splitting method and the baseline implies that the effort to
  build super- vised splitting methods might result in minimal or no performance
  author = {Marcin Junczys-Dowmunt and Filip Graliński},
  title = {Using a Treebank Grammar for the Syntactical Annotation of German Lexical Phrases},
  booktitle = {3rd Language and Technology Conference (LTC)},
  address = {Poznań},
  year = {2007},
  url = {},
  abstract = {The aim of this paper is to investigate whether a treebank grammar
  can be used to automatically classify and annotate German phrases
  contained in a MT lexicon. Phrases from the lexicon appear in their citation
  form and may differ structurally from the phr ase tokens found in the corpus.
  We describe the grammar extraction proc ess for a formalism called
  Tree-Generating Binary Grammar a nd evaluate the performance of subsets of the
  obtained grammar on a set of four types of lexical phrases.}
  author = {Marcin Junczys-Dowmunt},
  title = {Model skończenie stanowy niemieckich wyrazów pojedynczo i wielokrotnie złożonych},
  journal = {Investigationes Linguisticae},
  volume = {14},
  pages = {50--67},
  url = {},
  address = {Poznań, Poland},
  year = {2006}