#!/usr/bin/python from nltk.tag import brill #training data would normally be input by a per-corpus reader class #(nltk.corpus.TaggedCorpusReader, nltk.corpus.EuroparlCorpusReader, etc.) mock_training_data = [[ ('Should', 'md'), ('I', 'pn'), ('apologize', 'vb'), ('for', 'in'), ('the', 'dt'), ('wait', 'nn'), ('?', '.')]] ### TEMPLATES templates = [ brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-2, -1)), brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1)) ] baselineTagger = ... #some object implementing a basic 'tag' method trainer = brill.FastBrillTaggerTrainer(baselineTagger, templates) brill_tagger = trainer.train(mock_training_data) rules = brill_tagger.rules() ### LEARNED RULES #numbers are score and number of (good, bad, neutral) applications 1 1 0 0 | vb -> nn if the text of the preceding word is 'the' 1 1 0 0 | vbp -> vb if the tag of words i-2...i-1 is 'md'