from nltk.corpus import brown from nltk.corpus import treebank as penn from nltk.corpus import multext_east as mte from nltk.corpus import alpino import utils ### CORPORA ### def get_brown_corp(): return brown.tagged_sents() def get_penn_corp(wsj=None): sents = penn.tagged_sents() if wsj: if wsj > 0 and wsj < 200: filename = 'wsj_' + str(wsj).rjust(4, '0') + '.mrg' sents = penn.tagged_sents(filename) return list(map(lambda s: list(filter(lambda tup: tup[1] != '-NONE-', s)), sents)) def get_mte_corp(lang): if lang is "bulgarian": # DOESN'T WORK !!! return mte.tagged_sents("oana-bg.xml") elif lang is "czech": return mte.tagged_sents("oana-cs.xml") elif lang is "english": return mte.tagged_sents("oana-en.xml") elif lang is "estonian": return mte.tagged_sents("oana-et.xml") elif lang is "Farci": return mte.tagged_sents("oana-fa.xml") elif lang is "hungarian": return mte.tagged_sents("oana-hu.xml") elif lang is "macedonian": return mte.tagged_sents("oana-mk.xml") elif lang is "polish": return mte.tagged_sents("oana-pl.xml") elif lang is "romanian": return mte.tagged_sents("oana-ro.xml") elif lang is "slovak": return mte.tagged_sents("oana-sk.xml") elif lang is "slovene": return mte.tagged_sents("oana-sl.xml") elif lang is "serbian": return mte.tagged_sents("oana-sr.xml") return mte.tagged_sents("oana-en.xml") def get_alpino_corp(): return alpino.tagged_sents(tagset='universal') class CorpusManager(): def __init__(self, k=10): self._nextid = 0 self.k = k self.corpids = set() self.corpnams = {} self.corplangs = {} self.corpparts = {} def addCorpus(self, corpname, lang, taggedsents): self._nextid += 1 cid = self._nextid self.corpids.add(cid) self.corpnams[cid] = corpname self.corplangs[cid] = lang self.corpparts[cid] = self._mk_partitions(taggedsents) return cid def _mk_partitions(self, taggedsents): parts = [[] for i in range(self.k)] remaining = list(taggedsents) while len(remaining) >= self.k: for i in range(self.k): nextsent = utils.popFront(remaining) utils.pushBack(parts[i], nextsent) return parts def allCorpora(self): for cid in self.corpids: yield (cid, self.corpnams[cid], self.corplangs[cid]) def getTest(self, cid): """Returns the test parition for the corpus. """ return self.corpparts[cid][0] def _get_tvparts(self, cid): return self.corpparts[cid][1:] def getTrainVal(self, cid): """Returns the training set used for evaluation, which includes the validation partition. """ return utils.flatten(self._get_tvparts(cid)) def allTrainValidParts(self, cid): """Returns k-fold cross-validation partions. """ parts = self._get_tvparts(cid) for p in range(self.k-1): pval = parts[p] ptrain = utils.flatten(parts[:p] + parts[p+1:]) yield (ptrain, pval) def createCM(): cm = CorpusManager() cm.addCorpus("Penn", "English", get_penn_corp()) cm.addCorpus("Alpino", "Dutch", get_alpino_corp()) cm.addCorpus("MTE", "Estonian", get_mte_corp('estonian')) cm.addCorpus("MTE", "Romanian", get_mte_corp('romanian')) cm.addCorpus("MTE", "Serbian", get_mte_corp('serbian')) cm.addCorpus("MTE", "Slovene", get_mte_corp('slovene')) cm.addCorpus("MTE", "English", get_mte_corp('english')) cm.addCorpus("MTE", "Czech", get_mte_corp('czech')) cm.addCorpus("Brown", "English", get_brown_corp()) return cm cm = createCM()