Source code for mb.external_resources.natstor

from mb.core.general.table import *
from mb.util.toks2sents import toks2sents
from mb.util.sents2sentids import sents2sentids
from mb.util.tabular import rt2timestamps
from mb.util.util_natstor import ns_text_normalizer, ns_docid_int2name, docids_by_item, textgrid2itemmeasures, ns_merge


#####################################
#
# EXTERNAL RESOURCES
#
#####################################


[docs]class NatstorRepo(Repo):
    URL = 'https://github.com/languageMIT/naturalstories'
    GIT_URL = 'git@github.com:languageMIT/naturalstories.git'
    DESCR_SHORT = 'the Natural Stories Corpus'
    DESCR_LONG = (
        'A corpus of naturalistic stories meant to contain varied,\n'
        'low-frequency syntactic constructions. There are a variety of annotations\n'
        'and psycholinguistic measures available for the stories.\n'
    )


[docs]class LineTreesNatstorPennSource(ExternalResource):
    DEFAULT_LOCATION = 'parses/penn/all-parses.txt.penn'
    STATIC_PREREQ_TYPES = [NatstorRepo]
    PARENT_RESOURCE = NatstorRepo
    DESCR_SHORT = 'Natural Stories PTB source trees'
    DESCR_LONG = 'Source Penn Treebank style hand-corrected parses for the Natural Stories corpus'


[docs]class NatstorTokSource(ExternalResource):
    DEFAULT_LOCATION = 'naturalstories_RTS/all_stories.tok'
    STATIC_PREREQ_TYPES = [NatstorRepo]
    PARENT_RESOURCE = NatstorRepo
    FILE_TYPE = 'table'
    SEP = '\t'
    DESCR_SHORT = 'Natural Stories source tokenization'
    DESCR_LONG = 'Natural Stories source tokenization'


[docs]class NatstorProcessRTs(ExternalResource):
    DEFAULT_LOCATION = 'naturalstories_RTS/process_ns_mb.R'
    STATIC_PREREQ_TYPES = [NatstorRepo, 'scripts/process_ns_mb.R']
    PARENT_RESOURCE = NatstorRepo
    FILE_TYPE = None
    DESCR_SHORT = 'RT processing script'
    DESCR_LONG = 'Forked processing script for Natural Stories RT data'

    def body(self):
        return 'cp %s %s' % (
            self.static_prereqs()[1].path,
            self.path
        )


[docs]class NatstorRTSource(ExternalResource):
    DEFAULT_LOCATION = 'naturalstories_RTS/processed_RTs_MB.tsv'
    STATIC_PREREQ_TYPES = [NatstorRepo, NatstorProcessRTs]
    PARENT_RESOURCE = NatstorRepo
    FILE_TYPE = 'table'
    SEP = '\t'
    DESCR_SHORT = 'naturalstories source RTs'
    DESCR_LONG = 'Natural Stories source reading times'

    def body(self):
        return 'cd %s; ./%s' % (
            os.path.join(self.static_prereqs()[0].path, 'naturalstories_RTS'),
            self.static_prereqs()[1].basename
        )


[docs]class NatstorAudioSource(ExternalResource):
    DEFAULT_LOCATION = 'audio'
    STATIC_PREREQ_TYPES = [NatstorRepo]
    PARENT_RESOURCE = NatstorRepo
    DESCR_SHORT = 'Natural Stories audio source'
    DESCR_LONG = 'Natural Stories audio source'


    def body(self):
        return 'cp %s %s' % (
            self.static_prereqs()[1].path,
            self.path
        )





#####################################
#
# NATURAL STORIES TYPES
#
#####################################


[docs]class LineTreesNatstorPTB(LineTrees):
    MANIP = 'naturalstories.ptb'
    STATIC_PREREQ_TYPES = [LineTreesNatstorPennSource, 'scripts/editabletrees2linetrees.pl']
    DESCR_SHORT = 'naturalstories gold ptb linetrees'
    DESCR_LONG = (
        "Hand-annotated parse trees for the Natural Stories corpus.\n"
    )

    def body(self):
        out = "cat %s | sed 's/\\r//g' | perl %s > %s" % (
            self.static_prereqs()[0].path,
            self.static_prereqs()[1].path,
            self.path
        )

        return out

[docs]class LineToksNatstor(LineToks):
    MANIP = 'naturalstories'
    DESCR_SHORT = 'naturalstories linetoks'
    DESCR_LONG = (
        'PTB Tokenized sentences (linetoks) for the Natural Stories corpus'
    )

    def body(self):
        def out(inputs):
            trace = re.compile('\*')

            t = tree.Tree()
            outputs = []
            for x in inputs:
                x = x.strip()
                if (x != '') and (x[0] != '%'):
                    # Extract words
                    t.read(x)
                    all_words = t.words()
                    out = ''
                    for w in all_words:
                        if not trace.match(w):
                            if out != '':
                                out += ' '
                            out += w
                    out += '\n'

                    # Normalize sentence
                    out = ns_text_normalizer(out)

                    outputs.append(out)

            return outputs

        return out

    @classmethod
    def other_prereq_paths(self, path):
        if path is None:
            return ['(DIR/)naturalstories.ptb.linetrees']

        directory = os.path.dirname(path)
        filename = 'naturalstories.ptb.linetrees'

        return [os.path.join(directory, filename)]

    @classmethod
    def other_prereq_type(cls, i, path):
        return LineTreesNatstorPTB


[docs]class LineItemsNatstor(LineItems):
    MANIP = 'naturalstories'
    STATIC_PREREQ_TYPES = [NatstorTokSource]
    DESCR = 'naturalstories lineitems'
    DESCR_LONG = 'Natural Stories lineitems'
    
    def body(self):
        def out(tokmeasures, linetoks):
            tokmeasures = tokmeasures.copy()
            tokmeasures.word = tokmeasures.word.map(ns_text_normalizer)

            outputs = toks2sents(linetoks, tokmeasures)

            return outputs

        return out


    @classmethod
    def other_prereq_paths(cls, path):
        if path is None:
            return ['(DIR/)naturalstories.linetoks']
        return [os.path.join(os.path.dirname(path), 'naturalstories.linetoks')]

    @classmethod
    def other_prereq_type(cls, i, path):
        return LineToksNatstor


[docs]class ItemMeasuresNatstor(ItemMeasures):
    MANIP = 'naturalstories'
    STATIC_PREREQ_TYPES = [NatstorTokSource]
    DESCR = 'naturalstories itemmeasures'
    DESCR_LONG = 'Natural Stories base itemmeasures'

    def body(self):
        def out(tokmeasures, lineitems):
            outputs = docids_by_item(lineitems, tokmeasures)

            return outputs

        return out

    @classmethod
    def other_prereq_paths(cls, path):
        if path is None:
            return ['(DIR/)naturalstories.lineitems']
        return [os.path.join(os.path.dirname(path), 'naturalstories.lineitems')]

    @classmethod
    def other_prereq_type(cls, i, path):
        return LineItemsNatstor


[docs]class ItemMeasuresNatstorTime(ItemMeasures):
    MANIP = 'naturalstories.t'
    STEM_PREREQ_TYPES = [ItemMeasuresNatstor]
    STATIC_PREREQ_TYPES = [NatstorAudioSource]
    DESCR = 'naturalstories time itemmeasures'
    DESCR_LONG = 'Natural Stories timestamp itemmeasures'

    def body(self):
        def out(itemmeasures):
            outputs = textgrid2itemmeasures(itemmeasures, self.static_prereqs()[0].path)

            return outputs

        return out

    @classmethod
    def augment_prereq(cls, i, path):
        return 'naturalstories'


[docs]class ItemMeasuresNatstorMergeFields(ItemMeasures):
    MANIP = 'naturalstories.mfields'
    STATIC_PREREQ_TYPES = [NatstorTokSource]
    DESCR = 'naturalstories merge fields'
    DESCR_LONG = 'Natural Stories itemmeasures augmented with fields needed to merge with experimental measures (evmeasures)'

    def body(self):
        def out(tokmeasures, lineitems):
            sentids = sents2sentids(lineitems)
            sentids = sentids.reset_index(drop=True)

            tokmeasures = tokmeasures.copy()
            tokmeasures.word = tokmeasures.word.map(ns_text_normalizer)
            tokmeasures['docid'] = tokmeasures.item
            tokmeasures = ns_docid_int2name(tokmeasures)
            tokmeasures = tokmeasures.reset_index(drop=True)

            outputs = pd.concat([sentids, tokmeasures[['docid', 'item', 'zone']]], axis=1)

            return outputs

        return out

    @classmethod
    def other_prereq_paths(cls, path):
        if path is None:
            return ['(DIR/)naturalstories.lineitems']
        return [os.path.join(os.path.dirname(path), 'naturalstories.lineitems')]

    @classmethod
    def other_prereq_type(cls, i, path):
        return LineItemsNatstor


[docs]class EvMeasuresNatStor(EvMeasures):
    MANIP = 'naturalstories'
    STATIC_PREREQ_TYPES = [NatstorRTSource]
    DESCR = 'naturalstories base evmeasures'
    DESCR_LONG = 'Natural Stories base evmesaures'

    def body(self):
        def out(evmeasures, itemmeasures):
            evmeasures = evmeasures.copy()
            evmeasures.word = evmeasures.word.map(ns_text_normalizer)
            evmeasures.rename(lambda x: 'subject' if x == 'WorkerId' else 'fdur' if x == 'RT' else x, axis=1, inplace=True)

            itemmeasures = itemmeasures.copy()

            outputs = ns_merge(evmeasures, itemmeasures)

            outputs = rt2timestamps(outputs)

            return outputs

        return out

    @classmethod
    def other_prereq_paths(cls, path):
        if path is None:
            return ['(DIR/)naturalstories.mfields.lineitems']
        return [os.path.join(os.path.dirname(path), 'naturalstories.mfields.itemmeasures')]

    @classmethod
    def other_prereq_type(cls, i, path):
        return ItemMeasuresNatstorMergeFields