Spaces:

prb977
/

cooccurrence_count

Runtime error

File size: 3,794 Bytes

# Copyright 2020 The HuggingFace Datasets Authors and the current
# dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Get the co-occurance count for two words in each sentece in a dataset.
"""


import evaluate
import datasets
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import stanza


_DESCRIPTION = """\
Returns the co-occurrence count of two words in the input.
"""

_CITATION = ""

_KWARGS_DESCRIPTION = """
Calculates the co-occurence of two words in each sentence.
Args:
    `data`: a list of `str` which containes a dataset.
    `words`: list of list of two words that we want to check for
Returns:
Examples:
    >>> data = ["hello sun","hello moon", "hello sun"]
    >>> c_count = evaluate.load("prb977/cooccurrence_count")
    >>> results = c_count.compute(data=data, words=[['hello','sun']\)
    >>> print(results)
    [['hello','sun',3,2]]
"""


def check_count(x):
    if x[0].all() <= 0:
        return 0
    return 1


nlp = stanza.Pipeline(lang='en', processors='tokenize')


def stanza_tokenizer(sen):
    doc = nlp(sen)
    tokens = []
    for sen in doc.sentences:
        for token in sen.tokens:
            tokens.append(token.text)
    return tokens


@evaluate.utils.file_utils.add_start_docstrings(
    _DESCRIPTION,
    _KWARGS_DESCRIPTION
)
class CooccurrenceCount(evaluate.Measurement):
    """This measurement returns the co-occurrence count of two words."""

    def _info(self):
        return evaluate.MeasurementInfo(
            module_type="measurement",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            features=datasets.Features({
                'data': datasets.Value('string')
            }),
        )

    def _download_and_prepare(self, dl_manager):
        stanza.download('en', processors='tokenize')

    def _compute(self, data, words):
        for each in words:
            word1 = each[0]
            word2 = each[1]
            print(word1)
            print(word2)
            len1 = len(stanza_tokenizer(word1))
            len2 = len(stanza_tokenizer(word2))
            if len1 > len2:
                ugram = len1
                lgram = len2
            elif len1 < len2:
                ugram = len2
                lgram = len1
            else:
                ugram = len1
                lgram = len1

            v = CountVectorizer(
                ngram_range=(lgram, ugram),
                tokenizer=stanza_tokenizer,
                lowercase=True
            )
            analyzer = v.build_analyzer()
            vectorizer = CountVectorizer(
                ngram_range=(lgram, ugram),
                vocabulary={
                    analyzer(word1)[-1]: 0,
                    analyzer(word2)[-1]: 1
                },
                tokenizer=stanza_tokenizer,
                lowercase=True
            )
            co_occurrences = vectorizer.fit_transform(data)
            dense_mat = co_occurrences.todense()
            count = len(data)
            co_occurrence_count = np.sum(
                np.apply_along_axis(check_count, axis=1, arr=dense_mat)
            )
            each.append(count)
            each.append(co_occurrence_count)
        return words