Prabin Bhandari commited on
Commit
1ea30cb
1 Parent(s): 2c03ef4

Some changes

Browse files
Files changed (1) hide show
  1. cooccurrence_count.py +26 -4
cooccurrence_count.py CHANGED
@@ -21,6 +21,7 @@ import evaluate
21
  import datasets
22
  from sklearn.feature_extraction.text import CountVectorizer
23
  import numpy as np
 
24
 
25
 
26
  _DESCRIPTION = """\
@@ -53,6 +54,18 @@ def check_count(x):
53
  return 1
54
 
55
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  @evaluate.utils.file_utils.add_start_docstrings(
57
  _DESCRIPTION,
58
  _KWARGS_DESCRIPTION
@@ -71,9 +84,12 @@ class CooccurrenceCount(evaluate.Measurement):
71
  }),
72
  )
73
 
 
 
 
74
  def _compute(self, data, word1, word2):
75
- len1 = len(word1.split(' '))
76
- len2 = len(word2.split(' '))
77
  if len1 > len2:
78
  ugram = len1
79
  lgram = len2
@@ -84,14 +100,20 @@ class CooccurrenceCount(evaluate.Measurement):
84
  ugram = len1
85
  lgram = len1
86
 
87
- v = CountVectorizer(ngram_range=(lgram, ugram))
 
 
 
 
88
  analyzer = v.build_analyzer()
89
  vectorizer = CountVectorizer(
90
  ngram_range=(lgram, ugram),
91
  vocabulary={
92
  analyzer(word1)[-1]: 0,
93
  analyzer(word2)[-1]: 1
94
- }
 
 
95
  )
96
  co_occurrences = vectorizer.fit_transform(data)
97
  dense_mat = co_occurrences.todense()
 
21
  import datasets
22
  from sklearn.feature_extraction.text import CountVectorizer
23
  import numpy as np
24
+ import stanza
25
 
26
 
27
  _DESCRIPTION = """\
 
54
  return 1
55
 
56
 
57
+ nlp = stanza.Pipeline(lang='en', processors='tokenize')
58
+
59
+
60
+ def stanza_tokenizer(sen):
61
+ doc = nlp(sen)
62
+ tokens = []
63
+ for sen in doc.sentences:
64
+ for token in sen.tokens:
65
+ tokens.append(token.text)
66
+ return tokens
67
+
68
+
69
  @evaluate.utils.file_utils.add_start_docstrings(
70
  _DESCRIPTION,
71
  _KWARGS_DESCRIPTION
 
84
  }),
85
  )
86
 
87
+ def _download_and_prepare(self, dl_manager):
88
+ stanza.download('en', processors='tokenize')
89
+
90
  def _compute(self, data, word1, word2):
91
+ len1 = len(stanza_tokenizer(word1))
92
+ len2 = len(stanza_tokenizer(word2))
93
  if len1 > len2:
94
  ugram = len1
95
  lgram = len2
 
100
  ugram = len1
101
  lgram = len1
102
 
103
+ v = CountVectorizer(
104
+ ngram_range=(lgram, ugram),
105
+ tokenizer=stanza_tokenizer,
106
+ lowercase=True
107
+ )
108
  analyzer = v.build_analyzer()
109
  vectorizer = CountVectorizer(
110
  ngram_range=(lgram, ugram),
111
  vocabulary={
112
  analyzer(word1)[-1]: 0,
113
  analyzer(word2)[-1]: 1
114
+ },
115
+ tokenizer=stanza_tokenizer,
116
+ lowercase=True
117
  )
118
  co_occurrences = vectorizer.fit_transform(data)
119
  dense_mat = co_occurrences.todense()