dadashzadeh commited on
Commit
9bd81ec
·
verified ·
1 Parent(s): ed74736

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +29 -18
README.md CHANGED
@@ -41,7 +41,7 @@ import bm25s
41
  from bm25s.hf import BM25HF
42
 
43
  # Load the index
44
- retriever = BM25HF.load_from_hub("{username}/{repo_name}}")
45
 
46
  # You can retrieve now
47
  query = "a cat is a feline"
@@ -57,17 +57,28 @@ import bm25s
57
  from bm25s.hf import BM25HF
58
 
59
  corpus = [
60
- "a cat is a feline and likes to purr",
61
- "a dog is the human's best friend and loves to play",
62
- "a bird is a beautiful animal that can fly",
63
- "a fish is a creature that lives in water and swims",
 
 
 
 
 
 
 
 
 
 
 
64
  ]
65
 
66
  retriever = BM25HF(corpus=corpus)
67
  retriever.index(bm25s.tokenize(corpus))
68
 
69
  token = None # You can get a token from the Hugging Face website
70
- retriever.save_to_hub("{username}/{repo_name}", token=token)
71
  ```
72
 
73
  ## Advanced usage
@@ -76,16 +87,16 @@ You can leverage more advanced features of the BM25S library during `load_from_h
76
 
77
  ```python
78
  # Load corpus and index in memory-map (mmap=True) to reduce memory
79
- retriever = BM25HF.load_from_hub("{username}/{repo_name}", load_corpus=True, mmap=True)
80
 
81
  # Load a different branch/revision
82
- retriever = BM25HF.load_from_hub("{username}/{repo_name}", revision="main")
83
 
84
  # Change directory where the local files should be downloaded
85
- retriever = BM25HF.load_from_hub("{username}/{repo_name}", local_dir="/path/to/dir")
86
 
87
  # Load private repositories with a token:
88
- retriever = BM25HF.load_from_hub("{username}/{repo_name}", token=token)
89
  ```
90
 
91
  ## Stats
@@ -94,9 +105,9 @@ This dataset was created using the following data:
94
 
95
  | Statistic | Value |
96
  | --- | --- |
97
- | Number of documents | {num_docs} |
98
- | Number of tokens | {num_tokens} |
99
- | Average tokens per document | {avg_tokens_per_doc} |
100
 
101
  ## Parameters
102
 
@@ -104,11 +115,11 @@ The index was created with the following parameters:
104
 
105
  | Parameter | Value |
106
  | --- | --- |
107
- | k1 | `{k1}` |
108
- | b | `{b}` |
109
- | delta | `{delta}` |
110
- | method | `{method}` |
111
- | idf method | `{idf_method}` |
112
 
113
  ## Citation
114
 
 
41
  from bm25s.hf import BM25HF
42
 
43
  # Load the index
44
+ retriever = BM25HF.load_from_hub("dadashzadeh/2023_10_en_keywords_Cryptocurrency")
45
 
46
  # You can retrieve now
47
  query = "a cat is a feline"
 
57
  from bm25s.hf import BM25HF
58
 
59
  corpus = [
60
+ "northwest bank",
61
+ "misfits market",
62
+ "merrick bank login",
63
+ "marketing",
64
+ "market place",
65
+ "jetblue customer service",
66
+ "internal revenue service",
67
+ "how to make money online",
68
+ "gordon food service",
69
+ "futures market",
70
+ "frontier airlines customer service",
71
+ "food banks near me",
72
+ "first convenience bank",
73
+ "eastern bank",
74
+ "dollar bank",
75
  ]
76
 
77
  retriever = BM25HF(corpus=corpus)
78
  retriever.index(bm25s.tokenize(corpus))
79
 
80
  token = None # You can get a token from the Hugging Face website
81
+ retriever.save_to_hub("dadashzadeh/2023_10_en_keywords_Cryptocurrency", token=token)
82
  ```
83
 
84
  ## Advanced usage
 
87
 
88
  ```python
89
  # Load corpus and index in memory-map (mmap=True) to reduce memory
90
+ retriever = BM25HF.load_from_hub("dadashzadeh/2023_10_en_keywords_Cryptocurrency", load_corpus=True, mmap=True)
91
 
92
  # Load a different branch/revision
93
+ retriever = BM25HF.load_from_hub("dadashzadeh/2023_10_en_keywords_Cryptocurrency", revision="main")
94
 
95
  # Change directory where the local files should be downloaded
96
+ retriever = BM25HF.load_from_hub("dadashzadeh/2023_10_en_keywords_Cryptocurrency", local_dir="/path/to/dir")
97
 
98
  # Load private repositories with a token:
99
+ retriever = BM25HF.load_from_hub("dadashzadeh/2023_10_en_keywords_Cryptocurrency", token=token)
100
  ```
101
 
102
  ## Stats
 
105
 
106
  | Statistic | Value |
107
  | --- | --- |
108
+ | Number of documents | 602959 |
109
+ | Number of tokens | 2414020 |
110
+ | Average tokens per document | 4.0 |
111
 
112
  ## Parameters
113
 
 
115
 
116
  | Parameter | Value |
117
  | --- | --- |
118
+ | k1 | `1.5` |
119
+ | b | `0.75` |
120
+ | delta | `0.5` |
121
+ | method | `lucene` |
122
+ | idf method | `lucene` |
123
 
124
  ## Citation
125