fchoquette-ebay commited on
Commit
ec962e4
1 Parent(s): b244c44

feat: add auto tagging logic

Browse files
Files changed (3) hide show
  1. .gitignore +2 -0
  2. app.py +70 -0
  3. requirements.txt +87 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .venv
2
+ .idea
app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from sentence_transformers import SentenceTransformer, util
3
+
4
+ threshold = 0.65
5
+ sentence_length = 6
6
+ questions = [
7
+ "Is it new or used", "Are there any wear & tear", "Does it come with dust bag, receipt & original box",
8
+ "Are there any scratches, marks", "Are there any fading, stains, discolorization",
9
+ "Is this item customized, repainted or has hardware been replaced", "Is it special edition", "Is there any odour",
10
+ "Are there multiple items or extra add-ons in this listing?",
11
+ "Is there a date code or serial number present on the item?"
12
+ ]
13
+
14
+ model = SentenceTransformer("all-MiniLM-L6-v2")
15
+
16
+
17
+ def generate_phrases(desc: str, length: int):
18
+ desc_list = desc.split()
19
+ phrase_list = []
20
+ if len(desc_list) >= length:
21
+ for i in range(len(desc_list) - (length - 1)):
22
+ sub_list = []
23
+ for j in range(i, i + length):
24
+ sub_list.append(desc_list[j])
25
+ phrase_list.append(' '.join(sub_list))
26
+ else:
27
+ phrase_list.append(' '.join(desc_list))
28
+
29
+ return phrase_list
30
+
31
+
32
+ def extract(description: str):
33
+ sentences = generate_phrases(description, sentence_length)
34
+ sentences_embedding = model.encode(sentences)
35
+
36
+ answers = []
37
+
38
+ for question in questions:
39
+ query_embedding = model.encode(question)
40
+ similarities = util.cos_sim(query_embedding, sentences_embedding)
41
+
42
+ similarity_i = 0
43
+
44
+ new_row = None
45
+
46
+ for similarity in similarities[0]:
47
+ model_answer = sentences[similarity_i]
48
+
49
+ similarity_i += 1
50
+
51
+ if round(similarity.item(), 2) > threshold:
52
+ if new_row is not None and similarity < new_row['Similarity']:
53
+ continue
54
+
55
+ new_row = {'ModelAnswer': model_answer, 'Similarity': similarity.item()}
56
+
57
+ if new_row is not None:
58
+ answers.append(new_row['ModelAnswer'])
59
+ else:
60
+ answers.append('No answer')
61
+ return answers
62
+
63
+
64
+ def map_question_to_text(question):
65
+ return gr.Text(label=question)
66
+
67
+
68
+ demo = gr.Interface(fn=extract, inputs=gr.Textbox(label="Description"),
69
+ outputs=list(map(map_question_to_text, questions)))
70
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ altair==5.3.0
3
+ annotated-types==0.7.0
4
+ anyio==4.4.0
5
+ attrs==23.2.0
6
+ certifi==2024.6.2
7
+ charset-normalizer==3.3.2
8
+ click==8.1.7
9
+ contourpy==1.2.1
10
+ cycler==0.12.1
11
+ dnspython==2.6.1
12
+ email_validator==2.1.1
13
+ exceptiongroup==1.2.1
14
+ fastapi==0.111.0
15
+ fastapi-cli==0.0.4
16
+ ffmpy==0.3.2
17
+ filelock==3.14.0
18
+ fonttools==4.53.0
19
+ fsspec==2024.6.0
20
+ gradio==4.36.0
21
+ gradio_client==1.0.1
22
+ h11==0.14.0
23
+ httpcore==1.0.5
24
+ httptools==0.6.1
25
+ httpx==0.27.0
26
+ huggingface-hub==0.23.3
27
+ idna==3.7
28
+ importlib_resources==6.4.0
29
+ Jinja2==3.1.4
30
+ joblib==1.4.2
31
+ jsonschema==4.22.0
32
+ jsonschema-specifications==2023.12.1
33
+ kiwisolver==1.4.5
34
+ markdown-it-py==3.0.0
35
+ MarkupSafe==2.1.5
36
+ matplotlib==3.9.0
37
+ mdurl==0.1.2
38
+ mpmath==1.3.0
39
+ networkx==3.2.1
40
+ numpy==1.26.4
41
+ orjson==3.10.3
42
+ packaging==24.0
43
+ pandas==2.2.2
44
+ pillow==10.3.0
45
+ pydantic==2.7.3
46
+ pydantic_core==2.18.4
47
+ pydub==0.25.1
48
+ Pygments==2.18.0
49
+ pyparsing==3.1.2
50
+ python-dateutil==2.9.0.post0
51
+ python-dotenv==1.0.1
52
+ python-multipart==0.0.9
53
+ pytz==2024.1
54
+ PyYAML==6.0.1
55
+ referencing==0.35.1
56
+ regex==2024.5.15
57
+ requests==2.32.3
58
+ rich==13.7.1
59
+ rpds-py==0.18.1
60
+ ruff==0.4.8
61
+ safetensors==0.4.3
62
+ scikit-learn==1.5.0
63
+ scipy==1.13.1
64
+ semantic-version==2.10.0
65
+ sentence-transformers==3.0.1
66
+ shellingham==1.5.4
67
+ six==1.16.0
68
+ sniffio==1.3.1
69
+ starlette==0.37.2
70
+ sympy==1.12.1
71
+ threadpoolctl==3.5.0
72
+ tokenizers==0.19.1
73
+ tomlkit==0.12.0
74
+ toolz==0.12.1
75
+ torch==2.3.1
76
+ tqdm==4.66.4
77
+ transformers==4.41.2
78
+ typer==0.12.3
79
+ typing_extensions==4.12.1
80
+ tzdata==2024.1
81
+ ujson==5.10.0
82
+ urllib3==2.2.1
83
+ uvicorn==0.30.1
84
+ uvloop==0.19.0
85
+ watchfiles==0.22.0
86
+ websockets==11.0.3
87
+ zipp==3.19.2