File size: 1,234 Bytes
5845aec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import json
import datasketch
from datasketch import MinHash

class SongGuesser:
    @staticmethod
    def guess_song(query):
        with open("./swedish_christmas_songs.json", "r", encoding='utf-8') as f: 
            songs = json.load(f)
            shingle_size = 5

            query_shingles = get_shingles(query, shingle_size)
            query_minhash = create_minhash(query_shingles)

            max_sim = 0
            max_name = ""
            
            for song in songs:
                song_lyrics = song['lyrics'].lower()
                song_shingles = get_shingles(song_lyrics, shingle_size)
                song_minhash = create_minhash(song_shingles)
                
                estimated_jaccard = query_minhash.jaccard(song_minhash)
                if estimated_jaccard > max_sim:
                    max_sim = estimated_jaccard
                    max_name = song['name']
                
            return max_name

def get_shingles(text, shingle_size):
    return set(text[i:i+shingle_size] for i in range(len(text) - shingle_size + 1))

def create_minhash(shingles, num_perm=128):
    m = MinHash(num_perm=num_perm)
    for shingle in shingles:
        m.update(shingle.encode('utf8'))
    return m