from typing import Dict, Any, Iterable from sklearn.feature_extraction.text import TfidfVectorizer import wordcloud from pydantic import BaseModel, Field import numpy as np import PIL class WordCloudExtractor(BaseModel): max_words: int = 50 wordcloud_params: Dict[str, Any] = Field(default_factory=dict) tfidf_params: Dict[str, Any] = Field(default_factory=lambda: {"stop_words": "english"}) def extract_wordcloud_image(self, texts) -> PIL.Image.Image: frequencies = self._extract_frequencies(texts, self.max_words, tfidf_params=self.tfidf_params) wc = wordcloud.WordCloud(**self.wordcloud_params).generate_from_frequencies(frequencies) return wc.to_image() @classmethod def _extract_frequencies(cls, texts, max_words=100, tfidf_params: dict={}) -> Dict[str, float]: """ Extract word frequencies from a corpus using TF-IDF vectorization and generate word cloud frequencies. Args: texts: List of text documents max_features: Maximum number of words to include Returns: Dictionary of word frequencies suitable for WordCloud """ # Initialize TF-IDF vectorizer tfidf = TfidfVectorizer( max_features=max_words, **tfidf_params ) # Fit and transform the texts tfidf_matrix = tfidf.fit_transform(texts) # Get feature names (words) feature_names = tfidf.get_feature_names_out() # Calculate mean TF-IDF scores across documents mean_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten() # Create frequency dictionary frequencies = dict(zip(feature_names, mean_tfidf)) return frequencies