From 98a1a99ce7a777285a46a161a892ac92e8fdee79 Mon Sep 17 00:00:00 2001 From: skangasl Date: Sun, 11 Feb 2024 19:22:13 -0500 Subject: [PATCH] fixed examples --- doc/source/conf.py | 15 +- examples/README.rst | 8 + examples/demo_util.py | 29 -- examples/large_scale_demo.py | 344 ------------------ ...uci_topic_script.py => plot_uci_topics.py} | 41 ++- 5 files changed, 50 insertions(+), 387 deletions(-) create mode 100644 examples/README.rst delete mode 100644 examples/demo_util.py delete mode 100644 examples/large_scale_demo.py rename examples/{uci_topic_script.py => plot_uci_topics.py} (60%) diff --git a/doc/source/conf.py b/doc/source/conf.py index b8872a1..45d43cf 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -41,13 +41,13 @@ 'sphinx.ext.githubpages', 'sphinx.ext.mathjax', #'sphinx.ext.imgmath', 'numpydoc.numpydoc', - # 'sphinx_gallery.gen_gallery', + 'sphinx_gallery.gen_gallery', ] -# sphinx_gallery_conf = { -# 'examples_dirs': '../../examples', # path to your example scripts -# 'gallery_dirs': 'auto_examples', # path to where to save gallery generated output -# } +sphinx_gallery_conf = { + 'examples_dirs': '../../examples', # path to your example scripts + 'gallery_dirs': 'auto_examples', # path to where to save gallery generated output + } # Add any paths that contain templates here, relative to this directory. @@ -58,7 +58,7 @@ # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] -# NumPy +# NumPy numpydoc_class_members_toctree = False numpydoc_show_class_members = True numpydoc_show_inherited_class_members = False @@ -96,7 +96,7 @@ 'nav_links' : [('Install', 'install'), ('User Guide', 'user_guide/index'), ('API', 'modules/api'), - # ('Examples', 'auto_examples/index') + ('Examples', 'auto_examples/index') ], # 'external_nav_links' : [('TensorLy', 'http://tensorly.org/dev')] } @@ -108,4 +108,3 @@ # Remove the permalinks ("ΒΆ" symbols) html_permalinks_icon = "" - diff --git a/examples/README.rst b/examples/README.rst new file mode 100644 index 0000000..7a04b9f --- /dev/null +++ b/examples/README.rst @@ -0,0 +1,8 @@ +Gallery of examples +=================== +.. contents:: Contents + :local: + :depth: 1 +General examples +---------------- +Examples using TLDA. diff --git a/examples/demo_util.py b/examples/demo_util.py deleted file mode 100644 index e1a549a..0000000 --- a/examples/demo_util.py +++ /dev/null @@ -1,29 +0,0 @@ -from wordcloud import WordCloud, STOPWORDS -import matplotlib.pyplot as plt - -def generate_top_words(topic_word_dist, words, order, num_tops, top_n): - '''helper function for visualizing top words in a wordcloud''' - cloud = WordCloud(stopwords=STOPWORDS, - background_color='white', - width=2500, - height=1800, - max_words=top_n, - colormap='tab10') - - fig, axes = plt.subplots(1, 2, figsize=(7, 7), - sharey=True) - - for i, ax in enumerate(axes.flatten()): - fig.add_subplot(ax) - if i < num_tops: - cloud.generate_from_frequencies(dict(zip(words, topic_word_dist[order[i], :]))) - plt.gca().imshow(cloud) - plt.gca().set_title('Topic ' + str(order[i]), fontdict=dict(size=16)) - plt.gca().axis('off') - - plt.subplots_adjust(wspace=0, hspace=0) - plt.axis('off') - plt.margins(x=0, y=0) - plt.tight_layout() - plt.show() - return \ No newline at end of file diff --git a/examples/large_scale_demo.py b/examples/large_scale_demo.py deleted file mode 100644 index b7cfdcb..0000000 --- a/examples/large_scale_demo.py +++ /dev/null @@ -1,344 +0,0 @@ -# basic imports -import numpy as np -import os -from pathlib import Path -import gc -import pandas as pd -import time -import pickle - - -# Import stopwords -import nltk -nltk.download('stopwords') -from nltk.corpus import stopwords - -# Cuda imports -import cupy as cp -import cudf -from cudf import Series -from cuml.feature_extraction.text import CountVectorizer -from cuml.preprocessing.text.stem import PorterStemmer - -# Import TensorLy -import tensorly as tl - -# Import utility functions from other files -from tlda.tlda_wrapper import TLDA -from tlda.file_operations import get_files_in_dir - -# Root Filepath -- can modify -ROOT_DIR = "/tlda/data" - -# Data Relative Paths -- can modify -INDIR = "MeTooMonthCleaned/" - -# Output Relative paths -- do not change -X_MAT_FILEPATH_PREFIX = "x_mat/" -X_FILEPATH = "X_full.obj" -X_DF_FILEPATH = "X_df.obj" -X_LST_FILEPATH = "X_lst.obj" -CORPUS_FILEPATH_PREFIX = "corpus/" -GENSIM_CORPUS_FILEPATH = "corpus.obj" -COUNTVECTOR_FILEPATH = "countvec.obj" -TLDA_FILEPATH = "tlda.obj" -VOCAB_FILEPATH = "vocab.csv" -EXISTING_VOCAB_FILEPATH = "vocab.obj" -TOPIC_FILEPATH_PREFIX = 'predicted_topics/' -DOCUMENT_TOPIC_FILEPATH = 'dtm.csv' -COHERENCE_FILEPATH = 'coherence.obj' -DOCUMENT_TOPIC_FILEPATH_TOT = 'dtm_df.csv' -OUT_ID_DATA_PREFIX = 'ids/' -TOP_WORDS_FILEPATH ='top_words.csv' - -# Device settings -backend="cupy" -tl.set_backend(backend) -device = 'cuda' -porter = PorterStemmer() - - -def basic_clean(df): - df['tweets'] = df['tweets'].astype('str') - df = df.drop_duplicates(keep="first") - return df - - -def partial_fit(self , data): - if(hasattr(self , 'vocabulary_')): - vocab = self.vocabulary_ # series - else: - vocab = Series() - self.fit(data) - vocab = vocab.append(self.vocabulary_) - self.vocabulary_ = vocab.unique() - - -# declare the stop words -# potentially add extra stop words depending on the application dataset -stop_words = (stopwords.words('english')) -added_words = [] - -# set stop words and countvectorizer method -stop_words= list(np.append(stop_words,added_words)) -CountVectorizer.partial_fit = partial_fit - -# define function with no preprocessing -def custom_preprocessor(doc): - return doc - - -def fit_topics(num_tops, curr_dir, alpha_0 = 0.01, learning_rate = 0.0004, theta_param = 5.005, ortho_loss_param = 1000, smoothing = 1e-5, initialize_first_docs = False, n_eigenvec = None): - - # make final directories for outputs - save_dir = os.path.join(ROOT_DIR, curr_dir) - if not os.path.exists(save_dir): - os.makedirs(save_dir) - - # initialize RAPIDS CountVectorizer - countvec = CountVectorizer( stop_words = stop_words, - lowercase = True, - ngram_range = (1, 2), - preprocessor = custom_preprocessor, - max_df = 0.5, - min_df = 0.00125) - - # set directory for saving CountVectorizer and TLDA - eigenvec_str = "_n_eigenvec_" + (str(n_eigenvec) if n_eigenvec is not None else "None") - exp_save_dir = os.path.join(save_dir, "num_tops_" + str(num_tops) + "_alpha0_" + str(alpha_0) + "_learning_rate_" + str(learning_rate) + "_theta_" + str(theta_param) + "_orthogonality_" + str(ortho_loss_param) + "_initialize_first_docs_" + str(initialize_first_docs) + eigenvec_str + "/") - if not os.path.exists(exp_save_dir): - os.makedirs(exp_save_dir) - - # DEFAULT PARAMS -- Grid search according to dataset - batch_size_pca = 100000 - batch_size_grad = 80000 - n_iter_train = 200 - n_iter_test = 10 - - #SET SEED - seed = 57 - - # Program controls -- decide which portions to run - if os.path.exists(save_dir + "/" + COUNTVECTOR_FILEPATH): - first_run = 1 - vocab_build = first_run - save_files = first_run - stgd = 1 - recover_top_words = 1 - - # Start - print("\n\nSTART...") - - # Set files to read - inDir = os.path.join(ROOT_DIR, INDIR) - dl = sorted(get_files_in_dir(inDir)) - - # Build the vocabulary - if vocab_build == 1: - if not os.path.exists(save_dir + "/" + EXISTING_VOCAB_FILEPATH): - for i, f in enumerate(dl): - print("Beginning vocabulary build: " + f) - path_in = os.path.join(inDir,f) - - mempool = cp.get_default_memory_pool() - mempool.free_all_blocks() - pinned_mempool = cp.get_default_pinned_memory_pool() - pinned_mempool.free_all_blocks() - - # read in dataframe - df = pd.read_csv(path_in, names = ['tweets']) - - # basic preprocessing - mask = df['tweets'].str.len() > 10 - df = df.loc[mask] - df = cudf.from_pandas(df) - df = basic_clean(df) - - mempool = cp.get_default_memory_pool() - mempool.free_all_blocks() - pinned_mempool = cp.get_default_pinned_memory_pool() - pinned_mempool.free_all_blocks() - gc.collect() - - # add vocabulary from current file to CountVectorizer vocabulary - countvec.partial_fit(df['tweets']) - print("End " + f) - - # count rows of data - num_data_rows += len(df.index) - print(num_data_rows) - print(len(df.index)) - else: - countvec.vocabulary_ = countvec.vocabulary - vocab = len(countvec.vocabulary_) - - # Save fitted CountVectorizer and vocabulary - pickle.dump(countvec, open(os.path.join(save_dir, COUNTVECTOR_FILEPATH), 'wb')) - vocab = len(countvec.vocabulary_) - df_voc = cudf.DataFrame({'words':countvec.vocabulary_}) - df_voc.to_csv(save_dir + "/" + VOCAB_FILEPATH) - print("right after countvec partial fit vocab\n\n\n: ", vocab) - - # make directories to save: - # - X matrices - # - corpus (only needed if computing coherence) - x_mat_dir = os.path.join(save_dir, X_MAT_FILEPATH_PREFIX) - if not os.path.exists(x_mat_dir): - os.makedirs(x_mat_dir) - corpus_dir = os.path.join(save_dir, CORPUS_FILEPATH_PREFIX) - if not os.path.exists(corpus_dir): - os.makedirs(corpus_dir) - - - # transform X matrices with fitted CountVectorizer and save to disk - transform_time = 0.0 - if save_files == 1: - for f in dl: - print("Beginning CountVectorizer transform: " + f) - path_in = os.path.join(inDir,f) - - mempool = cp.get_default_memory_pool() - mempool.free_all_blocks() - pinned_mempool = cp.get_default_pinned_memory_pool() - pinned_mempool.free_all_blocks() - - # read in dataframe - df = pd.read_csv(path_in, names = ['tweets']) - - # basic preprocessing - mask = df['tweets'].str.len() > 10 - df = df.loc[mask] - df = cudf.from_pandas(df) - df = basic_clean(df) - - mempool = cp.get_default_memory_pool() - mempool.free_all_blocks() - gc.collect() - - # transform data from current file - t1 = time.time() - corpus = countvec.transform(df['tweets']) - t2 = time.time() - transform_time += t2 - t1 - X_batch = tl.tensor(corpus.toarray()) - - # save current X matrix and corpus to disk - pickle.dump( - (X_batch), - open(x_mat_dir + Path(f).stem + '.obj','wb') - ) - pickle.dump( - (corpus), - open(corpus_dir + Path(f).stem + '.obj','wb') - ) - del X_batch - del corpus - print("End " + f) - del df - del mask - - gc.collect() - - print("Transform Time:" + str(transform_time)) - - - # initialize TLDA using parameters from above - tlda = TLDA( - num_tops, alpha_0, n_iter_train, n_iter_test,learning_rate, - pca_batch_size = batch_size_pca, third_order_cumulant_batch = batch_size_grad, - gamma_shape = 1.0, smoothing = smoothing, theta=theta_param, ortho_loss_criterion = ortho_loss_param, random_seed = seed, - n_eigenvec = n_eigenvec, - ) - - tot_tlda_time = 0.0 - if stgd == 1: - # keep track of iterations - i = 0 - - t1 = time.time() - for f in dl: - mempool = cp.get_default_memory_pool() - mempool.free_all_blocks() - pinned_mempool = cp.get_default_pinned_memory_pool() - pinned_mempool.free_all_blocks() - - print("Beginning TLDA: " + f) - - # load saved X matrix batch from disk - X_batch = pickle.load( - open(save_dir + X_MAT_FILEPATH_PREFIX + Path(f).stem + '.obj','rb') - ) - - mempool = cp.get_default_memory_pool() - mempool.free_all_blocks() - pinned_mempool = cp.get_default_pinned_memory_pool() - pinned_mempool.free_all_blocks() - gc.collect() - - - t3 = time.time() - # fit tensor LDA fully online - if initialize_first_docs and i == 0: - # fully fit tensor LDA on first batch - tlda.fit(X_batch) - else: - # partial fit tensor LDA on remaining batches - tlda.partial_fit_online(X_batch) - - t4 = time.time() - print("New fit time" + str(t4-t3)) - tot_tlda_time += t4-t3 - - del X_batch - gc.collect() - mempool = cp.get_default_memory_pool() - mempool.free_all_blocks() - pinned_mempool = cp.get_default_pinned_memory_pool() - pinned_mempool.free_all_blocks() - - i += 1 - else: - tlda = pickle.load(open(exp_save_dir + TLDA_FILEPATH,'rb')) - - - # save top words in each topic - if recover_top_words == 1: - n_top_words = 100 - - top_words_df = cudf.DataFrame({}) - for k in range(0,num_tops): - t_n_indices = tlda.unwhitened_factors_[:,k].argsort()[:-n_top_words - 1:-1] - top_words_LDA = countvec.vocabulary_[t_n_indices] - top_words_df['words_'+str(k)] = top_words_LDA.reset_index(drop=True) - - - top_words_df.to_csv(exp_save_dir + TOP_WORDS_FILEPATH) - del top_words_df - - gc.collect() - mempool = cp.get_default_memory_pool() - mempool.free_all_blocks() - pinned_mempool = cp.get_default_pinned_memory_pool() - pinned_mempool.free_all_blocks() - - -def main(): - curr_dir = "metoo_evaluation_initialized_paper_exps/" - - # set parameters - num_tops = 10 - alpha_0 = 0.01 - lr = 0.0001 - pca_dim = 40 - - # run method to fit topics and save top words in each topic - fit_topics( - num_tops = num_tops, - curr_dir = curr_dir, - alpha_0 = alpha_0, - learning_rate = lr, - n_eigenvec = pca_dim - ) - -if __name__ == "__main__": - main() diff --git a/examples/uci_topic_script.py b/examples/plot_uci_topics.py similarity index 60% rename from examples/uci_topic_script.py rename to examples/plot_uci_topics.py index c6618d8..410fff6 100644 --- a/examples/uci_topic_script.py +++ b/examples/plot_uci_topics.py @@ -1,3 +1,10 @@ +""" +Fitting TLDA on UCI-Newsgroups +============================== + +In this example, we show how to run TLDA on a subset of the UCI 20 Newsgroups dataset. +""" + import numpy as np import nltk nltk.download('stopwords') @@ -6,7 +13,8 @@ from sklearn.feature_extraction.text import CountVectorizer from sklearn.datasets import fetch_20newsgroups -from examples.demo_util import generate_top_words +from wordcloud import WordCloud +import matplotlib.pyplot as plt # Import TensorLy import tensorly as tl @@ -48,8 +56,29 @@ print("Creating image to display fitted topics") # Generate a wordcloud from the topics -generate_top_words( - tlda.unwhitened_factors.T, - vocab, - np.argsort(tlda.weights_), - k, 25) + +topic_order = np.argsort(tlda.weights_) + +cloud = WordCloud(stopwords=stop_words, + background_color='white', + width=1000*k, + height=1000, + max_words=25, + colormap='tab10') + +fig, axes = plt.subplots(1, 2, figsize=(7, 7), + sharey=True) + +for i, ax in enumerate(axes.flatten()): + fig.add_subplot(ax) + if i < k: + cloud.generate_from_frequencies(dict(zip(vocab, tlda.unwhitened_factors.T[topic_order[i], :]))) + plt.gca().imshow(cloud) + plt.gca().set_title('Topic ' + str(topic_order[i]), fontdict=dict(size=16)) + plt.gca().axis('off') + +plt.subplots_adjust(wspace=0, hspace=0) +plt.axis('off') +plt.margins(x=0, y=0) +plt.tight_layout() +plt.show()