wordCloudGenerator.py

import PyPDF2
import nltk
from wordcloud import WordCloud
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_number in range(len(reader.pages)):
            page = reader.pages[page_number]
            text += page.extract_text()
    return text


import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

def create_word_cloud(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    words_without_sw = [word.lower() for word in words if not word in stop_words and len(word) >= 3 and not word.isnumeric()]

    # Count frequency of each word
    word_counts = Counter(words_without_sw)

    # Prepare data for the pretty table
    labels, values = zip(*word_counts.items())
    df = pd.DataFrame({'Words': labels, 'Counts': values})

    # Sort by count
    df = df.sort_values(by='Counts', ascending=False)
    print(df.loc[df['Counts'] > 5])
    a=1
    # Print the pretty table
    # print(df.to_string(index=False))

    # Print a text-based histogram
    # print(df.to_string(index=False, col_space=10, justify='right', columns=['Words', 'Counts']))


def main():
    file_path = 'intro_to_crypto_merged_lectures.pdf'

    pdf_path = file_path  # Path to your pdf file
    text = extract_text_from_pdf(pdf_path)
    create_word_cloud(text)

if __name__ == "__main__":
    main()