TKINTER GUI中的用户输入确定在PANDAS数据表中显示多少个单词



到目前为止,我所拥有的代码能够确定PDF中所有不间断单词和符号的频率,但是我希望用户确定显示了多少个单词在数据表中,不仅仅是全部。例如,如果PDF中有137个唯一单词,但是用户只想看到50个,那么代码只会显示PDF

中的50个最常见的单词

我程序中的"运行"按钮是导致代码加载后执行代码的原因,因此我决定链接用户输入的数字,以及将程序一起运行程序的按钮。我也尝试了一个Get opertaion,但这也没有用。

import os
import PyPDF2
import pandas
import webbrowser
import tkinter as tk
from tkinter import ttk
from tkinter import filedialog
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
#----Functions----#
#Method that a PDF that is read into the program goes through to eliminate any unwanted words or symbols#
def preprocess(text):
    #Filters out punctuation from paragraph witch becomes tokenized to words and punctuation#
    tokenizer = RegexpTokenizer(r'w+')
    result = tokenizer.tokenize(text)
    #Makes all words lowercase#
    words = [item.lower() for item in result]
    #Removes all remaining tokens that are not alphabetic#
    result = [word for word in words if word.isalpha()]
    #Imports stopwords to be removed from paragraph#
    stop_words = set(stopwords.words("english"))
    #Removes the stop words from the paragraph#
    filtered_sent = []
    for w in result:
        if w not in stop_words:
            filtered_sent.append(w)
    #Return word to root word/chop-off derivational affixes#
    ps = PorterStemmer()
    stemmed_words = []
    for w in filtered_sent:
        stemmed_words.append(ps.stem(w))
    #Lemmatization, which reduces word to their base word, which is linguistically correct lemmas#
    lem = WordNetLemmatizer()
    lemmatized_words = ' '.join([lem.lemmatize(w,'n') and lem.lemmatize(w,'v') for w in filtered_sent])
    #Re-tokenize lemmatized words string#
    tokenized_word = word_tokenize(lemmatized_words)
    return tokenized_word
#Wraps two functions inside an object which allows both functions to use filename#
class PDFSelector:
    #Creates global variable 'filename'#
    def __init(self):
        self.filename = ''
    #Allows user to select PDF to use in program#
    def select_PDF(self):
        #Opens file directory to select a file, and shows both folders and PDF files only#
        self.filename = filedialog.askopenfilename(initialdir = "/", title = "Select file", filetypes = (("pdf files", "*.pdf"), ("all files", "*.*")))
    #Method for PDF to run through to convert it into text, then print it out in a browser#
    def run_program(self):    
        #Loads in PDF into program#
        PDF_file = open(self.filename, 'rb')
        read_pdf = PyPDF2.PdfFileReader(PDF_file)
        #Determines number of pages in PDF file and sets the document content to 'null'#
        number_of_pages = read_pdf.getNumPages()
        doc_content = ""
        #Extract text from the PDF file#
        for i in range(number_of_pages):
            page = read_pdf.getPage(0)
            page_content = page.extractText()
            doc_content += page_content
        #Turns the text drawn from the PDF file into data the remaining code can understand#
        tokenized_words = preprocess(doc_content)
        #Determine frequency of words tokenized + lemmatized text#
        from nltk.probability import FreqDist
        fdist = FreqDist(tokenized_words)
        final_list = fdist.most_common(int(lbl2a.get()))
        #Organize data into two columns and export the data to an html that automatically opens#
        df = pandas.DataFrame(final_list, columns = ["Word", "Frequency"])
        df.to_html('word_frequency.html')
        webbrowser.open('file://' + os.path.realpath('word_frequency.html'))      
#----Main----#
#Creates an instance of the wrapped functions to use the GUI#        
selector = PDFSelector()
#Creats the GUI that will be used to select inputs#
window = tk.Tk()
window.geometry("375x130")
window.resizable(0, 0)
window.title("Word Frequency Program")
#Code literally just to make the GUI look better#
lblfilla = tk.Label(window, text = "   ").grid(row = 0, column = 0)
lblfillb = tk.Label(window, text = "   ").grid(row = 0, column = 1)
lblfillc = tk.Label(window, text = "   ").grid(row = 0, column = 2)
lblfilld = tk.Label(window, text = "   ").grid(row = 0, column = 3)
lblfille = tk.Label(window, text = "   ").grid(row = 0, column = 4)
lblfillf = tk.Label(window, text = "   ").grid(row = 1, column = 0)
lblfillg = tk.Label(window, text = "   ").grid(row = 2, column = 0)
lblfillh = tk.Label(window, text = "   ").grid(row = 3, column = 0)
lblfilli = tk.Label(window, text = "   ").grid(row = 4, column = 0)
#Just a simple label on the GUI# (FILE NAME IS CURRENTLY NOT ABLE TO BE DISPLAYED)
lbl1 = tk.Label(window, text = "File Selected: ").grid(row = 1, column = 1)
#Label asking for input to determine number of words to be displayed in the data table# (NOT IMPLEMENTED YET)
lbl2 = tk.Label(window, text = "Number of Words: ").grid(row = 2, column = 1)
lbl2a = tk.Entry(window).grid(row = 2, column = 2)
#Calls the select_PDF method to choose a PDF for the program to read#
button1 = ttk.Button(window, text = "Select File", command = selector.select_PDF).grid(row = 1, column = 4)
#Button to make the program execute#
button2 = ttk.Button(window, text = "Run", command = selector.run_program).grid(row = 2, column = 4)
#Quits out of the program when certain button clicked#
button3 = ttk.Button(window, text = "Quit", command = window.quit).grid(row = 3, column = 2)
window.mainloop()
window.destroy()

数据表应显示用户在GUI中输入的单词数,以及PDF

中的这些单词的频率

我只需要对两个独立区域进行两个小校正第一:

lbl2 = tk.Label(window, text = "Number of Words: ").grid(row = 2, column = 1)
lbl2a = tk.Entry(window).grid(row = 2, column = 2)

之所以更改为此,因为网格返回了None,所以我不得不将两者分开以进行工作,而不是将条目识别为None

lbl2 = tk.Label(window, text = "Number of Words: ").grid(row = 2, column = 1)
user_input = tk.Entry(window)
user_input.grid(row = 2, column = 2)

和此:

from nltk.probability import FreqDist
fdist = FreqDist(tokenized_words)
final_list = fdist.most_common(int(lbl2a.get()))

已更改为这个Beause,我将导入移至顶部,并将LBL2A的名称更改为user_input。另外,该程序识别用户输入为字符串的值,因此我将其转换为run_program方法中的int:

fdist = FreqDist(tokenized_words)
final_list = fdist.most_common(int(user_input.get()))

这个答案来自 @ furas ,我只是想提供它,以防其他人发生在同一问题上,我

最新更新