筛选数据框中的每一行并手动对其进行分类



有人能推荐一种有效的方法来筛选数据框中的每一行并手动分类吗?例如,我可能想要从电子邮件中分离垃圾邮件,或筛选招聘广告,求职者,或约会机构的资料(我知道Tinder通过让你向左或向右滑动来做到这一点)。

我的数据集足够小,可以手动分类。我想如果它更大,我可能只想手动分类其中的一部分,以便训练机器学习算法(如朴素贝叶斯)为我完成任务。

我将向您展示我目前的成果,但这并不是一个特别新颖的任务,所以一定有一种不那么粗糙的方法可以做到这一点,有人已经想到了!(作为一个新手,R的强大给我留下了深刻的印象,但当清理屏幕或捕捉击键这样的小任务变得非常重要时,我也感到困惑)

# Let us suppose I am using this built-in dataset to draw up a
# shortlist of where I might wish to go on holiday
df <- data.frame(state.x77);
# pp - define a task-specific pretty print function
pp <- function(row) {
    print(row); # Example dataset is simple enough to just print the entire row
}
# cls - clear the screen (this hack works on Windows but I've commented it for now)
cls <- function() {
    #system("powershell -ExecutionPolicy Bypass -command (New-Object -ComObject Wscript.Shell).SendKeys([string][char]12)");
}
# It would halve the number of keystrokes needed if I knew a way to read
# a single character
readcharacter <- readline;
sift <- function(df, pp)
{
    classification = rep('', nrow(df));
    for (nRow in 1:nrow(df))
    {
        cls();
        pp(df[nRow,]);
        cat("nEnter 'a' to discard, 'd' to keep, 'q' to quitn");
        char <- '';
        while (char != 'a' && char != 'd' && char != 'q') {
            char <- readcharacter();
        }
        if (char == 'q')
            break;
        classification[nRow] = char;
    }
    return(cbind(df,classification=classification));
}
result = sift(df, pp);
cls();
cat("Shortlist:n");
print(row.names(result[result$classification=='d',]));

那么StackOverflow社区对我使用这个闪亮的应用程序来解决我的问题有什么感觉呢?我不希望看到Shiny在数据分析的早期使用-通常只有当我们有一些我们想要探索或动态呈现的结果时,它才会发挥作用。

Learning Shiny很有趣也很有用,但我更希望能找到一个不那么复杂的答案。

library(shiny);
#
# shortlist - function that allows us to shortlist through the rows in a data frame efficiently
#
shortlist <- function(df, sTitle, sRowName) {
    createUI <- function() {
        listHeading <- list(
                    textOutput(outputId = "Progress"),
                    tags$br(),
                    fluidRow(
                        column(width=1, sRowName),
                        column(width=9, textOutput(outputId = "RowName"))));
        listFields <- lapply(names(df), function(sFieldname) {
            return(fluidRow(
                column(width=1, sFieldname),
                column(width=9, textOutput(outputId = sFieldname))));
        });
        listInputs <- list(
                    tags$br(),
                    tags$table(
                        tags$tr(
                            tags$td(" "),
                            tags$td(actionButton(inputId="Up", label="W", disabled=TRUE, width="100%"))),
                        tags$tr(
                            tags$td(width="100px", actionButton(inputId="Discard", label="Discard, A", width="100%")),
                            tags$td(width="100px", actionButton(inputId="Down", label="S", disabled=TRUE, width="100%")),
                            tags$td(width="100px", actionButton(inputId="Keep", label="Keep, D", width="100%")))),
                        tags$script("
                            // JavaScript implemented keyboard shortcuts, including lots of conditions to
                            // ensure we're finished processing one keystroke before we start the next.
                            var bReady = false;
                            $(document).on('shiny:recalculating', function(event) {
                                bReady = false;
                            });
                            $(document).on('shiny:recalculated', function(event) {
                                setTimeout(function() {bReady = true;}, 500);
                            });
                            $(document).on('keypress', function(event) {
                                if (bReady) {
                                    switch(event.key.toLowerCase()) {
                                    case 'a':
                                        document.getElementById('Discard').click();
                                        bReady = false;
                                        break;
                                    case 'd':
                                        document.getElementById('Keep').click();
                                        bReady = false;
                                        break;
                                    }
                                }
                            });
                            // End of JavaScript
                        "));
        listPanel <- list(
                    title = sTitle,
                    tags$br(),
                    conditionalPanel(
                        condition = paste("input.Keep + input.Discard <", nrow(df)),
                        append(append(listHeading, listFields), listInputs)));
        listShortlist <- list(
                    tags$hr(),
                    tags$h4("Shortlist:"),
                    dataTableOutput(outputId="Shortlist"));
        ui <- do.call(fluidPage, append(listPanel, listShortlist));
        return(ui);
    }
    app <- shinyApp(ui = createUI(), server = function(input, output) {
        classification <- rep('', nrow(df));
        getRow <- reactive({
            return (input$Keep + input$Discard + 1);
        });
        classifyRow <- function(nRow, char) {
            if (nRow <= nrow(df)) {
                classification[nRow] <<- char;
            }
            # In interactive mode, automatically stop the app when we're finished
            if ( interactive() && nRow >= nrow(df) ) {
                stopApp(classification);
            }
        }
        observeEvent(input$Discard, {classifyRow(getRow() - 1, 'a')});
        observeEvent(input$Keep,    {classifyRow(getRow() - 1, 'd')});
        output$Progress = renderText({paste("Showing record", getRow(), "of", nrow(df))});
        output$RowName  = renderText({row.names(df)[getRow()]});
        lapply(names(df), function(sFieldname) {
            output[[sFieldname]] <- renderText({df[getRow(), sFieldname]});
        });
        output$Shortlist <- renderDataTable(options = list(paging = FALSE, searching = FALSE), {
            # Mention the 'keep' input to ensure this code is called when the 'keep' button
            # is pressed.  That way the shortlist gets updated when an item to be added to it.
            dummy <- input$Keep;
            # Construct the shortlist
            shortlist <- data.frame(row.names(df[classification == 'd',]));
            colnames(shortlist) <- sRowName;
            return(shortlist);
        });
    });
    if (interactive()) {
        classification <- runApp(app);
        return(cbind(df, classification = classification));
    } else {
        return(app);
    }
}
#
# And now some example code.
# Shortlist the built in state.x77 data set (let us suppose I am drawing up
# a shortlist of where I might wish to go on holiday)
#
df <- data.frame(state.x77);
result <- shortlist(df = df, "Choose states", "State");
if (interactive()) {
    cat("Shortlist:n");
    print(row.names(result[result$classification == 'd',]));
} else {
    return (result);
}

最新更新