多标签文本分类.我有一个文本/标签CSV.文本是纯文本,标签是字母数字


    import keras
    import keras.backend as K
    from keras.optimizers import Adam
    from keras.models import Sequential
    from keras.layers import Dense
    from keras.layers.core import Activation
    from keras.preprocessing.text import Tokenizer          # for 
    tokenizing text
    from keras.preprocessing.sequence import pad_sequences  # for 
    padding sentences with zeros. To make the sentence length same
    from keras.utils import to_categorical                  # for one- 
    hot encoding of the labels
    from keras.layers import Dense, Input, Flatten, Dropout, 
    BatchNormalization
    from keras.layers import Conv1D, MaxPooling1D, Embedding
    from keras.models import Sequential
    from sklearn.model_selection import train_test_split

    MAX_SEQUENCE_LENGTH = 300   
    MAX_NB_WORDS = 20000        
    #Reading the data
    raw_data=pd.read_csv("/home/riaz.k/Desktop/TRAIN.csv")
    raw_data.head()
    # create training and testing vars
    train, test = train_test_split(raw_data, test_size=0.3)
    train.head()
    test.head()
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)   
    tokenizer.fit_on_texts(train.Procedure)           
    train_sequences = tokenizer.texts_to_sequences(train.Procedure)
    test_sequences = tokenizer.texts_to_sequences(test.Procedure)
    word_index = tokenizer.word_index                
    containing words and their index
    # print(tokenizer.word_index)                  
    print('Found %s unique tokens.' % len(word_index)) 
    train_data = pad_sequences(train_sequences, 
    maxlen=MAX_SEQUENCE_LENGTH)  
    train
    test_data=pad_sequences(test_sequences,maxlen=MAX_SEQUENCE_LENGTH) 
    test
    print(train_data.shape)
    print(test_data.shape)
    print (word_index)
    train_labels = train['dxcode']
    test_labels = test['dxcode']
    from sklearn import preprocessing
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()                  # converts the character 
                                        array to numeric array. 
                                Assigns levels to unique labels.
    le.fit(train_labels)
    le.fit(test_labels)
    train_labels = le.transform(train_labels)
    test_labels = le.transform(test_labels)
    print(le.classes_)
    print(np.unique(train_labels, return_counts=True))
    print(np.unique(test_labels, return_counts=True))
    le.inverse_transform(1)
    labels_train = to_categorical(np.asanyarray(train_labels))
    labels_test  = to_categorical(np.asarray(test_labels))
    print('Shape of data tensor:', train_data.shape)
    print('Shape of label tensor:', labels_train.shape)
    print('Shape of label tensor:', labels_test.shape)
    EMBEDDING_DIM = 100
    print(MAX_SEQUENCE_LENGTH)
    print('Training model.')
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS,
                        EMBEDDING_DIM,
                        input_length=MAX_SEQUENCE_LENGTH
                        ))
    model.add(Dropout(0.2))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(MaxPooling1D(5))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(MaxPooling1D(5))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(23, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['acc'],)

    model.fit(train_data, labels_train,
              batch_size=32,
              epochs=10,
              validation_data=(test_data, labels_test))
    model.evaluate(test_data, labels_test)
    pred = model.predict(test_data)
    pred
    # print(model.layers)
    for layer in model.layers:
        print(layer)
    import keras.backend as K
    emd = K.function(inputs=[model.layers[0].input], 
                     outputs=[model.layers[0].output])
    rbind = np.concatenate((train_data, test_data), axis=0)
    print(rbind.shape)
    ### Submissions file 
    test_results = model.predict_classes(rbind)
    #print(test_results)
    test_labels = le.inverse_transform(test_results)
    #test_labels = [le.inverse_transform(i) for i in test_results] 
    submissions_CNN = 
    pd.DataFrame({'id':raw_data['Claimno'],"label":test_labels})
    submissions_CNN.to_csv("/home/riaz.k/Desktop/submissions.csv",index=False)

文本文档可以用多个标签标记,那么我该如何在此数据集上进行多标签分类?我已经阅读了Sklearn中的很多文档,但是我似乎找不到正确的多标签分类方法。预先感谢您的任何帮助。

您是否在此行上获得错误:

train_labels = le.transform(train_labels)

如果是的,则是因为在上方的线上,您正在这样做:

le.fit(test_labels)

这样做的是忘记了先前的数据(上面的线上的fit()的先前调用(,而仅记住test_labels中的数据。因此

您需要重复行:

le.fit(train_labels)
le.fit(test_labels)

与此:

# I am using .tolist() because I observe that your 
# train_labels, test_labels are pandas Series objects
le.fit(train_labels.tolist() + test_labels.tolist())

最新更新