import keras
import keras.backend as K
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense
from keras.layers.core import Activation
from keras.preprocessing.text import Tokenizer # for
tokenizing text
from keras.preprocessing.sequence import pad_sequences # for
padding sentences with zeros. To make the sentence length same
from keras.utils import to_categorical # for one-
hot encoding of the labels
from keras.layers import Dense, Input, Flatten, Dropout,
BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential
from sklearn.model_selection import train_test_split
MAX_SEQUENCE_LENGTH = 300
MAX_NB_WORDS = 20000
#Reading the data
raw_data=pd.read_csv("/home/riaz.k/Desktop/TRAIN.csv")
raw_data.head()
# create training and testing vars
train, test = train_test_split(raw_data, test_size=0.3)
train.head()
test.head()
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train.Procedure)
train_sequences = tokenizer.texts_to_sequences(train.Procedure)
test_sequences = tokenizer.texts_to_sequences(test.Procedure)
word_index = tokenizer.word_index
containing words and their index
# print(tokenizer.word_index)
print('Found %s unique tokens.' % len(word_index))
train_data = pad_sequences(train_sequences,
maxlen=MAX_SEQUENCE_LENGTH)
train
test_data=pad_sequences(test_sequences,maxlen=MAX_SEQUENCE_LENGTH)
test
print(train_data.shape)
print(test_data.shape)
print (word_index)
train_labels = train['dxcode']
test_labels = test['dxcode']
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder() # converts the character
array to numeric array.
Assigns levels to unique labels.
le.fit(train_labels)
le.fit(test_labels)
train_labels = le.transform(train_labels)
test_labels = le.transform(test_labels)
print(le.classes_)
print(np.unique(train_labels, return_counts=True))
print(np.unique(test_labels, return_counts=True))
le.inverse_transform(1)
labels_train = to_categorical(np.asanyarray(train_labels))
labels_test = to_categorical(np.asarray(test_labels))
print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', labels_train.shape)
print('Shape of label tensor:', labels_test.shape)
EMBEDDING_DIM = 100
print(MAX_SEQUENCE_LENGTH)
print('Training model.')
model = Sequential()
model.add(Embedding(MAX_NB_WORDS,
EMBEDDING_DIM,
input_length=MAX_SEQUENCE_LENGTH
))
model.add(Dropout(0.2))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(23, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['acc'],)
model.fit(train_data, labels_train,
batch_size=32,
epochs=10,
validation_data=(test_data, labels_test))
model.evaluate(test_data, labels_test)
pred = model.predict(test_data)
pred
# print(model.layers)
for layer in model.layers:
print(layer)
import keras.backend as K
emd = K.function(inputs=[model.layers[0].input],
outputs=[model.layers[0].output])
rbind = np.concatenate((train_data, test_data), axis=0)
print(rbind.shape)
### Submissions file
test_results = model.predict_classes(rbind)
#print(test_results)
test_labels = le.inverse_transform(test_results)
#test_labels = [le.inverse_transform(i) for i in test_results]
submissions_CNN =
pd.DataFrame({'id':raw_data['Claimno'],"label":test_labels})
submissions_CNN.to_csv("/home/riaz.k/Desktop/submissions.csv",index=False)
文本文档可以用多个标签标记,那么我该如何在此数据集上进行多标签分类?我已经阅读了Sklearn中的很多文档,但是我似乎找不到正确的多标签分类方法。预先感谢您的任何帮助。
您是否在此行上获得错误:
train_labels = le.transform(train_labels)
如果是的,则是因为在上方的线上,您正在这样做:
le.fit(test_labels)
这样做的是忘记了先前的数据(上面的线上的fit()
的先前调用(,而仅记住test_labels
中的数据。因此
您需要重复行:
le.fit(train_labels)
le.fit(test_labels)
与此:
# I am using .tolist() because I observe that your
# train_labels, test_labels are pandas Series objects
le.fit(train_labels.tolist() + test_labels.tolist())