在sklearn中编写多功能文本分类器



我是sklearn的新手。我在以下链接的帮助下编写了一个文本分类器http://nbviewer.ipython.org/gist/rjweiss/7158866

在这个链接中只有一个功能。以下示例工作正常-

    add = open(file)
    add_data = []
    add_labels = []
    csv_reader = csv.reader(add)
    for line in csv_reader:
        add_labels.append(int(line[0]))
        add_data.append(line[1])
    add.close()
    trainset_size = int(round(len(add_data)*0.75))
    X_train = []
    y_train = []
    count = 0
    while count<trainset_size:
        X_train.append(add_data[count])
        y_train.append(add_labels[count])
        count = count + 1
    X_test = []
    y_test = []
    while count<len(add_data):
        X_test.append(add_data[count])
        y_test.append(add_labels[count])
        count = count + 1
    vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 1), stop_words='english', strip_accents='unicode', norm='l2')
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    nb_classifier = MultinomialNB().fit(X_train, y_train)
    #nb_classifier = LinearSVC().fit(X_train, y_train)
    #nb_classifier = LogisticRegression().fit(X_train, y_train)
    y_nb_predicted = nb_classifier.predict(X_test)

现在,我希望每个训练数据条目都有两个特性。所以我写了以下代码-

    add = open(file)
    add_data = []
    add_labels = []
    add_data_2 = []
    csv_reader = csv.reader(add)
    for line in csv_reader:
        add_labels.append(int(line[0]))
        add_data_2.append(str(line[1]))
        add_data.append(str(line[2]))
    add.close()
    trainset_size = int(round(len(add_data)*0.75))
    X_XX_train = []
    XX_train = []
    X_train = []
    y_train = []
    count = 0
    while count<trainset_size:
        X_train.append(add_data[count])
        XX_train.append(add_data_2[count])
        X_XX_train.append([add_data[count], add_data_2[count]])
        y_train.append(add_labels[count])
        count = count + 1

    X_XX_test = []
    X_test = []
    XX_test = []
    y_test = []
    while count<len(add_data):
        X_test.append(add_data[count])
        XX_test.append(add_data_2[count])
        X_XX_test.append([add_data[count], add_data_2[count]])
        y_test.append(add_labels[count])
        count = count + 1
    vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 1), stop_words='english', strip_accents='unicode', norm='l2')
    X_train = vectorizer.fit_transform(X_XX_train)
    X_test = vectorizer.transform(X_XX_test)
    nb_classifier = MultinomialNB().fit(X_train, y_train)
    #nb_classifier = LinearSVC().fit(X_train, y_train)
    #nb_classifier = LogisticRegression().fit(X_train, y_train)
    y_nb_predicted = nb_classifier.predict(X_test)

它给了我以下错误-

    Traceback (most recent call last):
  File "/home/XXX/PycharmProjects/add_classifier/src/my_classifier.py", line 162, in add_classifier_2
    X_train = vectorizer.fit_transform(X_XX_train)
  File "/usr/local/lib/python2.7/dist-packages/sklearn/feature_extraction/text.py", line 1282, in fit_transform
    X = super(TfidfVectorizer, self).fit_transform(raw_documents)
  File "/usr/local/lib/python2.7/dist-packages/sklearn/feature_extraction/text.py", line 817, in fit_transform
    self.fixed_vocabulary_)
  File "/usr/local/lib/python2.7/dist-packages/sklearn/feature_extraction/text.py", line 748, in _count_vocab
    for feature in analyze(doc):
  File "/usr/local/lib/python2.7/dist-packages/sklearn/feature_extraction/text.py", line 234, in <lambda>
    tokenize(preprocess(self.decode(doc))), stop_words)
  File "/usr/local/lib/python2.7/dist-packages/sklearn/feature_extraction/text.py", line 200, in <lambda>
    return lambda x: strip_accents(x.lower())
AttributeError: 'list' object has no attribute 'lower'

我包含两个特性(此处为add_data和add_data_2)的方法是错误的吗??我在网上找不到合适的例子!!

TfidfVectorizer需要一个字符串列表作为输入。X_XX_train是字符串列表。根据文本的两个部分是否需要相同的词汇表,可以直接连接字符串,也可以分别为这两个部分构建TfidfVectorizer,并连接它们的输出。

相关内容

  • 没有找到相关文章

最新更新