from transformers import pipeline
summarizer = pipeline('summarization')
summaries = []
if chunks: # Check if chunks is not empty
for chunk in chunks:
if chunk.strip(): # Check if chunk is not an empty string
summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
summaries.append(summary)
如果我运行这个代码,我得到这个错误:
Token indices sequence length is longer than the specified maximum sequence length for this model (10020 > 1024). Running this sequence through the model will result in indexing errors
我改变了max_length和min_length的值但是我得到了相同的错误
错误是因为您的块长度10020
远远超过100
的最大序列长度。
这里有两个建议:
- 将序列长度增加到512或变压器模型可以容纳的任何长度(您需要检查huggingface文档)。
- 我发布了一个片段,从
chunk
创建chunk_parts
,每个512
的最大序列长度。
max_chunk_length = 512 # Maximum length of each chunk
if chunks: # Check if chunks is not empty
for chunk in chunks:
if chunk.strip(): # Check if chunk is not an empty string
# Split chunk into smaller chunks
chunk_parts = [chunk[i:i+max_chunk_length] for i in range(0, len(chunk), max_chunk_length)]
# Summarize each chunk separately and append to summaries
for part in chunk_parts:
summary = summarizer(part, max_length=max_chunk_length, min_length=30, do_sample=False)[0]['summary_text']
summaries.append(summary)