我一直在修改以下逻辑,如果下面提取的列的值:l[5]是一个数字,它类似于_58982,它将简单地将其替换为CSV文件中下一列的值l[6],最终输出将变成:LEARNING_MAT.
下面是示例逻辑,但它没有像预期的那样工作。
for l in self.lines[1:]:
try:
if(re.match(r'_[d]{19}', l[5])):
print("Processing Line No : {}")
s = Sample(verified=True,
count=l[1],
intent=l[6].replace(':','_').replace('-', '_'),
token_tree=TokenTree(l[8]),
protected=self.protected)
else:
s = Sample(verified=True,
count=l[1],
intent=l[5].replace(':','_').replace('-', '_'),
token_tree=TokenTree(l[8]),
protected=self.protected)
except Exception as ex:
raise ValueError("Bad sample definition: %s (check tabs)" % l)
else:
print("Processing 5 No : {}")
ret.append(s)
下面的代码可用于此-
if(re.match(r'_[d]{19}', l[5])):
unaccented_string = unidecode(l[6])
# surpassing all control characters, symbols and punctuations
res = "".join(char for char in unaccented_string if category(char)[0] not in ["C", "S", "P"])
#remove duplicate spaces
res = " ".join(res.split())
append_str = "X_"
if(res[:1].isdigit()):
res = append_str+res
s = Sample(verified=True,
count=l[1],
intent=str(res).replace(':','_').replace('-', '_').upper().replace(" ","_"),
token_tree=TokenTree(l[8]),
protected=self.protected)
ret.append(s)