我需要帮助。我必须写一个标记化程序。我加载一个文本文件并将其拆分为多个标记,但我还需要显示单词的最终初始位置和单词长度(来自文本文件)。我将非常感谢你的任何帮助。在过去的3天里,我一直在尝试这样做,但没有运气,以下是我所做的:
import java.util.StringTokenizer;
import java.io.*;
public class Tokenizer1 {
public static void main(String[] args) throws FileNotFoundException, IOException {
BufferedReader br = new BufferedReader(new FileReader("C://text.txt"));
FileWriter fw=new FileWriter("C://result.txt");
PrintWriter pw=new PrintWriter(fw);
StringTokenizer st = new StringTokenizer(br.readLine()," ");
while (st.hasMoreTokens()) {
System.out.println(st.nextToken());
}
String[] tokens = "".split(",");
int tokenStartIndex = 0;
for (String token : tokens) {
for (String token : str.split(", ")) {
System.out.println("token: " + token + ", tokenStartIndex: " + tokenStartIndex);
tokenStartIndex += token.length() + 1;
}
}
}
如果您不需要逐行处理文件,可以试试这个:
public static void main(String[] args) throws FileNotFoundException, IOException {
FileInputStream fis = new FileInputStream("C:/text.txt");
StringBuilder sb = new StringBuilder();
int c;
while((c = fis.read()) != -1) {
sb.append((char)c);
}
fis.close();
System.out.println(sb.toString());
System.out.println("---------------------");
int start = 0;
// OPTION 1: using String.split method
String[] tokens = sb.toString().split("[\s,]+");
for(String t : tokens) {
System.out.println("START: " + start + "tLENGTH: " + t.length() + "tWORD: " + t);
start += t.length();
}
start = 0;
// OPTION 2: using StringTokenizer class
StringTokenizer st = new StringTokenizer(sb.toString(), ",tnfr");
while(st.hasMoreTokens()) {
String next = st.nextToken();
System.out.println("START: " + start + "tLENGTH: " + next.length() + "tWORD: " + next);
start += next.length();
}
}
如果你需要逐行处理文件,你可能想试试这个:
public static void main(String[] args) throws FileNotFoundException, IOException {
BufferedReader br = new BufferedReader(new FileReader("C:/text.txt"));
StringBuilder sb = new StringBuilder();
String line;
int lineNumber = -1;
while ((line = br.readLine()) != null) {
++lineNumber;
sb.append(line);
System.out.println("nLINE: " + lineNumber);
int elementPosition = 0;
// OPTION 1: using String.split method
/*String[] lineContents = line.split("[\s,]+");
for (String content : lineContents) {
System.out.println("tSTART: " + elementPosition + "tLENGTH: " + content.length() + "tWORD: " + content);
elementPosition += content.length();
}*/
// OPTION 2: using StringTokenizer class
StringTokenizer st = new StringTokenizer(sb.toString(), ",tnfr");
while(st.hasMoreTokens()) {
String next = st.nextToken();
System.out.println("tSTART: " + elementPosition + "tLENGTH: " + next.length() + "tWORD: " + next);
elementPosition += next.length();
}
}
br.close();
}