创建标记化程序



我需要帮助。我必须写一个标记化程序。我加载一个文本文件并将其拆分为多个标记,但我还需要显示单词的最终初始位置和单词长度(来自文本文件)。我将非常感谢你的任何帮助。在过去的3天里,我一直在尝试这样做,但没有运气,以下是我所做的:

import java.util.StringTokenizer;
import java.io.*;
public class Tokenizer1 { 
public static void main(String[] args) throws FileNotFoundException, IOException {
    BufferedReader br = new BufferedReader(new FileReader("C://text.txt"));
    FileWriter fw=new FileWriter("C://result.txt");
    PrintWriter pw=new PrintWriter(fw);
    StringTokenizer st = new StringTokenizer(br.readLine()," ");
    while (st.hasMoreTokens()) {
        System.out.println(st.nextToken());
    } 
    String[] tokens = "".split(",");
    int tokenStartIndex = 0;
    for (String token : tokens) {
        for (String token : str.split(", ")) {
            System.out.println("token: " + token + ", tokenStartIndex: " +    tokenStartIndex);
            tokenStartIndex += token.length() + 1;
        }
    }
}

如果您不需要逐行处理文件,可以试试这个:

public static void main(String[] args) throws FileNotFoundException, IOException {
    FileInputStream fis = new FileInputStream("C:/text.txt");
    StringBuilder sb = new StringBuilder();
    int c;
    while((c = fis.read()) != -1) {
        sb.append((char)c);
    }
    fis.close();
    System.out.println(sb.toString());
    System.out.println("---------------------");
    int start = 0;
    // OPTION 1: using String.split method
    String[] tokens = sb.toString().split("[\s,]+");
    for(String t : tokens) {
        System.out.println("START: " + start + "tLENGTH: " + t.length() + "tWORD: " + t);
        start += t.length();
    }
    start = 0;
    // OPTION 2: using StringTokenizer class
    StringTokenizer st = new StringTokenizer(sb.toString(), ",tnfr");
    while(st.hasMoreTokens()) {
        String next = st.nextToken();
        System.out.println("START: " + start + "tLENGTH: " + next.length() + "tWORD: " + next);
        start += next.length();
    }
}

如果你需要逐行处理文件,你可能想试试这个:

public static void main(String[] args) throws FileNotFoundException, IOException {
    BufferedReader br = new BufferedReader(new FileReader("C:/text.txt"));
    StringBuilder sb = new StringBuilder();
    String line;
    int lineNumber = -1;
    while ((line = br.readLine()) != null) {
        ++lineNumber;
        sb.append(line);
        System.out.println("nLINE: " + lineNumber);
        int elementPosition = 0;
        // OPTION 1: using String.split method
        /*String[] lineContents = line.split("[\s,]+");
        for (String content : lineContents) {
            System.out.println("tSTART: " + elementPosition + "tLENGTH: " + content.length() + "tWORD: " + content);
            elementPosition += content.length();
        }*/
        // OPTION 2: using StringTokenizer class
        StringTokenizer st = new StringTokenizer(sb.toString(), ",tnfr");
        while(st.hasMoreTokens()) {
            String next = st.nextToken();
            System.out.println("tSTART: " + elementPosition + "tLENGTH: " + next.length() + "tWORD: " + next);
            elementPosition += next.length();
        }
    }
    br.close();
}

相关内容

  • 没有找到相关文章

最新更新