使用 Java 大型文件查询 JSON 文件



我正在尝试使用java解析下面的JSON文件。 我需要能够

  • 按 ID 或名称或对象中的任何字段搜索文件。
  • 也在字段中搜索空值。

搜索应返回整个对象。 文件将很大,搜索仍然应该具有时间效率。


[
{
"id": 1,
"name": "Mark Robb",
"last_login": "2013-01-21T05:13:41 -11:30",
"email": "markrobb@gmail.com",
"phone": "12345",
"locations": [
"Germany",
"Austria"
]
},
{
"id": 2,
"name": "Matt Nish",
"last_login": "2014-02-21T07:10:41 -11:30",
"email": "mattnish@gmail.com",
"phone": "456123",
"locations": [
"France",
"Italy"
]
}
]

这是我迄今为止使用杰克逊库尝试的。

public void findById(int id) {
List<Customer> customers = objectMapper.readValue(new File("src/main/resources/customers.json"), new    TypeReference<List<Customer>>(){});
for(Customer customer: customers) {
if(customer.getId() == id) {
System.out.println(customer.getName());
}
}
}

我只是不认为这对于一个巨大的 JSON 文件(一个文件中大约有 20000 个客户(是一种有效的方法。并且可能有多个文件。搜索时间不应线性增加。 我怎样才能使这个时间更有效率?我应该使用任何其他库吗?

最有效的(CPU 和内存(解析方法是使用面向流的分析而不是对象映射。通常,编写更多的代码,但通常也是一个很好的:)Gson和Jackson都支持这种轻量级技术。此外,应避免在主/热路径中分配内存,以防止 GC 暂停。为了说明这个想法,我使用了一个小型的无GC库 https://github.com/anatolygudkov/green-jelly:

import org.green.jelly.*;    
import java.io.CharArrayReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
public class SelectById {
public static class Customer {
private long id;
private String name;
private String email;
public void clear() {
id = 0;
name = null;
email = null;
}
public Customer makeCopy() {
Customer result = new Customer();
result.id = id;
result.name = name;
result.email = email;
return result;
}
@Override
public String toString() {
return "Customer{" +
"id=" + id +
", name='" + name + ''' +
", email='" + email + ''' +
'}';
}
}
public static void main(String[] args) throws Exception {
final String file = "n" +
"[n" +
"  {n" +
"    "id": 1,n" +
"    "name": "Mark Robb",n" +
"    "last_login": "2013-01-21T05:13:41 -11:30",n" +
"    "email": "markrobb@gmail.com",n" +
"    "phone": "12345",n" +
"    "locations": [n" +
"        "Germany",n" +
"        "Austria"n" +
"    ]n" +
"},n" +
"  {n" +
"    "id": 2,n" +
"    "name": "Matt Nish",n" +
"    "last_login": "2014-02-21T07:10:41 -11:30",n" +
"    "email": "mattnish@gmail.com",n" +
"    "phone": "456123",n" +
"    "locations": [n" +
"        "France",n" +
"        "Italy"n" +
"    ]n" +
" }n" +
"]n";
final List<Customer> selection = new ArrayList<>();
final long selectionId = 2;
final JsonParser parser = new JsonParser().setListener(
new JsonParserListenerAdaptor() {
private final Customer customer = new Customer();
private String currentField;
@Override
public boolean onObjectStarted() {
customer.clear();
return true;
}
@Override
public boolean onObjectMember(final CharSequence name) {
currentField = name.toString();
return true;
}
@Override
public boolean onStringValue(final CharSequence data) {
switch (currentField) {
case "name":
customer.name = data.toString();
break;
case "email":
customer.email = data.toString();
break;
}
return true;
}
@Override
public boolean onNumberValue(final JsonNumber number) {
if ("id".equals(currentField)) {
customer.id = number.mantissa();
}
return true;
}
@Override
public boolean onObjectEnded() {
if (customer.id == selectionId) {
selection.add(customer.makeCopy());
return false; // we don't need to continue
}
return true;
}
}
);
// now let's read and parse the data with a buffer
final CharArrayCharSequence buffer = new CharArrayCharSequence(1024);
try (final Reader reader = new CharArrayReader(file.toCharArray())) { // replace by FileReader, for example
int len;
while((len = reader.read(buffer.getChars())) != -1) {
buffer.setLength(len);
parser.parse(buffer);
}
}
parser.eoj();
System.out.println(selection);
}
}

它应该在 Java 中尽可能快地工作(以防我们不能直接使用 SIMD 指令(。要摆脱主路径中的内存分配(和 GC 暂停(,您必须将".toString(("(它创建字符串的新实例(替换为可重用的东西,如 StringBuilder。

可能影响整体性能的最后一件事是文件读取方法。RandomAccessFile是我们在Java中拥有的最佳选择之一。由于您的编码似乎是 ASCII,因此只需将字节转换为字符即可传递给 JsonParser。

杰克逊应该可以做到这一点。 诀窍是使用JsonParser流式传输/解析顶级数组,然后使用ObjectMapper.readValue()解析每条记录。

ObjectMapper objectMapper = new ObjectMapper();
File file = new File("customers.json");
try (JsonParser parser = objectMapper.getFactory().createParser(file))
{
//Assuming top-level array
if (parser.nextToken() != JsonToken.START_ARRAY)
throw new RuntimeException("Expected top-level array in JSON.");
//Now inside the array, parse each record
while (parser.nextToken() != JsonToken.END_ARRAY)
{
Customer customer = objectMapper.readValue(parser, Customer.class);
//Do something with each customer as it is parsed
System.out.println(customer.id + ": " + customer.name);
}
}
@JsonIgnoreProperties(ignoreUnknown = true)
public static class Customer
{
public String id;
public String name;
public String email;
}

时间效率而言,它仍然需要扫描整个文件 - 如果没有索引或像并行解析这样更花哨的东西,你无能为力。 但它将比将整个 JSON 读入内存更节省内存- 此代码一次只加载一个Customer对象。


也:

if(customer.getId() == id) {

使用.equals()比较字符串,而不是==

if (customer.getId().equals(id)) {

你可以试试 Gson 库。此库实现了一个TypeAdapter类,该类通过流式序列化和反序列化将 Java 对象与 JSON 相互转换。

该 API 高效且灵活,尤其是对于大型文件。下面是一个示例:

public class GsonStream {
public static void main(String[] args) {
Gson gson = new Gson();
try (Reader reader = new FileReader("src/main/resources/customers.json")) {
Type listType = new TypeToken<List<Customer>>(){}.getType();
// Convert JSON File to Java Object
List<Customer> customers = gson.fromJson(reader, listType);
List<Customer> names = customers
.stream()
.filter(c -> c.getId() == id)
.map(Customer::getName)
.collect(Collectors.toList());
} catch (IOException e) {
e.printStackTrace();
}
}
}

如果你想了解如何覆盖这里的TypeAdapter抽象类,这里有和示例:

public class GsonTypeAdapter { 
public static void main(String args[]) { 
GsonBuilder builder = new GsonBuilder(); 
builder.registerTypeAdapter(Customer.class, new customerAdapter()); 
builder.setPrettyPrinting(); 
Gson gson = builder.create();  
try {
reader = new JsonReader(new FileReader("src/main/resources/customers.json"));
Customer customer = gson.fromJson(jsonString, Customer.class); 
System.out.println(customer);  
jsonString = gson.toJson(customer); 
System.out.println(jsonString);  
} catch (IOException e) {
e.printStackTrace();
}
}      
}  
class customerAdapter extends TypeAdapter<Customer> { 
@Override 
public customer read(JsonReader reader) throws IOException { 
Customer customer = new customer(); 
reader.beginObject(); 
String fieldName = null; 
while (reader.hasNext()) { 
JsonToken token = reader.peek();            
if (token.equals(JsonToken.NAME)) {     
//get the current token 
fieldName = reader.nextName(); 
} 
if ("name".equals(fieldName)) {       
//move to next token 
token = reader.peek(); 
customer.setName(reader.nextString()); 
} 
if("id".equals(fieldName)) { 
//move to next token 
token = reader.peek(); 
customer.setRollNo(reader.nextInt()); 
}               
} 
reader.endObject(); 
return customer; 
}  
@Override 
public void write(JsonWriter writer, Customer customer) throws IOException { 
writer.beginObject(); 
writer.name("name"); 
writer.value(customer.getName()); 
writer.name("id"); 
writer.value(customer.getId()); 
writer.endObject(); 
} 
}  
class Customer { 
private int id; 
private String name;  
public int getId() { 
return id; 
} 
public void setId(int id) { 
this.id = id; 
}  
public String getName() { 
return name; 
}  
public void setName(String name) { 
this.name = name; 
}   
public String toString() { 
return "Customer[ name = " + name + ", id: " + id + "]"; 
} 
}

最新更新