JSOUP登录,然后在HTML字符串中解析和查找价格



我基本上已经把程序弄明白了。我只需要从这个HTML字符串中提取价格:<li class=""><b class="">Your Price:</b> $23.51&nbsp; &nbsp; <b class="">You Save:</b> $11.48</li>我只需要该字符串中的"23.51",即列为"您的价格"的字符串。

我更新了代码以反映我目前正在使用的内容。我不确定下一步会是什么。

我不知道在哪里添加解析。我需要它从电子表格中获取URL,然后将价格数据放在适当的列中(price_column(。

这是我的代码,感谢Krystian。

public class Scraper {

private static final int URL_COLUMN = 2; // Column C
private static final int SKU_COLUMN = 3; // Column D
private static final int SALE_COLUMN = 4;// Column E
private static final int PRICE_COLUMN = 5; //Column F
public static void main(final String[] args) throws Exception {
Workbook originalWorkbook = Workbook.getWorkbook(new File("C:/Users/MSI/Desktop/original.xls"));
WritableWorkbook workbook = Workbook.createWorkbook(new File("C:/Users/MSI/Desktop/updated.xls"), originalWorkbook);
originalWorkbook.close();
WritableSheet sheet = workbook.getSheet(0);
int currentRow = 1;
Cell cell;
while (!(cell = sheet.getCell(URL_COLUMN, currentRow)).getType().equals(CellType.EMPTY)) {
String url = cell.getContents();
System.out.println("Parsing URL: " + url);
String SKU = parseUrlWithJsoupAndGetProductSKU(url);
String price = parseUrlWithJsoupAndGetProductPrice(url);
String sale= parseUrlWithJsoupAndGetSale(url);
System.out.println("SKU: " + SKU);
System.out.println("Regular price: " + price);
System.out.println("Sale price: " + sale);
Label cellWithSKU = new Label(SKU_COLUMN, currentRow, SKU);
sheet.addCell(cellWithSKU);
Label cellWithSale = new Label(SALE_COLUMN, currentRow, sale);
sheet.addCell(cellWithSale);
Label cellWithPrice = new Label(PRICE_COLUMN, currentRow, price);
sheet.addCell(cellWithPrice);
currentRow++;
}
workbook.write();
workbook.close();
}
private static String parseUrlWithJsoupAndGetProductSKU(String url) throws IOException {
try {
Document doc = Jsoup.connect(url).userAgent("Mozilla").get();
return doc.select("#product_id_num").text();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
private static String parseUrlWithJsoupAndGetSale(String url) throws IOException {
try {
Document doc = Jsoup.connect(url).userAgent("Mozilla").get();
return doc.select("#NA").text();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
private static String parseUrlWithJsoupAndGetProductPrice(String url) throws IOException {
try {
Document doc = Jsoup.connect(url).userAgent("Mozilla").get();
System.out.println(getPrice(doc));
return doc.select("#price").text();
private static String price(Document doc) {
// select all <li> inside <ul class="small">
Elements liElements = doc.select("ul.small>li");
for (Element li : liElements) {
// find "List Price:"
if (li.text().contains("List Price:")) {
// remove <b> with contents
li.select("b").remove();
// there's only price left in <li>
return li.text();
}
}
return "not found";
}
} catch (IOException e) {
e.printStackTrace();
}
return null;
}

注释中的解释:

public static void main(final String[] args) throws IOException {
Document doc = Jsoup
.parse("<ul class="small">    <li class=""><b class="">Description:</b> . (Import)</li> <li class=""><b class="">List Price:</b> $123</li> <li class=""><b class="">Your Price:</b> $***&nbsp; &nbsp; <b class="">You Save:</b> $***</li> <li class="stockStatus"><b class="">Stock Status:</b> 1</li> <h3 class="additionalInfo"><span class="">Additional Information</span></h3> <li class="additional">Uses extension part# ***</li> <li class="additional">Replaces GM# ***</li> </ul> ");
System.out.println(getPrice(doc));
}
private static String getPrice(Document doc) {
// select all <li> inside <ul class="small">
Elements liElements = doc.select("ul.small>li");
for (Element li : liElements) {
// find "List Price:"
if (li.text().contains("List Price:")) {
// remove <b> with contents
li.select("b").remove();
// there's only price left in <li>
return li.text();
}
}
return "not found";
}

最新更新