从 Java 网页读取源代码



我正在尝试从网页中读取源代码。我的 java 代码是

import java.net.*;
import java.io.*;
import java.util.*;
import javax.swing.JOptionPane;
class Testing{
public static void Connect() throws Exception{

  URL url = new URL("http://excite.com/education");
  URLConnection spoof = url.openConnection();

  spoof.setRequestProperty( "User-Agent", "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0; H010818)" );
  BufferedReader in = new BufferedReader(new InputStreamReader(spoof.getInputStream()));
  String strLine = "";

  while ((strLine = in.readLine()) != null){

   System.out.println(strLine);
  }
  System.out.println("End of page.");
 }
 public static void main(String[] args){
  try{
   Connect();
  }catch(Exception e){
  }
}

当我编译并运行此代码时,它给出了以下输出:

I�%&/m�{J�J��t� $ؐ@ iG#( * eVe]f@ 흼 { { ; N' ?\fdl J ɞ! ?~|?"~ $}> 4 7N + UƳm N ?J tZfM G j R ! 9 ?>JgE Ge[ ⳏ W ? 8|8ho 0׳ |փ:-- | L Uο ׫ m zt n3 l\ w O^f G[ CG@yC}�p�y���lAH�ͯ��zF#�V�6_��}��)�v=J+�$��̤�G�Y�L�b���wS"�7�y^����Z�m���Y:ɛ���J<N_�Y=���U�f���,���y�Q2(J٩P!ͨ�i����1&F0&ૼn�?�x�T��h�Qzw�+����n�)�h��K��2����8g����⮥��A0 ���1I�%����Q�Z����{��������w���?x����N�?�<d�S��۫�%a|4�j��z���k�Bak��k-�c�z�g��z���l="> ֎s^, 5/B { ]] Ý ֳ y{ _l 8g k ʫ b "+| ( M ^[ J�P��_�..?������x�Z�$ E> 느 u E~ {媘 f e1۷ QZ, f e 3Jٻb ^ 4 ۴> y ; n[ �,t�?����~�n�S�u#SL��n�^��������EC��q�/�y���FE�tpm������e&��oB���z9eY��������P��IK?����̦����w�N��;�;J?����;�/��5���M���rZ��q��]��C�dᖣ��F�nd���}���A5���M�5�.�:��/�_D�?�3����'�c�Z7��}��(OI),ۏi����{�<�w�������DZ?e����'q���eY]=���kj���߬������qhrRn���l�o-��.���k��_���oD8��GA�P�r��|$��ȈPv~Y�:�[q?�sH�� <��C��ˬ�^N�[ v(��S��l�c�C����3���E5&5�VӪL�T��۔���oQrĈ��/���#[f�5�5"� [ t vm \ .0 nh aڌWYM^T |\, 퓜 L u B ̌ C r ' % { ( (; fV ] g,> C c2 p 4 }H P ( %j" } & : Oh\5I l 氪 {/] LB l 2 I" = Y |> ֏n } ~ [ ' O:/( Wz 3 lo .5 k & H[ji b WWy} 5 ֝Q |f ] KjH5 }yNm g ʷ ǣ> 'o 泏

任何人请帮忙。

提前谢谢。

它对我有用。

private static String getWebPabeSource(String sURL) throws IOException {
        URL url = new URL(sURL);
        URLConnection urlCon = url.openConnection();
        BufferedReader in = null;
        if (urlCon.getHeaderField("Content-Encoding") != null
                && urlCon.getHeaderField("Content-Encoding").equals("gzip")) {
            in = new BufferedReader(new InputStreamReader(new GZIPInputStream(
                    urlCon.getInputStream())));
        } else {
            in = new BufferedReader(new InputStreamReader(
                    urlCon.getInputStream()));
        }
        String inputLine;
        StringBuilder sb = new StringBuilder();
        while ((inputLine = in.readLine()) != null)
            sb.append(inputLine);
        in.close();
        return sb.toString();
}

试着这样读:

private static String getUrlSource(String url) throws IOException {
        URL url = new URL(url);
        URLConnection urlConn = url.openConnection();
        BufferedReader in = new BufferedReader(new InputStreamReader(
                urlConn.getInputStream(), "UTF-8"));
        String inputLine;
        StringBuilder a = new StringBuilder();
        while ((inputLine = in.readLine()) != null)
            a.append(inputLine);
        in.close();
        return a.toString();
    }

并根据网页设置编码 - 请注意以下行:

BufferedReader in = new BufferedReader(new InputStreamReader(
                urlConn.getInputStream(), "UTF-8"));

首先,您必须使用 GZIPInputStream 解压缩内容。然后将未压缩的流放入输入流并使用缓冲阅读器读取它

使用 Apache HTTP Client 4.1.1

Maven dependency

<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.1.1</version>
</dependency>   

用于解析 gzip 内容的示例代码。

package com.gzip.simple;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.zip.GZIPInputStream;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
public class GZIPFetcher {
    public static void main(String[] args) {
        try {
            DefaultHttpClient httpClient = new DefaultHttpClient();
            HttpGet getRequest = new HttpGet("http://excite.com/education");
            getRequest.addHeader("accept", "application/json");
            HttpResponse response = httpClient.execute(getRequest);
            if (response.getStatusLine().getStatusCode() != 200) {
                throw new RuntimeException("Failed : HTTP error code : "
                        + response.getStatusLine().getStatusCode());
            }
            InputStream instream = response.getEntity().getContent();
            // Check whether the content-encoding is gzip or not.
            Header contentEncoding = response
                    .getFirstHeader("Content-Encoding");
            if (contentEncoding != null
                    && contentEncoding.getValue().equalsIgnoreCase("gzip")) {
                instream = new GZIPInputStream(instream);
            }
            BufferedReader in = new BufferedReader(new InputStreamReader(
                    instream));
            String content;
            System.out.println("Output from Server .... n");
            while ((content = in.readLine()) != null)
                System.out.println(content);
            httpClient.getConnectionManager().shutdown();
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

最新更新