杰瑞科技汇

Java如何获取URL的HTML内容?

Java 获取URL的HTML内容

在Java中,有几种方法可以获取指定URL的HTML内容,以下是几种常用的实现方式:

Java如何获取URL的HTML内容?-图1
(图片来源网络,侵删)

使用Java标准库(HttpURLConnection)

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
public class HtmlFetcher {
    public static String getHtml(String urlString) throws Exception {
        URL url = new URL(urlString);
        HttpURLConnection connection = (HttpURLConnection) url.openConnection();
        // 设置请求方法
        connection.setRequestMethod("GET");
        // 设置请求头(可选)
        connection.setRequestProperty("User-Agent", "Mozilla/5.0");
        // 获取响应码
        int responseCode = connection.getResponseCode();
        if (responseCode != HttpURLConnection.HTTP_OK) {
            throw new Exception("HTTP请求失败,响应码: " + responseCode);
        }
        // 读取响应内容
        BufferedReader reader = new BufferedReader(
            new InputStreamReader(connection.getInputStream()));
        StringBuilder html = new StringBuilder();
        String line;
        while ((line = reader.readLine()) != null) {
            html.append(line);
        }
        reader.close();
        return html.toString();
    }
    public static void main(String[] args) {
        try {
            String html = getHtml("https://www.example.com");
            System.out.println(html);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

使用第三方库(Apache HttpClient)

如果你使用的是Maven项目,可以添加以下依赖:

<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.13</version>
</dependency>

然后使用以下代码:

import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
public class HtmlFetcherWithHttpClient {
    public static String getHtml(String urlString) throws Exception {
        try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
            HttpGet request = new HttpGet(urlString);
            // 设置请求头(可选)
            request.setHeader("User-Agent", "Mozilla/5.0");
            HttpResponse response = httpClient.execute(request);
            if (response.getStatusLine().getStatusCode() != 200) {
                throw new Exception("HTTP请求失败,响应码: " + response.getStatusLine().getStatusCode());
            }
            return EntityUtils.toString(response.getEntity());
        }
    }
    public static void main(String[] args) {
        try {
            String html = getHtml("https://www.example.com");
            System.out.println(html);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

使用Java 11+的HttpClient(Java 9+)

如果你使用的是Java 9或更高版本,可以使用内置的HttpClient:

import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
public class HtmlFetcherJava11 {
    public static String getHtml(String urlString) throws Exception {
        HttpClient client = HttpClient.newHttpClient();
        HttpRequest request = HttpRequest.newBuilder()
                .uri(URI.create(urlString))
                .header("User-Agent", "Mozilla/5.0")
                .build();
        HttpResponse<String> response = client.send(
                request, 
                HttpResponse.BodyHandlers.ofString());
        if (response.statusCode() != 200) {
            throw new Exception("HTTP请求失败,响应码: " + response.statusCode());
        }
        return response.body();
    }
    public static void main(String[] args) {
        try {
            String html = getHtml("https://www.example.com");
            System.out.println(html);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

注意事项

  1. 异常处理:网络请求可能会抛出各种异常,需要妥善处理
  2. 编码问题:如果目标网页使用非UTF-8编码,可能需要指定正确的字符集
  3. 超时设置:长时间运行或网络不稳定时,应设置连接和读取超时
  4. User-Agent:某些网站会拒绝没有User-Agent头的请求
  5. HTTPS:对于HTTPS请求,可能需要处理SSL证书验证

更健壮的实现示例

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.StandardCharsets;
public class RobustHtmlFetcher {
    public static String getHtml(String urlString) throws Exception {
        URL url = new URL(urlString);
        HttpURLConnection connection = (HttpURLConnection) url.openConnection();
        try {
            // 设置请求方法
            connection.setRequestMethod("GET");
            // 设置请求头
            connection.setRequestProperty("User-Agent", "Mozilla/5.0");
            connection.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
            connection.setRequestProperty("Accept-Language", "en-US,en;q=0.5");
            // 设置超时(毫秒)
            connection.setConnectTimeout(5000);
            connection.setReadTimeout(5000);
            // 获取响应码
            int responseCode = connection.getResponseCode();
            if (responseCode != HttpURLConnection.HTTP_OK) {
                throw new Exception("HTTP请求失败,响应码: " + responseCode);
            }
            // 确定字符编码(优先使用Content-Type指定的编码)
            String contentType = connection.getContentType();
            String charset = StandardCharsets.UTF_8.name();
            if (contentType != null) {
                String[] parts = contentType.split(";");
                for (String part : parts) {
                    part = part.trim();
                    if (part.startsWith("charset=")) {
                        charset = part.substring(8);
                        break;
                    }
                }
            }
            // 读取响应内容
            BufferedReader reader = new BufferedReader(
                new InputStreamReader(connection.getInputStream(), charset));
            StringBuilder html = new StringBuilder();
            char[] buffer = new char[1024];
            int read;
            while ((read = reader.read(buffer)) != -1) {
                html.append(buffer, 0, read);
            }
            reader.close();
            return html.toString();
        } finally {
            connection.disconnect();
        }
    }
    public static void main(String[] args) {
        try {
            String html = getHtml("https://www.example.com");
            System.out.println(html);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

选择哪种方法取决于你的具体需求、Java版本和项目依赖,对于简单的需求,标准库的HttpURLConnection就足够了;对于更复杂的场景,Apache HttpClient或Java 11+的HttpClient可能是更好的选择。

Java如何获取URL的HTML内容?-图2
(图片来源网络,侵删)
Java如何获取URL的HTML内容?-图3
(图片来源网络,侵删)
分享:
扫描分享到社交APP
上一篇
下一篇