Java 获取URL的HTML内容
在Java中,有几种方法可以获取指定URL的HTML内容,以下是几种常用的实现方式:

使用Java标准库(HttpURLConnection)
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
public class HtmlFetcher {
public static String getHtml(String urlString) throws Exception {
URL url = new URL(urlString);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
// 设置请求方法
connection.setRequestMethod("GET");
// 设置请求头(可选)
connection.setRequestProperty("User-Agent", "Mozilla/5.0");
// 获取响应码
int responseCode = connection.getResponseCode();
if (responseCode != HttpURLConnection.HTTP_OK) {
throw new Exception("HTTP请求失败,响应码: " + responseCode);
}
// 读取响应内容
BufferedReader reader = new BufferedReader(
new InputStreamReader(connection.getInputStream()));
StringBuilder html = new StringBuilder();
String line;
while ((line = reader.readLine()) != null) {
html.append(line);
}
reader.close();
return html.toString();
}
public static void main(String[] args) {
try {
String html = getHtml("https://www.example.com");
System.out.println(html);
} catch (Exception e) {
e.printStackTrace();
}
}
}
使用第三方库(Apache HttpClient)
如果你使用的是Maven项目,可以添加以下依赖:
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
然后使用以下代码:
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
public class HtmlFetcherWithHttpClient {
public static String getHtml(String urlString) throws Exception {
try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
HttpGet request = new HttpGet(urlString);
// 设置请求头(可选)
request.setHeader("User-Agent", "Mozilla/5.0");
HttpResponse response = httpClient.execute(request);
if (response.getStatusLine().getStatusCode() != 200) {
throw new Exception("HTTP请求失败,响应码: " + response.getStatusLine().getStatusCode());
}
return EntityUtils.toString(response.getEntity());
}
}
public static void main(String[] args) {
try {
String html = getHtml("https://www.example.com");
System.out.println(html);
} catch (Exception e) {
e.printStackTrace();
}
}
}
使用Java 11+的HttpClient(Java 9+)
如果你使用的是Java 9或更高版本,可以使用内置的HttpClient:
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
public class HtmlFetcherJava11 {
public static String getHtml(String urlString) throws Exception {
HttpClient client = HttpClient.newHttpClient();
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(urlString))
.header("User-Agent", "Mozilla/5.0")
.build();
HttpResponse<String> response = client.send(
request,
HttpResponse.BodyHandlers.ofString());
if (response.statusCode() != 200) {
throw new Exception("HTTP请求失败,响应码: " + response.statusCode());
}
return response.body();
}
public static void main(String[] args) {
try {
String html = getHtml("https://www.example.com");
System.out.println(html);
} catch (Exception e) {
e.printStackTrace();
}
}
}
注意事项
- 异常处理:网络请求可能会抛出各种异常,需要妥善处理
- 编码问题:如果目标网页使用非UTF-8编码,可能需要指定正确的字符集
- 超时设置:长时间运行或网络不稳定时,应设置连接和读取超时
- User-Agent:某些网站会拒绝没有User-Agent头的请求
- HTTPS:对于HTTPS请求,可能需要处理SSL证书验证
更健壮的实现示例
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.StandardCharsets;
public class RobustHtmlFetcher {
public static String getHtml(String urlString) throws Exception {
URL url = new URL(urlString);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
try {
// 设置请求方法
connection.setRequestMethod("GET");
// 设置请求头
connection.setRequestProperty("User-Agent", "Mozilla/5.0");
connection.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
connection.setRequestProperty("Accept-Language", "en-US,en;q=0.5");
// 设置超时(毫秒)
connection.setConnectTimeout(5000);
connection.setReadTimeout(5000);
// 获取响应码
int responseCode = connection.getResponseCode();
if (responseCode != HttpURLConnection.HTTP_OK) {
throw new Exception("HTTP请求失败,响应码: " + responseCode);
}
// 确定字符编码(优先使用Content-Type指定的编码)
String contentType = connection.getContentType();
String charset = StandardCharsets.UTF_8.name();
if (contentType != null) {
String[] parts = contentType.split(";");
for (String part : parts) {
part = part.trim();
if (part.startsWith("charset=")) {
charset = part.substring(8);
break;
}
}
}
// 读取响应内容
BufferedReader reader = new BufferedReader(
new InputStreamReader(connection.getInputStream(), charset));
StringBuilder html = new StringBuilder();
char[] buffer = new char[1024];
int read;
while ((read = reader.read(buffer)) != -1) {
html.append(buffer, 0, read);
}
reader.close();
return html.toString();
} finally {
connection.disconnect();
}
}
public static void main(String[] args) {
try {
String html = getHtml("https://www.example.com");
System.out.println(html);
} catch (Exception e) {
e.printStackTrace();
}
}
}
选择哪种方法取决于你的具体需求、Java版本和项目依赖,对于简单的需求,标准库的HttpURLConnection就足够了;对于更复杂的场景,Apache HttpClient或Java 11+的HttpClient可能是更好的选择。


