采用htmlparser分析某网站的图片专辑并多线程下载 2008-10-08 16:03
package photoDownload;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
/**
* 多线程的图片专辑下载工具
* @author chenxiaowen
*
*/
public class FoodqsPhotoDownload {
/**
* 首页url
*/
private String indexUrl = null;
/**
* 本地下载到目录
*/
private File localFild = null;
/**
* html页面的编码
*/
private String enc = null;
/**
* url目录
*/
private String urlFold = null;
/**
* 要下载的图片数
*/
private int totalPics = 0;
/**
* 已下载的图片数
*/
private int downloadPics = 0;
/**
* 页面分析完毕,可以分析是否下载完毕了
*/
private boolean pageurlSetover = false;
private long beginTimeMillis;
/**
*
* @throws IOException
*/
public static void main(String[] args) throws IOException {
FoodqsPhotoDownload test = new FoodqsPhotoDownload(
"http://www.foodqs.com/news/jkys15/20085249110126.htm",
"不可思议:米饭的最高境界", "c:/中国食品产业网_pic", "gb2312");
test.begin();
}
/**
* 开始下载
*/
public void begin() {
beginTimeMillis = System.currentTimeMillis();
new DownLoadPageAndDownloadPhoto(this.indexUrl, true).start();
}
public FoodqsPhotoDownload(String indexUrl, String title,
String localFild, String enc) {
this.indexUrl = indexUrl;
urlFold = FilenameUtils.getFullPath(indexUrl);
if (localFild.endsWith("/") || localFild.endsWith("\\")) {
localFild = localFild.substring(0, localFild.length() - 1);
}
this.localFild = new File(localFild + File.separator + title);
{
if (!this.localFild.exists())
this.localFild.mkdirs();
}
this.enc = enc;
}
/**
* 因为FileUtils不支持“String content =
* FileUtils.readFileToString(FileUtils.toFile(new
* URL("http://www.baidu.com")), "gb2312");”所以添加个方法
*
* @param source
* @param encoding
* @return
* @throws IOException
*/
public static String readURLToString(URL source, String encoding)
throws IOException {
InputStream input = source.openStream();
try {
return IOUtils.toString(input, encoding);
} finally {
IOUtils.closeQuietly(input);
}
}
/**
* 获得关键的td节点,id=zoom
* @param content
* @return
*/
public Node getKeyTD(String content) {
Parser parser = Parser.createParser(content, enc);
HtmlPage page = new HtmlPage(parser);
try {
parser.visitAllNodesWith(page);
} catch (ParserException e1) {
}
NodeList tdNode = page.getBody();
NodeFilter tdfilter = new NodeAttributeFilter("id", "zoom",
TableColumn.class);
tdNode = tdNode.extractAllNodesThatMatch(tdfilter, true);
if (tdNode!= null && tdNode.size() > 0) {
return tdNode.elementAt(0);
}
return null;
}
/**
* 获得某个图片的url
* @param content
* @param tdid
* @return
*/
public String getImgUrl(Node keyTDNode) {
NodeList imgChildNodelist = keyTDNode.getChildren();
NodeFilter imgChildFilter = new TagNameFilter("IMG");
imgChildNodelist = imgChildNodelist.extractAllNodesThatMatch(
imgChildFilter, true);
if (imgChildNodelist.size() > 0) {
ImageTag img = (ImageTag) imgChildNodelist.elementAt(0);
return img.getImageURL();
}
return null;
}
/**
* 获得首页除外的其它页面的链接
* @param keyTDNode
* @return
*/
public NodeList getOtherPageUrl(Node keyTDNode) {
NodeList childNodelist = keyTDNode.getChildren();
NodeFilter childfilter = new NodeFilter() {
public boolean accept(Node node) {
if (node instanceof LinkTag) {
String txt = ((LinkTag) node).getStringText();// id大小写无所谓
if (txt.length() >= 3 && !"[1]".equals(txt)) {
txt = txt.replace("[", "");
txt = txt.replace("]", "");
try {
int i = Integer.parseInt(txt);
if (i > 0) {
return true;
}
} catch (NumberFormatException e) {
}
}
}
return false;
}
};
return childNodelist.extractAllNodesThatMatch(childfilter, true);
}
// implements Runnable
//一旦替换成Runnable之后, 在TestURLThread的run里面, sendMSsg.notify()将不能及时唤醒SendMsg的wait(),只能靠 this.wait(30*1000);来退出等待。
class PhotoDownload extends Thread {
String source = null;
public PhotoDownload(String source) {
this.source = source;
}
public void run() {
try {
File destination = null;
String name = null;
URL url = null;
url = new URL(source);
name = FilenameUtils.getName(source);
destination = new File(localFild, name);
System.out.println("正在下载图片 " + name + "……");
FileUtils.copyURLToFile(url, destination);
System.out.println("图片下载完成 " + name + "【ok】");
downloadPics++;
if(pageurlSetover && downloadPics==totalPics){
beginTimeMillis = System.currentTimeMillis() - beginTimeMillis;
System.out.println("【全部】图片下载完成!耗时"+(beginTimeMillis/1000)+"s");
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
class NodeAttributeFilter implements NodeFilter {
String attributeName = null;
String value = null;
Class tagType = null;
public NodeAttributeFilter(String attributeName, String value) {
this.attributeName = attributeName;
this.value = value;
}
public NodeAttributeFilter(String attributeName, String value,
Class tagType) {
this.attributeName = attributeName;
this.value = value;
this.tagType = tagType;
}
public boolean accept(Node node) {
if (tagType != null && node.getClass()!= tagType) {// TableColumn
return false;
}
String id = ((TableColumn) node).getAttribute(attributeName);// id大小写无所谓
if (value.equals(id)) {
return true;
}
return false;
}
}
class DownLoadPageAndDownloadPhoto extends Thread {
String pageUrl = null;
boolean otherPage = false;
public DownLoadPageAndDownloadPhoto(String pageUrl) {
this.pageUrl = pageUrl;
}
public DownLoadPageAndDownloadPhoto(String pageUrl, boolean otherPage) {
this.pageUrl = pageUrl;
this.otherPage = otherPage;
}
public void run() {
try {
System.out.println("正在下载页面 " + pageUrl);
String pageContent = readURLToString(new URL(pageUrl), enc);
System.out.println("开始分析页面图片链接" + pageUrl);
Node keyTDNode = getKeyTD(pageContent);
String img = getImgUrl(keyTDNode);
if(img != null){
new PhotoDownload(img).start();
}
if (otherPage) {
NodeList otherPageUrls = getOtherPageUrl(keyTDNode);
for (int i = 0; i < otherPageUrls.size(); i++) {
LinkTag pageLink = (LinkTag) otherPageUrls.elementAt(i);
new DownLoadPageAndDownloadPhoto(urlFold + pageLink.getLink(), false).start();
}
totalPics = 1 + otherPageUrls.size();
pageurlSetover = true;
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}