采用htmlparser分析某网站的图片专辑并多线程下载 2008-10-08 16:03

字号:    

package photoDownload;

import java.io.File;

import java.io.IOException;

import java.io.InputStream;

import java.net.MalformedURLException;

import java.net.URL;

import org.apache.commons.io.FileUtils;

import org.apache.commons.io.FilenameUtils;

import org.apache.commons.io.IOUtils;

import org.htmlparser.Node;

import org.htmlparser.NodeFilter;

import org.htmlparser.Parser;

import org.htmlparser.filters.TagNameFilter;

import org.htmlparser.tags.ImageTag;

import org.htmlparser.tags.LinkTag;

import org.htmlparser.tags.TableColumn;

import org.htmlparser.util.NodeList;

import org.htmlparser.util.ParserException;

import org.htmlparser.visitors.HtmlPage;

/**

* 多线程的图片专辑下载工具

* @author chenxiaowen

*

*/

public class FoodqsPhotoDownload {

/**

* 首页url

*/

private String indexUrl = null;

/**

* 本地下载到目录

*/

private File localFild = null;

/**

* html页面的编码

*/

private String enc = null;

/**

* url目录

*/

private String urlFold = null;

/**

* 要下载的图片数

*/

private int totalPics = 0;

/**

* 已下载的图片数

*/

private int downloadPics = 0;

/**

* 页面分析完毕,可以分析是否下载完毕了

*/

private boolean pageurlSetover = false;

private long beginTimeMillis;

/**

*

* @throws IOException

*/

public static void main(String[] args) throws IOException {

   FoodqsPhotoDownload test = new FoodqsPhotoDownload(

     "http://www.foodqs.com/news/jkys15/20085249110126.htm",

     "不可思议:米饭的最高境界", "c:/中国食品产业网_pic", "gb2312");

   test.begin();

}

/**

* 开始下载

*/

public void begin() {

   beginTimeMillis = System.currentTimeMillis();

   new DownLoadPageAndDownloadPhoto(this.indexUrl, true).start();

}

public FoodqsPhotoDownload(String indexUrl, String title,

    String localFild, String enc) {

   this.indexUrl = indexUrl;

   urlFold = FilenameUtils.getFullPath(indexUrl);

   if (localFild.endsWith("/") || localFild.endsWith("\\")) {

    localFild = localFild.substring(0, localFild.length() - 1);

   }

   this.localFild = new File(localFild + File.separator + title);

   {

    if (!this.localFild.exists())

     this.localFild.mkdirs();

   }

   this.enc = enc;

}

/**

* 因为FileUtils不支持“String content =

* FileUtils.readFileToString(FileUtils.toFile(new

* URL("http://www.baidu.com")), "gb2312");”所以添加个方法

*

* @param source

* @param encoding

* @return

* @throws IOException

*/

public static String readURLToString(URL source, String encoding)

    throws IOException {

   InputStream input = source.openStream();

   try {

    return IOUtils.toString(input, encoding);

   } finally {

    IOUtils.closeQuietly(input);

   }

}

/**

* 获得关键的td节点,id=zoom

* @param content

* @return

*/

public Node getKeyTD(String content) {

   Parser parser = Parser.createParser(content, enc);

   HtmlPage page = new HtmlPage(parser);

   try {

    parser.visitAllNodesWith(page);

   } catch (ParserException e1) {

   }

   NodeList tdNode = page.getBody();

   NodeFilter tdfilter = new NodeAttributeFilter("id", "zoom",

     TableColumn.class);

   tdNode = tdNode.extractAllNodesThatMatch(tdfilter, true);

   if (tdNode!= null && tdNode.size() > 0) {

    return tdNode.elementAt(0);

   }

   return null;

}

/**

* 获得某个图片的url

* @param content

* @param tdid

* @return

*/

public String getImgUrl(Node keyTDNode) {

   NodeList imgChildNodelist = keyTDNode.getChildren();

   NodeFilter imgChildFilter = new TagNameFilter("IMG");

   imgChildNodelist = imgChildNodelist.extractAllNodesThatMatch(

     imgChildFilter, true);

   if (imgChildNodelist.size() > 0) {

    ImageTag img = (ImageTag) imgChildNodelist.elementAt(0);

    return img.getImageURL();

   }

   return null;

}

/**

* 获得首页除外的其它页面的链接

* @param keyTDNode

* @return

*/

public NodeList getOtherPageUrl(Node keyTDNode) {

   NodeList childNodelist = keyTDNode.getChildren();

   NodeFilter childfilter = new NodeFilter() {

    public boolean accept(Node node) {

     if (node instanceof LinkTag) {

      String txt = ((LinkTag) node).getStringText();// id大小写无所谓

      if (txt.length() >= 3 && !"[1]".equals(txt)) {

       txt = txt.replace("[", "");

       txt = txt.replace("]", "");

       try {

        int i = Integer.parseInt(txt);

        if (i > 0) {

         return true;

        }

       } catch (NumberFormatException e) {

       }

      }

     }

     return false;

    }

   };

   return childNodelist.extractAllNodesThatMatch(childfilter, true);

}

// implements Runnable

//一旦替换成Runnable之后, 在TestURLThread的run里面, sendMSsg.notify()将不能及时唤醒SendMsg的wait(),只能靠 this.wait(30*1000);来退出等待。

class PhotoDownload extends Thread {

   String source = null;

   public PhotoDownload(String source) {

    this.source = source;

   }

   public void run() {

    try {

     File destination = null;

     String name = null;

     URL url = null;

     url = new URL(source);

     name = FilenameUtils.getName(source);

     destination = new File(localFild, name);

     System.out.println("正在下载图片 " + name + "……");

     FileUtils.copyURLToFile(url, destination);

     System.out.println("图片下载完成 " + name + "【ok】");

     downloadPics++;

     if(pageurlSetover && downloadPics==totalPics){

      beginTimeMillis = System.currentTimeMillis() - beginTimeMillis;

      System.out.println("【全部】图片下载完成!耗时"+(beginTimeMillis/1000)+"s");

     }

    } catch (MalformedURLException e) {

     e.printStackTrace();

    } catch (IOException e) {

     e.printStackTrace();

    }

   }

}

class NodeAttributeFilter implements NodeFilter {

   String attributeName = null;

   String value = null;

   Class tagType = null;

  

   public NodeAttributeFilter(String attributeName, String value) {

    this.attributeName = attributeName;

    this.value = value;

   }

  

   public NodeAttributeFilter(String attributeName, String value,

     Class tagType) {

    this.attributeName = attributeName;

    this.value = value;

    this.tagType = tagType;

   }

  

   public boolean accept(Node node) {

    if (tagType != null && node.getClass()!= tagType) {// TableColumn

     return false;

    }

   

    String id = ((TableColumn) node).getAttribute(attributeName);// id大小写无所谓

    if (value.equals(id)) {

     return true;

    }

   

    return false;

   }

}

class DownLoadPageAndDownloadPhoto extends Thread {

   String pageUrl = null;

   boolean otherPage = false;

   public DownLoadPageAndDownloadPhoto(String pageUrl) {

    this.pageUrl = pageUrl;

   }

   public DownLoadPageAndDownloadPhoto(String pageUrl, boolean otherPage) {

    this.pageUrl = pageUrl;

    this.otherPage = otherPage;

   }

   public void run() {

    try {

     System.out.println("正在下载页面 " + pageUrl);

     String pageContent = readURLToString(new URL(pageUrl), enc);

     System.out.println("开始分析页面图片链接" + pageUrl);

     Node keyTDNode = getKeyTD(pageContent);

     String img = getImgUrl(keyTDNode);

     if(img != null){

      new PhotoDownload(img).start();

     }

     if (otherPage) {

      NodeList otherPageUrls = getOtherPageUrl(keyTDNode);

      for (int i = 0; i < otherPageUrls.size(); i++) {

       LinkTag pageLink = (LinkTag) otherPageUrls.elementAt(i);

       new DownLoadPageAndDownloadPhoto(urlFold + pageLink.getLink(), false).start();

      }

      totalPics = 1 + otherPageUrls.size();

      pageurlSetover = true;

     }

    } catch (MalformedURLException e) {

     e.printStackTrace();

    } catch (IOException e) {

     e.printStackTrace();

    }

   }

}

}

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
网易公司版权所有 ©1997-2009