使用java将网页保存为mht格式 (转)

上一篇 / 下一篇  2014-05-07 09:55:47 / 个人分类:Java学习

原文地址: http://blog.csdn.net/dongle2001/article/details/2557434

    import java.io.BufferedInputStream;  
    import java.io.BufferedOutputStream;   
    import java.io.BufferedReader;  
    import java.io.ByteArrayInputStream;  
    import java.io.DataOutputStream;  
    import java.io.File;  
    import java.io.FileInputStream;  
    import java.io.FileOutputStream;  
    import java.io.FileWriter;  
    import java.io.IOException;  
    import java.io.InputStream;
    import java.io.InputStreamReader;  
    import java.io.OutputStream;  
    import java.io.Reader;  
    import java.net.MalformedURLException;  
    import java.net.URL;  
    import java.util.*;  

    import org.htmlparser.Parser;  
    import org.htmlparser.Tag;  
    import org.htmlparser.filters.TagNameFilter;  
    import org.htmlparser.lexer.Lexer;  
    import org.htmlparser.lexer.Page;
    import org.htmlparser.util.DefaultParserFeedback;
    import org.htmlparser.util.NodeList;
    import org.htmlparser.util.ParserException;
    
    import toptrack.tools.JQuery;
   
    import javax.activation.DataHandler;
    import javax.activation.DataSource;
    import javax.activation.MimetypesFileTypeMap;
    import javax.mail.Message;
    import javax.mail.MessagingException;
    import javax.mail.Multipart;
    import javax.mail.Session;
    import javax.mail.internet.InternetAddress;
    import javax.mail.internet.MimeBodyPart;
    import javax.mail.internet.MimeMessage;
    import javax.mail.internet.MimeMultipart;
    import javax.mail.internet.MimePartDataSource;
   
    /**
     * mht文件解析类
     * @author dl
     */
    public class Html2MHTCompiler {
        private URL strWeb = null; /**网页地址*/
        private String strText = null; /**网页文本内容*/
        private String strFileName = null; /**本地文件名*/
        private String strEncoding = null; /**网页编码*/
        //mht格式附加信息
        private String from = "dongle2001@126.com";
        private String to;
        private String subject = "mht compile";
        private String cc;
        private String bcc;
        private String smtp = "localhost";
       
        public static void main(String[] args) {
            String strUrl = "http://www.mtime.com/my/tropicofcancer/blog/843555/";
            String strEncoding = "utf-8";
            String strText = JQuery.getHtmlText(strUrl, strEncoding, null);
            if (strText == null)
                return;
            Html2MHTCompiler h2t = new Html2MHTCompiler(strText, strUrl, strEncoding, "test.mht");
            h2t.compile();
            //Html2MHTCompiler.mht2html("test.mht", "a.html");
        }
       
      /**
      *<br>方法说明:初始化
      *<br>输入参数:strText 网页文本内容; strUrl 网页地址; strEncoding 网页编码; strFileName 本地文件名
      *<br>返回类型:
      */
        public Html2MHTCompiler(String strText, String strUrl, String strEncoding, String strFileName) {
            // TODO Auto-generated constructor stub
            try {
                strWeb = new URL(strUrl);
            } catch (MalformedURLException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                return;
            }
                this.strText = strText;
            this.strEncoding = strEncoding;
            this.strFileName = strFileName;
        }
       
    /**
      *<br>方法说明:执行下载操作
      *<br>输入参数:
      *<br>返回类型:
      */
        public boolean compile() {
            if (strWeb == null || strText == null || strFileName == null || strEncoding == null)
                return false;
            HashMap urlMap = new HashMap();
            NodeList nodes = new NodeList();
            try {
                Parser parser = createParser(strText);
                parser.setEncoding(strEncoding);
                nodes = parser.parse(null);
            } catch (ParserException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            extractAllScriptNodes(nodes);
            ArrayList urlScriptList = extractAllScriptNodes(nodes, urlMap);
            ArrayList urlImageList = extractAllImageNodes(nodes, urlMap);
            for (Iterator iter = urlMap.entrySet().iterator(); iter.hasNext();) {
                Map.Entry entry = (Map.Entry) iter.next();
                String key = (String)entry.getKey();
                String val = (String)entry.getValue();
                strText = JHtmlClear.replace(strText, val, key);
            }
            try {
                createMhtArchive(strText, urlScriptList, urlImageList);
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                return false;
            }
            return true;
        }
       
      /**
      *<br>方法说明:建立HTML parser
      *<br>输入参数:inputHTML 网页文本内容
      *<br>返回类型:HTML parser
      */
        private Parser createParser(String inputHTML) {
            // TODO Auto-generated method stub
            Lexer mLexer = new Lexer(new Page(inputHTML));
            return new Parser(mLexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));
        }
   
      /**
      *<br>方法说明:抽取基础URL地址
      *<br>输入参数:nodes 网页标签集合
      *<br>返回类型:
      */
        private void extractAllScriptNodes(NodeList nodes) {
            NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter(
                    "BASE"), true);
            if (filtered != null && filtered.size() > 0) {
                Tag tag = (Tag) filtered.elementAt(0);
                String href = tag.getAttribute("href");
                if (href != null && href.length() > 0) {
                    try {
                        strWeb = new URL(href);
                    } catch (MalformedURLException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }
            }
        }
   
      /**
      *<br>方法说明:抽取网页包含的css,js链接
      *<br>输入参数:nodes 网页标签集合; urlMap 已存在的url集合
      *<br>返回类型:css,js链接的集合
      */
        private ArrayList extractAllScriptNodes(NodeList nodes, HashMap urlMap) {
            ArrayList urlList = new ArrayList();
            NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("script"), true);
            for (int i = 0; i < filtered.size(); i++) {
                Tag tag = (Tag) filtered.elementAt(i);
                String src = tag.getAttribute("src");
                // Handle external css file's url
                if (src != null && src.length() > 0) {
                    String innerURL = src;
                    String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
                    if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
                        urlMap.put(absoluteURL, innerURL);
                        ArrayList urlInfo = new ArrayList();
                        urlInfo.add(innerURL);
                        urlInfo.add(absoluteURL);
                        urlList.add(urlInfo);
                    }
                    tag.setAttribute("src", absoluteURL);                  
                }
            }
           
            filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("link"), true);
            for (int i = 0; i < filtered.size(); i++) {
                Tag tag = (Tag) filtered.elementAt(i);
                String type = (tag.getAttribute("type"));
                String rel = (tag.getAttribute("rel"));
                String href = tag.getAttribute("href");
   
                boolean isCssFile = false;
                if (rel != null) {
                    isCssFile = rel.indexOf("stylesheet") != -1;
                } else if (type != null) {
                    isCssFile |= type.indexOf("text/css") != -1;
                }
                // Handle external css file's url
                if (isCssFile && href != null && href.length() > 0) {
                    String innerURL = href;
                    String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
                    if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
                        urlMap.put(absoluteURL, innerURL);
                        ArrayList urlInfo = new ArrayList();
                        urlInfo.add(innerURL);
                        urlInfo.add(absoluteURL);
                        urlList.add(urlInfo);
                    }
                    tag.setAttribute("href", absoluteURL);
                }
            }
            return urlList;
        }
       
      /**
      *<br>方法说明:抽取网页包含的图像链接
      *<br>输入参数:nodes 网页标签集合; urlMap 已存在的url集合
      *<br>返回类型:图像链接集合
      */
        private ArrayList extractAllImageNodes(NodeList nodes, HashMap urlMap) {
            ArrayList urlList = new ArrayList();
            NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("IMG"), true);
            for (int i = 0; i < filtered.size(); i++) {
                Tag tag = (Tag) filtered.elementAt(i);
                String src = tag.getAttribute("src");
                // Handle external css file's url
                if (src != null && src.length() > 0) {
                    String innerURL = src;
                    String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
                    if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
                        urlMap.put(absoluteURL, innerURL);
                        ArrayList urlInfo = new ArrayList();
                        urlInfo.add(innerURL);
                        urlInfo.add(absoluteURL);
                        urlList.add(urlInfo);
                    }
                    tag.setAttribute("src", absoluteURL);                  
                }
            }
            return urlList;
        }
   
      /**
      *<br>方法说明:相对路径转绝对路径
      *<br>输入参数:strWeb 网页地址; innerURL 相对路径链接
      *<br>返回类型:绝对路径链接
      */
        public static String makeAbsoluteURL(URL strWeb, String innerURL) {
            // TODO Auto-generated method stub
            //去除后缀
            int pos = innerURL.indexOf("?");
            if (pos != -1) {
                innerURL = innerURL.substring(0, pos);
            }
            if (innerURL != null
                    && innerURL.toLowerCase().indexOf("http") == 0) {
                System.out.println(innerURL);
                return innerURL;
            }
           
            URL linkUri = null;
            try {
                linkUri = new URL(strWeb, innerURL);
            } catch (MalformedURLException e) {
                //TODO Auto-generated catch block
                e.printStackTrace();
                return null;
            }
           
            String absURL = linkUri.toString();
            absURL = JHtmlClear.replace(absURL, "../", "");
            absURL = JHtmlClear.replace(absURL, "./", "");
            System.out.println(absURL);
            return absURL;
        }
   
        /**
      *<br>方法说明:创建mht文件
      *<br>输入参数:content 网页文本内容; urlScriptList 脚本链接集合; urlImageList 图片链接集合
      *<br>返回类型:
      */
        private void createMhtArchive(String content, ArrayList urlScriptList, ArrayList urlImageList) throws Exception {
            //Instantiate a Multipart object
            MimeMultipart mp = new MimeMultipart("related");
            Properties props = new Properties();
            props.put("mail.smtp.host", smtp);
            Session session = Session.getDefaultInstance(props, null);
            MimeMessage msg = new MimeMessage(session);
            // set mailer
            msg.setHeader("X-Mailer", "Code Manager .SWT");
   
            // set from
            if (from != null) {
                msg.setFrom(new InternetAddress(from));
            }
            // set subject
            if (subject != null) {
                msg.setSubject(subject);
            }
            // to
            if (to != null) {
                InternetAddress[] toAddresses = getInetAddresses(to);
                msg.setRecipients(Message.RecipientType.TO, toAddresses);
            }
            // cc
            if (cc != null) {
                InternetAddress[] ccAddresses = getInetAddresses(cc);
                msg.setRecipients(Message.RecipientType.CC, ccAddresses);
            }
            // bcc
            if (bcc != null) {
                InternetAddress[] bccAddresses = getInetAddresses(bcc);
                msg.setRecipients(Message.RecipientType.BCC, bccAddresses);
            }
           
            //设置网页正文
            MimeBodyPart bp = new MimeBodyPart();
            bp.setText(content, strEncoding);
            bp.addHeader("Content-Type", "text/html;charset=" + strEncoding);
            bp.addHeader("Content-Location", strWeb.toString());
            mp.addBodyPart(bp);
            int urlCount = urlScriptList.size();
            for (int i = 0; i < urlCount; i++) {
                bp = new MimeBodyPart();
                ArrayList urlInfo = (ArrayList) urlScriptList.get(i);
                // String url = urlInfo.get(0).toString();
                String absoluteURL = urlInfo.get(1).toString();
                bp
                .addHeader("Content-Location",
                        javax.mail.internet.MimeUtility
                                .encodeWord(java.net.URLDecoder
                                        .decode(absoluteURL, strEncoding)));
                DataSource source = new AttachmentDataSource(absoluteURL, "text");
                bp.setDataHandler(new DataHandler(source));
                mp.addBodyPart(bp);
            }
           
            urlCount = urlImageList.size();
            for (int i = 0; i < urlCount; i++) {
                bp = new MimeBodyPart();
                ArrayList urlInfo = (ArrayList) urlImageList.get(i);
                // String url = urlInfo.get(0).toString();
                String absoluteURL = urlInfo.get(1).toString();
                bp
                .addHeader("Content-Location",
                        javax.mail.internet.MimeUtility
                                .encodeWord(java.net.URLDecoder
                                        .decode(absoluteURL, strEncoding)));
                DataSource source = new AttachmentDataSource(absoluteURL, "image");
                bp.setDataHandler(new DataHandler(source));
                mp.addBodyPart(bp);
            }
            msg.setContent(mp);
            // write the mime multi part message to a file
            msg.writeTo(new FileOutputStream(strFileName));
        }
       
      /**
      *<br>方法说明:mht转html
      *<br>输入参数:strMht mht文件路径; strHtml html文件路径
      *<br>返回类型:
      */
        public static void mht2html(String strMht, String strHtml) {
            try {
                //TODO readEmlFile
                InputStream fis = new FileInputStream(strMht);
                Session mailSession = Session.getDefaultInstance(System.getProperties(), null);
                MimeMessage msg = new MimeMessage(mailSession, fis);
                Object content = msg.getContent();
                if (content instanceof Multipart) {
                    MimeMultipart mp = (MimeMultipart)content;
                    MimeBodyPart bp1 = (MimeBodyPart)mp.getBodyPart(0);
                    String strEncodng = getEncoding(bp1);
                    String strText = getHtmlText(bp1, strEncodng);
                    if (strText == null)
                        return;
                    File parent = null;
                    if (mp.getCount() > 1) {
                        parent = new File(new File(strHtml).getAbsolutePath() + ".files");
                        parent.mkdirs();
                        if (!parent.exists())
                            return;
                    }
                    for (int i = 1; i < mp.getCount(); ++i) {
                        MimeBodyPart bp = (MimeBodyPart)mp.getBodyPart(i);
                       
                        String strUrl = getResourcesUrl(bp);
                        if (strUrl == null)
                            continue;
                       
                        DataHandler dataHandler = bp.getDataHandler();
                        MimePartDataSource source = (MimePartDataSource)dataHandler.getDataSource();
                        File resources = new File(parent.getAbsolutePath() + File.separator + getName(strUrl, i));
                        if (saveResourcesFile(resources, bp.getInputStream()))
                            strText = JHtmlClear.replace(strText, strUrl, resources.getAbsolutePath());
                    }
                    saveHtml(strText, strHtml);
                }
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
   
      /**
      *<br>方法说明:得到资源文件的name
      *<br>输入参数:strName 资源文件链接, ID 资源文件的序号
      *<br>返回类型:资源文件的本地临时文件名
      */
        public static String getName(String strName, int ID) {
            char separator = '/';
            System.out.println(strName);
            System.out.println(separator);
            if( strName.lastIndexOf(separator) >= 0)
                 return format(strName.substring(strName.lastIndexOf(separator) + 1));
             return "temp" + ID;
        }
       
      /**
      *<br>方法说明:得到网页编码
      *<br>输入参数:bp MimeBodyPart类型的网页内容
      *<br>返回类型:MimeBodyPart里的网页内容的编码
      */
        private static String getEncoding(MimeBodyPart bp) {  
            if (bp != null) {  
                try {
                    Enumeration list = bp.getAllHeaders();
                    while (list.hasMoreElements()) {
                        javax.mail.Header head = (javax.mail.Header)list.nextElement();
                        if (head.getName().compareTo("Content-Type") == 0) {
                            String strType = head.getValue();
                            int pos = strType.indexOf("charset=");
                            if (pos != -1) {
                                String strEncoding = strType.substring(pos + 8, strType.length());
                                if (strEncoding.toLowerCase().compareTo("gb2312") == 0) {
                                    strEncoding = "gbk";
                                }
                                return strEncoding;
                            }
                        }
                    }
                } catch (MessagingException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
   
            }  
            return null;  
        }
       
      /**
      *<br>方法说明:得到资源文件url
      *<br>输入参数:bp MimeBodyPart类型的网页内容
      *<br>返回类型:资源文件url
      */
        private static String getResourcesUrl(MimeBodyPart bp) {  
            if (bp != null) {  
                try {
                    Enumeration list = bp.getAllHeaders();
                    while (list.hasMoreElements()) {
                        javax.mail.Header head = (javax.mail.Header)list.nextElement();
                        if (head.getName().compareTo("Content-Location") == 0) {
                            return head.getValue();
                        }
                    }
                } catch (MessagingException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
   
            }  
            return null;  
        }  
   
      /**
      *<br>方法说明:格式化文件名
      *<br>输入参数:strName 文件名
      *<br>返回类型:经过处理的符合命名规则的文件名
      */
        private static String format(String strName) {
            if (strName == null)
                return null;
            strName = strName.replaceAll("     ", " ");
            String strText = "///:*?/"<>|^___FCKpd___0quot;;
            for (int i = 0; i < strName.length(); ++i) {
                String ch = String.valueOf(strName.charAt(i));
                if (strText.indexOf(ch) != -1) {
                    strName = strName.replace(strName.charAt(i), '-');
                }
            }
            return strName;
        }
       
      /**
      *<br>方法说明:保存资源文件
      *<br>输入参数:resources 要创建的资源文件; inputStream 要输入文件中的流
      *<br>返回类型:boolean
      */
        private static boolean saveResourcesFile(File resources, InputStream inputStream) {
            if (resources == null || inputStream == null) {
                return false;
            }
            BufferedInputStream in = null;
            FileOutputStream fio = null;
            BufferedOutputStream sw = null;
            try {
                in = new BufferedInputStream(inputStream);
                fio = new FileOutputStream(resources);
                sw = new BufferedOutputStream(new DataOutputStream(fio));
                int b;
                byte[] a = new byte[1024];
                boolean isEmpty = true;
                while ((b = in.read(a)) != -1) {
                    isEmpty = false;
                    osw.write(a, 0, b);
                    osw.flush();
                }
                osw.close();
                fio.close();
                in.close();
                inputStream.close();
                if (isEmpty)
                    resources.delete();
                return true;
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                System.out.println("解析mht失败");
                return false;
            } finally{
                try {
                    if (osw != null)
                        osw.close();
                    if (fio != null)
                        fio.close();
                    if (in != null)
                        in.close();
                    if (inputStream != null)
                        inputStream.close();
                } catch (Exception e) {
                    e.printStackTrace();
                    System.out.println("解析mht失败");
                    return false;
                }  
            }
        }
       
      /**
      *<br>方法说明:得到mht文件的标题
      *<br>输入参数:mhtFilename mht文件名
      *<br>返回类型:mht文件的标题
      */
        public static String getTitle(String mhtFilename) {
            try {
                //TODO readEmlFile
                InputStream fis = new FileInputStream(mhtFilename);
                Session mailSession = Session.getDefaultInstance(System.getProperties(), null);
                MimeMessage msg = new MimeMessage(mailSession, fis);
                Object content = msg.getContent();
                if (content instanceof Multipart) {
                    MimeMultipart mp = (MimeMultipart)content;
                    MimeBodyPart bp1 = (MimeBodyPart)mp.getBodyPart(0);
                    String strEncodng = getEncoding(bp1);
                    String strText = getHtmlText(bp1, strEncodng);
                    if (strText == null)
                        return null;
                    strText = strText.toLowerCase();
                    int pos1 = strText.indexOf("<title>");
                    int pos2 = strText.indexOf("</title>");
                    if (pos1 != -1 && pos2!= -1 && pos2 > pos1) {
                        return strText.substring(pos1 + 7, pos2).trim();
                    }
                }
                return null;
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                return null;
            }
        }
   
      /**
      *<br>方法说明:得到html文本
      *<br>输入参数:bp MimeBodyPart类型的网页内容; strEncoding 内容编码
      *<br>返回类型:html文本
      */
        private static String getHtmlText(MimeBodyPart bp, String strEncoding) {
            InputStream textStream = null;
            BufferedInputStream buff = null;
            BufferedReader br = null;
            Reader r = null;
            try {
                textStream = bp.getInputStream();
                buff = new BufferedInputStream(textStream);
                r = new InputStreamReader(buff, strEncoding);  
                br = new BufferedReader(r);
                StringBuffer strHtml = new StringBuffer("");
                String strLine = null;
                while ((strLine = br.readLine()) != null) {
                    strHtml.append(strLine + "/r/n");
                }
                br.close();
                r.close();
                textStream.close();
                return strHtml.toString();
            } catch (Exception e) {
                // TODO Auto-generated catch block
               

TAG:

 

评分:0

我来说两句

Open Toolbar