欢迎来到天天文库
浏览记录
ID:35478070
大小:60.47 KB
页数:8页
时间:2019-03-25
《java网页页面抓取标题和正文心得》由会员上传分享,免费在线阅读,更多相关内容在工程资料-天天文库。
1、java网页页面抓取标题和正文心得importjava.io.BufferedReader;importjava.io」OException;importjava.io」nputStreamReader;importjava.net.MalformedURLException;importjava.net.URL;importjava.util.ArrayList;importjava.util.HashMap;importjava.util.List;importjava.util,regex.Matcher;importjava.util,regex
2、.Pattern;publicclassWebContent{/***读取一个网页全部内容*/publicStringgetOneHtml(finalStringhtmlurl)throwslOException{URLurl;Stringtemp;finalStringBuffersb=newStringBuffer();try{url=newURL(htmlurl);finalBufferedReaderin二newBufferedReader(newInputStreamReader(url.openStream(),nutf-8H));〃读取网页
3、全部内容while((temp=in.readLine())!=null){sb.append(temp);}in.close();}catch(finalMalformedURLExceptionme){System.out.println("你输入的URL格式有问题!请仔细输入”);me.getMessage();throwme;}catch(finallOExceptione)e.printStackTrace();throwe;}returnsb.toString();}/****@params*@return获得网页标题*/publicStri
4、nggetTitle(finalStrings){Stringregex;Stringtitle=finalList();regex=".*?";finalPatternpa=Pattern.compilejregex,Pattern.CANONEQ);finalMatcherma=pa.matcher(s);while(ma.find()){list.add(ma.group());}for(inti=0;i5、e+list.get(i);}returnoutTag(title);}/****@params*@return获得链接*/publicListgetLink(finalStrings){Stringregex;finalUstlist=newArrayList();regex=H]*href=("(rV]*)6、'([叫]*)'l(L\s>]*))F>]*>(.*?)v/a>“;finalPatternpa=Pattern.compilefregex,Pattern.DOIALL);f7、inalMatcherma=pa.matcher(s);while(ma.find())Iist.add(ma.group());}returnlist;}/****@params*@return获得脚本代码*/publicListgetScript(finalStrings){Stringregex;finalUstlist=newArrayList();regex=H";finalPatternpa=Pattern.compile(regex,Pattern.DCHA8、LL);finalMatcherma=pa.matcher(s);while(ma.find()){list.add(ma.group());}returnlist;}/****@params*@return获得CSS*/publicListgetCSS(finalStrings){Stringregex;finalListlist=newArrayList();regex=,,H;finalPatternpa=Pattern.compile(regex,Pattern.D9、CHALL);finalMatcherma=pa.matcher(s);whil
5、e+list.get(i);}returnoutTag(title);}/****@params*@return获得链接*/publicListgetLink(finalStrings){Stringregex;finalUstlist=newArrayList();regex=H]*href=("(rV]*)
6、'([叫]*)'l(L\s>]*))F>]*>(.*?)v/a>“;finalPatternpa=Pattern.compilefregex,Pattern.DOIALL);f
7、inalMatcherma=pa.matcher(s);while(ma.find())Iist.add(ma.group());}returnlist;}/****@params*@return获得脚本代码*/publicListgetScript(finalStrings){Stringregex;finalUstlist=newArrayList();regex=H";finalPatternpa=Pattern.compile(regex,Pattern.DCHA
8、LL);finalMatcherma=pa.matcher(s);while(ma.find()){list.add(ma.group());}returnlist;}/****@params*@return获得CSS*/publicListgetCSS(finalStrings){Stringregex;finalListlist=newArrayList();regex=,,H;finalPatternpa=Pattern.compile(regex,Pattern.D
9、CHALL);finalMatcherma=pa.matcher(s);whil
此文档下载收益归作者所有