开发者体验：Java抓取百度Top500歌曲及源码

主要的工作就是如何通过Java抓取***的Baidu好听的歌曲，Java抓取的工作主要包括3个属性：歌名、歌曲在线播放地址和歌词内容(符合LRC歌词格式)，目前完成歌曲和歌曲地址抓取，由于百度的歌曲地址很多通过js获取，所以歌曲地址获取我这里使用搜狗音乐搜索方便些，所有的源码如下：

创新互联专注于企业成都全网营销推广、网站重做改版、大连网站定制设计、自适应品牌网站建设、H5响应式网站、商城网站建设、集团公司官网建设、成都外贸网站建设公司、高端网站制作、响应式网页设计等建站业务，价格优惠性价比高，为大连等各大城市提供网站开发制作服务。

 
 
 
  
  
  /** *//**  
  
  
  　　http://www.bt285.cn http://www.5a520.cn  
  
  
  　　*/  
  
  
  　　package com.common.utils;  
  
  
  　　import Java.io.BufferedReader;  
  
  
  　　import java.io.ByteArrayOutputStream;  
  
  
  　　import java.io.IOException;  
  
  
  　　import java.io.InputStream;  
  
  
  　　import java.io.InputStreamReader;  
  
  
  　　import java.io.OutputStreamWriter;  
  
  
  　　import java.io.UnsupportedEncodingException;  
  
  
  　　import java.net.HttpURLConnection;  
  
  
  　　import java.net.MalformedURLException;  
  
  
  　　import java.net.URL;  
  
  
  　　import java.net.URLConnection;  
  
  
  　　import java.net.URLDecoder;  
  
  
  　　import java.net.URLEncoder;  
  
  
  　　import java.util.ArrayList;  
  
  
  　　import java.util.HashSet;  
  
  
  　　import java.util.List;  
  
  
  　　import java.util.Set;  
  
  
  　　import java.util.TreeSet;  
  
  
  　　import java.util.regex.Matcher;  
  
  
  　　import java.util.regex.Pattern;  
  
  
  　　import org.htmlparser.Node;  
  
  
  　　import org.htmlparser.NodeFilter;  
  
  
  　　import org.htmlparser.Parser;  
  
  
  　　import org.htmlparser.filters.NodeClassFilter;  
  
  
  　　import org.htmlparser.filters.OrFilter;  
  
  
  　　import org.htmlparser.nodes.TextNode;  
  
  
  　　import org.htmlparser.tags.LinkTag;  
  
  
  　　import org.htmlparser.util.NodeList;  
  
  
  　　import org.htmlparser.util.ParserException;  
  
  
  　　import com.common.doc.FileOperUtils;  
  
  
  　　class Song{  
  
  
  　　private String name;  
  
  
  　　private String url;  
  
  
  　　private String lrc;  
  
  
  　　public Song(String name,String url){  
  
  
  　　this.name = name;  
  
  
  　　this.url = url;  
  
  
  　　this.lrc = "";  
  
  
  　　}  
  
  
  　　public String getName() {  
  
  
  　　return name;  
  
  
  　　}  
  
  
  　　public void setName(String name) {  
  
  
  　　this.name = name;  
  
  
  　　}  
  
  
  　　public String getUrl() {  
  
  
  　　return url;  
  
  
  　　}  
  
  
  　　public void setUrl(String url) {  
  
  
  　　this.url = url;  
  
  
  　　}  
  
  
  　　public String getLrc() {  
  
  
  　　return lrc;  
  
  
  　　}  
  
  
  　　public void setLrc(String lrc) {  
  
  
  　　this.lrc = lrc;  
  
  
  　　}  
  
  
  　　}  
  
  
  　　public class BaiduMP3 {  
  
  
  　　public static String visitURL(String strUrl) {  
  
  
  　　URL url = null;  
  
  
  　　try {  
  
  
  　　url = new URL(strUrl);  
  
  
  　　} catch (MalformedURLException e) {  
  
  
  　　e.printStackTrace();  
  
  
  　　}  
  
  
  　　URLConnection conn = null;  
  
  
  　　try {  
  
  
  　　conn = url.openConnection();  
  
  
  　　conn.setDoOutput(true);  
  
  
  　　} catch (IOException e) {  
  
  
  　　System.out.println("e:"+e.getMessage());  
  
  
  　　}  
  
  
  　　OutputStreamWriter out;  
  
  
  　　try {  
  
  
  　　out = new OutputStreamWriter(conn.getOutputStream(), "GBK");  
  
  
  　　out.flush();  
  
  
  　　out.close();  
  
  
  　　} catch (UnsupportedEncodingException e2) {  
  
  
  　　e2.printStackTrace();  
  
  
  　　} catch (IOException e2) {  
  
  
  　　e2.printStackTrace();  
  
  
  　　}  
  
  
  　　// 接收返回信息  
  
  
  　　BufferedReader rd = null;  
  
  
  　　try {  
  
  
  　　rd = new BufferedReader(  
  
  
  　　new InputStreamReader(conn.getInputStream()));  
  
  
  　　return rd.readLine();  
  
  
  　　} catch (IOException e1) {  
  
  
  　　e1.printStackTrace();  
  
  
  　　}  
  
  
  　　return "";  
  
  
  　　}  
  
  
  　　/** *//**  
  
  
  　　* 功能说明：访问指定的URL并检查返回结果。  
  
  
  　　* @param strUrl  
  
  
  　　* @param successFlag 请求成功的标识，比如包含“_SUCCESS”字。  
  
  
  　　* @return  
  
  
  　　*/  
  
  
  　　public static String visitURL(String strUrl, String successFlag) {  
  
  
  　　boolean rs = false;  
  
  
  　　HttpURLConnection jconn = null;  
  
  
  　　ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();  
  
  
  　　try {  
  
  
  　　URL url = new URL(strUrl);  
  
  
  　　jconn = (HttpURLConnection) url.openConnection();  
  
  
  　　jconn.setDoOutput(true);  
  
  
  　　jconn.setDoInput(true);  
  
  
  　　jconn.connect();  
  
  
  　　InputStream in = jconn.getInputStream();  
  
  
  　　byte[] buf = new byte[4096];  
  
  
  　　int bytesRead;  
  
  
  　　while ((bytesRead = in.read(buf)) != -1) {  
  
  
  　　byteArrayOutputStream.write(buf, 0, bytesRead);  
  
  
  　　}  
  
  
  　　String strRead = new String(byteArrayOutputStream.toByteArray(),"GBK");  
  
  
  　　return strRead;  
  
  
  　　} catch (MalformedURLException e) {  
  
  
  　　e.printStackTrace();  
  
  
  　　} catch (IOException e) {  
  
  
  　　e.printStackTrace();  
  
  
  　　} finally {  
  
  
  　　jconn.disconnect();  
  
  
  　　try {  
  
  
  　　byteArrayOutputStream.close();  
  
  
  　　} catch (IOException e) {  
  
  
  　　e.printStackTrace();  
  
  
  　　}  
  
  
  　　}  
  
  
  　　return "";  
  
  
  　　}  
  
  
  　　private static boolean isTrimEmptyOrBlank(String astr) {  
  
  
  　　if ((null == astr) || (astr.length() == 0) || " ".equals(astr)) {  
  
  
  　　return true;  
  
  
  　　}  
  
  
  　　astrastr = astr.trim();  
  
  
  　　if ((null == astr) || (astr.length() == 0)) {  
  
  
  　　return true;  
  
  
  　　}  
  
  
  　　return false;  
  
  
  　　}  
  
  
  　　private static String getFilteredContent(String htmlContent, String reg,int i) {  
  
  
  　　String content = "";  
  
  
  　　int k=1;  
  
  
  　　Pattern pp = Pattern.compile(reg, Pattern.DOTALL);  
  
  
  　　Matcher m = pp.matcher(htmlContent);  
  
  
  　　while (m.find()) {  
  
  
  　　content = m.group();  
  
  
  　　if(k++==i)  
  
  
  　　break;  
  
  
  　　}  
  
  
  　　return content;  
  
  
  　　}  
  
  
  　　public static List getBaiduSongs(){  
  
  
  　　List ss = new ArrayList();  
  
  
  　　String htmlContent = visitURL("http://list.mp3.baidu.com/topso/mp3topsong.html?id=1?top2","s");  
  
  
  　　String encode = "GBK";  
  
  
  　　//　　　　　 System.out.println("===========================================================================");  
  
  
  　　//　　　　　 System.out.println(htmlContent);  
  
  
  　　//　　　　　 System.out.println("===========================================================================");  
  
  
  　　String reg = "(.*?)";  
  
  
  　　htmlContent = getFilteredContent(htmlContent,reg,0);  
  
  
  　　//FileOperUtils.writeFile("c:\\1.html", htmlContent, false);  
  
  
  　　String line = "",lineurl="";  
  
  
  　　Node anode = null;  
  
  
  　　TextNode textnode = null;  
  
  
  　　try {  
  
  
  　　Parser parser = Parser.createParser(htmlContent, encode);  
  
  
  　　NodeClassFilter textFilter = new NodeClassFilter(LinkTag.class);  
  
  
  　　OrFilter lastFilter = new OrFilter();  
  
  
  　　lastFilter.setPredicates(new NodeFilter[] { textFilter });  
  
  
  　　NodeList nodeList = parser.parse(lastFilter);  
  
  
  　　Node[] nodes = nodeList.toNodeArray();  
  
  
  　　for (int i = 0; i < nodes.length; i++) {  
  
  
  　　anode = (Node) nodes[i];  
  
  
  　　if(anode instanceof LinkTag){  
  
  
  　　LinkTag txt = (LinkTag)anode;  
  
  
  　　line = txt.getLinkText();  
  
  
  　　if(txt.getPreviousSibling()!=null){  
  
  
  　　if(txt.getPreviousSibling().toString().indexOf("(")>=0)  
  
  
  　　continue;  
  
  
  　　}  
  
  
  　　line = txt.getLinkText();  
  
  
  　　lineurl = txt.getAttribute("href");  
  
  
  　　//System.out.println(txt.getLink());  
  
  
  　　}  
  
  
  　　if (isTrimEmptyOrBlank(line)||isTrimEmptyOrBlank(lineurl))  
  
  
  　　continue;  
  
  
  　　ss.add(new Song(line,getSongURL(line)));  
  
  
  　　}  
  
  
  　　} catch (ParserException pe) {  
  
  
  　　pe.printStackTrace();  
  
  
  　　}  
  
  
  　　return ss;  
  
  
  　　}  
  
  
  　　private static String getSongURL(String songname){  
  
  
  　　try {  
  
  
  　　String ss = URLEncoder.encode(songname,"GBK");  
  
  
  　　String htmlContent = visitURL("http://so.mp3.qihoo.com/?type=0&ssrc=s&kw="+ss,"s");  
  
  
  　　String encode = "GBK";  
  
  
  　　http://www.feng123.com  
  
  
  　　String reg = "(.*?)";　 http://www.5a520.cn  
  
  
  　　htmlContent = getFilteredContent(htmlContent,reg,1);  
  
  
  　　String line = "",lineurl="";  
  
  
  　　Node anode = null;  
  
  
  　　TextNode textnode = null;  
  
  
  　　Parser parser = Parser.createParser(htmlContent, encode);  
  
  
  　　NodeClassFilter textFilter = new NodeClassFilter(LinkTag.class);  
  
  
  　　OrFilter lastFilter = new OrFilter();  
  
  
  　　lastFilter.setPredicates(new NodeFilter[] { textFilter });  
  
  
  　　NodeList nodeList = parser.parse(lastFilter);  
  
  
  　　Node[] nodes = nodeList.toNodeArray();  
  
  
  　　for (int i = 0; i < nodes.length; i++) {  
  
  
  　　anode = (Node) nodes[i];  
  
  
  　　if(anode instanceof LinkTag){  
  
  
  　　LinkTag txt = (LinkTag)anode;  
  
  
  　　line = txt.getLinkText();  
  
  
  　　lineurl = txt.getAttribute("href");  
  
  
  　　if(!isTrimEmptyOrBlank(lineurl) && lineurl.startsWith("down.html")){  
  
  
  　　String s = getFilteredContent(lineurl,"u=(.*?)\\&",0);  
  
  
  　　if(!s.equals("")&&s.length()>5){  
  
  
  　　s = Utils.replace(s, "u=", "");  
  
  
  　　s = Utils.replace(s, "&", "");  
  
  
  　　s = URLDecoder.decode(s,"GBK");  
  
  
  　　return s;  
  
  
  　　}  
  
  
  　　}  
  
  
  　　}  
  
  
  　　}  
  
  
  　　} catch (Exception pe) {  
  
  
  　　pe.printStackTrace();  
  
  
  　　}  
  
  
  　　return "";  
  
  
  　　}  
  
  
  　　public static void main(String[] args) throws Exception{  
  
  
  　　List ss = getBaiduSongs();  
  
  
  　　int idx = 0;  
  
  
  　　for(Song s:ss){  
  
  
  　　System.out.println((++idx)+":"+s.getName()+"->"+s.getUrl());  
  
  
  　　}  
  
  
  　　//　　　　　 String ss = getSongURL("国家");  
  
  
  　　//　　　　　 System.out.println(ss);  
  
  
  　　//　　　　　 String s = URLDecoder.decode("http%3A%2F%2F http://www.5a520.cn %2F%B9%FA%BC%D2.mp3","GBK");  
  
  
  　　//　　　　　 System.out.println(s);  
  
  
  　　}  
  
  
  　　}

至此Java抓取百度Top500歌曲及源码的工作完成。

网页标题：开发者体验：Java抓取百度Top500歌曲及源码
转载来于：http://www.36103.cn/qtweb/news35/7585.html

网站建设、网络推广公司-创新互联，是专注品牌与效果的网站制作，网络营销seo公司；服务项目有等

声明：本网站发布的内容（图片、视频和文字）以用户投稿、用户转载内容为主，如果涉及侵权请尽快告知，我们将会在第一时间删除。文章观点不代表本网站立场，如需处理请联系客服。电话：028-86922220；邮箱：631063699@qq.com。内容未经允许不得转载，或转载时需注明来源：创新互联

猜你还喜欢下面的内容