用JAVA基本类库去解析HTML

这几天参加公司的定级考试，有个上机题是

访问URL:?http://www.weather.com.cn/weather/101010100.shtml?页面，提取出页面中的天气信息，然后把信息按照要求输出到控制台。开始想到的是先把html文件存到本地，然后在逐行用正则表达式去解析，后来想想这种方法太土，而且解析起来会很复杂，所以就想用SAX去解析，试了一下，程序执行起来太慢，半天出不了结果，而且网络中的html标签不规则，标签不一定都有结尾，所以解析时会报错。后来在网上搜了一下，原来javax.swing.text.html包中已经提供了解析html标签的类库，在网上参考的别人的博客，地址如下：

http://blog.csdn.net/thamsyangsw/article/details/4389900

程序如下：

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

package com.thunisoft.kms.java.lvl2.exam;
????
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.net.URLConnection;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
????
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;
????
/**
?* Title: <br>
?* Description: <br>
?* Copyright: Copyright (c) 2007<br>
?* Company:<br>
?*
?* @author keep at it
?* @version 1.0
?* @date 2013-12-4
?*/
public class GrapWeatherInfo extends ParserCallback
{
????/** 是否是table标签 */
????protected boolean isTable = false;
????/** 是否是a标签 */
????protected boolean isAlink = false;
????/** 是否是div标签 */
????protected boolean isDiv = false;
????/** 是否是td标签 */
????protected boolean isTd = false;
????/** 放符合条件的元素 */
????protected static Vector<String> element = new Vector<String>();
????protected static String paragraphText = new String();
????/** 要获取文件在网络中的URL */
????private static final String FILE_URL =
"http://www.weather.com.cn/weather/101010100.shtml";
????/** 文件在本地磁盘的存储位置 */
????private static final String FILE_LOCATION = "E:/url.html";
????
????/** 构造方法 */
????public GrapWeatherInfo()
????{
????
????}
????
????/**
?????* 开始解析
?????*
?????* @param r
?????*/
????private static void startParse(Reader r)
????{
????????try
????????{
?????????ParserDelegator ps = new ParserDelegator();
?????????????// 负责每次在调用其 parse
?????????????// 方法时启动一个新的
?????????????// DocumentParser
????????????HTMLEditorKit.ParserCallback parser =
new GrapWeatherInfo();// 解析结果驱动这些回调方法。
????????????ps.parse(r, parser, true);
// 解析给定的流并通过解析的结果驱动给定的回调。
????????????Vector<String> link = element;
????????????String temp = "";
????????????for (int i = 1; i < link.size(); i++)
????????????{
????????????????if (link.get(i).contains("星期"))
????????????????{
????????????????????temp = link.get(i);
????????????????}
????????????????if (link.get(i).equals(";"))
????????????????{
????????????????????System.out.println();
????????????????}
????????????????else if (!link.get(i).equals(">"))
????????????????{
????????????????????// Pattern p = Pattern.compile("\\s*|\t|\r|\n");
????????????????????// Matcher m = p.matcher(link.get(i));
????????????????????if (link.get(i).endsWith("夜间")
??????????????????????&& !link.get(i - 1).contains("星期"))
????????????????????{
????????????????????????System.out.println();
????????????????????????System.out.print(temp + "?? ");
????????????????????????System.out.print(link.get(i) + "?? ");
????????????????????}
????????????????????else
????????????????????{
????????????????????????System.out.print(link.get(i) + "?? ");
????????????????????}
????????????????}
????????????}
????
????????}
????????catch (Exception e)
????????{
????????????e.printStackTrace();
????????}
????}
????
????/**
?????* 处理文本
?????*
?????* @param data
?????* @param pos
?????*/
????public void handleText(char[] data, int pos)
????{
????????Pattern p = Pattern.compile("\\s*|\t|\r|\n");
????????Matcher m = null;
????????if (isAlink)
????????{
????????????String tempParagraphText = new String(data);
????????????m = p.matcher(tempParagraphText);
????????????if (paragraphText != null)
????????????{
?????????????// 符合条件的添加到集合中去
??????????????element.addElement(m.replaceAll(""));
????????????}
????????}
????????else if (isTd)
????????{
????????????String tempParagraphText = new String(data);
????????????m = p.matcher(tempParagraphText);
????????????if (paragraphText != null)
????????????{
????????????????// 符合条件的添加到集合中去
????????????????element.addElement(m.replaceAll(""));
????????????}
????????}
????}
????
????/**
?????* 处理开始标签
?????*
?????* @param t
?????* @param a
?????* @param pos
?????*/
????public void handleStartTag(HTML.Tag t,
???????????????????????MutableAttributeSet a, int pos)
????{
????????// System.out.println("start: "+t+"? "
????????????+a.getAttribute(HTML.Attribute.ID)+"? "
????????????+a.getAttribute(HTML.Attribute.CLASS));
????????// 如果是<div/>
????????if (t == HTML.Tag.DIV)
????????{
????????????// 7d 是要解析的div的id属性，用来和其他的div区分
????????????if ("7d".equals(a.getAttribute(HTML.Attribute.ID)))
????????????{
????????????????// 说明是要找的div
????????????????isDiv = true;
????????????}
????????}
????????// 如果是<table/>
????????if (t == HTML.Tag.TABLE)
????????{
????????????// yuBaoTable 是要解析的table的class属性，
????????????//用来和其他的table区分
????????????if ("yuBaoTable".equals(
??????????????????a.getAttribute(HTML.Attribute.CLASS)))
????????????{
????????????????// 说明是要找的table
????????????????isTable = true;
????????????}
????????}
????????// 如果是<a/>,加上是id=7d的限制
????????if (t == HTML.Tag.A && isDiv)
????????{
????
????????????if (a.getAttribute(HTML.Attribute.ID) == null)
????????????{
????????????????if (a.getAttribute(HTML.Attribute.HREF) != null ?
??????????????????a.getAttribute(HTML.Attribute.HREF).toString()
????????????????????????.endsWith(".php") : false)
????????????????{
????????????????????// 说明是要找的<a/>
????????????????????isAlink = true;
????????????????}
????
????????????}
????????}
????????if (t == HTML.Tag.TD && isDiv)
????????{
????????????isTd = true;
????????}
????}
????
????/**
?????* 解析出问题时的处理方法
?????*
?????* @param errorMsg
?????* @param pos
?????*/
????public void handleError(String errorMsg, int pos)
????{
????}
????
????/**
?????* 处理普通tag
?????*
?????* @param t
?????* @param a
?????* @param pos
?????*/
????public void handleSimpleTag(HTML.Tag t,
??????????????????????????MutableAttributeSet a, int pos)
????{
????????handleStartTag(t, a, pos);
????}
????
????/**
?????* getter method
?????*
?????* @return
?????*/
????public static String getParagraphText()
????{
????????return paragraphText;
????}
????
????/**
?????* 处理注释
?????*
?????* @param data
?????* @param pos
?????*/
????public void handleComment(char[] data, int pos)
????{
????}
????
????/**
?????* 处理end tag
?????*
?????* @param t
?????* @param pos
?????*/
????public void handleEndTag(HTML.Tag t, int pos)
????{
????????// System.out.println("end: "+t+"? "+pos);
????????// 如果是<a/>标签
????????if (t == HTML.Tag.A)
????????{
????????????if (isAlink)
????????????{
????????????????isAlink = false;
????????????}
????????}// 如果是<table/>标签
????????else if (t == HTML.Tag.TABLE && isAlink == false)
????????{
????????????if (isTable)
????????????{
????????????????isTable = false;
????????????????// 一个table标签解析完的时候,element中加入一个;
?????????????????//元素用来分隔每个table中的文本，方便输出
????????????}
????????????element.addElement(new String(";"));
????????}// 如果是<div/>标签
????????else if (t == HTML.Tag.DIV && isTable == false)
????????{
????????????if (isDiv == true && isTable == false)
????????????{
????????????????isDiv = false;
????????????}
????????}
????????else if (t == HTML.Tag.TD)
????????{
????????????isTd = false;
????????}
????}
????
????/**
?????* 程序的入口
?????*
?????* @param args
?????*/
????public static void main(String args[])
????{
????????InputStream input = null;
????????FileOutputStream fos = null;
????????BufferedReader brd = null;
????????try
????????{
????????????// 设置要提取的文件的URL
????????????URL url = new URL(FILE_URL);
????????????// 建立连接
????????????URLConnection conn = url.openConnection();
????????????conn.connect();
????????????// 获取输入流
????????????input = conn.getInputStream();
????????????// new 一个具体的文件输出流
????????????fos = new FileOutputStream(FILE_LOCATION);
????????????byte[] b = new byte[1024];
????????????int read = 0;
????????????// 输出
????????????while ((read = input.read(b)) != -1)
????????????{
????????????????fos.write(b, 0, read);
????????????}
????????????// 获取HTML文件流，以UTF-8编码
????????????brd = new BufferedReader(
?????????????????????new InputStreamReader(
?????????????????????new FileInputStream(
??????????????????????????FILE_LOCATION), "UTF-8"));
????????????// 开始解析HTML
????????????startParse(brd);
????????}
????????catch (Exception e)
????????{
????????????e.printStackTrace();
????????}
????????finally
????????{
????????????// 关闭资源
????????????if (input != null)
????????????{
????????????????try
????????????????{
????????????????????input.close();
????????????????}
????????????????catch (IOException e)
????????????????{
????????????????????input = null;
????????????????}
????????????}
????
????????????if (fos != null)
????????????{
????????????????try
????????????????{
????????????????????fos.close();
????????????????}
????????????????catch (IOException e)
????????????????{
????????????????????fos = null;
????????????????}
????????????}
????
????????????if (brd != null)
????????????{
????????????????try
????????????????{
????????????????????brd.close();
????????????????}
????????????????catch (IOException e)
????????????????{
????????????????????brd = null;
????????????????}
????????????}
????????}
????}
}

时不我待

The tenser the life is,the more vatality is shown.

用JAVA基本类库去解析HTML

发表评论取消回复

发表评论 取消回复

发表评论取消回复