1 # Copyright (c) 2001 Chris Withers
3 # This Software is released under the MIT License:
4 # http://www.opensource.org/licenses/mit-license.html
5 # See license.txt for more details.
7 # $Id: html2text.py,v 1.7 2002/12/17 16:56:17 fresh Exp $
10 from string import lower, replace, split, join
12 class HTML2Text(sgmllib.SGMLParser):
14 from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
16 def __init__(self, ignore_tags=(), indent_width=4, page_width=80):
17 sgmllib.SGMLParser.__init__(self)
21 self.page_width=page_width
22 self.inde_width=indent_width
25 self.ignore_tags = ignore_tags
27 def add_text(self,text):
28 # convert text into words
29 words = split(replace(text,'\n',' '))
30 self.line.extend(words)
33 self.lines.append((self.indent,self.line))
37 # join lines with indents
38 indent_width = self.inde_width
39 page_width = self.page_width
41 for indent,line in self.lines+[(self.indent,self.line)]:
45 line_width = page_width-i
52 if len_out_line+len_word<line_width:
54 len_out_line = len_out_line + len_word
56 out_para = out_para + indent_string + join(out_line, ' ') + '\n'
60 out_para = out_para + indent_string + join(out_line, ' ')
61 out_paras.append(out_para)
63 self.result = join(out_paras,'\n\n')
66 def mod_indent(self,i):
67 self.indent = self.indent + i
71 def handle_data(self, data):
75 def unknown_starttag(self, tag, attrs):
76 """ Convert HTML to something meaningful in plain text """
79 if tag not in self.ignore_tags:
80 if tag[0]=='h' or tag in ['br','pre','p','hr']:
85 # newline, text, newline
93 self.add_text('Image: ' + src)
99 self.add_text(str(self.ol_number) + ' - ')
100 self.ol_number = self.ol_number + 1
105 elif tag in ['dd','dt']:
110 elif tag in ['ul','dl','ol']:
117 def unknown_endtag(self, tag):
118 """ Convert HTML to something meaningful in plain text """
121 if tag not in self.ignore_tags:
122 if tag[0]=='h' or tag in ['pre']:
123 # newline, text, newline
129 elif tag in ['dd','dt']:
134 elif tag in ['ul','dl','ol']: