1 # Copyright (c) 2001 Chris Withers
\r
3 # This Software is released under the MIT License:
\r
4 # http://www.opensource.org/licenses/mit-license.html
\r
5 # See license.txt for more details.
\r
7 # $Id: html2text.py,v 1.7 2002/12/17 16:56:17 fresh Exp $
\r
10 from string import lower, replace, split, join
\r
12 class HTML2Text(sgmllib.SGMLParser):
\r
14 from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
\r
16 def __init__(self, ignore_tags=(), indent_width=4, page_width=80):
\r
17 sgmllib.SGMLParser.__init__(self)
\r
21 self.page_width=page_width
\r
22 self.inde_width=indent_width
\r
25 self.ignore_tags = ignore_tags
\r
27 def add_text(self,text):
\r
28 # convert text into words
\r
29 words = split(replace(text,'\n',' '))
\r
30 self.line.extend(words)
\r
32 def add_break(self):
\r
33 self.lines.append((self.indent,self.line))
\r
37 # join lines with indents
\r
38 indent_width = self.inde_width
\r
39 page_width = self.page_width
\r
41 for indent,line in self.lines+[(self.indent,self.line)]:
\r
43 i=indent*indent_width
\r
44 indent_string = i*' '
\r
45 line_width = page_width-i
\r
51 len_word = len(word)
\r
52 if len_out_line+len_word<line_width:
\r
53 out_line.append(word)
\r
54 len_out_line = len_out_line + len_word
\r
56 out_para = out_para + indent_string + join(out_line, ' ') + '\n'
\r
58 len_out_line=len_word
\r
60 out_para = out_para + indent_string + join(out_line, ' ')
\r
61 out_paras.append(out_para)
\r
63 self.result = join(out_paras,'\n\n')
\r
66 def mod_indent(self,i):
\r
67 self.indent = self.indent + i
\r
71 def handle_data(self, data):
\r
75 def unknown_starttag(self, tag, attrs):
\r
76 """ Convert HTML to something meaningful in plain text """
\r
79 if tag not in self.ignore_tags:
\r
80 if tag[0]=='h' or tag in ['br','pre','p','hr']:
\r
81 # insert a blank line
\r
85 # newline, text, newline
\r
89 if lower(k) == 'src':
\r
93 self.add_text('Image: ' + src)
\r
99 self.add_text(str(self.ol_number) + ' - ')
\r
100 self.ol_number = self.ol_number + 1
\r
103 self.add_text('- ')
\r
105 elif tag in ['dd','dt']:
\r
108 self.mod_indent(+1)
\r
110 elif tag in ['ul','dl','ol']:
\r
113 self.mod_indent(+1)
\r
117 def unknown_endtag(self, tag):
\r
118 """ Convert HTML to something meaningful in plain text """
\r
121 if tag not in self.ignore_tags:
\r
122 if tag[0]=='h' or tag in ['pre']:
\r
123 # newline, text, newline
\r
129 elif tag in ['dd','dt']:
\r
132 self.mod_indent(-1)
\r
134 elif tag in ['ul','dl','ol']:
\r
138 self.mod_indent(-1)
\r