]> git.openstreetmap.org Git - osqa.git/blob - forum/utils/html2text.py
Merge pull request #5 from lra/french
[osqa.git] / forum / utils / html2text.py
1 # Copyright (c) 2001 Chris Withers
2 #
3 # This Software is released under the MIT License:
4 # http://www.opensource.org/licenses/mit-license.html
5 # See license.txt for more details.
6 #
7 # $Id: html2text.py,v 1.7 2002/12/17 16:56:17 fresh Exp $
8
9 import sgmllib
10 from string import lower, replace, split, join
11
12 class HTML2Text(sgmllib.SGMLParser):
13
14     from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
15
16     def __init__(self, ignore_tags=(), indent_width=4, page_width=80):
17         sgmllib.SGMLParser.__init__(self)
18         self.result = ""
19         self.indent = 0
20         self.ol_number = 0
21         self.page_width=page_width
22         self.inde_width=indent_width
23         self.lines=[]
24         self.line=[]
25         self.ignore_tags = ignore_tags
26
27     def add_text(self,text):
28         # convert text into words
29         words = split(replace(text,'\n',' '))
30         self.line.extend(words)
31
32     def add_break(self):
33         self.lines.append((self.indent,self.line))
34         self.line=[]
35
36     def generate(self):
37         # join lines with indents
38         indent_width = self.inde_width
39         page_width = self.page_width
40         out_paras=[]
41         for indent,line in self.lines+[(self.indent,self.line)]:
42
43             i=indent*indent_width
44             indent_string = i*' '
45             line_width = page_width-i
46
47             out_para=''
48             out_line=[]
49             len_out_line=0
50             for word in line:
51                 len_word = len(word)
52                 if len_out_line+len_word<line_width:
53                     out_line.append(word)
54                     len_out_line = len_out_line + len_word
55                 else:
56                     out_para = out_para + indent_string + join(out_line, ' ') + '\n'
57                     out_line=[word]
58                     len_out_line=len_word
59
60             out_para = out_para + indent_string + join(out_line, ' ')
61             out_paras.append(out_para)
62
63         self.result = join(out_paras,'\n\n')
64
65
66     def mod_indent(self,i):
67         self.indent = self.indent + i
68         if self.indent < 0:
69             self.indent = 0
70
71     def handle_data(self, data):
72         if data:
73             self.add_text(data)
74
75     def unknown_starttag(self, tag, attrs):
76         """ Convert HTML to something meaningful in plain text """
77         tag = lower(tag)
78
79         if tag not in self.ignore_tags:
80             if tag[0]=='h' or tag in ['br','pre','p','hr']:
81                 # insert a blank line
82                 self.add_break()
83
84             elif tag =='img':
85                 # newline, text, newline
86                 src = ''
87
88                 for k, v in attrs:
89                     if lower(k) == 'src':
90                         src = v
91
92                 self.add_break()
93                 self.add_text('Image: ' + src)
94
95             elif tag =='li':
96                 self.add_break()
97                 if self.ol_number:
98                     # num - text
99                     self.add_text(str(self.ol_number) + ' - ')
100                     self.ol_number = self.ol_number + 1
101                 else:
102                     # - text
103                     self.add_text('- ')
104
105             elif tag in ['dd','dt']:
106                 self.add_break()
107                 # increase indent
108                 self.mod_indent(+1)
109
110             elif tag in ['ul','dl','ol']:
111                 # blank line
112                 # increase indent
113                 self.mod_indent(+1)
114                 if tag=='ol':
115                     self.ol_number = 1
116
117     def unknown_endtag(self, tag):
118         """ Convert HTML to something meaningful in plain text """
119         tag = lower(tag)
120
121         if tag not in self.ignore_tags:
122             if tag[0]=='h' or tag in ['pre']:
123                 # newline, text, newline
124                 self.add_break()
125
126             elif tag =='li':
127                 self.add_break()
128
129             elif tag in ['dd','dt']:
130                 self.add_break()
131                 # descrease indent
132                 self.mod_indent(-1)
133
134             elif tag in ['ul','dl','ol']:
135                 # blank line
136                 self.add_break()
137                 # decrease indent
138                 self.mod_indent(-1)
139                 self.ol_number = 0
140