-# Copyright (c) 2001 Chris Withers\r
-#\r
-# This Software is released under the MIT License:\r
-# http://www.opensource.org/licenses/mit-license.html\r
-# See license.txt for more details.\r
-#\r
-# $Id: html2text.py,v 1.7 2002/12/17 16:56:17 fresh Exp $\r
-\r
-import sgmllib\r
-from string import lower, replace, split, join\r
-\r
-class HTML2Text(sgmllib.SGMLParser):\r
-\r
- from htmlentitydefs import entitydefs # replace entitydefs from sgmllib\r
-\r
- def __init__(self, ignore_tags=(), indent_width=4, page_width=80):\r
- sgmllib.SGMLParser.__init__(self)\r
- self.result = ""\r
- self.indent = 0\r
- self.ol_number = 0\r
- self.page_width=page_width\r
- self.inde_width=indent_width\r
- self.lines=[]\r
- self.line=[]\r
- self.ignore_tags = ignore_tags\r
-\r
- def add_text(self,text):\r
- # convert text into words\r
- words = split(replace(text,'\n',' '))\r
- self.line.extend(words)\r
-\r
- def add_break(self):\r
- self.lines.append((self.indent,self.line))\r
- self.line=[]\r
-\r
- def generate(self):\r
- # join lines with indents\r
- indent_width = self.inde_width\r
- page_width = self.page_width\r
- out_paras=[]\r
- for indent,line in self.lines+[(self.indent,self.line)]:\r
-\r
- i=indent*indent_width\r
- indent_string = i*' '\r
- line_width = page_width-i\r
-\r
- out_para=''\r
- out_line=[]\r
- len_out_line=0\r
- for word in line:\r
- len_word = len(word)\r
- if len_out_line+len_word<line_width:\r
- out_line.append(word)\r
- len_out_line = len_out_line + len_word\r
- else:\r
- out_para = out_para + indent_string + join(out_line, ' ') + '\n'\r
- out_line=[word]\r
- len_out_line=len_word\r
-\r
- out_para = out_para + indent_string + join(out_line, ' ')\r
- out_paras.append(out_para)\r
-\r
- self.result = join(out_paras,'\n\n')\r
-\r
-\r
- def mod_indent(self,i):\r
- self.indent = self.indent + i\r
- if self.indent < 0:\r
- self.indent = 0\r
-\r
- def handle_data(self, data):\r
- if data:\r
- self.add_text(data)\r
-\r
- def unknown_starttag(self, tag, attrs):\r
- """ Convert HTML to something meaningful in plain text """\r
- tag = lower(tag)\r
-\r
- if tag not in self.ignore_tags:\r
- if tag[0]=='h' or tag in ['br','pre','p','hr']:\r
- # insert a blank line\r
- self.add_break()\r
-\r
- elif tag =='img':\r
- # newline, text, newline\r
- src = ''\r
-\r
- for k, v in attrs:\r
- if lower(k) == 'src':\r
- src = v\r
-\r
- self.add_break()\r
- self.add_text('Image: ' + src)\r
-\r
- elif tag =='li':\r
- self.add_break()\r
- if self.ol_number:\r
- # num - text\r
- self.add_text(str(self.ol_number) + ' - ')\r
- self.ol_number = self.ol_number + 1\r
- else:\r
- # - text\r
- self.add_text('- ')\r
-\r
- elif tag in ['dd','dt']:\r
- self.add_break()\r
- # increase indent\r
- self.mod_indent(+1)\r
-\r
- elif tag in ['ul','dl','ol']:\r
- # blank line\r
- # increase indent\r
- self.mod_indent(+1)\r
- if tag=='ol':\r
- self.ol_number = 1\r
-\r
- def unknown_endtag(self, tag):\r
- """ Convert HTML to something meaningful in plain text """\r
- tag = lower(tag)\r
-\r
- if tag not in self.ignore_tags:\r
- if tag[0]=='h' or tag in ['pre']:\r
- # newline, text, newline\r
- self.add_break()\r
-\r
- elif tag =='li':\r
- self.add_break()\r
-\r
- elif tag in ['dd','dt']:\r
- self.add_break()\r
- # descrease indent\r
- self.mod_indent(-1)\r
-\r
- elif tag in ['ul','dl','ol']:\r
- # blank line\r
- self.add_break()\r
- # decrease indent\r
- self.mod_indent(-1)\r
- self.ol_number = 0\r
-\r
+# Copyright (c) 2001 Chris Withers
+#
+# This Software is released under the MIT License:
+# http://www.opensource.org/licenses/mit-license.html
+# See license.txt for more details.
+#
+# $Id: html2text.py,v 1.7 2002/12/17 16:56:17 fresh Exp $
+
+import sgmllib
+from string import lower, replace, split, join
+
+class HTML2Text(sgmllib.SGMLParser):
+
+ from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
+
+ def __init__(self, ignore_tags=(), indent_width=4, page_width=80):
+ sgmllib.SGMLParser.__init__(self)
+ self.result = ""
+ self.indent = 0
+ self.ol_number = 0
+ self.page_width=page_width
+ self.inde_width=indent_width
+ self.lines=[]
+ self.line=[]
+ self.ignore_tags = ignore_tags
+
+ def add_text(self,text):
+ # convert text into words
+ words = split(replace(text,'\n',' '))
+ self.line.extend(words)
+
+ def add_break(self):
+ self.lines.append((self.indent,self.line))
+ self.line=[]
+
+ def generate(self):
+ # join lines with indents
+ indent_width = self.inde_width
+ page_width = self.page_width
+ out_paras=[]
+ for indent,line in self.lines+[(self.indent,self.line)]:
+
+ i=indent*indent_width
+ indent_string = i*' '
+ line_width = page_width-i
+
+ out_para=''
+ out_line=[]
+ len_out_line=0
+ for word in line:
+ len_word = len(word)
+ if len_out_line+len_word<line_width:
+ out_line.append(word)
+ len_out_line = len_out_line + len_word
+ else:
+ out_para = out_para + indent_string + join(out_line, ' ') + '\n'
+ out_line=[word]
+ len_out_line=len_word
+
+ out_para = out_para + indent_string + join(out_line, ' ')
+ out_paras.append(out_para)
+
+ self.result = join(out_paras,'\n\n')
+
+
+ def mod_indent(self,i):
+ self.indent = self.indent + i
+ if self.indent < 0:
+ self.indent = 0
+
+ def handle_data(self, data):
+ if data:
+ self.add_text(data)
+
+ def unknown_starttag(self, tag, attrs):
+ """ Convert HTML to something meaningful in plain text """
+ tag = lower(tag)
+
+ if tag not in self.ignore_tags:
+ if tag[0]=='h' or tag in ['br','pre','p','hr']:
+ # insert a blank line
+ self.add_break()
+
+ elif tag =='img':
+ # newline, text, newline
+ src = ''
+
+ for k, v in attrs:
+ if lower(k) == 'src':
+ src = v
+
+ self.add_break()
+ self.add_text('Image: ' + src)
+
+ elif tag =='li':
+ self.add_break()
+ if self.ol_number:
+ # num - text
+ self.add_text(str(self.ol_number) + ' - ')
+ self.ol_number = self.ol_number + 1
+ else:
+ # - text
+ self.add_text('- ')
+
+ elif tag in ['dd','dt']:
+ self.add_break()
+ # increase indent
+ self.mod_indent(+1)
+
+ elif tag in ['ul','dl','ol']:
+ # blank line
+ # increase indent
+ self.mod_indent(+1)
+ if tag=='ol':
+ self.ol_number = 1
+
+ def unknown_endtag(self, tag):
+ """ Convert HTML to something meaningful in plain text """
+ tag = lower(tag)
+
+ if tag not in self.ignore_tags:
+ if tag[0]=='h' or tag in ['pre']:
+ # newline, text, newline
+ self.add_break()
+
+ elif tag =='li':
+ self.add_break()
+
+ elif tag in ['dd','dt']:
+ self.add_break()
+ # descrease indent
+ self.mod_indent(-1)
+
+ elif tag in ['ul','dl','ol']:
+ # blank line
+ self.add_break()
+ # decrease indent
+ self.mod_indent(-1)
+ self.ol_number = 0
+