]> git.openstreetmap.org Git - osqa.git/blobdiff - forum/utils/html2text.py
Merge pull request #46 from udacity/subfolder_fixes
[osqa.git] / forum / utils / html2text.py
index 3b517712e6191369d76ff9645996d75a554971e2..c666610e4063b74b1aade98ff9ec02546b96b2ee 100644 (file)
-# Copyright (c) 2001 Chris Withers\r
-#\r
-# This Software is released under the MIT License:\r
-# http://www.opensource.org/licenses/mit-license.html\r
-# See license.txt for more details.\r
-#\r
-# $Id: html2text.py,v 1.7 2002/12/17 16:56:17 fresh Exp $\r
-\r
-import sgmllib\r
-from string import lower, replace, split, join\r
-\r
-class HTML2Text(sgmllib.SGMLParser):\r
-\r
-    from htmlentitydefs import entitydefs # replace entitydefs from sgmllib\r
-\r
-    def __init__(self, ignore_tags=(), indent_width=4, page_width=80):\r
-        sgmllib.SGMLParser.__init__(self)\r
-        self.result = ""\r
-        self.indent = 0\r
-        self.ol_number = 0\r
-        self.page_width=page_width\r
-        self.inde_width=indent_width\r
-        self.lines=[]\r
-        self.line=[]\r
-        self.ignore_tags = ignore_tags\r
-\r
-    def add_text(self,text):\r
-        # convert text into words\r
-        words = split(replace(text,'\n',' '))\r
-        self.line.extend(words)\r
-\r
-    def add_break(self):\r
-        self.lines.append((self.indent,self.line))\r
-        self.line=[]\r
-\r
-    def generate(self):\r
-        # join lines with indents\r
-        indent_width = self.inde_width\r
-        page_width = self.page_width\r
-        out_paras=[]\r
-        for indent,line in self.lines+[(self.indent,self.line)]:\r
-\r
-            i=indent*indent_width\r
-            indent_string = i*' '\r
-            line_width = page_width-i\r
-\r
-            out_para=''\r
-            out_line=[]\r
-            len_out_line=0\r
-            for word in line:\r
-                len_word = len(word)\r
-                if len_out_line+len_word<line_width:\r
-                    out_line.append(word)\r
-                    len_out_line = len_out_line + len_word\r
-                else:\r
-                    out_para = out_para + indent_string + join(out_line, ' ') + '\n'\r
-                    out_line=[word]\r
-                    len_out_line=len_word\r
-\r
-            out_para = out_para + indent_string + join(out_line, ' ')\r
-            out_paras.append(out_para)\r
-\r
-        self.result = join(out_paras,'\n\n')\r
-\r
-\r
-    def mod_indent(self,i):\r
-        self.indent = self.indent + i\r
-        if self.indent < 0:\r
-            self.indent = 0\r
-\r
-    def handle_data(self, data):\r
-        if data:\r
-            self.add_text(data)\r
-\r
-    def unknown_starttag(self, tag, attrs):\r
-        """ Convert HTML to something meaningful in plain text """\r
-        tag = lower(tag)\r
-\r
-        if tag not in self.ignore_tags:\r
-            if tag[0]=='h' or tag in ['br','pre','p','hr']:\r
-                # insert a blank line\r
-                self.add_break()\r
-\r
-            elif tag =='img':\r
-                # newline, text, newline\r
-                src = ''\r
-\r
-                for k, v in attrs:\r
-                    if lower(k) == 'src':\r
-                        src = v\r
-\r
-                self.add_break()\r
-                self.add_text('Image: ' + src)\r
-\r
-            elif tag =='li':\r
-                self.add_break()\r
-                if self.ol_number:\r
-                    # num - text\r
-                    self.add_text(str(self.ol_number) + ' - ')\r
-                    self.ol_number = self.ol_number + 1\r
-                else:\r
-                    # - text\r
-                    self.add_text('- ')\r
-\r
-            elif tag in ['dd','dt']:\r
-                self.add_break()\r
-                # increase indent\r
-                self.mod_indent(+1)\r
-\r
-            elif tag in ['ul','dl','ol']:\r
-                # blank line\r
-                # increase indent\r
-                self.mod_indent(+1)\r
-                if tag=='ol':\r
-                    self.ol_number = 1\r
-\r
-    def unknown_endtag(self, tag):\r
-        """ Convert HTML to something meaningful in plain text """\r
-        tag = lower(tag)\r
-\r
-        if tag not in self.ignore_tags:\r
-            if tag[0]=='h' or tag in ['pre']:\r
-                # newline, text, newline\r
-                self.add_break()\r
-\r
-            elif tag =='li':\r
-                self.add_break()\r
-\r
-            elif tag in ['dd','dt']:\r
-                self.add_break()\r
-                # descrease indent\r
-                self.mod_indent(-1)\r
-\r
-            elif tag in ['ul','dl','ol']:\r
-                # blank line\r
-                self.add_break()\r
-                # decrease indent\r
-                self.mod_indent(-1)\r
-                self.ol_number = 0\r
-\r
+# Copyright (c) 2001 Chris Withers
+#
+# This Software is released under the MIT License:
+# http://www.opensource.org/licenses/mit-license.html
+# See license.txt for more details.
+#
+# $Id: html2text.py,v 1.7 2002/12/17 16:56:17 fresh Exp $
+
+import sgmllib
+from string import lower, replace, split, join
+
+class HTML2Text(sgmllib.SGMLParser):
+
+    from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
+
+    def __init__(self, ignore_tags=(), indent_width=4, page_width=80):
+        sgmllib.SGMLParser.__init__(self)
+        self.result = ""
+        self.indent = 0
+        self.ol_number = 0
+        self.page_width=page_width
+        self.inde_width=indent_width
+        self.lines=[]
+        self.line=[]
+        self.ignore_tags = ignore_tags
+
+    def add_text(self,text):
+        # convert text into words
+        words = split(replace(text,'\n',' '))
+        self.line.extend(words)
+
+    def add_break(self):
+        self.lines.append((self.indent,self.line))
+        self.line=[]
+
+    def generate(self):
+        # join lines with indents
+        indent_width = self.inde_width
+        page_width = self.page_width
+        out_paras=[]
+        for indent,line in self.lines+[(self.indent,self.line)]:
+
+            i=indent*indent_width
+            indent_string = i*' '
+            line_width = page_width-i
+
+            out_para=''
+            out_line=[]
+            len_out_line=0
+            for word in line:
+                len_word = len(word)
+                if len_out_line+len_word<line_width:
+                    out_line.append(word)
+                    len_out_line = len_out_line + len_word
+                else:
+                    out_para = out_para + indent_string + join(out_line, ' ') + '\n'
+                    out_line=[word]
+                    len_out_line=len_word
+
+            out_para = out_para + indent_string + join(out_line, ' ')
+            out_paras.append(out_para)
+
+        self.result = join(out_paras,'\n\n')
+
+
+    def mod_indent(self,i):
+        self.indent = self.indent + i
+        if self.indent < 0:
+            self.indent = 0
+
+    def handle_data(self, data):
+        if data:
+            self.add_text(data)
+
+    def unknown_starttag(self, tag, attrs):
+        """ Convert HTML to something meaningful in plain text """
+        tag = lower(tag)
+
+        if tag not in self.ignore_tags:
+            if tag[0]=='h' or tag in ['br','pre','p','hr']:
+                # insert a blank line
+                self.add_break()
+
+            elif tag =='img':
+                # newline, text, newline
+                src = ''
+
+                for k, v in attrs:
+                    if lower(k) == 'src':
+                        src = v
+
+                self.add_break()
+                self.add_text('Image: ' + src)
+
+            elif tag =='li':
+                self.add_break()
+                if self.ol_number:
+                    # num - text
+                    self.add_text(str(self.ol_number) + ' - ')
+                    self.ol_number = self.ol_number + 1
+                else:
+                    # - text
+                    self.add_text('- ')
+
+            elif tag in ['dd','dt']:
+                self.add_break()
+                # increase indent
+                self.mod_indent(+1)
+
+            elif tag in ['ul','dl','ol']:
+                # blank line
+                # increase indent
+                self.mod_indent(+1)
+                if tag=='ol':
+                    self.ol_number = 1
+
+    def unknown_endtag(self, tag):
+        """ Convert HTML to something meaningful in plain text """
+        tag = lower(tag)
+
+        if tag not in self.ignore_tags:
+            if tag[0]=='h' or tag in ['pre']:
+                # newline, text, newline
+                self.add_break()
+
+            elif tag =='li':
+                self.add_break()
+
+            elif tag in ['dd','dt']:
+                self.add_break()
+                # descrease indent
+                self.mod_indent(-1)
+
+            elif tag in ['ul','dl','ol']:
+                # blank line
+                self.add_break()
+                # decrease indent
+                self.mod_indent(-1)
+                self.ol_number = 0
+