+ def test_invalid_utf8
+ # See e.g http://en.wikipedia.org/wiki/UTF-8 for byte sequences
+ # FIXME - Invalid Unicode characters can still be encoded into "valid" utf-8 byte sequences - maybe check this too?
+ invalid_sequences = ["\xC0", # always invalid utf8
+ "\xC2\x4a", # 2-byte multibyte identifier, followed by plain ASCII
+ "\xC2\xC2", # 2-byte multibyte identifier, followed by another one
+ "\x4a\x82", # plain ASCII, followed by multibyte continuation
+ "\x82\x82", # multibyte continuations without multibyte identifier
+ "\xe1\x82\x4a", # three-byte identifier, contination and (incorrectly) plain ASCII
+ ]
+ invalid_sequences.each do |char|
+ begin
+ # create a message and save to the database
+ msg = make_message(char, 1)
+ # if the save throws, thats fine and the test should pass, as we're
+ # only testing invalid sequences anyway.
+ msg.save!
+
+ # get the saved message back and check that it is identical - i.e:
+ # its OK to accept invalid UTF-8 as long as we return it unmodified.
+ db_msg = msg.class.find(msg.id)
+ assert_equal char, db_msg.title, "Database silently truncated message title"
+
+ rescue ActiveRecord::RecordInvalid
+ # because we only test invalid sequences it is OK to barf on them
+ end
+ end
+ end
+