<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;"># -*- coding: UTF-8 -*-
def recode_utf8(data):
	"""
	Given a string which is either:
	 * unicode
	 * well-encoded utf8
	 * well-encoded latin1
	 * poorly-encoded utf8+latin1
	Return the equivalent utf8-encoded byte string.
	"""
	if isinstance(data, unicode):
		# The input is already decoded. Just return the utf8.
		return data.encode('UTF-8')

	try:
		decoded = data.decode('UTF-8')
	except UnicodeDecodeError:
		# Indicates latin1 encoded bytes.
		decoded = data.decode('latin1')

	while True:
		# Check if the data is poorly-encoded as utf8+latin1
		try:
			encoded = decoded.encode('latin1')
		except UnicodeEncodeError:
			# Indicates non-latin1-encodable characters; it's not utf8+latin1.
			return decoded.encode('UTF-8')

		try:
			decoded = encoded.decode('UTF-8')
		except UnicodeDecodeError:
			# Can't decode the latin1 as utf8; it's not utf8+latin1.
			return decoded.encode('UTF-8')


import unittest as T
class TestRecodeUtf8(T.TestCase):
	latin1 = u'MÃ¼nchen' # encodable to latin1
	utf8 = u'ÅÃ³dÅº' # not encodable to latin1

	def test_unicode(self):
		"An un-encoded unicode string should just become utf8-encoded"
		self.assertEqual(
				recode_utf8(self.utf8),
				self.utf8.encode('UTF-8'),
		)

	def test_utf8(self):
		"A utf8-encoded string should be unchanged"
		utf8 = self.utf8.encode('UTF-8')
		self.assertEqual(
				recode_utf8(utf8),
				utf8,
		)

	def test_latin1(self):
		"A latin1-encoded string should become utf8-encoded"
		self.assertEqual(
				recode_utf8(self.latin1.encode('latin1')),
				self.latin1.encode('UTF-8'),
		)

	def test_utf8_plus_latin1(self):
		"A poorly-encoded utf8+latin1 string should become utf8-encoded"
		utf8 = self.utf8.encode('UTF-8')
		poorly_encoded = utf8.decode('latin1').encode('UTF-8')
		self.assertEqual(
				recode_utf8(poorly_encoded),
				utf8,
		)

	def test_utf8_plus_latin1_several_times(self):
		"A string mangled by utf8+latin1 several times should become utf8-encoded"
		utf8 = self.utf8.encode('UTF-8')
		poorly_encoded = utf8
		for x in range(10):
			poorly_encoded = poorly_encoded.decode('latin1').encode('UTF-8')

		self.assertEqual(
				recode_utf8(poorly_encoded),
				utf8,
		)



if __name__ == '__main__':
	T.main()</pre></body></html>