Skip to content

Commit ea3f5f4

Browse files
committed
Added D32, a version of Base32 that maintains lexicographical ordering
1 parent 821de4f commit ea3f5f4

2 files changed

Lines changed: 147 additions & 0 deletions

File tree

src/bd2k/util/d32.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# Copyright (c) 2015 Hannes Schmidt
2+
#
3+
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software
4+
# and associated documentation files (the "Software"), to deal in the Software without
5+
# restriction, including without limitation the rights to use, copy, modify, merge, publish,
6+
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
7+
# Software is furnished to do so, subject to the following conditions:
8+
#
9+
# The above copyright notice and this permission notice shall be included in all copies or
10+
# substantial portions of the Software.
11+
#
12+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
13+
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
14+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
15+
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17+
18+
# Inspired by Dominic Tarr's JavaScript at https://github.com/dominictarr/d64
19+
20+
class D32( object ):
21+
"""
22+
Base32 encoding and decoding without padding, and using an arbitrary alphabet.
23+
"""
24+
25+
def __init__( self, alphabet ):
26+
super( D32, self ).__init__( )
27+
self.alphabet = bytearray( alphabet )
28+
self.lookup = bytearray( 255 )
29+
for i in xrange( 32 ):
30+
self.lookup[ self.alphabet[ i ] ] = i
31+
32+
def encode( self, d ):
33+
"""
34+
>>> encode = standard.encode
35+
>>> encode('')
36+
''
37+
>>> encode('\\0')
38+
'22'
39+
>>> encode('\\xff')
40+
'zw'
41+
>>> encode('\\0\\1\\2\\3\\4')
42+
'222k62s6'
43+
>>> encode('\\0\\1\\2\\3\\4\\5')
44+
'222k62s62o'
45+
"""
46+
m = len( d )
47+
n = (m * 8 + 4) / 5
48+
padding = 8 - n % 8
49+
e = bytearray( n + padding )
50+
i, j = 0, 0
51+
a = self.alphabet
52+
53+
while i < m:
54+
if m - i < 5:
55+
g = bytearray( d[ i: ] + '\0' * (5 - (m - i)) )
56+
else:
57+
g = bytearray( d[ i:i + 5 ] )
58+
# bit 1 2 3
59+
# bit 01234567 89012345 67890123 45678901 23456789
60+
# byte 00000000 11111111 22222222 33333333 44444444
61+
# group 00000111 11222223 33334444 45555566 66677777
62+
e[ j + 0 ] = a[ g[ 0 ] >> 3 ]
63+
e[ j + 1 ] = a[ g[ 0 ] << 2 & 31 | g[ 1 ] >> 6 ]
64+
e[ j + 2 ] = a[ g[ 1 ] >> 1 & 31 ]
65+
e[ j + 3 ] = a[ g[ 1 ] << 4 & 31 | g[ 2 ] >> 4 ]
66+
e[ j + 4 ] = a[ g[ 2 ] << 1 & 31 | g[ 3 ] >> 7 ]
67+
e[ j + 5 ] = a[ g[ 3 ] >> 2 & 31 ]
68+
e[ j + 6 ] = a[ g[ 3 ] << 3 & 31 | g[ 4 ] >> 5 ]
69+
e[ j + 7 ] = a[ g[ 4 ] & 31 ]
70+
j += 8
71+
i += 5
72+
return str( e[ :-padding ] )
73+
74+
def decode( self, e ):
75+
"""
76+
>>> decode = standard.decode
77+
78+
# >>> decode('222k62s62o')
79+
# '\\x00\\x01\\x02\\x03\\x04\\x05'
80+
# >>> decode('222k62s6')
81+
# '\\x00\\x01\\x02\\x03\\x04'
82+
>>> decode('zw')
83+
'\\xff'
84+
"""
85+
n = len( e )
86+
m = n * 5 / 8
87+
padding = 5 - m % 5
88+
d = bytearray( m + padding )
89+
i, j = 0, 0
90+
l = self.lookup
91+
while j < n:
92+
if n - j < 8:
93+
g = [ l[ ord( x ) ] for x in e[ j: ] ] + [ 0 ] * (8 - (n - j))
94+
else:
95+
g = [ l[ ord( x ) ] for x in e[ j:j + 8 ] ]
96+
# bit 1 2 3
97+
# bit 01234567 89012345 67890123 45678901 23456789
98+
# byte 00000000 11111111 22222222 33333333 44444444
99+
# group 00000111 11222223 33334444 45555566 66677777
100+
d[ i + 0 ] = g[ 0 ] << 3 & 255 | g[ 1 ] >> 2
101+
d[ i + 1 ] = g[ 1 ] << 6 & 255 | g[ 2 ] << 1 & 255 | g[ 3 ] >> 4
102+
d[ i + 2 ] = g[ 3 ] << 4 & 255 | g[ 4 ] >> 1
103+
d[ i + 3 ] = g[ 4 ] << 7 & 255 | g[ 5 ] << 2 & 255 | g[ 6 ] >> 3
104+
d[ i + 4 ] = g[ 6 ] << 5 & 255 | g[ 7 ]
105+
j += 8
106+
i += 5
107+
return str( d[ :-padding ] )
108+
109+
110+
# A variant of Base64 that maintains the lexicographical ordering such that for any given list of
111+
# string l, map( decode, sorted( map( standard.encode, l ) ) == sorted( l )
112+
113+
standard = D32( '234567abcdefghijklmnopqrstuvwxyz' )
114+
115+
# A reimplementation of base64.b32encode and base64.b32encode, but faster and without padding:
116+
117+
base32 = D32( 'abcdefghijklmnopqrstuvwxyz234567' )

src/bd2k/util/test/test_d32.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Copyright (c) 2014 Dominic Tarr
2+
# Copyright (c) 2015 Hannes Schmidt
3+
#
4+
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software
5+
# and associated documentation files (the "Software"), to deal in the Software without
6+
# restriction, including without limitation the rights to use, copy, modify, merge, publish,
7+
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
8+
# Software is furnished to do so, subject to the following conditions:
9+
#
10+
# The above copyright notice and this permission notice shall be included in all copies or
11+
# substantial portions of the Software.
12+
#
13+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
14+
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
15+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
16+
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
17+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
18+
19+
# Inspired by JavaScript code found at https://github.com/dominictarr/d64
20+
21+
from __future__ import absolute_import
22+
from unittest import TestCase
23+
from bd2k.util.d32 import standard as d32
24+
import os
25+
26+
27+
class TestD32( TestCase ):
28+
def test( self ):
29+
l = [ os.urandom( i ) for i in xrange( 1000 ) ]
30+
self.assertEqual( map( d32.decode, sorted( map( d32.encode, l ) ) ), sorted( l ) )

0 commit comments

Comments
 (0)