Skip to content

Commit a662f3c

Browse files
authored
Merge pull request #12 from BD2KGenomics/features/d32
Added D32, a version of Base32 that maintains lexicographical ordering
2 parents d409b22 + ea3f5f4 commit a662f3c

4 files changed

Lines changed: 166 additions & 23 deletions

File tree

src/bd2k/util/d32.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# Copyright (c) 2015 Hannes Schmidt
2+
#
3+
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software
4+
# and associated documentation files (the "Software"), to deal in the Software without
5+
# restriction, including without limitation the rights to use, copy, modify, merge, publish,
6+
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
7+
# Software is furnished to do so, subject to the following conditions:
8+
#
9+
# The above copyright notice and this permission notice shall be included in all copies or
10+
# substantial portions of the Software.
11+
#
12+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
13+
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
14+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
15+
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17+
18+
# Inspired by Dominic Tarr's JavaScript at https://github.com/dominictarr/d64
19+
20+
class D32( object ):
21+
"""
22+
Base32 encoding and decoding without padding, and using an arbitrary alphabet.
23+
"""
24+
25+
def __init__( self, alphabet ):
26+
super( D32, self ).__init__( )
27+
self.alphabet = bytearray( alphabet )
28+
self.lookup = bytearray( 255 )
29+
for i in xrange( 32 ):
30+
self.lookup[ self.alphabet[ i ] ] = i
31+
32+
def encode( self, d ):
33+
"""
34+
>>> encode = standard.encode
35+
>>> encode('')
36+
''
37+
>>> encode('\\0')
38+
'22'
39+
>>> encode('\\xff')
40+
'zw'
41+
>>> encode('\\0\\1\\2\\3\\4')
42+
'222k62s6'
43+
>>> encode('\\0\\1\\2\\3\\4\\5')
44+
'222k62s62o'
45+
"""
46+
m = len( d )
47+
n = (m * 8 + 4) / 5
48+
padding = 8 - n % 8
49+
e = bytearray( n + padding )
50+
i, j = 0, 0
51+
a = self.alphabet
52+
53+
while i < m:
54+
if m - i < 5:
55+
g = bytearray( d[ i: ] + '\0' * (5 - (m - i)) )
56+
else:
57+
g = bytearray( d[ i:i + 5 ] )
58+
# bit 1 2 3
59+
# bit 01234567 89012345 67890123 45678901 23456789
60+
# byte 00000000 11111111 22222222 33333333 44444444
61+
# group 00000111 11222223 33334444 45555566 66677777
62+
e[ j + 0 ] = a[ g[ 0 ] >> 3 ]
63+
e[ j + 1 ] = a[ g[ 0 ] << 2 & 31 | g[ 1 ] >> 6 ]
64+
e[ j + 2 ] = a[ g[ 1 ] >> 1 & 31 ]
65+
e[ j + 3 ] = a[ g[ 1 ] << 4 & 31 | g[ 2 ] >> 4 ]
66+
e[ j + 4 ] = a[ g[ 2 ] << 1 & 31 | g[ 3 ] >> 7 ]
67+
e[ j + 5 ] = a[ g[ 3 ] >> 2 & 31 ]
68+
e[ j + 6 ] = a[ g[ 3 ] << 3 & 31 | g[ 4 ] >> 5 ]
69+
e[ j + 7 ] = a[ g[ 4 ] & 31 ]
70+
j += 8
71+
i += 5
72+
return str( e[ :-padding ] )
73+
74+
def decode( self, e ):
75+
"""
76+
>>> decode = standard.decode
77+
78+
# >>> decode('222k62s62o')
79+
# '\\x00\\x01\\x02\\x03\\x04\\x05'
80+
# >>> decode('222k62s6')
81+
# '\\x00\\x01\\x02\\x03\\x04'
82+
>>> decode('zw')
83+
'\\xff'
84+
"""
85+
n = len( e )
86+
m = n * 5 / 8
87+
padding = 5 - m % 5
88+
d = bytearray( m + padding )
89+
i, j = 0, 0
90+
l = self.lookup
91+
while j < n:
92+
if n - j < 8:
93+
g = [ l[ ord( x ) ] for x in e[ j: ] ] + [ 0 ] * (8 - (n - j))
94+
else:
95+
g = [ l[ ord( x ) ] for x in e[ j:j + 8 ] ]
96+
# bit 1 2 3
97+
# bit 01234567 89012345 67890123 45678901 23456789
98+
# byte 00000000 11111111 22222222 33333333 44444444
99+
# group 00000111 11222223 33334444 45555566 66677777
100+
d[ i + 0 ] = g[ 0 ] << 3 & 255 | g[ 1 ] >> 2
101+
d[ i + 1 ] = g[ 1 ] << 6 & 255 | g[ 2 ] << 1 & 255 | g[ 3 ] >> 4
102+
d[ i + 2 ] = g[ 3 ] << 4 & 255 | g[ 4 ] >> 1
103+
d[ i + 3 ] = g[ 4 ] << 7 & 255 | g[ 5 ] << 2 & 255 | g[ 6 ] >> 3
104+
d[ i + 4 ] = g[ 6 ] << 5 & 255 | g[ 7 ]
105+
j += 8
106+
i += 5
107+
return str( d[ :-padding ] )
108+
109+
110+
# A variant of Base64 that maintains the lexicographical ordering such that for any given list of
111+
# string l, map( decode, sorted( map( standard.encode, l ) ) == sorted( l )
112+
113+
standard = D32( '234567abcdefghijklmnopqrstuvwxyz' )
114+
115+
# A reimplementation of base64.b32encode and base64.b32encode, but faster and without padding:
116+
117+
base32 = D32( 'abcdefghijklmnopqrstuvwxyz234567' )

src/bd2k/util/d64.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,12 @@
2323
class D64( object ):
2424
def __init__( self, special_chars ):
2525
super( D64, self ).__init__( )
26-
self.chars = bytearray( sorted(
27-
'PYFGCRLAOEUIDHTNSQJKXBMWVZpyfgcrlaoeuidhtnsqjkxbmwvz1234567890' + special_chars ) )
28-
self.codeToIndex = bytearray( 128 )
26+
alphabet = 'PYFGCRLAOEUIDHTNSQJKXBMWVZpyfgcrlaoeuidhtnsqjkxbmwvz1234567890'
27+
self.alphabet = bytearray( sorted( alphabet + special_chars ) )
28+
self.lookup = bytearray( 255 )
2929
for i in xrange( 64 ):
30-
code = self.chars[ i ]
31-
self.codeToIndex[ code ] = i
30+
code = self.alphabet[ i ]
31+
self.lookup[ code ] = i
3232

3333
def encode( self, data ):
3434
"""
@@ -48,32 +48,32 @@ def encode( self, data ):
4848
s = bytearray( (l * 4 + 2) / 3 )
4949
hang = 0
5050
j = 0
51-
chars = self.chars
51+
a = self.alphabet
5252
for i in xrange( l ):
5353
v = ord( data[ i ] )
5454
r = i % 3
5555
if r == 0:
56-
s[ j ] = chars[ v >> 2 ]
56+
s[ j ] = a[ v >> 2 ]
5757
j += 1
5858
hang = (v & 3) << 4
5959
elif r == 1:
60-
s[ j ] = chars[ hang | v >> 4 ]
60+
s[ j ] = a[ hang | v >> 4 ]
6161
j += 1
6262
hang = (v & 0xf) << 2
6363
elif r == 2:
64-
s[ j ] = chars[ hang | v >> 6 ]
64+
s[ j ] = a[ hang | v >> 6 ]
6565
j += 1
66-
s[ j ] = chars[ v & 0x3f ]
66+
s[ j ] = a[ v & 0x3f ]
6767
j += 1
6868
hang = 0
6969
else:
7070
assert False
7171
if l % 3:
72-
s[ j ] = chars[ hang ]
72+
s[ j ] = a[ hang ]
7373

7474
return str( s )
7575

76-
def decode( self, s ):
76+
def decode( self, e ):
7777
"""
7878
>>> decode = standard.decode
7979
>>> decode('')
@@ -87,14 +87,14 @@ def decode( self, s ):
8787
>>> decode('..31.kF40VR')
8888
'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07'
8989
"""
90-
l = len( s )
90+
n = len( e )
9191
j = 0
92-
b = bytearray( l * 3 / 4 )
92+
b = bytearray( n * 3 / 4 )
9393
hang = 0
94-
codeToIndex = self.codeToIndex
94+
l = self.lookup
9595

96-
for i in xrange( l ):
97-
v = codeToIndex[ ord( s[ i ] ) ]
96+
for i in xrange( n ):
97+
v = l[ ord( e[ i ] ) ]
9898
r = i % 4
9999
if r == 0:
100100
hang = v << 2

src/bd2k/util/test/test_d32.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Copyright (c) 2014 Dominic Tarr
2+
# Copyright (c) 2015 Hannes Schmidt
3+
#
4+
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software
5+
# and associated documentation files (the "Software"), to deal in the Software without
6+
# restriction, including without limitation the rights to use, copy, modify, merge, publish,
7+
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
8+
# Software is furnished to do so, subject to the following conditions:
9+
#
10+
# The above copyright notice and this permission notice shall be included in all copies or
11+
# substantial portions of the Software.
12+
#
13+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
14+
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
15+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
16+
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
17+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
18+
19+
# Inspired by JavaScript code found at https://github.com/dominictarr/d64
20+
21+
from __future__ import absolute_import
22+
from unittest import TestCase
23+
from bd2k.util.d32 import standard as d32
24+
import os
25+
26+
27+
class TestD32( TestCase ):
28+
def test( self ):
29+
l = [ os.urandom( i ) for i in xrange( 1000 ) ]
30+
self.assertEqual( map( d32.decode, sorted( map( d32.encode, l ) ) ), sorted( l ) )

src/bd2k/util/test/test_d64.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,5 @@
2626

2727
class TestD64( TestCase ):
2828
def test( self ):
29-
data = [ (os.urandom( i ), i) for i in xrange( 1000 ) ]
30-
encoded_data = [ (d64.encode( d ), i) for d, i in data ]
31-
decoded_data = [ (d64.decode( s ), i) for s, i in encoded_data ]
32-
self.assertEqual( data, decoded_data )
33-
# Ensure that lexicographical sort is consistent between data and encoded data
34-
self.assertEqual( zip( *sorted( data ) )[ 1 ], zip( *sorted( encoded_data ) )[ 1 ] )
29+
l = [ os.urandom( i ) for i in xrange( 1000 ) ]
30+
self.assertEqual( map( d64.decode, sorted( map( d64.encode, l ) ) ), sorted( l ) )

0 commit comments

Comments
 (0)