Browse Source

Merge pull request #2 from jamesob/doc-and-test

Add some documentation, tests
pull/3/head
Pieter Wuille 4 years ago
committed by GitHub
parent
commit
e01badc265
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 3
      Makefile
  2. 227
      buildmap.py
  3. BIN
      demo.map
  4. 100
      demo.random.dat
  5. 74
      testmap.py

3
Makefile

@ -9,3 +9,6 @@ demo.dat.xz: ipv4.dump ipv6.dump birdparse.py
demo.map: demo.dat.xz buildmap.py
xz -d <demo.dat.xz | python3 buildmap.py >demo.map
test: demo.map
python3 testmap.py

227
buildmap.py

@ -1,25 +1,94 @@
"""
Intake a map of IP prefixes -> AS numbers and output instructions that will
allow a decoder to match an IP address to an ASN by following a sequence
of instructions.
The instructions describe a prefix tree that can be navigated using the bits of
an IP address (i.e. 0 for left child, 1 for right child, leaf nodes
corresponding to a given ASN). The types of instructions are denoted by the
*Type() functions defined below. Once an IP address specifies a bit for which
there is no path in the tree (i.e. the part of its address more specific than
any known network prefix), the tree returns a "default" ASN value that has been
set based on the last valid location in the tree.
See `testmap.py:Interpret` for an illustration of how the decoding process
works.
Before the prefix tree is encoded into instructions using bits, it is compacted
(e.g. duplicate subtrees removed) and annotated with which default ASN values
should be set for particular regions of the tree.
"""
import sys
import re
import ipaddress
from collections import namedtuple
def Parse(entries: list):
"""
Read in a file of the format
def Parse(entries):
1.0.0.0/24 AS13335 # ipv4.dump:4856343
1.0.4.0/22 AS56203 # ipv4.dump:2759291
...
Ignoring comments following '#'. Creates an Entry object for each line.
Maps IPv4 networks into IPv6 space.
Args:
entries: modified in place with the new Entrys.
"""
for line in sys.stdin:
line = line.split('#')[0].lstrip(' ').rstrip(' \r\n')
prefix, asn = line.split(' ')
assert(len(asn) > 2 and asn[:2] == "AS")
network = ipaddress.ip_network(prefix)
prefix_len = network.prefixlen
net_addr = int.from_bytes(network.network_address.packed, 'big')
# Map an IPv4 prefix into IPv6 space.
if isinstance(network, ipaddress.IPv4Network):
entries.append((network.prefixlen + 96, int.from_bytes(network.network_address.packed, 'big') + 0xffff00000000, int(asn[2:])))
elif isinstance(network, ipaddress.IPv6Network):
entries.append((network.prefixlen, int.from_bytes(network.network_address.packed, 'big'), int(asn[2:])))
prefix_len += 96
net_addr += 0xffff00000000
entries.append(Entry(prefix_len, net_addr, int(asn[2:])))
# Add a list of (prefixlen, addrbits, asn) entries to a tree
def UpdateTree(gtree, addrlen, entries):
Entry = namedtuple('Entry', (
# The length of the network prefix in bits. E.g. '26' for 255.255.0.0/26.
'prefix_len',
# An int containing the bits of the network address.
'net_addr',
# An int for the autonomous system (AS) number.
'asn',
))
def UpdateTree(gtree, addrlen: int, entries: [Entry]):
"""
Returns a prefix tree such that following a path down through the
tree based on the bits of a network prefix (in order of most significant
bit) leads to an ASN.
Args:
gtree: tree structure to encode the mappings into. Modified in-place.
addrlen: The maximum number of bits in a network address.
This is 128 for IPv6 (16 bytes).
entries: The network prefix -> ASN mappings to encode.
"""
for prefix, val, asn in sorted(entries):
tree = gtree
default = None
# Iterate through each bit in the network prefix, starting with the
# most significant bit.
for i in range(prefix):
bit = (val >> (addrlen - 1 - i)) & 1
# If we have passed the end of the network prefix, all entries
# under subsequent bits will be associated with the same ASN.
needs_inner = i < prefix - 1
if tree[bit] is None:
if needs_inner:
@ -44,9 +113,20 @@ def UpdateTree(gtree, addrlen, entries):
tree = tree[bit]
return gtree
# Remove redundancy from a tree.
# If approx is True, unassigned ranges may get reassigned to arbitrary ASNs.
def CompactTree(tree, approx=True):
def CompactTree(tree, approx=True) -> (list, set):
"""
Remove redundancy from a tree.
E.g. if all nodes in a subtree point to the same ASN, compact the subtree
into a single int.
Returns:
(the compacted tree, a set of all ASNs in the tree)
Args:
approx: if True, unassigned ranges may get reassigned to arbitrary ASNs.
"""
num = 0
if tree is None:
return (tree, set())
@ -63,60 +143,119 @@ def CompactTree(tree, approx=True):
return tree[0], set([tree[0]])
return (tree, allas)
# Get the (key, value) with maximum value from a dict.
def DictMax(d):
mk = None
mv = None
for k, v in d.items():
if mv is None or v > mv:
mk, mv = k, v
return mk, mv
# Annotate internal nodes in the tree with the most common leafs below it.
# The binary serialization later uses this.
def PropTree(tree, approx=True):
def PropTree(tree, approx=True) -> (list, Counter, bool):
"""
Annotate internal nodes in the tree with the most common leafs below it.
The binary serialization later uses this.
This changes the shape of the `tree` datastructure from
`[left_child, right_child]` to `[lc, rc, max_ASN_in_tree]`.
Returns:
(tree, Counter of ASNs in tree, whether or not tree is empty)
"""
if tree is None:
return (tree, {}, True)
return (tree, Counter(), True)
if isinstance(tree, int):
return (tree, {tree: 1}, False)
return (tree, Counter({tree: 1}), False)
tree[0], leftcnt, leftnone = PropTree(tree[0], approx)
tree[1], rightcnt, rightnone = PropTree(tree[1], approx)
allcnt = {k: leftcnt.get(k, 0) + rightcnt.get(k, 0) for k in set(leftcnt) | set(rightcnt)}
allcnt = leftcnt + rightcnt
allnone = leftnone | rightnone
maxasn, maxcount = DictMax(allcnt)
maxasn, maxcount = allcnt.most_common(1)[0]
if maxcount is not None and maxcount >= 2 and (approx or not allnone):
return ([tree[0], tree[1], maxasn], {maxasn: 1}, allnone)
return ([tree[0], tree[1], maxasn], Counter({maxasn: 1}), allnone)
return (tree, allcnt, allnone)
def EncodeBits(val, minval, bit_sizes):
def EncodeBits(val, minval, bit_sizes) -> [int]:
"""
Perform a variable-length encoding of a value to bits, least significant
bit first.
For each `bit_sizes` passed, attempt to encode the value with that number
of bits + 1. Normalize the encoded value by `minval` to potentially save
bits - the value will be corrected during decoding.
Returns:
a list of bits representing the value to encode.
"""
val -= minval
ret = []
for pos in range(len(bit_sizes)):
bit_size = bit_sizes[pos]
# If the value will not fit in `bit_size` bits, absorb the largest
# value for this bitsize and continue to the next smallest size.
if val >= (1 << bit_size):
val -= (1 << bit_size)
ret += [1]
else:
# If we aren't encoding the largest possible value per the largest
# bitsize...
if (pos + 1 < len(bit_sizes)):
ret += [0]
# Use remaining bits to encode the rest of val.
for b in range(bit_size):
ret += [(val >> (bit_size - 1 - b)) & 1]
return ret
# Couldn't fit val into any of the bit_sizes
assert(False)
def EncodeType(v):
def MatchType() -> [int]:
"""
The match instruction descends into the tree based on a bit path. If at any
point the match fails to hit a valid path through the tree, it will fail
and return the current default ASN (which changes as we move through the
tree).
"""
return EncodeType(2)
def JumpType() -> [int]:
"""
The jump instruction allows us to quickly seek to one side of the tree
or the other. By encoding the length of the left child, we can skip over
it to the right child if need be.
"""
return EncodeType(1)
def LeafType() -> [int]:
"""The leaf instruction encodes an ASN at the end of a bit path."""
return EncodeType(0)
def SetNewDefaultType() -> [int]:
"""
This instruction establishes a new default ASN to return should we fail
while traversing this path.
"""
return EncodeType(3)
def EncodeType(v) -> [int]:
return EncodeBits(v, 0, [0, 0, 1])
def EncodeASN(v):
def EncodeASN(v) -> [int]:
# It's reasonable to ask why "15" (indicating 16 bits) is the minimum size
# we might try to pack an ASN into, given there are many ASNs below 2**16.
#
# The reason that we start at 15 here is because we want the first bitsize
# we specify to contain ~50% of the values we are trying to encode - this
# is because each separate bitsize we try will add a digit to our encoded
# values, so we simultaneously want to minimize the number of bitsizes we
# allow while also minimizing the bit length of the encoded data, which
# is a trade-off.
return EncodeBits(v, 1, [15, 16, 17, 18, 19, 20, 21, 22, 23, 24])
def EncodeMatch(v):
def EncodeMatch(v) -> [int]:
return EncodeBits(v, 2, [1, 2, 3, 4, 5, 6, 7, 8])
def EncodeJump(v):
def EncodeJump(v) -> [int]:
return EncodeBits(v, 17, [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30])
def EncodeBytes(bits):
def EncodeBytes(bits) -> [int]:
"""Encode a sequence of bits as a sequence of bytes."""
val = 0
nbits = 0
bytes = []
@ -135,6 +274,9 @@ def TreeSer(tree, default):
match = 1
assert(tree is not None)
assert(not (isinstance(tree, int) and tree == default))
# If one side of the tree is empty (i.e. represents a path without
# choices), encode a match instruction up to 8 bits.
while isinstance(tree, list) and match <= 0xFF:
if tree[0] is None or tree[0] == default:
match = (match << 1) + 1
@ -145,21 +287,30 @@ def TreeSer(tree, default):
else:
break
if match >= 2:
return EncodeType(2) + EncodeMatch(match) + TreeSer(tree, default)
return MatchType() + EncodeMatch(match) + TreeSer(tree, default)
# Leaf node: return the ASN.
if isinstance(tree, int):
return EncodeType(0) + EncodeASN(tree)
return LeafType() + EncodeASN(tree)
# Return the tree along with a new "default" ASN value should we fail to
# match while along this path.
if len(tree) > 2 and tree[2] != default:
return EncodeType(3) + EncodeASN(tree[2]) + TreeSer(tree, tree[2])
return SetNewDefaultType() + EncodeASN(tree[2]) + TreeSer(tree, tree[2])
left = TreeSer(tree[0], default)
right = TreeSer(tree[1], default)
return EncodeType(1) + EncodeJump(len(left)) + left + right
# Start the program by specifying a possible jump to either child of the
# first node.
return JumpType() + EncodeJump(len(left)) + left + right
def BuildTree(entries, approx=True):
tree = [None, None]
tree = UpdateTree(tree, 128, entries)
return tree
entries = []
entries: [Entry] = []
print("[INFO] Loading", file=sys.stderr)
Parse(entries)
print("[INFO] Read %i prefixes" % len(entries), file=sys.stderr)

BIN
demo.map

Binary file not shown.

100
demo.random.dat

@ -0,0 +1,100 @@
185.246.138.0/23 AS9009 # ipv4.dump:14068018
208.91.107.0/24 AS35913 # ipv4.dump:5300217
186.249.165.0/24 AS53037 # ipv4.dump:7983539
200.54.108.0/24 AS52310 # ipv4.dump:5501190
209.71.160.0/20 AS46339 # ipv4.dump:17825415
142.52.146.0/24 AS852 # ipv4.dump:488513
212.128.116.0/23 AS200521 # ipv4.dump:13356568
186.178.15.0/24 AS28006 # ipv4.dump:9201526
2a01:9700:13a7::/48 AS8376 # ipv6.dump:1037828
172.105.134.0/23 AS63949 # ipv4.dump:15412112
78.130.228.0/24 AS9070 # ipv4.dump:12119994
118.126.140.0/23 AS23724 # ipv4.dump:7123714
24.94.32.0/19 AS11351 # ipv4.dump:1434597
142.234.32.0/21 AS7979 # ipv4.dump:7022111
2405:4800:2140::/46 AS18403 # ipv6.dump:2326297
200.113.215.0/24 AS27653 # ipv4.dump:3279416
45.167.190.0/23 AS268046 # ipv4.dump:2584658
198.137.70.0/24 AS10264 # ipv4.dump:3336467
200.13.36.0/24 AS28400 # ipv4.dump:3882782
64.69.220.0/24 AS55002 # ipv4.dump:18960871
209.49.229.0/24 AS395626 # ipv4.dump:14398115
87.249.76.0/22 AS15641 # ipv4.dump:9319161
2001:67c:206c::/48 AS49788 # ipv6.dump:1800452
2604:2d80:8000::/48 AS30036 # ipv6.dump:1982081
177.53.12.0/24 AS52989 # ipv4.dump:2090089
85.192.36.0/22 AS12695 # ipv4.dump:17606664
212.126.108.0/24 AS39216 # ipv4.dump:18478672
66.13.132.0/24 AS5650 # ipv4.dump:19204708
2400:3800:6000::/37 AS9617 # ipv6.dump:2204768
5.123.160.0/20 AS44244 # ipv4.dump:16376955
2600:2100:1b::/48 AS54858 # ipv6.dump:1944131
150.242.174.0/24 AS132453 # ipv4.dump:9808415
62.221.134.0/23 AS13124 # ipv4.dump:3496079
213.153.170.0/23 AS34984 # ipv4.dump:15953301
197.149.72.0/24 AS35074 # ipv4.dump:18530955
132.97.0.0/16 AS306 # ipv4.dump:19630592
212.138.180.0/24 AS209464 # ipv4.dump:6079946
103.251.30.0/24 AS58984 # ipv4.dump:18032160
74.221.64.0/20 AS29979 # ipv4.dump:20136686
185.116.176.0/22 AS204033 # ipv4.dump:19959133
40.248.252.0/24 AS4249 # ipv4.dump:9393294
103.78.50.0/24 AS135655 # ipv4.dump:12363962
67.209.219.0/24 AS20356 # ipv4.dump:2756256
214.72.0.0/24 AS27064 # ipv4.dump:6228044
188.38.127.0/24 AS15897 # ipv4.dump:827001
2a03:f080:1000::/48 AS42685 # ipv6.dump:1301509
95.140.24.0/21 AS48739 # ipv4.dump:10034325
84.39.111.0/24 AS48200 # ipv4.dump:20443864
63.163.108.0/23 AS26724 # ipv4.dump:19309425
24.197.96.0/24 AS20115 # ipv4.dump:14008635
184.181.24.0/21 AS22773 # ipv4.dump:3568675
156.0.224.0/21 AS328220 # ipv4.dump:9584679
146.120.20.0/22 AS57901 # ipv4.dump:19852902
149.20.24.0/24 AS1280 # ipv4.dump:7015284
219.65.104.0/23 AS4755 # ipv4.dump:20183203
92.42.64.0/21 AS44764 # ipv4.dump:15718454
45.160.90.0/24 AS268414 # ipv4.dump:11064350
187.149.8.0/21 AS8151 # ipv4.dump:7827459
208.80.116.0/22 AS32354 # ipv4.dump:17877462
64.129.144.0/24 AS20251 # ipv4.dump:15149238
205.147.108.0/24 AS17439 # ipv4.dump:6170532
65.5.88.0/21 AS6389 # ipv4.dump:11859089
116.241.76.0/22 AS131596 # ipv4.dump:3725486
98.143.211.0/24 AS22639 # ipv4.dump:17293905
138.136.68.0/23 AS5972 # ipv4.dump:2055175
170.80.99.0/24 AS264829 # ipv4.dump:2425897
186.159.164.0/22 AS52228 # ipv4.dump:15901760
202.60.124.0/23 AS37970 # ipv4.dump:6011975
92.36.192.0/20 AS9146 # ipv4.dump:13920556
182.73.4.0/24 AS9498 # ipv4.dump:19297810
198.73.210.0/24 AS393304 # ipv4.dump:5060133
74.204.128.0/21 AS35986 # ipv4.dump:14710033
185.129.83.0/24 AS203616 # ipv4.dump:8719643
79.99.56.0/21 AS47212 # ipv4.dump:12465762
77.92.123.0/24 AS25145 # ipv4.dump:16610816
82.141.192.0/18 AS5466 # ipv4.dump:9919239
171.159.60.0/24 AS10794 # ipv4.dump:6186705
52.216.4.0/24 AS16509 # ipv4.dump:11150294
196.16.92.0/22 AS19969 # ipv4.dump:4553797
188.187.246.0/24 AS41668 # ipv4.dump:3501609
198.228.131.0/24 AS701 # ipv4.dump:10060201
168.77.75.0/24 AS3551 # ipv4.dump:6537912
201.197.64.0/23 AS11830 # ipv4.dump:12945306
105.235.103.0/24 AS36974 # ipv4.dump:20415098
217.72.60.0/23 AS45011 # ipv4.dump:7429652
185.118.72.0/22 AS47406 # ipv4.dump:11380081
109.70.187.0/24 AS44391 # ipv4.dump:6097074
116.50.78.0/23 AS38529 # ipv4.dump:15525205
177.152.66.0/24 AS262773 # ipv4.dump:9918217
116.93.48.0/23 AS23930 # ipv4.dump:15072801
172.110.58.0/24 AS396191 # ipv4.dump:13625296
182.50.255.0/24 AS45786 # ipv4.dump:20783094
199.48.177.0/24 AS20473 # ipv4.dump:9553840
85.91.120.0/23 AS28809 # ipv4.dump:12403000
213.172.0.0/19 AS20632 # ipv4.dump:4621070
150.196.85.0/24 AS747 # ipv4.dump:21064826
201.151.243.0/24 AS11172 # ipv4.dump:3121096
123.51.9.0/24 AS45510 # ipv4.dump:11847527
138.118.109.0/24 AS264257 # ipv4.dump:7184483
83.228.128.0/17 AS12350 # ipv4.dump:8421526

74
testmap.py

@ -1,4 +1,6 @@
#!/usr/bin/env python3
import sys
import random
import ipaddress
# Convert a byte array to a bit array
@ -66,14 +68,64 @@ def Interpret(asmap, num, bits):
else:
assert(False)
with open(sys.argv[1], "rb") as f:
asmap = DecodeBytes(f.read())
addr = ipaddress.ip_address(sys.argv[2])
if isinstance(addr, ipaddress.IPv4Address):
num = int.from_bytes(addr.packed, 'big') + 0xffff00000000
elif isinstance(addr, ipaddress.IPv6Address):
num = int.from_bytes(addr.packed, 'big')
ret = Interpret(asmap, num, 128)
if ret:
print("AS%i" % ret)
def decode_ip(ip: str) -> int:
addr = ipaddress.ip_address(ip)
if isinstance(addr, ipaddress.IPv4Address):
return int.from_bytes(addr.packed, 'big') + 0xffff00000000
elif isinstance(addr, ipaddress.IPv6Address):
return int.from_bytes(addr.packed, 'big')
if __name__ == '__main__':
no_args = len(sys.argv) == 1
if no_args:
filename = './demo.map'
else:
filename = sys.argv[1]
with open(filename, "rb") as f:
asmap = DecodeBytes(f.read())
# If no arguments are passed, run a test on a random selection from
# demo.dat.
if no_args:
expected = [
('8.8.8.8', 15169),
]
failed = False
with open('./demo.random.dat', 'r') as f:
for line in f:
(ip, asn) = line.split()[:2]
ip = ip.split('/')[0]
assert(asn[:2] == 'AS')
asn = int(asn[2:])
# Make the IP concrete and randomize it somewhat within the
# subnet.
if ':' not in ip:
ip = '.'.join(ip.split('.')[:3]) + '.{}'.format(
random.randint(0, 16))
expected.append((ip, asn))
for ip, asn in expected:
got = Interpret(asmap, decode_ip(ip), 128)
if got != asn:
failed = True
print("{} failed! Got {}, expected {}".format(
ip, got, asn), file=sys.stderr)
else:
print("{} passed".format(ip))
sys.exit(1 if failed else 0)
else:
ret = Interpret(asmap, decode_ip(sys.argv[2]), 128)
if ret:
print("AS%i" % ret)

Loading…
Cancel
Save