From e69d55f27c4d9fdc709ca342e93f85447d00fe6f Mon Sep 17 00:00:00 2001 From: James O'Beirne Date: Wed, 15 Jan 2020 16:04:16 -0500 Subject: [PATCH] documentation and refactoring in buildmap --- buildmap.py | 227 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 189 insertions(+), 38 deletions(-) diff --git a/buildmap.py b/buildmap.py index 1636ee6..3f80545 100644 --- a/buildmap.py +++ b/buildmap.py @@ -1,25 +1,94 @@ +""" +Intake a map of IP prefixes -> AS numbers and output instructions that will +allow a decoder to match an IP address to an ASN by following a sequence +of instructions. + +The instructions describe a prefix tree that can be navigated using the bits of +an IP address (i.e. 0 for left child, 1 for right child, leaf nodes +corresponding to a given ASN). The types of instructions are denoted by the +*Type() functions defined below. Once an IP address specifies a bit for which +there is no path in the tree (i.e. the part of its address more specific than +any known network prefix), the tree returns a "default" ASN value that has been +set based on the last valid location in the tree. + +See `testmap.py:Interpret` for an illustration of how the decoding process +works. + +Before the prefix tree is encoded into instructions using bits, it is compacted +(e.g. duplicate subtrees removed) and annotated with which default ASN values +should be set for particular regions of the tree. +""" import sys -import re import ipaddress +from collections import namedtuple + + +def Parse(entries: list): + """ + Read in a file of the format -def Parse(entries): + 1.0.0.0/24 AS13335 # ipv4.dump:4856343 + 1.0.4.0/22 AS56203 # ipv4.dump:2759291 + ... + + Ignoring comments following '#'. Creates an Entry object for each line. + Maps IPv4 networks into IPv6 space. + + Args: + entries: modified in place with the new Entrys. + """ for line in sys.stdin: line = line.split('#')[0].lstrip(' ').rstrip(' \r\n') prefix, asn = line.split(' ') assert(len(asn) > 2 and asn[:2] == "AS") network = ipaddress.ip_network(prefix) + + prefix_len = network.prefixlen + net_addr = int.from_bytes(network.network_address.packed, 'big') + + # Map an IPv4 prefix into IPv6 space. if isinstance(network, ipaddress.IPv4Network): - entries.append((network.prefixlen + 96, int.from_bytes(network.network_address.packed, 'big') + 0xffff00000000, int(asn[2:]))) - elif isinstance(network, ipaddress.IPv6Network): - entries.append((network.prefixlen, int.from_bytes(network.network_address.packed, 'big'), int(asn[2:]))) + prefix_len += 96 + net_addr += 0xffff00000000 + + entries.append(Entry(prefix_len, net_addr, int(asn[2:]))) -# Add a list of (prefixlen, addrbits, asn) entries to a tree -def UpdateTree(gtree, addrlen, entries): + +Entry = namedtuple('Entry', ( + # The length of the network prefix in bits. E.g. '26' for 255.255.0.0/26. + 'prefix_len', + + # An int containing the bits of the network address. + 'net_addr', + + # An int for the autonomous system (AS) number. + 'asn', +)) + + +def UpdateTree(gtree, addrlen: int, entries: [Entry]): + """ + Returns a prefix tree such that following a path down through the + tree based on the bits of a network prefix (in order of most significant + bit) leads to an ASN. + + Args: + gtree: tree structure to encode the mappings into. Modified in-place. + addrlen: The maximum number of bits in a network address. + This is 128 for IPv6 (16 bytes). + entries: The network prefix -> ASN mappings to encode. + """ for prefix, val, asn in sorted(entries): tree = gtree default = None + + # Iterate through each bit in the network prefix, starting with the + # most significant bit. for i in range(prefix): bit = (val >> (addrlen - 1 - i)) & 1 + + # If we have passed the end of the network prefix, all entries + # under subsequent bits will be associated with the same ASN. needs_inner = i < prefix - 1 if tree[bit] is None: if needs_inner: @@ -44,9 +113,20 @@ def UpdateTree(gtree, addrlen, entries): tree = tree[bit] return gtree -# Remove redundancy from a tree. -# If approx is True, unassigned ranges may get reassigned to arbitrary ASNs. -def CompactTree(tree, approx=True): + +def CompactTree(tree, approx=True) -> (list, set): + """ + Remove redundancy from a tree. + + E.g. if all nodes in a subtree point to the same ASN, compact the subtree + into a single int. + + Returns: + (the compacted tree, a set of all ASNs in the tree) + + Args: + approx: if True, unassigned ranges may get reassigned to arbitrary ASNs. + """ num = 0 if tree is None: return (tree, set()) @@ -63,60 +143,119 @@ def CompactTree(tree, approx=True): return tree[0], set([tree[0]]) return (tree, allas) -# Get the (key, value) with maximum value from a dict. -def DictMax(d): - mk = None - mv = None - for k, v in d.items(): - if mv is None or v > mv: - mk, mv = k, v - return mk, mv - -# Annotate internal nodes in the tree with the most common leafs below it. -# The binary serialization later uses this. -def PropTree(tree, approx=True): + +def PropTree(tree, approx=True) -> (list, Counter, bool): + """ + Annotate internal nodes in the tree with the most common leafs below it. + The binary serialization later uses this. + + This changes the shape of the `tree` datastructure from + `[left_child, right_child]` to `[lc, rc, max_ASN_in_tree]`. + + Returns: + (tree, Counter of ASNs in tree, whether or not tree is empty) + """ if tree is None: - return (tree, {}, True) + return (tree, Counter(), True) if isinstance(tree, int): - return (tree, {tree: 1}, False) + return (tree, Counter({tree: 1}), False) tree[0], leftcnt, leftnone = PropTree(tree[0], approx) tree[1], rightcnt, rightnone = PropTree(tree[1], approx) - allcnt = {k: leftcnt.get(k, 0) + rightcnt.get(k, 0) for k in set(leftcnt) | set(rightcnt)} + allcnt = leftcnt + rightcnt allnone = leftnone | rightnone - maxasn, maxcount = DictMax(allcnt) + maxasn, maxcount = allcnt.most_common(1)[0] if maxcount is not None and maxcount >= 2 and (approx or not allnone): - return ([tree[0], tree[1], maxasn], {maxasn: 1}, allnone) + return ([tree[0], tree[1], maxasn], Counter({maxasn: 1}), allnone) return (tree, allcnt, allnone) -def EncodeBits(val, minval, bit_sizes): + +def EncodeBits(val, minval, bit_sizes) -> [int]: + """ + Perform a variable-length encoding of a value to bits, least significant + bit first. + + For each `bit_sizes` passed, attempt to encode the value with that number + of bits + 1. Normalize the encoded value by `minval` to potentially save + bits - the value will be corrected during decoding. + + Returns: + a list of bits representing the value to encode. + """ val -= minval ret = [] for pos in range(len(bit_sizes)): bit_size = bit_sizes[pos] + + # If the value will not fit in `bit_size` bits, absorb the largest + # value for this bitsize and continue to the next smallest size. if val >= (1 << bit_size): val -= (1 << bit_size) ret += [1] else: + # If we aren't encoding the largest possible value per the largest + # bitsize... if (pos + 1 < len(bit_sizes)): ret += [0] + + # Use remaining bits to encode the rest of val. for b in range(bit_size): ret += [(val >> (bit_size - 1 - b)) & 1] return ret + + # Couldn't fit val into any of the bit_sizes assert(False) -def EncodeType(v): +def MatchType() -> [int]: + """ + The match instruction descends into the tree based on a bit path. If at any + point the match fails to hit a valid path through the tree, it will fail + and return the current default ASN (which changes as we move through the + tree). + """ + return EncodeType(2) + +def JumpType() -> [int]: + """ + The jump instruction allows us to quickly seek to one side of the tree + or the other. By encoding the length of the left child, we can skip over + it to the right child if need be. + """ + return EncodeType(1) + +def LeafType() -> [int]: + """The leaf instruction encodes an ASN at the end of a bit path.""" + return EncodeType(0) + +def SetNewDefaultType() -> [int]: + """ + This instruction establishes a new default ASN to return should we fail + while traversing this path. + """ + return EncodeType(3) + +def EncodeType(v) -> [int]: return EncodeBits(v, 0, [0, 0, 1]) -def EncodeASN(v): +def EncodeASN(v) -> [int]: + # It's reasonable to ask why "15" (indicating 16 bits) is the minimum size + # we might try to pack an ASN into, given there are many ASNs below 2**16. + # + # The reason that we start at 15 here is because we want the first bitsize + # we specify to contain ~50% of the values we are trying to encode - this + # is because each separate bitsize we try will add a digit to our encoded + # values, so we simultaneously want to minimize the number of bitsizes we + # allow while also minimizing the bit length of the encoded data, which + # is a trade-off. return EncodeBits(v, 1, [15, 16, 17, 18, 19, 20, 21, 22, 23, 24]) -def EncodeMatch(v): +def EncodeMatch(v) -> [int]: return EncodeBits(v, 2, [1, 2, 3, 4, 5, 6, 7, 8]) -def EncodeJump(v): +def EncodeJump(v) -> [int]: return EncodeBits(v, 17, [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]) -def EncodeBytes(bits): +def EncodeBytes(bits) -> [int]: + """Encode a sequence of bits as a sequence of bytes.""" val = 0 nbits = 0 bytes = [] @@ -135,6 +274,9 @@ def TreeSer(tree, default): match = 1 assert(tree is not None) assert(not (isinstance(tree, int) and tree == default)) + + # If one side of the tree is empty (i.e. represents a path without + # choices), encode a match instruction up to 8 bits. while isinstance(tree, list) and match <= 0xFF: if tree[0] is None or tree[0] == default: match = (match << 1) + 1 @@ -145,21 +287,30 @@ def TreeSer(tree, default): else: break if match >= 2: - return EncodeType(2) + EncodeMatch(match) + TreeSer(tree, default) + return MatchType() + EncodeMatch(match) + TreeSer(tree, default) + + # Leaf node: return the ASN. if isinstance(tree, int): - return EncodeType(0) + EncodeASN(tree) + return LeafType() + EncodeASN(tree) + + # Return the tree along with a new "default" ASN value should we fail to + # match while along this path. if len(tree) > 2 and tree[2] != default: - return EncodeType(3) + EncodeASN(tree[2]) + TreeSer(tree, tree[2]) + return SetNewDefaultType() + EncodeASN(tree[2]) + TreeSer(tree, tree[2]) + left = TreeSer(tree[0], default) right = TreeSer(tree[1], default) - return EncodeType(1) + EncodeJump(len(left)) + left + right + + # Start the program by specifying a possible jump to either child of the + # first node. + return JumpType() + EncodeJump(len(left)) + left + right def BuildTree(entries, approx=True): tree = [None, None] tree = UpdateTree(tree, 128, entries) return tree -entries = [] +entries: [Entry] = [] print("[INFO] Loading", file=sys.stderr) Parse(entries) print("[INFO] Read %i prefixes" % len(entries), file=sys.stderr)