IP-to-country lookup

A while ago my brother and I started playing with the IP to country code tables provided by ARIN and others.

The problem is this:

Given IP 123.4.5.6 and a list of ranges like this:

 0.0.0.0 to 4.255.255.255 us
 5.0.0.1 to 5.10.255.255 nl
 5.15.0.0 to 20.1.4.128 de
 ...
 255.0.0.0 to 255.255.255.255 xx

find the range, if any, that corresponds to the given IP.

If you sort the list, you can do a binary search and you can improve upon that by providing an index.

You can get a copy of the full list (compiled mid 2002) here [1] that the code below needs.

This code illustrates, I think it does anyway, that the way you do something in C is likely not the way to do it in Tcl. In Tcl, two separate lists that hold the IP ranges and corresponding country codes would likely be much, much faster. As it is, the Tcl (8.4.2 on linux) code is about 430 times slower than the C implementation.

16May2003 MS: Please see further down for a modified implementation that is about 25 times slower than C.

13May2003 PS.

This is the code in C:

    #include <stdio.h>
    #include <stdlib.h>

    #define REC_OFFSET 8
    #define REC_SIZE   18

    struct record_t {
        unsigned int low;
        unsigned int high;
        char country[3];
    };

    unsigned int IpToUInt(const char *ipStr)
        {
            unsigned int d1=0, d2=0, d3=0, d4=0;

            sscanf(ipStr, "%u.%u.%u.%u", &d1, &d2, &d3, &d4);

            return ( ((d1 & 0xFF) << 24) | \
                     ((d2 & 0xFF) << 16) | \
                     ((d3 & 0xFF) << 8)  | \
                     (d4 & 0xFF) );

        }

    unsigned int HexToUInt(const char *hexIp)
        {
            return (int)strtoul(hexIp, NULL, 16);
        }

    int GetRecord(struct record_t *r, int num, FILE *infile)
        {
            char data[9];
            data[8]=0; // terminate data.

            //seek to the selected record:
            if ( fseek(infile, REC_OFFSET + num * REC_SIZE, SEEK_SET)==-1) return 0;

            //read&parse low IP
            if( !fread(&data, 8, 1, infile) ) {
                return 0;
            }
            r->low = HexToUInt(data);

            //read&parse high IP
            if( !fread(&data, 8, 1, infile) ) {
                return 0;
            }
            r->high = HexToUInt(data);

            //read country
            if( !fread(r->country, 2, 1, infile) ) {
                return 0;
            }

            r->country[2]=0; //terminate string

            return 1;

        }

    const char *lookup(unsigned int ip, int records, struct record_t *r, int *index, int *hindex)
        {
            int steps=0;
            int found=0;

            unsigned int myMin=0, myMax=0, avg;

            myMin=index[ip >> 24];
            myMax=hindex[ip >> 24];
            while( myMax > myMin ) {
                steps++;

                avg = (myMax + myMin) /2;

                if( r[avg].low <= ip && r[avg].high >= ip ) {
                    found=1;
                    break;
                } else if (r[avg].low <= ip) {
                    myMin = avg;
                } else {
                    myMax = avg;
                }
                if (steps > 32) {
                    //Infiniteloop
                    break;
                }
            }
            if (found) {
                return r[avg].country;
            } else {

                //check one above:
                if ( avg < records )
                    if( r[avg+1].low <= ip && r[avg+1].high >= ip )
                        return r[avg+1].country;

                //check one below:
                if (avg > 0)
                    if( r[avg-1].low <= ip && r[avg-1].high >= ip )
                        return r[avg-1].country;

                return 0;
            }

        }

    int main (int argc, char *argv[])
        {
            unsigned int ip;
            char *country;
            FILE *infile=0;
            int i;

            char data[9];
            struct record_t *r;
            int index[256], hindex[256], indexptr;
            int records=0;

            if (argc < 2) {
                fprintf(stderr, "Usage: lookup <dotted-decimal-ip> <...>\n");
                return 2;
            }

            if( (infile=fopen("opentarget.dat", "r"))==NULL ) {
                fprintf(stderr, "Could not open database\n");
                return 2;
            }

            data[8]=0; // terminate data.

            if ( fseek(infile, 0, SEEK_SET)==-1) return 0;
            if( !fread(&data, 8, 1, infile) ) {
                fprintf(stderr, "File read error while initialising\n");
                return 2;
            }
            records = HexToUInt(data);

            if ( (r = malloc(sizeof(struct record_t)*records)) == NULL) {
                fprintf(stderr, "Insufficient memory\n");
                return 2;
            }

            for (i=0; i<256; i++) index[i]=hindex[i]=0;
            index[0]=0;
            indexptr=0;

            for (i=0; i < records; i++) {
                //read&parse low IP
                if( !fread(&data, 8, 1, infile) ) {
                    fprintf(stderr, "Error reading database record %d\n", i);
                    return 2;
                }
                r[i].low = HexToUInt(data);

                //read&parse high IP
                if( !fread(&data, 8, 1, infile) ) {
                    fprintf(stderr, "Error reading database record %d\n", i);
                    return 2;
                }
                r[i].high = HexToUInt(data);

                //read country
                if( !fread(r[i].country, 2, 1, infile) ) {
                    fprintf(stderr, "Error reading database record %d\n", i);
                    return 2;
                }

                r[i].country[2]=0; //terminate string

                //add to index?
                if (indexptr < (r[i].low >> 24)) {
                    indexptr=r[i].low >> 24;
                    index[indexptr]=i;
                }
            }
            fclose(infile);

            for (i=1; i<256; i++) {
                if(index[i]==0) index[i]=index[i-1];
            }
            hindex[255]=records;
            for (i=254; i>=0; i--) {
                if (index[i+1]>index[i]) {
                    hindex[i]=index[i+1];
                } else {
                    hindex[i]=hindex[i+1];
                }
            }

            //for (i=0; i<256; i++) printf("%04x", index[i]);
            //for (i=0; i<256; i++) printf("%04x", hindex[i]);
            //return 0;
            for (i=1; i<argc; i++) {
                // uncomment the next loop to do a speed test:
                //int j;
                //for (j=0; j<1000000; j++) {
                    ip=IpToUInt(argv[i]);
                    country=lookup(ip, records, r, index, hindex);
                //}
                if ( country!=NULL ) {
                    printf("%s %s\n", argv[i], country);
                } else {
                    printf("%s NOMATCH\n", argv[i]);
                }
            }
            return 0;

        }

The same code in Tcl looks like this:

 namespace eval ::opentarget {

    set max 0
    set theArray(0) foo
    set dbLoaded 0
    set theList [list]

    proc calcIp {ip} {
        set lIp [split $ip .]
        while {[llength $lIp] < 4} {
            lappend lIp 0
        }
        set res [expr ((((([lindex $lIp 0].0 * 256.0) + [lindex $lIp 1].0) * 256.0) + [lindex $lIp 2].0) * 256.0) + [lindex $lIp 3].0]
        return [lindex [split $res .] 0]
    }

    proc calcIpS {ip} {
        #calc IP, but drop the LSB
        set lIp [split $ip .]
        while {[llength $lIp] < 4} {
            lappend lIp 0
        }
        set res [expr (((( [lindex $lIp 0]) * 256) + [lindex $lIp 1]) * 256) + [lindex $lIp 2]]
        return $res
    }

    proc calcIpHex {hex} {
        return [format %u 0x$hex]
    }

    proc openFile {fileName} {
        regsub -all {\\} $fileName {/} fileName
        variable max
        set if [open $fileName r]
        set lenHex [read $if 8]
        set len [format %i 0x$lenHex]
        set max [expr $len - 1]
        return $if
    }

    proc readDatabase {fileName} {
        regsub -all {\\} $fileName {/} fileName
        set key -1
        variable theArray
        variable theList
        variable max
        variable dbLoaded
        set if [open $fileName r]
        set max [expr 1 * 0x[read $if 8]]
        while {![eof $if] && [string length [set entry [read $if 18]]] == 18} {
            lappend line [string range $entry 0 7]
            lappend line [string range $entry 8 15]
            lappend line [string range $entry 16 17]
            incr key
            set theArray($key) $line
            unset line
            if { ![string equal [string range $entry 6 7] "00"] } {
                #puts "$key starts at [string range $entry 6 7]"
            }
            if { ![string equal [string range $entry 14 15] "ff"] } {
                #puts "$key ends at [string range $entry 14 15]"
            }
            lappend line [calcIpHex [string range $entry 0 5]]
            lappend line [calcIpHex [string range $entry 8 13]]
            lappend line [string range $entry 16 17]
            lappend theList $line
            unset line

        }

        set max $key
        close $if
        set dbLoaded 1
    }

    proc lookup {args} {
        variable theArray
        variable dbLoaded
        if {[llength $args] == 1 && !$dbLoaded} {
            error "Database not loaded"
        } elseif {[llength $args] == 1 && $dbLoaded} {
            lookupArray [lindex $args 0]
        } else {
            lookupFile [lindex $args 0] [lindex $args 1]
        }
    }

    proc lookupArray {ip} {

        variable theArray
        variable max
        variable dbLoaded
        if {!$dbLoaded} {error "database not loaded"}

        set nip [calcIp $ip]
        set myMin 0
        set myMax $max
        set steps 0
        set avg 0
        while {[expr $myMax - $myMin] > 0 && $steps < 32} {
            incr steps
            if {$avg == 1} {
                set avg 0
            } else {
                set avg [expr $myMax - (($myMax - $myMin) /2)]
            }
            set current $theArray($avg)
            set low [calcIpHex [lindex $current 0]]
            set hi [calcIpHex  [lindex $current 1]]

            if {[expr "$low.0 <= $nip.0"] && [expr "$hi.0 >= $nip.0"]} {
                return [lindex $current 2]
            } elseif {[expr "$low.0 <= $nip.0"]} {
                set myMin $avg
            } else {
                set myMax $avg
            }
        }

        return ""
    }

    proc lookupArrayS {ip} {

        variable theList
        variable max
        variable dbLoaded
        if {!$dbLoaded} {error "database not loaded"}

        set nip [calcIpS $ip]
        set myMin 0
        set myMax $max
        set steps 0
        set avg 0
        while {[expr $myMax - $myMin] > 0 && $steps < 32} {
            incr steps
            if {$avg == 1} {
                set avg 0
            } else {
                set avg [expr $myMax - (($myMax - $myMin) /2)]
            }
            set current [lindex $theList $avg]
            set low [lindex $current 0]
            set hi [lindex $current 1]

            if {[expr $low <= $nip] && [expr $hi >= $nip]} {
                return [lindex $current 2]
            } elseif {[expr $low <= $nip]} {
                set myMin $avg
            } else {
                set myMax $avg
            }
        }

        return ""
    }

    proc lookupFile {fileName ip} {
        variable max
        set if [openFile $fileName]
        set found 0
        set nip [calcIp $ip]
        set myMin 0
        set myMax $max
        set steps 0
        while {[expr $myMax - $myMin] > 0 && $steps < 32} {
            incr steps
            if {$myMax == 1} {
                set avg 0
            } else {
                set avg [expr $myMax - (($myMax - $myMin) /2)]
            }
            seek $if [expr 8 + ($avg * 18)] start
            set low [calcIpHex [read $if 8]]
            set hi [calcIpHex [read $if 8]]
            set country [read $if 2]

            if {[expr $low.0 <= $nip.0] && [expr $hi.0 >= $nip.0]} {
                set found 1
                set return $country
                break
            } elseif {[expr $low.0 <= $nip.0]} {
                set myMin $avg
            } else {
                set myMax $avg
            }
        }
        close $if
        if {!$found} {
            return ""
        } else {
            return $return
        }

    }

 }

 opentarget::readDatabase opentarget.dat
 puts [time {
 set foo [::opentarget::lookupArray $args]
 } 1000]

RS has this version:

        proc ip2s ip {eval format %02x%02x%02x%02x [split $ip .]}
        set fp [open [file dirname [info script]]/opentarget.dat]
        read $fp 8 ;# skip record count
        set data {}
        while 1 {
            set from [read $fp 8]
            set to [read $fp 8]
            set country [read $fp 2]
            if [eof $fp] break
            lappend data s$from s$to $country
        }
        close $fp
        foreach arg $argv {
            set ips s[ip2s $arg]
            foreach {from to country} $data {
                if {$from<=$ips && $to>=$ips} {puts $arg:$country; break}
            }
        }

which may not be optimally fast, but responds fast enough when called from command line. Most of all, it is extremely short (compared to the original), so easyly maintained.

ulis, 2004-12-18. I don't know how slow is this script but I trust it.

MS did some performance tuning on the original algorithm presented here, modifying the list-based lookupArrayS to avoid runtime parsing (lookupArrayS1) and by a slight change in the loop algorithm (lookupArrayS2).

This produced a 17x speedup over the original (lookupArray) - so that this algorithm is now about 25 times slower than C.

     proc lookupArrayS1 {ip} {

        variable theList
        variable max
        variable dbLoaded
        if {!$dbLoaded} {error "database not loaded"}

        set nip [calcIpS $ip]
        set myMin 0
        set myMax $max
        set steps 0
        set avg 0
        while {$myMax > $myMin && $steps < 32} {
            incr steps
            if {$avg == 1} {
                set avg 0
            } else {
                set avg [expr {$myMax - (($myMax - $myMin) /2)}]
            }
            set current [lindex $theList $avg]
            set low [lindex $current 0]
            set hi [lindex $current 1]

            if {$low <= $nip && $hi >= $nip} {
                return [lindex $current 2]
            } elseif {$low <= $nip} {
                set myMin $avg
            } else {
                set myMax $avg
            }
        }
    }

    proc lookupArrayS2 {ip} {

        variable theList
        variable max
        variable dbLoaded
        if {!$dbLoaded} {error "database not loaded"}

        set nip [calcIpS $ip]
        set myMin 0
        set myMax $max

        while {$myMin <= $myMax} {
            set avg [expr {($myMax + $myMin) /2}]
            set current [lindex $theList $avg]

            # $current is now a 3-list of the form
            #    {low-value hi-value country-code}

            if {[lindex $current 0] > $nip} {
                set myMax [expr {$avg - 1}]
            } elseif {[lindex $current 1] < $nip} {
                set myMin [expr {$avg + 1}]
            } else {
                return [lindex $current 2]
            }
        }
        # no suitable range found: return ""
    }

 % time {opentarget::lookupArray 128.1.0.1} 1000
 2788 microseconds per iteration
 % time {opentarget::lookupArrayS 128.1.0.1} 1000
 2044 microseconds per iteration
 % time {opentarget::lookupArrayS1 128.1.0.1} 1000
 199 microseconds per iteration
 % time {opentarget::lookupArrayS2 128.1.0.1} 1000
 161 microseconds per iteration
 %
 % expr 430*161/2788.
 24.831420373

14May2003 PS Cool. Great improvement. I see a 13 fold increase between lookupArrayS and S2. Comparing with lookupArray is not fair because ArrayS only does a partial lookup, masking the final/LSB byte in the IP address. Is S2 still guaranteed to return? That is why I added the steps counter (which should actually start at 15, because there are only about 17000 entries in the list).

Now on to writing a TIP for [lsearch -(lower|upper)bound] with implementation and the Tcl version should be very close to the C version.

15May2003 MS I modified S2 slightly with an additional optimisation (having an esthetic rather than performance impact). S2 is indeed guaranteed to return, as (a) myMin or myMax converge at every non-returning iteration, (b) S2 returns "" when they cross over. The original S (and S1) can leave both myMin and myMax unmodified (when they differ by one). Note that this improved algorithm can also be used in the C implementation.

Category Internet