Updated 2010-06-20 07:17:46 by bob

A while ago my brother and I started playing with the IP to country code tables provided by ARIN and others.

The problem is this:

Given IP 123.4.5.6 and a list of ranges like this:
``` 0.0.0.0 to 4.255.255.255 us
5.0.0.1 to 5.10.255.255 nl
5.15.0.0 to 20.1.4.128 de
...
255.0.0.0 to 255.255.255.255 xx```

find the range, if any, that corresponds to the given IP.

If you sort the list, you can do a binary search and you can improve upon that by providing an index.

You can get a copy of the full list (compiled mid 2002) here [1] that the code below needs.

This code illustrates, I think it does anyway, that the way you do something in C is likely not the way to do it in Tcl. In Tcl, two separate lists that hold the IP ranges and corresponding country codes would likely be much, much faster. As it is, the Tcl (8.4.2 on linux) code is about 430 times slower than the C implementation.

16May2003 MS: Please see further down for a modified implementation that is about 25 times slower than C.

13May2003 PS.

This is the code in C:
```    #include <stdio.h>
#include <stdlib.h>

#define REC_OFFSET 8
#define REC_SIZE   18

struct record_t {
unsigned int low;
unsigned int high;
char country[3];
};

unsigned int IpToUInt(const char *ipStr)
{
unsigned int d1=0, d2=0, d3=0, d4=0;

sscanf(ipStr, "%u.%u.%u.%u", &d1, &d2, &d3, &d4);

return ( ((d1 & 0xFF) << 24) | \
((d2 & 0xFF) << 16) | \
((d3 & 0xFF) << 8)  | \
(d4 & 0xFF) );

}

unsigned int HexToUInt(const char *hexIp)
{
return (int)strtoul(hexIp, NULL, 16);
}

int GetRecord(struct record_t *r, int num, FILE *infile)
{
char data[9];
data[8]=0; // terminate data.

//seek to the selected record:
if ( fseek(infile, REC_OFFSET + num * REC_SIZE, SEEK_SET)==-1) return 0;

if( !fread(&data, 8, 1, infile) ) {
return 0;
}
r->low = HexToUInt(data);

if( !fread(&data, 8, 1, infile) ) {
return 0;
}
r->high = HexToUInt(data);

if( !fread(r->country, 2, 1, infile) ) {
return 0;
}

r->country[2]=0; //terminate string

return 1;

}

const char *lookup(unsigned int ip, int records, struct record_t *r, int *index, int *hindex)
{
int steps=0;
int found=0;

unsigned int myMin=0, myMax=0, avg;

myMin=index[ip >> 24];
myMax=hindex[ip >> 24];
while( myMax > myMin ) {
steps++;

avg = (myMax + myMin) /2;

if( r[avg].low <= ip && r[avg].high >= ip ) {
found=1;
break;
} else if (r[avg].low <= ip) {
myMin = avg;
} else {
myMax = avg;
}
if (steps > 32) {
//Infiniteloop
break;
}
}
if (found) {
return r[avg].country;
} else {

//check one above:
if ( avg < records )
if( r[avg+1].low <= ip && r[avg+1].high >= ip )
return r[avg+1].country;

//check one below:
if (avg > 0)
if( r[avg-1].low <= ip && r[avg-1].high >= ip )
return r[avg-1].country;

return 0;
}

}

int main (int argc, char *argv[])
{
unsigned int ip;
char *country;
FILE *infile=0;
int i;

char data[9];
struct record_t *r;
int index[256], hindex[256], indexptr;
int records=0;

if (argc < 2) {
fprintf(stderr, "Usage: lookup <dotted-decimal-ip> <...>\n");
return 2;
}

if( (infile=fopen("opentarget.dat", "r"))==NULL ) {
fprintf(stderr, "Could not open database\n");
return 2;
}

data[8]=0; // terminate data.

if ( fseek(infile, 0, SEEK_SET)==-1) return 0;
if( !fread(&data, 8, 1, infile) ) {
fprintf(stderr, "File read error while initialising\n");
return 2;
}
records = HexToUInt(data);

if ( (r = malloc(sizeof(struct record_t)*records)) == NULL) {
fprintf(stderr, "Insufficient memory\n");
return 2;
}

for (i=0; i<256; i++) index[i]=hindex[i]=0;
index[0]=0;
indexptr=0;

for (i=0; i < records; i++) {
if( !fread(&data, 8, 1, infile) ) {
fprintf(stderr, "Error reading database record %d\n", i);
return 2;
}
r[i].low = HexToUInt(data);

if( !fread(&data, 8, 1, infile) ) {
fprintf(stderr, "Error reading database record %d\n", i);
return 2;
}
r[i].high = HexToUInt(data);

if( !fread(r[i].country, 2, 1, infile) ) {
fprintf(stderr, "Error reading database record %d\n", i);
return 2;
}

r[i].country[2]=0; //terminate string

if (indexptr < (r[i].low >> 24)) {
indexptr=r[i].low >> 24;
index[indexptr]=i;
}
}
fclose(infile);

for (i=1; i<256; i++) {
if(index[i]==0) index[i]=index[i-1];
}
hindex[255]=records;
for (i=254; i>=0; i--) {
if (index[i+1]>index[i]) {
hindex[i]=index[i+1];
} else {
hindex[i]=hindex[i+1];
}
}

//for (i=0; i<256; i++) printf("%04x", index[i]);
//for (i=0; i<256; i++) printf("%04x", hindex[i]);
//return 0;
for (i=1; i<argc; i++) {
// uncomment the next loop to do a speed test:
//int j;
//for (j=0; j<1000000; j++) {
ip=IpToUInt(argv[i]);
country=lookup(ip, records, r, index, hindex);
//}
if ( country!=NULL ) {
printf("%s %s\n", argv[i], country);
} else {
printf("%s NOMATCH\n", argv[i]);
}
}
return 0;

}```

The same code in Tcl looks like this:
``` namespace eval ::opentarget {

set max 0
set theArray(0) foo
set theList [list]

proc calcIp {ip} {
set lIp [split \$ip .]
while {[llength \$lIp] < 4} {
lappend lIp 0
}
set res [expr ((((([lindex \$lIp 0].0 * 256.0) + [lindex \$lIp 1].0) * 256.0) + [lindex \$lIp 2].0) * 256.0) + [lindex \$lIp 3].0]
return [lindex [split \$res .] 0]
}

proc calcIpS {ip} {
#calc IP, but drop the LSB
set lIp [split \$ip .]
while {[llength \$lIp] < 4} {
lappend lIp 0
}
set res [expr (((( [lindex \$lIp 0]) * 256) + [lindex \$lIp 1]) * 256) + [lindex \$lIp 2]]
return \$res
}

proc calcIpHex {hex} {
return [format %u 0x\$hex]
}

proc openFile {fileName} {
regsub -all {\\} \$fileName {/} fileName
variable max
set if [open \$fileName r]
set len [format %i 0x\$lenHex]
set max [expr \$len - 1]
return \$if
}

regsub -all {\\} \$fileName {/} fileName
set key -1
variable theArray
variable theList
variable max
set if [open \$fileName r]
set max [expr 1 * 0x[read \$if 8]]
while {![eof \$if] && [string length [set entry [read \$if 18]]] == 18} {
lappend line [string range \$entry 0 7]
lappend line [string range \$entry 8 15]
lappend line [string range \$entry 16 17]
incr key
set theArray(\$key) \$line
unset line
if { ![string equal [string range \$entry 6 7] "00"] } {
#puts "\$key starts at [string range \$entry 6 7]"
}
if { ![string equal [string range \$entry 14 15] "ff"] } {
#puts "\$key ends at [string range \$entry 14 15]"
}
lappend line [calcIpHex [string range \$entry 0 5]]
lappend line [calcIpHex [string range \$entry 8 13]]
lappend line [string range \$entry 16 17]
lappend theList \$line
unset line

}

set max \$key
close \$if
}

proc lookup {args} {
variable theArray
if {[llength \$args] == 1 && !\$dbLoaded} {
} elseif {[llength \$args] == 1 && \$dbLoaded} {
lookupArray [lindex \$args 0]
} else {
lookupFile [lindex \$args 0] [lindex \$args 1]
}
}

proc lookupArray {ip} {

variable theArray
variable max

set nip [calcIp \$ip]
set myMin 0
set myMax \$max
set steps 0
set avg 0
while {[expr \$myMax - \$myMin] > 0 && \$steps < 32} {
incr steps
if {\$avg == 1} {
set avg 0
} else {
set avg [expr \$myMax - ((\$myMax - \$myMin) /2)]
}
set current \$theArray(\$avg)
set low [calcIpHex [lindex \$current 0]]
set hi [calcIpHex  [lindex \$current 1]]

if {[expr "\$low.0 <= \$nip.0"] && [expr "\$hi.0 >= \$nip.0"]} {
return [lindex \$current 2]
} elseif {[expr "\$low.0 <= \$nip.0"]} {
set myMin \$avg
} else {
set myMax \$avg
}
}

return ""
}

proc lookupArrayS {ip} {

variable theList
variable max

set nip [calcIpS \$ip]
set myMin 0
set myMax \$max
set steps 0
set avg 0
while {[expr \$myMax - \$myMin] > 0 && \$steps < 32} {
incr steps
if {\$avg == 1} {
set avg 0
} else {
set avg [expr \$myMax - ((\$myMax - \$myMin) /2)]
}
set current [lindex \$theList \$avg]
set low [lindex \$current 0]
set hi [lindex \$current 1]

if {[expr \$low <= \$nip] && [expr \$hi >= \$nip]} {
return [lindex \$current 2]
} elseif {[expr \$low <= \$nip]} {
set myMin \$avg
} else {
set myMax \$avg
}
}

return ""
}

proc lookupFile {fileName ip} {
variable max
set if [openFile \$fileName]
set found 0
set nip [calcIp \$ip]
set myMin 0
set myMax \$max
set steps 0
while {[expr \$myMax - \$myMin] > 0 && \$steps < 32} {
incr steps
if {\$myMax == 1} {
set avg 0
} else {
set avg [expr \$myMax - ((\$myMax - \$myMin) /2)]
}
seek \$if [expr 8 + (\$avg * 18)] start
set low [calcIpHex [read \$if 8]]
set hi [calcIpHex [read \$if 8]]

if {[expr \$low.0 <= \$nip.0] && [expr \$hi.0 >= \$nip.0]} {
set found 1
set return \$country
break
} elseif {[expr \$low.0 <= \$nip.0]} {
set myMin \$avg
} else {
set myMax \$avg
}
}
close \$if
if {!\$found} {
return ""
} else {
return \$return
}

}

}

puts [time {
set foo [::opentarget::lookupArray \$args]
} 1000]```

RS has this version:
```        proc ip2s ip {eval format %02x%02x%02x%02x [split \$ip .]}
set fp [open [file dirname [info script]]/opentarget.dat]
read \$fp 8 ;# skip record count
set data {}
while 1 {
if [eof \$fp] break
lappend data s\$from s\$to \$country
}
close \$fp
foreach arg \$argv {
set ips s[ip2s \$arg]
foreach {from to country} \$data {
if {\$from<=\$ips && \$to>=\$ips} {puts \$arg:\$country; break}
}
}```

which may not be optimally fast, but responds fast enough when called from command line. Most of all, it is extremely short (compared to the original), so easyly maintained.

ulis, 2004-12-18. I don't know how slow is this script but I trust it.

MS did some performance tuning on the original algorithm presented here, modifying the list-based lookupArrayS to avoid runtime parsing (lookupArrayS1) and by a slight change in the loop algorithm (lookupArrayS2).

This produced a 17x speedup over the original (lookupArray) - so that this algorithm is now about 25 times slower than C.
```     proc lookupArrayS1 {ip} {

variable theList
variable max

set nip [calcIpS \$ip]
set myMin 0
set myMax \$max
set steps 0
set avg 0
while {\$myMax > \$myMin && \$steps < 32} {
incr steps
if {\$avg == 1} {
set avg 0
} else {
set avg [expr {\$myMax - ((\$myMax - \$myMin) /2)}]
}
set current [lindex \$theList \$avg]
set low [lindex \$current 0]
set hi [lindex \$current 1]

if {\$low <= \$nip && \$hi >= \$nip} {
return [lindex \$current 2]
} elseif {\$low <= \$nip} {
set myMin \$avg
} else {
set myMax \$avg
}
}
}

proc lookupArrayS2 {ip} {

variable theList
variable max

set nip [calcIpS \$ip]
set myMin 0
set myMax \$max

while {\$myMin <= \$myMax} {
set avg [expr {(\$myMax + \$myMin) /2}]
set current [lindex \$theList \$avg]

# \$current is now a 3-list of the form
#    {low-value hi-value country-code}

if {[lindex \$current 0] > \$nip} {
set myMax [expr {\$avg - 1}]
} elseif {[lindex \$current 1] < \$nip} {
set myMin [expr {\$avg + 1}]
} else {
return [lindex \$current 2]
}
}
# no suitable range found: return ""
}

% time {opentarget::lookupArray 128.1.0.1} 1000
2788 microseconds per iteration
% time {opentarget::lookupArrayS 128.1.0.1} 1000
2044 microseconds per iteration
% time {opentarget::lookupArrayS1 128.1.0.1} 1000
199 microseconds per iteration
% time {opentarget::lookupArrayS2 128.1.0.1} 1000
161 microseconds per iteration
%
% expr 430*161/2788.
24.831420373```

14May2003 PS Cool. Great improvement. I see a 13 fold increase between lookupArrayS and S2. Comparing with lookupArray is not fair because ArrayS only does a partial lookup, masking the final/LSB byte in the IP address. Is S2 still guaranteed to return? That is why I added the steps counter (which should actually start at 15, because there are only about 17000 entries in the list).

Now on to writing a TIP for [lsearch -(lower|upper)bound] with implementation and the Tcl version should be very close to the C version.

15May2003 MS I modified S2 slightly with an additional optimisation (having an esthetic rather than performance impact). S2 is indeed guaranteed to return, as (a) myMin or myMax converge at every non-returning iteration, (b) S2 returns "" when they cross over. The original S (and S1) can leave both myMin and myMax unmodified (when they differ by one). Note that this improved algorithm can also be used in the C implementation.