Repository: luigirizzo/dummynet
Branch: master
Commit: e717cdd4bef7
Files: 120
Total size: 1.1 MB

Directory structure:
gitextract_r3ojrber/

├── 020-mips-hz1000.patch
├── Makefile
├── Makefile.inc
├── Makefile.openwrt
├── NOTES
├── README
├── binary/
│   ├── README.txt
│   ├── ipfw.sys
│   ├── netipfw.inf
│   ├── netipfw_m.inf
│   └── testme.bat
├── binary64/
│   └── ipfw.sys
├── configuration/
│   ├── README
│   ├── change_rules.sh
│   ├── change_rules_linux.sh
│   ├── ipfw.conf
│   ├── ipfw.rules
│   └── rc.firewall
├── glue.h
├── ipfw/
│   ├── Makefile
│   ├── add_rules
│   ├── dummynet.c
│   ├── expand_number.c
│   ├── glue.c
│   ├── humanize_number.c
│   ├── include/
│   │   ├── alias.h
│   │   ├── net/
│   │   │   ├── if_dl.h
│   │   │   └── pfvar.h
│   │   └── timeconv.h
│   ├── ipfw.8
│   ├── ipfw2.c
│   ├── ipfw2.h
│   ├── ipv6.c
│   ├── main.c
│   ├── qsort.c
│   ├── qsort_r.c
│   ├── rule_test.sh
│   └── ws2_32.def
├── kipfw/
│   ├── Makefile
│   ├── bsd_compat.c
│   ├── debug.c
│   ├── ipfw2_mod.c
│   ├── md_win.c
│   ├── missing.h
│   ├── mysetenv.sh
│   ├── netipfw.inf
│   ├── netipfw_m.inf
│   ├── sources
│   ├── win-passthru.diff
│   └── winmissing.h
├── kmod-ipfw3_2.4.35.4-brcm-2.4-1_mipsel.ipk
├── planetlab/
│   ├── Makefile.planetlab
│   ├── check_planetlab_sync
│   ├── ipfw
│   ├── ipfw.cron
│   ├── ipfwroot.spec
│   ├── ipfwslice.spec
│   ├── netconfig
│   ├── planetlab-tags.mk
│   ├── planetlab.mk
│   └── sample_hook
├── sys/
│   ├── net/
│   │   ├── if.h
│   │   ├── pfil.h
│   │   ├── radix.c
│   │   └── radix.h
│   ├── netgraph/
│   │   └── ng_ipfw.h
│   ├── netinet/
│   │   ├── in_cksum.c
│   │   ├── ip.h
│   │   ├── ip6.h
│   │   ├── ip_dummynet.h
│   │   ├── ip_fw.h
│   │   ├── ip_icmp.h
│   │   ├── ipfw/
│   │   │   ├── dn_heap.c
│   │   │   ├── dn_heap.h
│   │   │   ├── dn_sched.h
│   │   │   ├── dn_sched_fifo.c
│   │   │   ├── dn_sched_prio.c
│   │   │   ├── dn_sched_qfq.c
│   │   │   ├── dn_sched_rr.c
│   │   │   ├── dn_sched_wf2q.c
│   │   │   ├── ip_dn_glue.c
│   │   │   ├── ip_dn_io.c
│   │   │   ├── ip_dn_private.h
│   │   │   ├── ip_dummynet.c
│   │   │   ├── ip_fw2.c
│   │   │   ├── ip_fw_dynamic.c
│   │   │   ├── ip_fw_log.c
│   │   │   ├── ip_fw_lookup.c
│   │   │   ├── ip_fw_nat.c
│   │   │   ├── ip_fw_pfil.c
│   │   │   ├── ip_fw_private.h
│   │   │   ├── ip_fw_sockopt.c
│   │   │   └── ip_fw_table.c
│   │   ├── tcp.h
│   │   ├── tcp_var.h
│   │   └── udp.h
│   └── sys/
│       ├── cdefs.h
│       ├── kernel.h
│       ├── malloc.h
│       ├── mbuf.h
│       ├── module.h
│       ├── param.h
│       ├── queue.h
│       ├── syslog.h
│       ├── systm.h
│       └── taskqueue.h
├── tcc_glue.h
└── test/
    ├── Makefile
    ├── basic_ipfw.sh
    ├── dn_test.h
    ├── dynrules.sh
    ├── interpolation.c
    ├── main.c
    ├── memory_leak.sh
    ├── mylist.h
    ├── profile_bench1
    ├── profile_bench2
    ├── profile_bench3
    ├── test_dn_heap.c
    └── test_dn_sched.c

================================================
FILE CONTENTS
================================================

================================================
FILE: 020-mips-hz1000.patch
================================================
--- include/asm-mips/param_orig.h	2010-02-23 12:45:58.000000000 +0100
+++ include/asm-mips/param.h	2010-02-23 12:00:31.000000000 +0100
@@ -41,7 +41,7 @@
    counter is increasing.  This value is independent from the external value
    and can be changed in order to suit the hardware and application
    requirements.  */
-#  define HZ 100
+#  define HZ 1000
 #  define hz_to_std(a) (a)
 
 #endif /* Not a DECstation  */


================================================
FILE: Makefile
================================================
# $Id: Makefile 11689 2012-08-12 21:07:34Z luigi $
#
# Top level makefile for building ipfw/dummynet (kernel and userspace).
# You can run it manually or also under the Planetlab build.
# Planetlab wants also the 'install' target.
#
# To build on system with non standard Kernel sources or userland files,
# you should run this with
#
#	make KERNELPATH=/path/to/linux-2.x.y.z USRDIR=/path/to/usr
#
# We assume that $(USRDIR) contains include/ and lib/ used to build userland.
#

include Makefile.inc

DATE ?= $(shell date +%Y%m%d)
SNAPSHOT_NAME=$(DATE)-ipfw3.tgz
BINDIST=$(DATE)-dummynet-linux.tgz
WINDIST=$(DATE)-dummynet-windows.zip

DISTFILES= Makefile Makefile.inc README binary* ipfw kipfw *.h sys

.PHONY: ipfw kipfw

###########################################
#  windows x86 and x64 specific variables #
###########################################
#  DRIVE must be the hard drive letter where DDK is installed
#  DDKDIR must be the path to the DDK root directory, without drive letter
#  TARGETOS (x64 only) must be one of the following:
#  wnet   -> windows server 2003
#  wlh    -> windows vista and windows server 2008
#  win7   -> windows 7
#  future version must be added here
DRIVE ?= C:
DDKDIR ?= /WinDDK/7600.16385.1
DDK = $(DRIVE)$(DDKDIR)
TARGETOS=win7

export WIN64
export DDK
export DRIVE
export DDKDIR

_all: all

clean distclean:
	-@(cd ipfw && $(MAKE) $(@) )
	-@rm -rf kipfw-mod binary64/[A-hj-z]*

all: kipfw ipfw
	@# -- windows only
ifeq ($(OSARCH),Windows)	# copy files
ifeq ($(WIN64),)
	-@ cp ipfw/ipfw.exe kipfw-mod/$(OBJDIR)/ipfw.sys binary/
	-@ cp kipfw/*.inf binary/
else
	-@ cp binary/* kipfw/*.inf binary64/
	-@ cp ipfw/ipfw.exe kipfw-mod/objchk_win7_amd64/amd64/ipfw.sys binary64/
endif	# WIN64
endif	# Windows

win64:
	$(MAKE) WIN64=1

# kipfw-src prepares the sources for the kernel part.
# The windows files (passthru etc.) are modified version of the
# examples found in the $(DDK)/src/network/ndis/passthru/driver/
# They can be re-created using the 'ndis-glue' target
# # We need a sed trick to remove newlines from the patchfile.

ndis-glue:
	-@mkdir -p kipfw-mod
	cp $(DDK)/src/network/ndis/passthru/driver/*.[ch] kipfw-mod
	(cd kipfw-mod; for i in  `find . -type f`; do sed -i.tmp "s/$$(printf '\r')//g" $$i; done )
	cat kipfw/win-passthru.diff | sed "s/$$(printf '\r')//g" | (cd kipfw-mod; patch )

kipfw-src:
	-@rm -rf kipfw-mod
	-@mkdir -p kipfw-mod
	-@cp -Rp kipfw/* kipfw-mod
	-@cp `find sys -name \*.c` kipfw-mod
	-@(cd kipfw-mod && $(MAKE) include_e)
ifeq ($(OSARCH),Windows)
	make ndis-glue
endif

snapshot:
	$(MAKE) distclean
	(tar cvzhf /tmp/$(SNAPSHOT_NAME) -s':^:ipfw3-2012/:' $(DISTFILES) )

bindist:
	$(MAKE) clean
	$(MAKE) all
	tar cvzf /tmp/$(BINDIST) ipfw/ipfw ipfw/ipfw.8 kipfw-mod/ipfw_mod.ko

windist:
	$(MAKE) clean
	-$(MAKE) all
	-rm /tmp/$(WINDIST)
	zip -r /tmp/$(WINDIST) binary -x \*.svn\*


ipfw:
	@(cd ipfw && $(MAKE) $(@) )

kipfw: kipfw-src
ifeq ($(WIN64),)	# linux or windows 32 bit
	@(cd kipfw-mod && $(MAKE) $(@) )
else	#--- windows 64 bit, we use build.exe and nmake
	rm -f kipfw-mod/Makefile
	mkdir kipfw-mod/tmpbuild		# check mysetenv.sh
	bash kipfw/mysetenv.sh $(DRIVE) $(DDKDIR) $(TARGETOS)
endif

openwrt_release:
	# create a temporary directory
	$(eval TMPDIR := $(shell mktemp -d -p /tmp/ ipfw3_openwrt_XXXXX))
	# create the source destination directory
	$(eval IPFWDIR := ipfw3-$(DATE))
	$(eval DSTDIR := $(TMPDIR)/$(IPFWDIR))
	mkdir $(DSTDIR)
	# copy the package, clean objects and svn info
	cp -r ./ipfw ./kipfw-mod glue.h Makefile ./configuration README $(DSTDIR)
	(cd $(DSTDIR); make -s distclean; find . -name .svn | xargs rm -rf)
	(cd $(TMPDIR); tar czf $(IPFWDIR).tar.gz $(IPFWDIR))

	# create the port files in /tmp/ipfw3-port
	$(eval PORTDIR := $(TMPDIR)/ipfw3)
	mkdir -p $(PORTDIR)/patches
	# generate the Makefile, PKG_VERSION and PKG_MD5SUM
	md5sum $(DSTDIR).tar.gz | cut -d ' ' -f 1 > $(TMPDIR)/md5sum
	cat ./OPENWRT/Makefile | \
		sed s/PKG_VERSION:=/PKG_VERSION:=$(DATE)/ | \
		sed s/PKG_MD5SUM:=/PKG_MD5SUM:=`cat $(TMPDIR)/md5sum`/ \
		> $(PORTDIR)/Makefile

	@echo ""
	@echo "The openwrt port is in $(TMPDIR)/ipfw3-port"
	@echo "The source file should be copied to the public server:"
	@echo "scp $(DSTDIR).tar.gz marta@info.iet.unipi.it:~marta/public_html/dummynet"
	@echo "after this the temporary directory $(TMPDIR) can be removed."

install:

diff:
	-@(diff -upr $(BSD_HEAD)/sbin/ipfw ipfw)
	-@(diff -upr $(BSD_HEAD)/sys sys)


================================================
FILE: Makefile.inc
================================================
# $Id$
# GNU makefile header for ipfw/kipfw building
BSD_HEAD ?= ~/FreeBSD/head
OSARCH := $(shell uname)
OSARCH := $(findstring $(OSARCH),FreeBSD Linux Darwin)
ifeq ($(OSARCH),)
    OSARCH := Windows
endif
OBJDIR=mia

KSRC ?= /lib/modules/$(shell uname -r)/build
ifneq ($V,1) # no echo
    MSG=@echo
    HIDE=@
else
    MSG=@\#
    HIDE=
endif

.c.o:
	$(MSG) "   CC $<"
	$(HIDE) $(CC) $(CFLAGS) -c $< -o $@


================================================
FILE: Makefile.openwrt
================================================
# Makefile to build the package in openwrt.
# goes into package/network/utils/ipfw3/Makefile
#
# Edit IPFW_DIR to point to the directory with the sources for ipfw

IPFW_DIR := $(TOPDIR)/../qemu-misc/ipfw3

include $(TOPDIR)/rules.mk
include $(INCLUDE_DIR)/kernel.mk

PKG_NAME:=ipfw3
PKG_RELEASE:=1

# MV is undefined, we use it in the internal Makefiles
MV ?= mv

include $(INCLUDE_DIR)/package.mk

#Stuff depending on kernel version
$(warning --- openwrt kernel version $(KERNEL) linux dir $(LINUX_DIR) -------)

ifeq ($(KERNEL),2.4)
    VERS:=openwrt
    CFLAGS_WRT:=-DSYSCTL_NODE -DEMULATE_SYSCTL
    IPFW_MOD:=ipfw_mod.o
    IPFW_SRC_DIR:=SUBDIRS
else
    #VERS:=2.6
    IPFW_MOD:=ipfw_mod.ko
    IPFW_SRC_DIR:=M
endif

define Package/ipfw3
  SECTION:=utils
  CATEGORY:=Utilities
  TITLE := /sbin/ipfw
  DEPENDS := +libc +libgcc
  FILES := $(PKG_BUILD_DIR)/ipfw/ipfw
  $(warning --- build dir is $(PKG_BUILD_DIR) ---)
endef

define Package/ipfw3/description
  Control program for ipfw and dummynet
endef

# XXX not entirely clear why the install entry for userland works,
# given that /sbin/ipfw is in KernelPackage/ipfw3

define Package/ipfw3/install
	$(INSTALL_DIR) $(1) /sbin
endef

# Description for the package.
# The names KernelPackage/ipfw3 must match the arguments to the
# call $(eval $(call KernelPackage,ipfw3)) used to build it

define KernelPackage/ipfw3
  SUBMENU:=Other modules
  TITLE:= IPFW and dummynet
  # FILES is what makes up the module, both kernel and userland
  # It must be in the KernelPackage section XXX
  FILES := $(PKG_BUILD_DIR)/kipfw-mod/$(IPFW_MOD)
  # AUTOLOAD:=$(call AutoLoad,80,ipfw_mod)
endef

define KernelPackage/kmod-ipfw3/description
  ipfw and dummynet kernel module
endef

# Standard entries for the openwrt builds: Build/Prepare and Build/Compile
# Remember that commands must start with a tab

# 'prepare' instructions for both kernel and userland
# We copy the entire subtree, then build include_e/ which
# contains empty headers used by the kernel sources.
define Build/Prepare
  # $(warning --- Preparing ipfw sources ---)
	mkdir -p $(PKG_BUILD_DIR)
	$(CP) -Rp $(IPFW_DIR)/* $(PKG_BUILD_DIR)/
	# The kernel sources are spread in multiple places,
	# so we put everything in kipfw-mod
	mkdir -p $(PKG_BUILD_DIR)/kipfw-mod
	cp -Rp $(IPFW_DIR)/kipfw/* $(PKG_BUILD_DIR)/kipfw-mod
	cp `find $(IPFW_DIR)/sys -name \*.c` $(PKG_BUILD_DIR)/kipfw-mod
	# we do not need cross parameters
	(cd $(PKG_BUILD_DIR)/ipfw && $(MAKE) include_e )
	(cd $(PKG_BUILD_DIR)/kipfw-mod && $(MAKE) include_e )
endef

define Build/Compile
	# XXX check whether we need all linux_dir etc.
	$(warning --- compile the user part for ipfw/openwrt ---)
	$(MAKE) -C $(PKG_BUILD_DIR)/ipfw \
		LINUX_DIR=$(LINUX_DIR) \
		$(TARGET_CONFIGURE_OPTS) \
		CFLAGS="$(TARGET_CFLAGS) $(CFLAGS_WRT) -I./include_e -I./include -include ../glue.h -DNO_ALTQ -D__BSD_VISIBLE" \
		_VER=$(VERS) all
	$(warning --- compile the kernel part for ipfw/openwrt ---)
	$(MAKE) -C "$(LINUX_DIR)" \
		CROSS_COMPILE="$(TARGET_CROSS)" \
		LINUX_DIR=$(LINUX_DIR) \
		KERNELPATH=$(LINUX_DIR) \
		ARCH="$(LINUX_KARCH)" \
		$(IPFW_SRC_DIR)="$(PKG_BUILD_DIR)/kipfw-mod" \
		IPFW3_ROOT="$(PKG_BUILD_DIR)" \
		_VER=$(VERS) modules
	$(warning +++ done compile the kernel part for ipfw/openwrt ---)
endef


$(eval $(call BuildPackage,ipfw3))
$(eval $(call KernelPackage,ipfw3))


================================================
FILE: NOTES
================================================
#
# $Id: NOTES 6552 2010-06-15 11:24:59Z svn_panicucci $
#

---------------------------------------------------------------------
---  DEVELOPER NOTES ------------------------------------------------

Both the client and the kernel code use almost unmodified sources
from FreeBSD (just a very small number of sections #ifdef'ed out
for features not relevant or not implemented).

In both cases we provide two set of headers:
 - one set is made of empty files, automatically generated, to replace
   FreeBSD headers not available or conflicting on the ported platforms.
 - one set is made of custom files, sometimes copied verbatim
   from FreeBSD, sometimes containing only the minimal set of
   macros/ struct/ prototypes required by the port.

Additionally, we have a small set of .c files providing functions not
available in the port platforms, and hooks for the sockopt/packet
data.


TODO 20100205:
+ use an appropriate identifier instead of LINUX24
+ find the discharging module hook, in order to force a queue flush
+ better matching on interface names (case insensitive etc ?)
+ match by interface address
+ verify path
+ send keepalives (20100301 marta: implemented)
+ pullup of data in external buffers
+ O_TAG
+ O_DIVERT
+ O_TEE
+ O_SETFIB
+ kmem_cache_alloc 

TODO (OpenWRT) 20090622
+ add a module compilation for 2.6

TODO (FreeBSD, general)
+ New features related to the forthcoming IPv6 are missing, as the IPv6
support for lookup tables that currently support IPv4 addresses only.
One of the goal of this project is to add the tables feature to the
IPv6 protocol.

+ The current code implements rules listing requests as a single
request returning both static and dynamic rules as a whole block. This
operation requires a lock to be held for the time needed to get the
full list of rules, regardless of the requested rules.  I propose to
break up the rule request in two parts, for static and dynamic rules, in
order to avoid to lock the whole struct for a subset of rules required.

+ At last, due to improvement and contribution to the code, the tool
significantly grown over the time with new functionalities and features,
leaving the general view aside. An example of this will be the use of
dispatching table instead some very long switch case, making the resulting
code more readable and hopefully a faster execution.

+ XXX can't find the ipfw_* indirection...

DETAILED PORTING INFO

--- ipfw (userland) on linux ---

The port is relatively trivial. Communication with the kernel occurs
through a raw socket using [gs]etsockopt(), and all is needed is the
availability of ip_fw.h and ip_dummynet.h headers to describe the
relevant data structures.

--- kernel ipfw on linux ---

Sources are mostly unmodified, except for commenting out
unsupported features (tables, in-kernel nat...).
The port requires a rather large number of empty headers.
Other porting issues are in ipfw2_mod.c

--- build as an Openwrt package

------ WINDOWS PORT ------

We started from the wipfw port available at [WIPFW] , but
most of the port is done from scratch using the most recent
version of ipfw+dummynet from HEAD/RELENG_7 as of March 2009

# WIPFW: wipfw.sourceforge.net
#binary:
http://downloads.sourceforge.net/wipfw/wipfw-0.3.2b.zip?use_mirror=mesh
http://downloads.sourceforge.net/wipfw/wipfw-0.2.8-source.zip

--- DEVELOPMENT TOOLS:

At least initially, to build the code you need a pc with
windows installed and the [WINDDK] from the microsoft site.
Other tools like the new WDK should work as well.

The 'standard' way used by WDK/WINDDK is to run a 'build'
script which in turn calls nmake and then the microsoft
compiler [CL] and linker [LINK]. See the documentation for
command line switches for these tools, they are similar but
not the same as the equivalent gcc switches. In particular,
a / is often used to replace - though both forms are accepted.

The steps to do in order to launch the build environment follows:

 + download winddk from microsoft.com 
 + install 
 + run the Free Build Enviroment from:

	Start -> All Program -> WINDDK ->
	[NT|XP|2000] -> Free Build Environment

 + change dir to .src and type `build' in command line

For our purposes, however, it is much more convenient to use
cygwin [CYGWIN] and invoke CL and LINK using gmake

A debugging tools is:
	http://technet.microsoft.com/en-us/sysinternals/bb896647.aspx
it simply display the kernel-mode debug output.
Use the DbgPrint() function, that is something similar to printk().
Can be lauched with dbgview.exe.

After a succesfully compilation and link, you can launch the program
in user space simply executing the binary file, while for the kernel
space you need to do the following steps:

cp ipfw.sys /cygdrive/c/WINDOWS/system32/drivers/
ipfw install_drv System32\DRIVERS\ip_fw.sys
net start ip_fw


=======
--- ARCHITECTURE ---

The main part of the userland program mostly work as the
unix equivalent, the only issue is to provide empty
header files to replace those not available in Windows,
and include the winsock2 headers to access some network
related functions and headers.

Communication with the kernel module does not use a raw IP socket
as in the unix version. Instead, we inherit the same method
used in ipfw -- a replacement for socket() creates a handle
to access the control structure, and setsockopt/getsockopt
replacements are also used to communicate with the kernel
side. This is implemented in win32.c

In order to load the module and activate it, we also use
the same technique suggested in wipfw -- the main() is
extended (with a wrapper) so that it can handle additional
commands to install/control/deinstall the service and
call the appropriate actions. See svcmain.c for details.

--- PORTING ISSUES:

Most of the unix hierarchy of headers is not available so we
have to replicate them.

gcc attributes are also not present.

C99 types are not present, remapped in <sys/cdefs.h>
Also, we don't have C99 initializers which sometimes gives trouble.

--- USEFUL LINKS:

[WIPFW]
	http://wipfw.sourceforge.net/

[WINDDK]
	http://www.microsoft.com/whdc/devtools/ddk/default.mspx

[CL]
	http://msdn.microsoft.com/en-us/library/610ecb4h.aspx
	command line syntax

[CYGWIN]
	http://www.cygwin.com/setup.exe
Windows Driver Kit
http://www.microsoft.com/whdc/DevTools/WDK/WDKpkg.mspx

Debug Symbols for WinXP SP3
http://www.microsoft.com/whdc/devtools/debugging/symbolpkg.mspx#d

DbgView
http://technet.microsoft.com/en-us/sysinternals/bb896647.aspx

Cygwin
http://www.cygwin.com/
(installazione pacchetti di default + categoria devel)

Winrar (il WDK e' distribuito in un file .iso)
http://www.rarlab.com/download.htm

puttycyg (terminale per cygwin)
http://code.google.com/p/puttycyg/

Tortoise SVN
http://tortoisesvn.net/downloads

EditPlus
http://www.editplus.com/

---------------------------------------------------------------------
--- OPEN ISSUES/TODO ------------------------------------------------

- Fix the build on OpenWRT for linux 2.6
  [Forum: https://forum.openwrt.org/viewtopic.php?id=24990]
- Compilation on 2.6 OpenWRT (target is MIPS Artheros 71xx) gives compilation
  errors; [Send updates to: https://forum.openwrt.org/viewtopic.php?id=24990]
- Windows stack corruption [a tricky bug in dummynet]
- Windows ipv6 port [RE: Windows port of ipv6 in ipfw+dummynet]

NOTE:
- To allow compilation on OpenWRT with kernel 2.6 only the Makefile.opewrt
  is modified to guess the kernel version (2.4/2.6)
- ipfw3 Makefile is not modified.
- Also compile on bigendian, but not tested yet...
- Little changes in source code.


================================================
FILE: README
================================================
#
# $Id: README 11691 2012-08-12 21:32:37Z luigi $
#

This directory contains a port of ipfw and dummynet to Linux and Windows.
This version of ipfw and dummynet is called "ipfw3" as it is the
third major rewrite of the code.  The source code here comes straight
from FreeBSD (roughly the version in HEAD as of February 2010),
plus some glue code and headers written from scratch.  Unless
specified otherwise, all the code here is under a BSD license.

Specific build instructions are below, and in general produce

	a kernel module,	ipfw_mod.ko (ipfw.sys on windows)
	a userland program,	/sbin/ipfw (ipfw.exe on windows)

which you need to install on your system.

CREDITS:
    Luigi Rizzo (main design and development)
    Marta Carbone (Linux and Planetlab ports)
    Riccardo Panicucci (modular scheduler support)
    Francesco Magno (Windows port)
    Fabio Checconi (the QFQ scheduler)
    Funding from Universita` di Pisa (NETOS project),
	European Commission (ONELAB2 project)
	ACM SIGCOMM (Sigcomm Community Projects Award, April 2012)
    
------ INSTALL/REMOVE INSTRUCTIONS ------

Linux
    INSTALL:
	# Do the following as root
	insmod ./dummynet2/ipfw_mod.ko
	cp ipfw/ipfw /usr/local/sbin
    REMOVE:
	rmmod ipfw_mod.ko

OpenWRT
    INSTALL:	# use the correct name for your system
	opkg install  kmod-ipfw3_2.4.35.4-brcm-2.4-1_mipsel.ipk #install
	ls -l ls -l /lib/modules/2.4.35.4/ipfw*     # check
	insmod /lib/modules/2.4.35.4/ipfw_mod.o     # load the module
	/lib/modules/2.4.35.4/ipfw show             # launch the userspace tool
    REMOVE:
	rmmod ipfw_mod.o                            # remove the module

Windows:
    A pre-built version is in binary/ and binary64/ directories.

    INSTALL THE NDIS DRIVER
	- open the configuration panel for the network card in use
	  (right click on the icon on the SYSTRAY, or go to
	  Control Panel -> Network and select one card)
	- click on Properties->Install->Service->Add
	- click on 'Driver Disk' and select 'netipfw.inf' in this folder
	- select 'ipfw+dummynet' which is the only service you should see
	- click accept on the warnings for the installation of an unsigned
	  driver (roughly twice per existing network card)

	Now you are ready to use the emulator. To configure it, open a 'cmd'
	window (REMEMBER to run it as Administrator)
	and you can use the ipfw command from the command line.
	Otherwise click on the 'TESTME.bat' which is a batch program that
	runs various tests.
	REMEMBER: you need to run ipfw as administrator.

    REMOVE:
	- select a network card as above.
	- click on Properties
	- select 'ipfw+dummynet'
	- click on 'Remove'


------ BUILD INSTRUCTIONS ------

+ Windows 32 bit and 64 bit (XP, Windows7)

    To build your own version of the package you need:
	- cygwin, http://www.cygwin.com/ with base packages, make,
	  c compiler, possibly an editor and subversion.
	  This is used to build the userspace control program, ipfw.exe

	- Microsoft Windows Driver Kit Version 7.1.0, available from
	    http://www.microsoft.com/en-us/download/details.aspx?id=11800
	    (ISO image, GRMWDK_EN_7600_1.ISO)
	  This is used to build the kernel module.

	- optionally, DbgView if you want to see diagnostics coming from
	  the kernel module. You can find it at

	    http://technet.microsoft.com/en-us/sysinternals/bb896647.aspx

    Check the Makefile in the root directory to make sure that the WDK is
    installed in the place indicated by DRIVE and DDKDIR variables
    (otherwise pass the correct values to the Makefile).
    Open a shell from cygwin, move to this directory, and run "make" for
    the 32-bit version, "make win64" for the 64 bit version.
    This will produce in the binary/ or binary64/ directory the
    following files:
	ipfw.exe (you also need cygwin1.dll)
	ipfw.sys (an NDIS intermediate filter driver)
	netipfw.inf and netipfw_m.inf (installer files)

    Cross compilation of the userland side under FreeBSD is possible with
	gmake TCC=`pwd`/tcc-0.9.25-bsd/win32 CC=`pwd`/tcc-0.9.25-bsd/win32/bin/wintcc
    (wintcc is a custom version of tcc which produces Windows code)

    NOTE: the 64-bit version is compiled as a 32-bit executable for userspace,
	with appropriate changes to produce 64-bit pointers.
	The kernel module is built using the MSC 'build' utility instead
	of 'make'. THE MODULE IS NOT SIGNED.
    IMPORTANT: Windows 64-bit will not load unsigned kernel modules unless
	you boot with 'F8' and disable checks for signed modules.

***** Linux 2.6 and above ******

	make [KSRC=/path/to/linux USRDIR=/path/to/usr]

    where the two variables are optional an point to the linux kernel
    sources and the /usr directory. Defaults are USRDIR=/usr and
    KSRC=/lib/modules/`uname -r`/build 	--- XXX check ?

    NOTE: make sure CONFIG_NETFILTER is enabled in the kernel
    configuration file. You need the ncurses devel library,
    that can be installed according your distro with:
	apt-get install ncurses-dev	# for debian based distro
	yum -y install ncurses-dev	# for fedora based distro
    You can enable CONFIG_NETFILTER by doing:
    
	"(cd ${KSRC}; make menuconfig)"

    and enabling the option listed below:

        Networking --->
	    Networking options  --->
              [*] Network packet filtering framework (Netfilter)

    If you have not yet compiled your kernel source, you need to
    prepare the build environment:

	(cd $(KSRC); make oldconfig; make prepare; make scripts)

***** Linux 2.4.x *****

    Almost as above, with an additional VER=2.4

	make VER=2.4 KSRC=...

    For 2.4, if KSRC is not specified then we use
    	KSRC ?= /usr/src/`uname -r`/build

    You need to follow the same instruction for the 2.6 kernel, enabling
    netfilter in the kernel options:

    Networking options  --->
      [*] Network packet filtering (replaces ipchains)

***** Openwrt package *****

    (Tested with kamikaze_8.09.1 and Linux 2.4)

    + Download and extract the OpenWrt package, e.g.

	wget http://downloads.openwrt.org/kamikaze/8.09.1/kamikaze_8.09.1_source.tar.bz2
	tar xvjf kamikaze_8.09.1_source.tar.bz2

    + move to the directory with the OpenWrt sources (the one that
      contains Config.in, rules.mk ...)

	cd kamikaze_8.09.1

    + Optional: Add support for 1ms resolution.

	By default OpenWRT kernel is compiled with HZ=100; this implies
        that all timeouts are rounded to 10ms, too coarse for dummynet.
        The file 020-mips-hz1000.patch contains a kernel patch to build
	a kernel with HZ=1000 (i.e. 1ms resolution) as in Linux/FreeBSD.
        To apply this patch, go in the kernel source directory and
        patch the kernel

		cd build_dir/linux-brcm-2.4/linux-2.4.35.4
		cat $IPFW3_SOURCES/020-mips-hz1000.patch | patch -p0

	where IPFW3_SOURCES contains the ipfw3 source code.
	Now, the next kernel recompilation will use the right HZ value

    + Optional: to be sure that the tools are working, make a first
      build as follows:

	- run "make menuconfig" and set the correct target device,
	  drivers, and so on;
	- run "make" to do the build

    + Add ipfw3 to the openwrt package, as follows:

      - copy the code from this directory to the place used for the build:

		cp -Rp /path_to_ipfw3 ../ipfw3; 

	If you want, you can fetch a newer version from the web
	(cd ..; rm -rf ipfw3; \
	wget http://info.iet.unipi.it/~luigi/dummynet/ipfw3-latest.tgz;\
	tar xvzf ipfw3-latest.tgz)

      - run the following commands:
	(mkdir package/ipfw3; \
	cp ../ipfw3/Makefile.openwrt package/ipfw3/Makefile)

	to create the package/ipfw3 directory in the OpenWrt source
	directory, and copy Makefile.openwrt to package/ipfw3/Makefile ;

      - if necessary, edit package/ipfw3/Makefile and set IPFW_DIR to point to
	the directory ipfw3, which contains the sources;

      - run "make menuconfig" and select kmod-ipfw3 as a module <M> in
	    Kernel Modules -> Other modules -> kmod-ipfw3 

      - run "make" to build the package, "make V=99" for verbose build.

      - to modify the code, assuming you are in directory "kamikaze_8.09.1"
	
	(cd ../ipfw3 && vi ...the files you are interested in )
	rm -rf build_dir/linux-brcm-2.4/kmod-ipfw3
	make package/ipfw3/compile V=99

    The resulting package is located in bin/packages/mipsel/kmod-ipfw3*,
    upload the file and install on the target system, as follows:

    opkg install  kmod-ipfw3_2.4.35.4-brcm-2.4-1_mipsel.ipk #install
    ls -l ls -l /lib/modules/2.4.35.4/ipfw*     # check
    insmod /lib/modules/2.4.35.4/ipfw_mod.o     # load the module
    /lib/modules/2.4.35.4/ipfw show             # launch the userspace tool
    rmmod ipfw_mod.o                            # remove the module

***** PLANETLAB BUILD (within a slice) *****
These instruction can be used by PlanetLab developers to compile
the dummynet module on a node. To install the module on the node
users need root access in root context.  PlanetLab users that want
to use the dummynet package should ask to PlanetLab support for
nodes with dummynet emulation capabilities.

    Follow the instructions below. You can just cut&paste

	# install the various tools if not available
	sudo yum -y install subversion rpm-build rpm-devel m4 redhat-rpm-config make gcc
	# new build installation requires the gnupg package
	sudo yum -y install gnupg
	# the linux kernel and the ipfw source can be fetched by git
	sudo yum -y install git

	# create and move to a work directory
	mkdir -p test
	# extract a planetlab distribution to directory XYZ
	(cd test; git clone git://git.onelab.eu/build ./XYZ)
	# download the specfiles and do some patching.
	# Results are into SPEC/ (takes 5 minutes)
	(cd test/XYZ; make stage1=true PLDISTRO=onelab)
	# Building the slice code is fast, the root code takes longer
	# as it needs to rebuild the whole kernel
	(cd test/XYZ; sudo make ipfwslice PLDISTRO=onelab)
	(cd test/XYZ; sudo make ipfwroot PLDISTRO=onelab)

    The kernel dependency phase is a bit time consuming, but does not
    need to be redone if we are changing the ipfw sources only.
    To clean up the code do
	(cd test/XYZ; sudo make ipfwroot-clean ipfwslice-clean)
    then after you have updated the repository again
	(cd test/XYZ; sudo make ipfwslice ipfwroot)

--- References
[1] https://svn.planet-lab.org/wiki/VserverCentos
[2] http://wiki.linux-vserver.org/Installation_on_CentOS
[3] http://mirror.centos.org/centos/5/isos/
[4] More information are in /build/README* files 


================================================
FILE: binary/README.txt
================================================
This directory contains the binaries to install and use IPFW and
DUMMYNET on a Windows Machine. The kernel part is an NDIS module,
whereas the user interface is a command line program.

1. INSTALL THE NDIS DRIVER

- open the configuration panel for the network card in use
  (either right click on the icon on the SYSTRAY, or go to
  Control Panel -> Network and select one card)

- click on Properties->Install->Service->Add
- click on 'Driver Disk' and select 'netipfw.inf' in this folder
- select 'ipfw+dummynet' which is the only service you should see
- click accept on the warnings for the installation of an unknown
  driver (roughly twice per existing network card)

Now you are ready to use the emulator. To configure it, open a 'cmd'
window and you can use the ipfw command from the command line.
Otherwise click on the 'TESTME.bat' which is a batch program that
runs various tests.

2. UNINSTALL THE DRIVER

- select a network card as above.
- click on Properties
- select 'ipfw+dummynet'
- click on 'Remove'


================================================
FILE: binary/netipfw.inf
================================================
; version section
[Version]
Signature  = "$Windows NT$"
Class      = NetService
ClassGUID  = {4D36E974-E325-11CE-BFC1-08002BE10318}
Provider   = %Unipi%
DriverVer  = 26/02/2010,3.0.0.1

; manufacturer section
[Manufacturer]
%Unipi% = UNIPI,NTx86,NTamd64

; control flags section
; optional, unused in netipfw.inf inf, used in netipfw_m.inf
[ControlFlags]

; models section
[UNIPI] ; Win2k
%Desc% = Ipfw.ndi, unipi_ipfw
[UNIPI.NTx86] ;For WinXP and later
%Desc% = Ipfw.ndi, unipi_ipfw
[UNIPI.NTamd64] ;For x64
%Desc% = Ipfw.ndi, unipi_ipfw

; ddinstall section
[Ipfw.ndi]
AddReg          = Ipfw.ndi.AddReg, Ipfw.AddReg
Characteristics = 0x4410 ;  NCF_FILTER | NCF_NDIS_PROTOCOL !--Filter Specific--!!
CopyFiles       = Ipfw.Files.Sys
CopyInf         = netipfw_m.inf

; remove section
[Ipfw.ndi.Remove]
DelFiles = Ipfw.Files.Sys

;ddinstall.services section
[Ipfw.ndi.Services]
AddService = Ipfw,,Ipfw.AddService

[Ipfw.AddService]
DisplayName    = %ServiceDesc%
ServiceType    = 1 ;SERVICE_KERNEL_DRIVER
StartType      = 3 ;SERVICE_DEMAND_START
ErrorControl   = 1 ;SERVICE_ERROR_NORMAL
ServiceBinary  = %12%\ipfw.sys
AddReg         = Ipfw.AddService.AddReg

[Ipfw.AddService.AddReg]

;file copy related sections
[SourceDisksNames]
1=%DiskDescription%,"",,

[SourceDisksFiles]
ipfw.sys=1

[DestinationDirs]
DefaultDestDir = 12
Ipfw.Files.Sys   = 12   ; %windir%\System32\drivers

; ddinstall->copyfiles points here
[Ipfw.Files.Sys]
ipfw.sys,,,2

; ddinstall->addreg points here
[Ipfw.ndi.AddReg]
HKR, Ndi,            HelpText,            , %HELP% ; this is displayed at the bottom of the General page of the Connection Properties dialog box
HKR, Ndi,            FilterClass,         , failover
HKR, Ndi,            FilterDeviceInfId,   , unipi_ipfwmp
HKR, Ndi,            Service,             , Ipfw
HKR, Ndi\Interfaces, UpperRange,          , noupper
HKR, Ndi\Interfaces, LowerRange,          , nolower
HKR, Ndi\Interfaces, FilterMediaTypes,    , "ethernet, tokenring, fddi, wan"

;strings section
[Strings]
Unipi = "Unipi"
DiskDescription = "Ipfw Driver Disk"
Desc = "ipfw+dummynet"
HELP = "This is ipfw and dummynet network emulator, developed by unipi.it"
ServiceDesc = "ipfw service"


================================================
FILE: binary/netipfw_m.inf
================================================
; version section
[Version]
Signature  = "$Windows NT$"
Class      = Net
ClassGUID  = {4D36E972-E325-11CE-BFC1-08002BE10318}
Provider   = %Unipi%
DriverVer  = 26/02/2010,3.0.0.1

; control flags section
; optional, unused in netipfw.inf inf, used in netipfw_m.inf
[ControlFlags]
ExcludeFromSelect = unipi_ipfwmp

; destinationdirs section, optional
[DestinationDirs]
DefaultDestDir=12
; No files to copy 

; manufacturer section
[Manufacturer]
%Unipi% = UNIPI,NTx86,NTamd64

; models section
[UNIPI] ; Win2k
%Desc% = IpfwMP.ndi, unipi_ipfwmp
[UNIPI.NTx86] ;For WinXP and later
%Desc% = IpfwMP.ndi, unipi_ipfwmp
[UNIPI.NTamd64] ;For x64
%Desc% = IpfwMP.ndi, unipi_ipfwmp

; ddinstall section
[IpfwMP.ndi]
AddReg  = IpfwMP.ndi.AddReg
Characteristics = 0x29 ;NCF_NOT_USER_REMOVABLE | NCF_VIRTUAL | NCF_HIDDEN

; ddinstall->addreg points here
[IpfwMP.ndi.AddReg]
HKR, Ndi, Service,  0,  IpfwMP

;ddinstall.services section
[IpfwMP.ndi.Services]
AddService = IpfwMP,0x2, IpfwMP.AddService

[IpfwMP.AddService]
ServiceType    = 1 ;SERVICE_KERNEL_DRIVER
StartType      = 3 ;SERVICE_DEMAND_START
ErrorControl   = 1 ;SERVICE_ERROR_NORMAL
ServiceBinary  = %12%\ipfw.sys
AddReg         = IpfwMP.AddService.AddReg

[IpfwMP.AddService.AddReg]
; None

[Strings]
Unipi = "Unipi"
Desc = "Ipfw Miniport"

================================================
FILE: binary/testme.bat
================================================
@echo on
@set CYGWIN=nodosfilewarning

@ipfw -q flush
@ipfw -q pipe flush
@echo ######################################################################
@echo ## Setting delay to 100ms for both incoming and outgoing ip packets ##
@echo ## and sending 4 echo request to Google                             ##
@echo ######################################################################
ipfw pipe 3 config delay 100ms
ipfw add pipe 3 ip from any to any
ipfw pipe show
ping -n 4 www.google.it

@echo ##############################################
@echo ## Raising delay to 300ms and pinging again ##
@echo ##############################################
ipfw pipe 3 config delay 300ms
ipfw pipe show
ping -n 4 www.google.com

@echo ##################################
@echo ## Shaping bandwidth to 500kbps ##
@echo ##################################
ipfw pipe 3 config bw 500Kbit/s
ipfw pipe show
wget http://info.iet.unipi.it/~luigi/1m
@del 1m

@echo ###################################
@echo ## Lowering bandwidth to 250kbps ##
@echo ###################################
ipfw pipe 3 config bw 250Kbit/s
ipfw pipe show
wget http://info.iet.unipi.it/~luigi/1m
@del 1m

@echo ###################################################################
@echo ## Simulating 50 percent packet loss and sending 15 echo request ##
@echo ###################################################################
@ipfw -q flush
@ipfw -q pipe flush
ipfw add prob 0.5 deny proto icmp in
ping -n 15 -w 300 www.google.it
@ipfw -q flush

@echo ##############################
@echo ## Showing SYSCTL variables ##
@echo ##############################
ipfw sysctl -a

@echo #############################################
@echo ## Inserting rules to test command parsing ##
@echo #############################################
@echo -- dropping all packets of a specific protocol --
ipfw add deny proto icmp
@echo -- dropping packets of all protocols except a specific one --
ipfw add deny not proto tcp
@echo -- dropping all packets from IP x to IP y --
ipfw add deny src-ip 1.2.3.4 dst-ip 5.6.7.8
@echo -- dropping all ssh outgoing connections --
ipfw add deny out dst-port 22
@echo -- allowing already opened browser connections --
@echo -- but preventing new ones from being opened   --
ipfw add deny out proto tcp dst-port 80 tcpflags syn
@echo -- another way to do the same thing --
ipfw add allow out proto tcp dst-port 80 established
ipfw add deny out proto tcp dst-port 80 setup
@echo -- checking what rules have been inserted --
ipfw -c show
@ipfw -q flush

@echo #################
@echo ## Cleaning up ##
@echo #################
ipfw -q flush
ipfw -q pipe flush

pause


================================================
FILE: configuration/README
================================================
This directorty contains some ipfw configurations and a scripts 
to safely change the firewall rules.

The firewall configuration comes from the FreeBSD initial script.
The change_rules_linux.sh allows to change the ipfw rules and
in case os a misconfiguration which prevents to reach the remote
host, to restore the old ruleset.

To configure the firewall behavior, edit the ipfw.conf file and 
execute the ./change_rules_linux.sh script.

The ipfw program executable should be located in /sbin (XXX)

XXX seems we use something which is not compatible with dash


================================================
FILE: configuration/change_rules.sh
================================================
#!/bin/sh
#
# Copyright (c) 2000 Alexandre Peixoto
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
# $FreeBSD: src/share/examples/ipfw/change_rules.sh,v 1.6 2003/09/07 07:52:56 jmg Exp $

# Change ipfw(8) rules with safety guarantees for remote operation
#
# Invoke this script to edit ${firewall_script}. It will call ${EDITOR},
# or vi(1) if the environment variable is not set, for you to edit
# ${firewall_script}, ask for confirmation, and then run
# ${firewall_script}. You can then examine the output of ipfw list and
# confirm whether you want the new version or not.
#
# If no answer is received in 30 seconds, the previous
# ${firewall_script} is run, restoring the old rules (this assumes ipfw
# flush is present in it).
#
# If the new rules are confirmed, they'll replace ${firewall_script} and
# the previous ones will be copied to ${firewall_script}.{date}. Mail
# will also be sent to root with a unified diff of the rule change.
#
# Unapproved rules are kept in ${firewall_script}.new, and you are
# offered the option of changing them instead of the present rules when
# you call this script.
#
# This script could be improved by using version control
# software.

# XXX on linux /etc/rc.conf defines:
# firewall_type and firewall_script

if [ -r /etc/defaults/rc.conf ]; then
	. /etc/defaults/rc.conf
	source_rc_confs
elif [ -r /etc/rc.conf ]; then
	. /etc/rc.conf
fi

EDITOR=${EDITOR:-/usr/bin/vi}
PAGER=${PAGER:-/usr/bin/more}

# on linux the default mktemp invocation behavior
# is different, we should change the temporary file creation
tempfoo=`basename $0`
#TMPFILE=`mktemp -t ${tempfoo}` || exit 1
TMPFILE=`mktemp -t ${tempfoo}.XXXXX` || exit 1

get_yes_no() {
	while true
	do
		echo -n "$1 (Y/N) ? " 
		read -t 30 a
		if [ $? != 0 ]; then
			a="No";
		        return;
		fi
		case $a in
			[Yy]) a="Yes";
			      return;;
			[Nn]) a="No";
			      return;;
			*);;
		esac
	done
}

restore_rules() {
	nohup sh ${firewall_script} </dev/null >/dev/null 2>&1
	rm ${TMPFILE}
	exit 1
}

case "${firewall_type}" in
[Cc][Ll][Ii][Ee][Nn][Tt]|\
[Cc][Ll][Oo][Ss][Ee][Dd]|\
[Oo][Pp][Ee][Nn]|\
[Ss][Ii][Mm][Pp][Ll][Ee]|\
[Uu][Nn][Kk][Nn][Oo][Ww][Nn])
	edit_file="${firewall_script}"
	rules_edit=no
	;;
*)
	if [ -r "${firewall_type}" ]; then
		edit_file="${firewall_type}"
		rules_edit=yes
	fi
	;;
esac

if [ -f ${edit_file}.new ]; then
	get_yes_no "A new rules file already exists, do you want to use it"
	[ $a = 'No' ] && cp ${edit_file} ${edit_file}.new
else 
	cp ${edit_file} ${edit_file}.new
fi

trap restore_rules SIGHUP

${EDITOR} ${edit_file}.new

get_yes_no "Do you want to install the new rules"

[ $a = 'No' ] && exit 1

cat <<!
The rules will be changed now. If the message 'Type y to keep the new
rules' does not appear on the screen or the y key is not pressed in 30
seconds, the original rules will be restored.
The TCP/IP connections might be broken during the change. If so, restore
the ssh/telnet connection being used.
!

if [ ${rules_edit} = yes ]; then
	nohup sh ${firewall_script} ${firewall_type}.new \
	    < /dev/null > ${TMPFILE} 2>&1
else
	nohup sh ${firewall_script}.new \
	    < /dev/null > ${TMPFILE} 2>&1
fi
sleep 2;
get_yes_no "Would you like to see the resulting new rules"
[ $a = 'Yes' ] && ${PAGER} ${TMPFILE}
get_yes_no "Type y to keep the new rules"
[ $a != 'Yes' ] && restore_rules

DATE=`date "+%Y%m%d%H%M"`
cp ${edit_file} ${edit_file}.$DATE
mv ${edit_file}.new ${edit_file} 
cat <<!
The new rules are now installed. The previous rules have been preserved in
the file ${edit_file}.$DATE
!
diff -F "^# .*[A-Za-z]" -u ${edit_file}.$DATE ${edit_file} \
    | mail -s "`hostname` Firewall rule change" root
rm ${TMPFILE}
exit 0


================================================
FILE: configuration/change_rules_linux.sh
================================================
#!/bin/sh
#
# marta
# linux wrapper for the FreeBSD change rules program
# This file load the linux configuration and calls the
# original change rules program

if [ -r ./ipfw.conf ]; then
	. ./ipfw.conf
fi

. ./change_rules.sh


================================================
FILE: configuration/ipfw.conf
================================================
# ipfw and dummynet configuration file for linux
# XXX TO BE TESTED ON LINUX

# The firewall_type variable is used to configure the firewall behavior.
# A detailed description on how a following type works is in rc.firewall
#
#   open        - will allow anyone in
#   client      - will try to protect just this machine
#   simple      - will try to protect a whole network
#   closed      - totally disables IP services except via lo0 interface
#   workstation - will try to protect just this machine using statefull
#                 firewalling. See below for rc.conf variables used
#   UNKNOWN     - disables the loading of firewall rules.
#   filename    - will load the rules in the given filename (full path required)

# firewall_type=open

# The following file is an example on how to use a filename to define a firewall
# and how to configure a simple dummynet pipe to ... XXX shape traffic... etc...
firewall_type=/home/marta/SVN/ports-luigi/dummynet-branches/ipfw3/configuration/ipfw.rules

# Environment variables expected by the change rules script
EDITOR=/usr/bin/vi
PAGER=/bin/more

# The following variable should point to the rc.firewall script
# XXX TEST
#firewall_script=`echo "please edit the firewall_script variable in ipfw.conf"`;
firewall_script="/home/marta/SVN/ports-luigi/dummynet-branches/ipfw3/configuration/rc.firewall"


================================================
FILE: configuration/ipfw.rules
================================================
# This is a simple configuration file
# add dummynet pipes and a firewall section

# flush all rules ...
# flush

# dummynet configuration

# firewall configuration
add 1 allow all from any to any
# ...
add 65000 deny all from any to any


================================================
FILE: configuration/rc.firewall
================================================
#!/bin/sh -
# Copyright (c) 1996  Poul-Henning Kamp
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
# $FreeBSD: src/etc/rc.firewall,v 1.52.4.1 2008/01/29 00:22:32 dougb Exp $
#

#
# Setup system for ipfw(4) firewall service.
#

# Suck in the configuration variables.
if [ -z "${source_rc_confs_defined}" ]; then
	if [ -r /etc/defaults/rc.conf ]; then
		. /etc/defaults/rc.conf
		source_rc_confs
	elif [ -r /etc/rc.conf ]; then
		. /etc/rc.conf
	fi
fi

############
# Define the firewall type in /etc/rc.conf.  Valid values are:
#   open        - will allow anyone in
#   client      - will try to protect just this machine
#   simple      - will try to protect a whole network
#   closed      - totally disables IP services except via lo0 interface
#   workstation - will try to protect just this machine using statefull
#		  firewalling. See below for rc.conf variables used
#   UNKNOWN     - disables the loading of firewall rules.
#   filename    - will load the rules in the given filename (full path required)
#
# For ``client'' and ``simple'' the entries below should be customized
# appropriately.

############
#
# If you don't know enough about packet filtering, we suggest that you
# take time to read this book:
#
#	Building Internet Firewalls, 2nd Edition
#	Brent Chapman and Elizabeth Zwicky
#
#	O'Reilly & Associates, Inc
#	ISBN 1-56592-871-7
#	http://www.ora.com/
#	http://www.oreilly.com/catalog/fire2/
#
# For a more advanced treatment of Internet Security read:
#
#	Firewalls and Internet Security: Repelling the Wily Hacker, 2nd Edition
#	William R. Cheswick, Steven M. Bellowin, Aviel D. Rubin
#
#	Addison-Wesley / Prentice Hall
#	ISBN 0-201-63466-X
#	http://www.pearsonhighered.com/
#	http://www.pearsonhighered.com/educator/academic/product/0,3110,020163466X,00.html
#

setup_loopback () {
	############
	# Only in rare cases do you want to change these rules
	#
	${fwcmd} add 100 pass all from any to any via lo0
	${fwcmd} add 200 deny all from any to 127.0.0.0/8
	${fwcmd} add 300 deny ip from 127.0.0.0/8 to any
}

if [ -n "${1}" ]; then
	firewall_type="${1}"
fi

############
# Set quiet mode if requested
#
case ${firewall_quiet} in
[Yy][Ee][Ss])
	fwcmd="/sbin/ipfw -q"
	;;
*)
	fwcmd="/sbin/ipfw"
	;;
esac

############
# Flush out the list before we begin.
#
${fwcmd} -f flush

setup_loopback

############
# Network Address Translation.  All packets are passed to natd(8)
# before they encounter your remaining rules.  The firewall rules
# will then be run again on each packet after translation by natd
# starting at the rule number following the divert rule.
#
# For ``simple'' firewall type the divert rule should be put to a
# different place to not interfere with address-checking rules.
#
case ${firewall_type} in
[Oo][Pp][Ee][Nn]|[Cc][Ll][Ii][Ee][Nn][Tt])
	case ${natd_enable} in
	[Yy][Ee][Ss])
		if [ -n "${natd_interface}" ]; then
			${fwcmd} add 50 divert natd ip4 from any to any via ${natd_interface}
		fi
		;;
	esac
	case ${firewall_nat_enable} in
	[Yy][Ee][Ss])
		if [ -n "${firewall_nat_interface}" ]; then
			${fwcmd} nat 123 config if ${firewall_nat_interface} log
			${fwcmd} add 50 nat 123 ip4 from any to any via ${firewall_nat_interface}
		fi
		;;
	esac
esac

############
# If you just configured ipfw in the kernel as a tool to solve network
# problems or you just want to disallow some particular kinds of traffic
# then you will want to change the default policy to open.  You can also
# do this as your only action by setting the firewall_type to ``open''.
#
# ${fwcmd} add 65000 pass all from any to any


# Prototype setups.
#
case ${firewall_type} in
[Oo][Pp][Ee][Nn])
	${fwcmd} add 65000 pass all from any to any
	;;

[Cc][Ll][Ii][Ee][Nn][Tt])
	############
	# This is a prototype setup that will protect your system somewhat
	# against people from outside your own network.
	############

	# set these to your network and netmask and ip
	net="192.0.2.0"
	mask="255.255.255.0"
	ip="192.0.2.1"

	# Allow any traffic to or from my own net.
	${fwcmd} add pass all from ${ip} to ${net}:${mask}
	${fwcmd} add pass all from ${net}:${mask} to ${ip}

	# Allow TCP through if setup succeeded
	${fwcmd} add pass tcp from any to any established

	# Allow IP fragments to pass through
	${fwcmd} add pass all from any to any frag

	# Allow setup of incoming email
	${fwcmd} add pass tcp from any to me 25 setup

	# Allow setup of outgoing TCP connections only
	${fwcmd} add pass tcp from me to any setup

	# Disallow setup of all other TCP connections
	${fwcmd} add deny tcp from any to any setup

	# Allow DNS queries out in the world
	${fwcmd} add pass udp from me to any 53 keep-state

	# Allow NTP queries out in the world
	${fwcmd} add pass udp from me to any 123 keep-state

	# Everything else is denied by default, unless the
	# IPFIREWALL_DEFAULT_TO_ACCEPT option is set in your kernel
	# config file.
	;;

[Ss][Ii][Mm][Pp][Ll][Ee])
	############
	# This is a prototype setup for a simple firewall.  Configure this
	# machine as a DNS and NTP server, and point all the machines
	# on the inside at this machine for those services.
	############

	# set these to your outside interface network and netmask and ip
	oif="ed0"
	onet="192.0.2.0"
	omask="255.255.255.240"
	oip="192.0.2.1"

	# set these to your inside interface network and netmask and ip
	iif="ed1"
	inet="192.0.2.16"
	imask="255.255.255.240"
	iip="192.0.2.17"

	# Stop spoofing
	${fwcmd} add deny all from ${inet}:${imask} to any in via ${oif}
	${fwcmd} add deny all from ${onet}:${omask} to any in via ${iif}

	# Stop RFC1918 nets on the outside interface
	${fwcmd} add deny all from any to 10.0.0.0/8 via ${oif}
	${fwcmd} add deny all from any to 172.16.0.0/12 via ${oif}
	${fwcmd} add deny all from any to 192.168.0.0/16 via ${oif}

	# Stop draft-manning-dsua-03.txt (1 May 2000) nets (includes RESERVED-1,
	# DHCP auto-configuration, NET-TEST, MULTICAST (class D), and class E)
	# on the outside interface
	${fwcmd} add deny all from any to 0.0.0.0/8 via ${oif}
	${fwcmd} add deny all from any to 169.254.0.0/16 via ${oif}
	${fwcmd} add deny all from any to 192.0.2.0/24 via ${oif}
	${fwcmd} add deny all from any to 224.0.0.0/4 via ${oif}
	${fwcmd} add deny all from any to 240.0.0.0/4 via ${oif}

	# Network Address Translation.  This rule is placed here deliberately
	# so that it does not interfere with the surrounding address-checking
	# rules.  If for example one of your internal LAN machines had its IP
	# address set to 192.0.2.1 then an incoming packet for it after being
	# translated by natd(8) would match the `deny' rule above.  Similarly
	# an outgoing packet originated from it before being translated would
	# match the `deny' rule below.
	case ${natd_enable} in
	[Yy][Ee][Ss])
		if [ -n "${natd_interface}" ]; then
			${fwcmd} add divert natd all from any to any via ${natd_interface}
		fi
		;;
	esac

	# Stop RFC1918 nets on the outside interface
	${fwcmd} add deny all from 10.0.0.0/8 to any via ${oif}
	${fwcmd} add deny all from 172.16.0.0/12 to any via ${oif}
	${fwcmd} add deny all from 192.168.0.0/16 to any via ${oif}

	# Stop draft-manning-dsua-03.txt (1 May 2000) nets (includes RESERVED-1,
	# DHCP auto-configuration, NET-TEST, MULTICAST (class D), and class E)
	# on the outside interface
	${fwcmd} add deny all from 0.0.0.0/8 to any via ${oif}
	${fwcmd} add deny all from 169.254.0.0/16 to any via ${oif}
	${fwcmd} add deny all from 192.0.2.0/24 to any via ${oif}
	${fwcmd} add deny all from 224.0.0.0/4 to any via ${oif}
	${fwcmd} add deny all from 240.0.0.0/4 to any via ${oif}

	# Allow TCP through if setup succeeded
	${fwcmd} add pass tcp from any to any established

	# Allow IP fragments to pass through
	${fwcmd} add pass all from any to any frag

	# Allow setup of incoming email
	${fwcmd} add pass tcp from any to ${oip} 25 setup

	# Allow access to our DNS
	${fwcmd} add pass tcp from any to ${oip} 53 setup
	${fwcmd} add pass udp from any to ${oip} 53
	${fwcmd} add pass udp from ${oip} 53 to any

	# Allow access to our WWW
	${fwcmd} add pass tcp from any to ${oip} 80 setup

	# Reject&Log all setup of incoming connections from the outside
	${fwcmd} add deny log tcp from any to any in via ${oif} setup

	# Allow setup of any other TCP connection
	${fwcmd} add pass tcp from any to any setup

	# Allow DNS queries out in the world
	${fwcmd} add pass udp from ${oip} to any 53 keep-state

	# Allow NTP queries out in the world
	${fwcmd} add pass udp from ${oip} to any 123 keep-state

	# Everything else is denied by default, unless the
	# IPFIREWALL_DEFAULT_TO_ACCEPT option is set in your kernel
	# config file.
	;;

[Ww][Oo][Rr][Kk][Ss][Tt][Aa][Tt][Ii][Oo][Nn])
	# Configuration:
	#  firewall_myservices:		List of TCP ports on which this host
	#			 	 offers services.
	#  firewall_allowservices:	List of IPs which has access to
	#				 $firewall_myservices.
	#  firewall_trusted:		List of IPs which has full access 
	#				 to this host. Be very carefull 
	#				 when setting this. This option can
	#				 seriously degrade the level of 
	#				 protection provided by the firewall.
	#  firewall_logdeny:		Boolean (YES/NO) specifying if the
	#				 default denied packets should be
	#				 logged (in /var/log/security).
	#  firewall_nologports:		List of TCP/UDP ports for which
	#				 denied incomming packets are not
	#				 logged.
	
	# Allow packets for which a state has been built.
	${fwcmd} add check-state

	# For services permitted below.
	${fwcmd} add pass tcp  from me to any established

	# Allow any connection out, adding state for each.
	${fwcmd} add pass tcp  from me to any setup keep-state
	${fwcmd} add pass udp  from me to any       keep-state
	${fwcmd} add pass icmp from me to any       keep-state

	# Allow DHCP.
	${fwcmd} add pass udp  from 0.0.0.0 68 to 255.255.255.255 67 out
	${fwcmd} add pass udp  from any 67     to me 68 in
	${fwcmd} add pass udp  from any 67     to 255.255.255.255 68 in
	# Some servers will ping the IP while trying to decide if it's 
	# still in use.
	${fwcmd} add pass icmp from any to any icmptype 8

	# Allow "mandatory" ICMP in.
	${fwcmd} add pass icmp from any to any icmptype 3,4,11
	
	# Add permits for this workstations published services below
	# Only IPs and nets in firewall_allowservices is allowed in.
	# If you really wish to let anyone use services on your 
	# workstation, then set "firewall_allowservices='any'" in /etc/rc.conf
	#
	# Note: We don't use keep-state as that would allow DoS of
	#       our statetable. 
	#       You can add 'keep-state' to the lines for slightly
	#       better performance if you fell that DoS of your
	#       workstation won't be a problem.
	#
	for i in ${firewall_allowservices} ; do
	  for j in ${firewall_myservices} ; do
	    ${fwcmd} add pass tcp from $i to me $j
	  done
	done

	# Allow all connections from trusted IPs.
	# Playing with the content of firewall_trusted could seriously
	# degrade the level of protection provided by the firewall.
	for i in ${firewall_trusted} ; do
	  ${fwcmd} add pass ip from $i to me
	done
	
	${fwcmd} add 65000 count ip from any to any

	# Drop packets to ports where we don't want logging
	for i in ${firewall_nologports} ; do
	  ${fwcmd} add deny { tcp or udp } from any to any $i in
	done

	# Broadcasts and muticasts
	${fwcmd} add deny ip  from any to 255.255.255.255
	${fwcmd} add deny ip  from any to 224.0.0.0/24 in	# XXX

	# Noise from routers
	${fwcmd} add deny udp from any to any 520 in

	# Noise from webbrowsing.
	# The statefull filter is a bit agressive, and will cause some
	#  connection teardowns to be logged.
	${fwcmd} add deny tcp from any 80,443 to any 1024-65535 in

	# Deny and (if wanted) log the rest unconditionally.
	log=""
	if [ ${firewall_logdeny:-x} = "YES" -o ${firewall_logdeny:-x} = "yes" ] ; then
	  log="log logamount 500"	# The default of 100 is too low.
	  sysctl net.inet.ip.fw.verbose=1 >/dev/null
	fi
	${fwcmd} add deny $log ip from any to any
	;;

[Cc][Ll][Oo][Ss][Ee][Dd])
	${fwcmd} add 65000 deny ip from any to any
	;;
[Uu][Nn][Kk][Nn][Oo][Ww][Nn])
	;;
*)
	if [ -r "${firewall_type}" ]; then
		${fwcmd} ${firewall_flags} ${firewall_type}
	fi
	;;
esac


================================================
FILE: glue.h
================================================
/*
 * Copyright (c) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
/*
 * $Id: glue.h 12501 2014-01-10 01:09:14Z luigi $
 *
 * glue code to adapt the FreeBSD version to linux and windows,
 * userland and kernel.
 * This is included before any other headers, so we do not have
 * a chance to override any #define that should appear in other
 * headers.
 * First handle headers for userland and kernel. Then common code
 * (including headers that require a specific order of inclusion),
 * then the user- and kernel- specific parts.
 */
 
#if defined __FreeBSD__
#define _GLUE_H
#endif /* __FreeBSD__ */
#ifndef _GLUE_H
#define	_GLUE_H


/*
 * common definitions to allow portability
 */
#ifndef __FBSDID
#define __FBSDID(x)
#endif  /* FBSDID */

#ifndef KERNEL_MODULE	/* Userland headers */

#if defined(__CYGWIN32__) || defined(__CYGWIN__)
#if !defined(_WIN32)                                   
#define _WIN32                                                                  
#endif                                                                          
#endif                                                                          

#if defined(TCC) && defined(_WIN32)
#include <tcc_glue.h>
#endif /* TCC */

#include <stdint.h>	/* linux needs it in addition to sys/types.h */
#include <sys/types.h>	/* for size_t */
#include <sys/ioctl.h>
#include <time.h>
#include <errno.h>
#ifdef __linux__
#include <netinet/ether.h>	/* linux only 20111031 */
#endif

#else /* KERNEL_MODULE, kernel headers */

#define	INET		# want inet support
#ifdef __linux__

#include <linux/version.h>

#define ifnet		net_device	/* remap */
#define	_KERNEL		# make kernel structure visible
#define	KLD_MODULE	# add the module glue

#include <linux/stddef.h>	/* linux kernel */
#include <linux/types.h>	/* linux kernel */

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)	// or 2.4.x
#include <linux/linkage.h>	/* linux/msg.h require this */
#include <linux/netdevice.h>	/* just MAX_ADDR_LEN 8 on 2.4 32 on 2.6, also brings in byteorder */
#endif

/* on 2.6.22, msg.h requires spinlock_types.h */
/* XXX spinlock_type.h was introduced in 2.6.14 */
#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,13) && \
	LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
#include <linux/spinlock_types.h>
#endif
/* XXX m_type define conflict with include/sys/mbuf.h,
 * so early include msg.h (to be solved)
*/
#include <linux/msg.h>	

#include <linux/list.h>
#include <linux/in.h>		/* struct in_addr */
#include <linux/in6.h>		/* struct in6_addr */
#include <linux/icmp.h>
/*
 * LIST_HEAD in queue.h conflict with linux/list.h
 * some previous linux include need list.h definition
 */
#undef LIST_HEAD

#define	IF_NAMESIZE	(16)
typedef	uint32_t	in_addr_t;

#define printf(fmt, arg...) printk(KERN_ERR fmt, ##arg)
#endif	/* __linux__ */

#endif /* KERNEL_MODULE end of kernel headers */


/*
 * Part 2: common userland and kernel definitions
 */

#ifndef ETHER_ADDR_LEN
#define ETHER_ADDR_LEN (6+0)       /* length of an Ethernet address */
#endif

#define ICMP6_DST_UNREACH_NOROUTE       0       /* no route to destination */
#define ICMP6_DST_UNREACH_ADMIN         1       /* administratively prohibited */
#define ICMP6_DST_UNREACH_ADDR          3       /* address unreachable */
#define ICMP6_DST_UNREACH_NOPORT        4       /* port unreachable */

/*
 * linux: sysctl are mapped into /sys/module/ipfw_mod parameters
 * windows: they are emulated via get/setsockopt
 */
#define CTLFLAG_RD		1
#define CTLFLAG_RDTUN	1
#define CTLFLAG_RW		2
#define CTLFLAG_SECURE3	0 // unsupported
#define CTLFLAG_VNET    0	/* unsupported */

/* if needed, queue.h must be included here after list.h */

/*
 * struct thread is used in linux and windows kernel.
 * In windows, we need to emulate the sockopt interface
 * so also the userland needs to have the struct sockopt defined.
 * In order to achieve 64 bit compatibility, padding has been inserted.
 */
struct thread {
        void *sopt_td;
        void *td_ucred;
};

enum sopt_dir { SOPT_GET, SOPT_SET };

struct  sockopt {
        enum    sopt_dir sopt_dir; /* is this a get or a set? */
        int     sopt_level;     /* second arg of [gs]etsockopt */
        int     sopt_name;      /* third arg of [gs]etsockopt */
#ifdef _X64EMU
		void* pad1;
		void* pad2;
#endif
		void   *sopt_val;       /* fourth arg of [gs]etsockopt */
		size_t  sopt_valsize;   /* (almost) fifth arg of [gs]etsockopt */
#ifdef _X64EMU
		void* pad3;
		void* pad4;
#endif
		struct  thread *sopt_td; /* calling thread or null if kernel */
};


#define INET_ADDRSTRLEN		(16)	/* missing in netinet/in.h */

/*
 * List of values used for set/getsockopt options.
 * The base value on FreeBSD is defined as a macro,
 * if not available we will use our own enum.
 * The TABLE_BASE value is used in the kernel.
 */
#ifndef IP_FW_TABLE_ADD
#define _IPFW_SOCKOPT_BASE	100	/* 40 on freebsd */
enum ipfw_msg_type {
	IP_FW_TABLE_ADD		= _IPFW_SOCKOPT_BASE,
	IP_FW_TABLE_DEL,
	IP_FW_TABLE_FLUSH,
	IP_FW_TABLE_GETSIZE,
	IP_FW_TABLE_LIST,
	IP_FW_DYN_GET,		/* new addition */

	/* IP_FW3 and IP_DUMMYNET3 are the new API */
	IP_FW3			= _IPFW_SOCKOPT_BASE + 8,
	IP_DUMMYNET3,

	IP_FW_ADD		= _IPFW_SOCKOPT_BASE + 10,
	IP_FW_DEL,
	IP_FW_FLUSH,
	IP_FW_ZERO,
	IP_FW_GET,
	IP_FW_RESETLOG,

	IP_FW_NAT_CFG,
	IP_FW_NAT_DEL,
	IP_FW_NAT_GET_CONFIG,
	IP_FW_NAT_GET_LOG,

	IP_DUMMYNET_CONFIGURE,
	IP_DUMMYNET_DEL	,
	IP_DUMMYNET_FLUSH,
	/* 63 is missing */
	IP_DUMMYNET_GET		= _IPFW_SOCKOPT_BASE + 24,
	_IPFW_SOCKOPT_END
};
#endif /* IP_FW_TABLE_ADD */

/*
 * Part 3: userland stuff
 */

#ifndef KERNEL_MODULE

/*
 * internal names in struct in6_addr (netinet/in6.h) differ,
 * so we remap the FreeBSD names to the platform-specific ones.
 */
#ifndef _WIN32
#define __u6_addr	in6_u
#define __u6_addr32	u6_addr32
#define in6_u __in6_u	/* missing type for ipv6 (linux 2.6.28) */
#else	/* _WIN32 uses different naming */
#define __u6_addr	__u6
#define __u6_addr32	__s6_addr32
#endif	/* _WIN32 */

/* missing in linux netinet/ip.h */
#define IPTOS_ECN_ECT0	0x02    /* ECN-capable transport (0) */
#define IPTOS_ECN_CE	0x03    /* congestion experienced */

/* defined in freebsd netinet/icmp6.h */
#define ICMP6_MAXTYPE	201

/* on freebsd sys/socket.h pf specific */
#define NET_RT_IFLIST	3               /* survey interface list */

#if defined(__linux__) || defined(__CYGWIN32__) || defined(__CYGWIN__)
/* on freebsd net/if.h XXX used */
struct if_data {
	/* ... */
        u_long ifi_mtu;	/* maximum transmission unit */
};

/*
 * Message format for use in obtaining information about interfaces
 * from getkerninfo and the routing socket.
 * This is used in nat.c
 */
struct if_msghdr {
        u_short ifm_msglen;     /* to skip over unknown messages */
        u_char  ifm_version;    /* future binary compatibility */
        u_char  ifm_type;       /* message type */
        int     ifm_addrs;      /* like rtm_addrs */
        int     ifm_flags;      /* value of if_flags */
        u_short ifm_index;      /* index for associated ifp */
        struct  if_data ifm_data;/* stats and other ifdata */
};

/*
 * Message format for use in obtaining information about interface
 * addresses from getkerninfo and the routing socket
 */
struct ifa_msghdr {
        u_short ifam_msglen;    /* to skip over unknown messages */
        u_char  ifam_version;   /* future binary compatibility */
        u_char  ifam_type;      /* message type */
        int     ifam_addrs;     /* like rtm_addrs */
        int     ifam_flags;     /* value of ifa_flags */
        u_short ifam_index;     /* index for associated ifp */
        int     ifam_metric;    /* value of ifa_metric */
};

#ifndef NO_RTM	/* conflicting with netlink */
/* missing in net/route.h */
#define RTM_VERSION     5       /* Up the ante and ignore older versions */
#define RTM_IFINFO      0xe     /* iface going up/down etc. */
#define RTM_NEWADDR     0xc     /* address being added to iface */
#define RTA_IFA         0x20    /* interface addr sockaddr present */
#endif	/* NO_RTM */

/* SA_SIZE is used in the userland nat.c modified */
#define SA_SIZE(sa)                                             \
    (  (!(sa) ) ?      \
        sizeof(long)            :                               \
        1 + ( (sizeof(struct sockaddr) - 1) | (sizeof(long) - 1) ) )

/* sys/time.h */
/*
 * Getkerninfo clock information structure
 */
struct clockinfo {
        int     hz;             /* clock frequency */
        int     tick;           /* micro-seconds per hz tick */
        int     spare;
        int     stathz;         /* statistics clock frequency */
        int     profhz;         /* profiling clock frequency */
};

/* no sin_len in sockaddr, we only remap in userland */
#define	sin_len	sin_zero[0]

#endif /* Linux/Win */

/*
 * linux does not have a reentrant version of qsort,
 * so we the FreeBSD stdlib version.
 */
void qsort_r(void *a, size_t n, size_t es, void *thunk,
	int cmp_t(void *, const void *, const void *));

/* prototypes from libutil */
/* humanize_number(3) */
#define HN_DECIMAL              0x01
#define HN_NOSPACE              0x02
#define HN_B                    0x04
#define HN_DIVISOR_1000         0x08

#define HN_GETSCALE             0x10
#define HN_AUTOSCALE            0x20

int     humanize_number(char *_buf, size_t _len, int64_t _number,
            const char *_suffix, int _scale, int _flags);
int     expand_number(const char *_buf, int64_t *_num);

#define setprogname(x)	/* not present in linux */

extern int optreset;	/* not present in linux */

size_t strlcpy(char * dst, const char * src, size_t siz);
long long int strtonum(const char *nptr, long long minval,
	long long maxval, const char **errstr);
 
int sysctlbyname(const char *name, void *oldp, size_t *oldlenp,
	void *newp, size_t newlen);
 

#else /* KERNEL_MODULE */

/*
 * Part 4: kernel stuff
 */

/* linux and windows kernel do not have bcopy ? */
#define bcopy(_s, _d, _l)	memcpy(_d, _s, _l)
/* definitions useful for the kernel side */
struct route_in6 {
	int dummy;
};

#ifdef __linux__

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)	// or 2.4.x
#include <linux/in6.h>
#endif

/* skb_dst() and skb_dst_set() was introduced from linux 2.6.31 */
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31)
void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst);
struct dst_entry *skb_dst(const struct sk_buff *skb);
#endif

/* The struct flowi changed */
#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,38)	// check boundaries
#define flow_daddr fl.u.ip4
#else
#define flow_daddr fl.nl_u.ip4_u
#endif

#endif /* __linux__ */

/* 
 * Do not load prio_heap.h header because of conflicting names
 * with our heap functions defined in include/netinet/ipfw/dn_heap.h
 * However do define struct ptr_heap used in linux 3.12.7 etc.
 */
#define _LINUX_PRIO_HEAP_H
struct ptr_heap;

/* 
 * The following define prevent the ipv6.h header to be loaded.
 * Starting from the 2.6.38 kernel the ipv6.h file, which is included
 * by include/net/inetpeer.h in turn included by net/route.h
 * include the system tcp.h file while we want to include 
 * our include/net/tcp.h instead.
 */
#ifndef _NET_IPV6_H
#define _NET_IPV6_H
static inline void ipv6_addr_copy(struct in6_addr *a1, const struct in6_addr *a2)
{
        memcpy(a1, a2, sizeof(struct in6_addr));
}
#endif /* _NET_IPV6_H */

#endif	/* KERNEL_MODULE */

/*
 * Part 5: windows specific stuff
 */

#ifdef _WIN32
#ifndef KERNEL_MODULE
#define CTL_CODE( DeviceType, Function, Method, Access ) (                 \
    ((DeviceType) << 16) | ((Access) << 14) | ((Function) << 2) | (Method) \
)

#define METHOD_BUFFERED                 0
#define METHOD_IN_DIRECT                1
#define METHOD_OUT_DIRECT               2
#define METHOD_NEITHER                  3
#define FILE_ANY_ACCESS                 0
#define FILE_READ_DATA            ( 0x0001 )    // file & pipe
#define FILE_WRITE_DATA           ( 0x0002 )    // file & pipe
#endif /* !KERNEL_MODULE */

#define FILE_DEVICE_IPFW		0x00654324
#define IP_FW_BASE_CTL			0x840
#define IP_FW_SETSOCKOPT \
	CTL_CODE(FILE_DEVICE_IPFW, IP_FW_BASE_CTL + 1, METHOD_BUFFERED, FILE_WRITE_DATA)
#define IP_FW_GETSOCKOPT \
	CTL_CODE(FILE_DEVICE_IPFW, IP_FW_BASE_CTL + 2, METHOD_BUFFERED, FILE_ANY_ACCESS)

/*********************************
* missing declarations in altq.c *
**********************************/

#define _IOWR(x,y,t) _IOW(x,y,t)

/**********************************
* missing declarations in ipfw2.c *
***********************************/

#define	ICMP_UNREACH_NET		0	/* bad net */
#define	ICMP_UNREACH_HOST		1	/* bad host */
#define	ICMP_UNREACH_PROTOCOL		2	/* bad protocol */
#define	ICMP_UNREACH_PORT		3	/* bad port */
#define	ICMP_UNREACH_NEEDFRAG		4	/* IP_DF caused drop */
#define	ICMP_UNREACH_SRCFAIL		5	/* src route failed */
#define	ICMP_UNREACH_NET_UNKNOWN	6	/* unknown net */
#define	ICMP_UNREACH_HOST_UNKNOWN	7	/* unknown host */
#define	ICMP_UNREACH_ISOLATED		8	/* src host isolated */
#define	ICMP_UNREACH_NET_PROHIB		9	/* prohibited access */
#define	ICMP_UNREACH_HOST_PROHIB	10	/* ditto */
#define	ICMP_UNREACH_TOSNET		11	/* bad tos for net */
#define	ICMP_UNREACH_TOSHOST		12	/* bad tos for host */
#define	ICMP_UNREACH_FILTER_PROHIB	13	/* admin prohib */
#define	ICMP_UNREACH_HOST_PRECEDENCE	14	/* host prec vio. */
#define	ICMP_UNREACH_PRECEDENCE_CUTOFF	15	/* prec cutoff */


struct ether_addr;
struct ether_addr * ether_aton(const char *a);

/*********************************
* missing declarations in ipv6.c *
**********************************/

struct hostent* gethostbyname2(const char *name, int af);

#define strcasecmp strcmp // windows XXX ip_dummynet.c

/********************
* windows wrappings *
*********************/

int my_socket(int domain, int ty, int proto);
#define socket(_a, _b, _c)	my_socket(_a, _b, _c)

#endif /* _WIN32 */
/*******************
* SYSCTL emulation *
********************/
#if defined (_WIN32) || defined (EMULATE_SYSCTL)
#define STRINGIFY(x) #x

/* flag is set with the last 2 bits for access, as defined in glue.h
 * and the rest for type
 */
enum {
	SYSCTLTYPE_INT = 0,
	SYSCTLTYPE_UINT,
	SYSCTLTYPE_SHORT,
	SYSCTLTYPE_USHORT,
	SYSCTLTYPE_LONG,
	SYSCTLTYPE_ULONG,
	SYSCTLTYPE_STRING,
};

struct sysctlhead {
	uint32_t blocklen; //total size of the entry
	uint32_t namelen; //strlen(name) + '\0'
	uint32_t flags; //type and access
	uint32_t datalen;
};

#ifdef _KERNEL

#ifdef SYSCTL_NODE
#undef SYSCTL_NODE
#endif
#define SYSCTL_NODE(a,b,c,d,e,f)
#define SYSCTL_DECL(a)
#define SYSCTL_VNET_PROC(a,b,c,d,e,f,g,h,i)

#define GST_HARD_LIMIT 100

/* In the module, GST is implemented as an array of
 * sysctlentry, but while passing data to the userland
 * pointers are useless, the buffer is actually made of:
 * - sysctlhead (fixed size, containing lengths)
 * - data (typically 32 bit)
 * - name (zero-terminated and padded to mod4)
 */

struct sysctlentry {
	struct sysctlhead head;
	char* name;
	void* data;
};

struct sysctltable {
	int count; //number of valid tables
	int totalsize; //total size of valid entries of al the valid tables
	void* namebuffer; //a buffer for all chained names
	struct sysctlentry entry[GST_HARD_LIMIT];
};

#ifdef SYSBEGIN
#undef SYSBEGIN
#endif
#define SYSBEGIN(x) void sysctl_addgroup_##x() {
#ifdef SYSEND
#undef SYSEND
#endif
#define SYSEND }

/* XXX remove duplication */
#define SYSCTL_INT(a,b,c,d,e,f,g) 				\
	sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1,	\
		(d) | (SYSCTLTYPE_INT << 2), sizeof(*e), e)

#define SYSCTL_VNET_INT(a,b,c,d,e,f,g)				\
	sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1,	\
		(d) | (SYSCTLTYPE_INT << 2), sizeof(*e), e)

#define SYSCTL_UINT(a,b,c,d,e,f,g)				\
	sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1,	\
		(d) | (SYSCTLTYPE_UINT << 2), sizeof(*e), e)

#define SYSCTL_VNET_UINT(a,b,c,d,e,f,g)				\
	sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1,	\
		(d) | (SYSCTLTYPE_UINT << 2), sizeof(*e), e)

#define SYSCTL_LONG(a,b,c,d,e,f,g)				\
	sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1,	\
		(d) | (SYSCTLTYPE_LONG << 2), sizeof(*e), e)

#define SYSCTL_ULONG(a,b,c,d,e,f,g)				\
	sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1,	\
		(d) | (SYSCTLTYPE_ULONG << 2), sizeof(*e), e)
#define TUNABLE_INT(a,b)

void keinit_GST(void);
void keexit_GST(void);
int kesysctl_emu_set(void* p, int l);
int kesysctl_emu_get(struct sockopt* sopt);
void sysctl_pushback(char* name, int flags, int datalen, void* data);

#endif /* _KERNEL */

int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp,
         size_t newlen);
#endif /* _WIN32" || EMULATE_SYSCTL */
#ifdef _WIN32
int do_cmd(int optname, void *optval, uintptr_t optlen);

#endif /* _WIN32 */

#define __PAST_END(v, idx)      v[idx]
#endif /* !_GLUE_H */


================================================
FILE: ipfw/Makefile
================================================
#
# $Id: Makefile 11688 2012-08-12 20:58:26Z luigi $
#
# GNUMakefile to build the userland part of ipfw on Linux and Windows
#
# Do not set with = or := so we can inherit from the caller

include ../Makefile.inc
TARGET := ipfw

all: $(TARGET)

#TCC=c:/path/to/tcc

# common flags
EXTRA_CFLAGS += -O1
EXTRA_CFLAGS += -Wall
EXTRA_CFLAGS += -include ../glue.h
EXTRA_CFLAGS += -I ./include_e -I ./include

ifneq ($(VER),openwrt)
ifeq ($(OSARCH),Linux)
    EXTRA_CFLAGS += -D__BSD_VISIBLE
    EXTRA_CFLAGS += -Werror
    # Required by GCC 4.6
    EXTRA_CFLAGS += -Wno-unused-but-set-variable
endif
ifeq ($(OSARCH),FreeBSD)
    EXTRA_CFLAGS += -D__BSD_VISIBLE
    EXTRA_CFLAGS += -Werror
endif
ifeq ($(OSARCH),Darwin)
    EXTRA_CFLAGS += -D__BSD_VISIBLE
    EXTRA_CFLAGS += -Werror
endif

ifeq ($(OSARCH),Windows)
# we only support Cygwin and tcc as compilers.
ifeq ($(WIN64),1)
    EXTRA_CFLAGS += -D_X64EMU
endif

ifeq ($(TCC),)	# cygwin
    EXTRA_CFLAGS += -I/cygdrive/c/$(DDKDIR)/inc/ddk
    EXTRA_CFLAGS += -I .
    EXTRA_CFLAGS += -pipe -Wall
else		#-- build with tcc
    # TCC points to the root of tcc tree
    CC=$(TCC)/tcc.exe
    EXTRA_CFLAGS += -DTCC -I..
    EXTRA_CFLAGS += -I$(TCC)/include/winapi -I$(TCC)/include
    EXTRA_CFLAGS += -nostdinc

    EFILES_. += err.h grp.h netdb.h pwd.h sysexits.h
    EFILES_arpa += inet.h
    EFILES_net += if.h
    EFILES_netinet += in.h in_systm.h ip.h ip_icmp.h
    EFILES_sys += cdefs.h wait.h ioctl.h socket.h

endif
    # EXTRA_CFLAGS += -D_WIN32 # see who defines it
    EXTRA_CFLAGS += -Dsetsockopt=wnd_setsockopt
    EXTRA_CFLAGS += -Dgetsockopt=wnd_getsockopt
    EXTRA_CFLAGS += -DEMULATE_SYSCTL
    EFILES_net += ethernet.h route.h
    EFILES_netinet += ether.h icmp6.h
    EFILES_sys += sysctl.h
    TARGET = ipfw.exe
ipfw: $(TARGET)
endif # windows
endif # !openwrt

CFLAGS += $(EXTRA_CFLAGS)
# Location of OS headers and libraries. After our stuff.
USRDIR?= /usr
ifeq ($(TCC),)
    CFLAGS += -I$(USRDIR)/include
    LDFLAGS += -L$(USRDIR)/lib
else
    LDFLAGS += -L. -L$(TCC)/lib -lws2_32
endif

OBJS = ipfw2.o dummynet.o main.o ipv6.o qsort_r.o
OBJS += expand_number.o humanize_number.o glue.o

# we don't use ALTQ
CFLAGS += -DNO_ALTQ
#OBJS += altq.o


$(TARGET): $(OBJS)
	$(MSG) "   LD  $@"
	$(HIDE)$(CC) $(LDFLAGS) -o $@ $^

$(OBJS) : ipfw2.h ../glue.h include_e

# support to create empty dirs and files in include_e/
# EDIRS is the list of directories, EFILES is the list of files.
EFILES_sys  += sockio.h
EFILES_.    += libutil.h
EFILES_netinet += __emtpy.h

M ?= $(shell pwd)

# first make a list of directories from variable names
EDIRS= $(subst EFILES_,,$(filter EFILES_%,$(.VARIABLES)))
# then prepend the directory name to individual files.
#       $(empty) serves to interpret the following space literally,
#       and the ":  = " substitution packs spaces into one.
EFILES = $(foreach i,$(EDIRS),$(subst $(empty) , $(i)/, $(EFILES_$(i):  = )))

include_e:
	$(MSG) "building include_e in $M"
	-@rm -rf $(M)/include_e opt_*
	-@mkdir -p $(M)/include_e
	-@(cd $(M)/include_e; mkdir -p $(EDIRS); touch $(EFILES) )
	-@(cd $(M)/include_e/netinet; \
		for i in ip_fw.h ip_dummynet.h tcp.h; do \
		cp ../../../sys/netinet/$$i .; done; )

clean distclean:
	-@rm -rf $(OBJS) $(TARGET) include_e

diff:
	-@(diff -upr $(BSD_HEAD)/sbin/ipfw .)


================================================
FILE: ipfw/add_rules
================================================
#!/bin/bash
#
# A test script to add rules

PRG=./ipfw

myfun() {
	$PRG add 10 count icmp from any to 131.114.9.128
	$PRG add 20 count icmp from 131.114.9.128 to any
	$PRG add 20 count icmp from any to 131.114.9.130
	$PRG add 30 count icmp from 131.114.9.130 to any
	$PRG add 40 count icmp from any to 131.114.9.129
	$PRG add 50 count icmp from 131.114.9.129 to any
	$PRG add 60 count icmp from 131.114.9.236 to any
	sleep 1
	$PRG del 10
	$PRG del 20
	$PRG del 20
	$PRG del 30
	$PRG del 40
	$PRG del 50
	$PRG del 60
}

for ((i=0;i<100;i++)) ; do
	myfun
done


================================================
FILE: ipfw/dummynet.c
================================================
/*
 * Copyright (c) 2002-2003,2010 Luigi Rizzo
 *
 * Redistribution and use in source forms, with and without modification,
 * are permitted provided that this entire comment appears intact.
 *
 * Redistribution in binary form may occur without any restrictions.
 * Obviously, it would be nice if you gave credit where credit is due
 * but requiring it would be too onerous.
 *
 * This software is provided ``AS IS'' without any warranties of any kind.
 *
 * $FreeBSD: head/sbin/ipfw/dummynet.c 206843 2010-04-19 15:11:45Z luigi $
 *
 * dummynet support
 */

#include <sys/types.h>
#include <sys/socket.h>
/* XXX there are several sysctl leftover here */
#include <sys/sysctl.h>

#include "ipfw2.h"

#include <ctype.h>
#include <err.h>
#include <errno.h>
#include <libutil.h>
#include <netdb.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sysexits.h>

#include <net/if.h>
#include <netinet/in.h>
#include <netinet/ip_fw.h>
#include <netinet/ip_dummynet.h>
#include <arpa/inet.h>	/* inet_ntoa */


static struct _s_x dummynet_params[] = {
	{ "plr",		TOK_PLR },
	{ "noerror",		TOK_NOERROR },
	{ "buckets",		TOK_BUCKETS },
	{ "dst-ip",		TOK_DSTIP },
	{ "src-ip",		TOK_SRCIP },
	{ "dst-port",		TOK_DSTPORT },
	{ "src-port",		TOK_SRCPORT },
	{ "proto",		TOK_PROTO },
	{ "weight",		TOK_WEIGHT },
	{ "lmax",		TOK_LMAX },
	{ "maxlen",		TOK_LMAX },
	{ "all",		TOK_ALL },
	{ "mask",		TOK_MASK }, /* alias for both */
	{ "sched_mask",		TOK_SCHED_MASK },
	{ "flow_mask",		TOK_FLOW_MASK },
	{ "droptail",		TOK_DROPTAIL },
	{ "red",		TOK_RED },
	{ "gred",		TOK_GRED },
	{ "bw",			TOK_BW },
	{ "bandwidth",		TOK_BW },
	{ "delay",		TOK_DELAY },
	{ "link",		TOK_LINK },
	{ "pipe",		TOK_PIPE },
	{ "queue",		TOK_QUEUE },
	{ "flowset",		TOK_FLOWSET },
	{ "sched",		TOK_SCHED },
	{ "pri",		TOK_PRI },
	{ "priority",		TOK_PRI },
	{ "type",		TOK_TYPE },
	{ "flow-id",		TOK_FLOWID},
	{ "dst-ipv6",		TOK_DSTIP6},
	{ "dst-ip6",		TOK_DSTIP6},
	{ "src-ipv6",		TOK_SRCIP6},
	{ "src-ip6",		TOK_SRCIP6},
	{ "profile",		TOK_PROFILE},
	{ "burst",		TOK_BURST},
	{ "dummynet-params",	TOK_NULL },
	{ NULL, 0 }	/* terminator */
};

#define O_NEXT(p, len) ((void *)((char *)p + len))

static void
oid_fill(struct dn_id *oid, int len, int type, uintptr_t id)
{
	oid->len = len;
	oid->type = type;
	oid->subtype = 0;
	oid->id = id;
}

/* make room in the buffer and move the pointer forward */
static void *
o_next(struct dn_id **o, int len, int type)
{
	struct dn_id *ret = *o;
	oid_fill(ret, len, type, 0);
	*o = O_NEXT(*o, len);
	return ret;
}

/* handle variable length structures moving back the pointer and fixing length */
static void *
o_compact(struct dn_id **o, int len, int real_length, int type)
{
        struct dn_id *ret = *o;

        ret = O_NEXT(*o, -len);
        oid_fill(ret, real_length, type, 0);
        *o = O_NEXT(ret, real_length);
        return ret;
}

#if 0
static int
sort_q(void *arg, const void *pa, const void *pb)
{
	int rev = (co.do_sort < 0);
	int field = rev ? -co.do_sort : co.do_sort;
	long long res = 0;
	const struct dn_flow_queue *a = pa;
	const struct dn_flow_queue *b = pb;

	switch (field) {
	case 1: /* pkts */
		res = a->len - b->len;
		break;
	case 2: /* bytes */
		res = a->len_bytes - b->len_bytes;
		break;

	case 3: /* tot pkts */
		res = a->tot_pkts - b->tot_pkts;
		break;

	case 4: /* tot bytes */
		res = a->tot_bytes - b->tot_bytes;
		break;
	}
	if (res < 0)
		res = -1;
	if (res > 0)
		res = 1;
	return (int)(rev ? res : -res);
}
#endif

/* print a mask and header for the subsequent list of flows */
static void
print_mask(struct ipfw_flow_id *id)
{
	if (!IS_IP6_FLOW_ID(id)) {
		printf("    "
		    "mask: %s 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n",
		    id->extra ? "queue," : "",
		    id->proto,
		    id->src_ip, id->src_port,
		    id->dst_ip, id->dst_port);
	} else {
		char buf[255];
		printf("\n        mask: %sproto: 0x%02x, flow_id: 0x%08x,  ",
		    id->extra ? "queue," : "",
		    id->proto, id->flow_id6);
		inet_ntop(AF_INET6, &(id->src_ip6), buf, sizeof(buf));
		printf("%s/0x%04x -> ", buf, id->src_port);
		inet_ntop(AF_INET6, &(id->dst_ip6), buf, sizeof(buf));
		printf("%s/0x%04x\n", buf, id->dst_port);
	}
}

static void
print_header(struct ipfw_flow_id *id)
{
	if (!IS_IP6_FLOW_ID(id))
		printf("BKT Prot ___Source IP/port____ "
		    "____Dest. IP/port____ "
		    "Tot_pkt/bytes Pkt/Byte Drp\n");
	else
		printf("BKT ___Prot___ _flow-id_ "
		    "______________Source IPv6/port_______________ "
		    "_______________Dest. IPv6/port_______________ "
		    "Tot_pkt/bytes Pkt/Byte Drp\n");
}

static void
list_flow(struct dn_flow *ni, int *print)
{
	char buff[255];
	struct protoent *pe = NULL;
	struct in_addr ina;
	struct ipfw_flow_id *id = &ni->fid;

	if (*print) {
		print_header(&ni->fid);
		*print = 0;
	}
	pe = getprotobynumber(id->proto);
		/* XXX: Should check for IPv4 flows */
	printf("%3u%c", (ni->oid.id) & 0xff,
		id->extra ? '*' : ' ');
	if (!IS_IP6_FLOW_ID(id)) {
		if (pe)
			printf("%-4s ", pe->p_name);
		else
			printf("%4u ", id->proto);
		ina.s_addr = htonl(id->src_ip);
		printf("%15s/%-5d ",
		    inet_ntoa(ina), id->src_port);
		ina.s_addr = htonl(id->dst_ip);
		printf("%15s/%-5d ",
		    inet_ntoa(ina), id->dst_port);
	} else {
		/* Print IPv6 flows */
		if (pe != NULL)
			printf("%9s ", pe->p_name);
		else
			printf("%9u ", id->proto);
		printf("%7d  %39s/%-5d ", id->flow_id6,
		    inet_ntop(AF_INET6, &(id->src_ip6), buff, sizeof(buff)),
		    id->src_port);
		printf(" %39s/%-5d ",
		    inet_ntop(AF_INET6, &(id->dst_ip6), buff, sizeof(buff)),
		    id->dst_port);
	}
	pr_u64(&ni->tot_pkts, 4);
	pr_u64(&ni->tot_bytes, 8);
	printf("%2u %4u %3u\n",
	    ni->length, ni->len_bytes, ni->drops);
}

static void
print_flowset_parms(struct dn_fs *fs, char *prefix)
{
	int l;
	char qs[30];
	char plr[30];
	char red[90];	/* Display RED parameters */

	l = fs->qsize;
	if (fs->flags & DN_QSIZE_BYTES) {
		if (l >= 8192)
			sprintf(qs, "%d KB", l / 1024);
		else
			sprintf(qs, "%d B", l);
	} else
		sprintf(qs, "%3d sl.", l);
	if (fs->plr)
		sprintf(plr, "plr %f", 1.0 * fs->plr / (double)(0x7fffffff));
	else
		plr[0] = '\0';

	if (fs->flags & DN_IS_RED)	/* RED parameters */
		sprintf(red,
		    "\n\t %cRED w_q %f min_th %d max_th %d max_p %f",
		    (fs->flags & DN_IS_GENTLE_RED) ? 'G' : ' ',
		    1.0 * fs->w_q / (double)(1 << SCALE_RED),
		    fs->min_th,
		    fs->max_th,
		    1.0 * fs->max_p / (double)(1 << SCALE_RED));
	else
		sprintf(red, "droptail");

	if (prefix[0]) {
	    printf("%s %s%s %d queues (%d buckets) %s\n",
		prefix, qs, plr, fs->oid.id, fs->buckets, red);
	    prefix[0] = '\0';
	} else {
	    printf("q%05d %s%s %d flows (%d buckets) sched %d "
			"weight %d lmax %d pri %d %s\n",
		fs->fs_nr, qs, plr, fs->oid.id, fs->buckets,
		fs->sched_nr, fs->par[0], fs->par[1], fs->par[2], red);
	    if (fs->flags & DN_HAVE_MASK)
		print_mask(&fs->flow_mask);
	}
}

static void
print_extra_delay_parms(struct dn_profile *p)
{
	double loss;
	if (p->samples_no <= 0)
		return;

	loss = p->loss_level;
	loss /= p->samples_no;
	printf("\t profile: name \"%s\" loss %f samples %d\n",
		p->name, loss, p->samples_no);
}

static void
flush_buf(char *buf)
{
	if (buf[0])
		printf("%s\n", buf);
	buf[0] = '\0';
}

/*
 * generic list routine. We expect objects in a specific order, i.e.
 * PIPES AND SCHEDULERS:
 *	link; scheduler; internal flowset if any; instances
 * we can tell a pipe from the number.
 *
 * FLOWSETS:
 *	flowset; queues;
 * link i (int queue); scheduler i; si(i) { flowsets() : queues }
 */
static void
list_pipes(struct dn_id *oid, struct dn_id *end)
{
    char buf[160];	/* pending buffer */
    int toPrint = 1;	/* print header */

    buf[0] = '\0';
    for (; oid != end; oid = O_NEXT(oid, oid->len)) {
	if (oid->len < sizeof(*oid))
		errx(1, "invalid oid len %d\n", oid->len);

	switch (oid->type) {
	default:
	    flush_buf(buf);
	    printf("unrecognized object %d size %d\n", oid->type, oid->len);
	    break;
	case DN_TEXT: /* list of attached flowsets */
	    {
		int i, l;
		struct {
			struct dn_id id;
			uint32_t p[0];
		} *d = (void *)oid;
		l = (oid->len - sizeof(*oid))/sizeof(d->p[0]);
		if (l == 0)
		    break;
		printf("   Children flowsets: ");
		for (i = 0; i < l; i++)
			printf("%u ", d->p[i]);
		printf("\n");
		break;
	    }
	case DN_CMD_GET:
	    if (co.verbose)
		printf("answer for cmd %d, len %d\n", oid->type, oid->id);
	    break;
	case DN_SCH: {
	    struct dn_sch *s = (struct dn_sch *)oid;
	    flush_buf(buf);
	    printf(" sched %d type %s flags 0x%x %d buckets %d active\n",
			s->sched_nr,
			s->name, s->flags, s->buckets, s->oid.id);
	    if (s->flags & DN_HAVE_MASK)
		print_mask(&s->sched_mask);
	    }
	    break;

	case DN_FLOW:
	    list_flow((struct dn_flow *)oid, &toPrint);
	    break;

	case DN_LINK: {
	    struct dn_link *p = (struct dn_link *)oid;
	    double b = p->bandwidth;
	    char bwbuf[30];
	    char burst[5 + 7];

	    /* This starts a new object so flush buffer */
	    flush_buf(buf);
	    /* data rate */
	    if (b == 0)
		sprintf(bwbuf, "unlimited     ");
	    else if (b >= 1000000)
		sprintf(bwbuf, "%7.3f Mbit/s", b/1000000);
	    else if (b >= 1000)
		sprintf(bwbuf, "%7.3f Kbit/s", b/1000);
	    else
		sprintf(bwbuf, "%7.3f bit/s ", b);

	    if (humanize_number(burst, sizeof(burst), p->burst,
		    "", HN_AUTOSCALE, 0) < 0 || co.verbose)
		sprintf(burst, "%d", (int)p->burst);
	    sprintf(buf, "%05d: %s %4d ms burst %s",
		p->link_nr % DN_MAX_ID, bwbuf, p->delay, burst);
	    }
	    break;

	case DN_FS:
	    print_flowset_parms((struct dn_fs *)oid, buf);
	    break;
	case DN_PROFILE:
	    flush_buf(buf);
	    print_extra_delay_parms((struct dn_profile *)oid);
	}
	flush_buf(buf); // XXX does it really go here ?
    }
}

/*
 * Delete pipe, queue or scheduler i
 */
int
ipfw_delete_pipe(int do_pipe, int i)
{
	struct {
		struct dn_id oid;
		uintptr_t a[1];	/* add more if we want a list */
	} cmd;
	oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION);
	cmd.oid.subtype = (do_pipe == 1) ? DN_LINK :
		( (do_pipe == 2) ? DN_FS : DN_SCH);
	cmd.a[0] = i;
	i = do_cmd(IP_DUMMYNET3, &cmd, cmd.oid.len);
	if (i) {
		i = 1;
		warn("rule %u: setsockopt(IP_DUMMYNET_DEL)", i);
	}
	return i;
}

/*
 * Code to parse delay profiles.
 *
 * Some link types introduce extra delays in the transmission
 * of a packet, e.g. because of MAC level framing, contention on
 * the use of the channel, MAC level retransmissions and so on.
 * From our point of view, the channel is effectively unavailable
 * for this extra time, which is constant or variable depending
 * on the link type. Additionally, packets may be dropped after this
 * time (e.g. on a wireless link after too many retransmissions).
 * We can model the additional delay with an empirical curve
 * that represents its distribution.
 *
 *      cumulative probability
 *      1.0 ^
 *          |
 *      L   +-- loss-level          x
 *          |                 ******
 *          |                *
 *          |           *****
 *          |          *
 *          |        **
 *          |       *
 *          +-------*------------------->
 *                      delay
 *
 * The empirical curve may have both vertical and horizontal lines.
 * Vertical lines represent constant delay for a range of
 * probabilities; horizontal lines correspond to a discontinuty
 * in the delay distribution: the link will use the largest delay
 * for a given probability.
 *
 * To pass the curve to dummynet, we must store the parameters
 * in a file as described below, and issue the command
 *
 *      ipfw pipe <n> config ... bw XXX profile <filename> ...
 *
 * The file format is the following, with whitespace acting as
 * a separator and '#' indicating the beginning a comment:
 *
 *	samples N
 *		the number of samples used in the internal
 *		representation (2..1024; default 100);
 *
 *	loss-level L
 *		The probability above which packets are lost.
 *	       (0.0 <= L <= 1.0, default 1.0 i.e. no loss);
 *
 *	name identifier
 *		Optional a name (listed by "ipfw pipe show")
 *		to identify the distribution;
 *
 *	"delay prob" | "prob delay"
 *		One of these two lines is mandatory and defines
 *		the format of the following lines with data points.
 *
 *	XXX YYY
 *		2 or more lines representing points in the curve,
 *		with either delay or probability first, according
 *		to the chosen format.
 *		The unit for delay is milliseconds.
 *
 * Data points does not need to be ordered or equal to the number
 * specified in the "samples" line. ipfw will sort and interpolate
 * the curve as needed.
 *
 * Example of a profile file:

	name    bla_bla_bla
	samples 100
	loss-level    0.86
	prob    delay
	0       200	# minimum overhead is 200ms
	0.5     200
	0.5     300
	0.8     1000
	0.9     1300
	1       1300

 * Internally, we will convert the curve to a fixed number of
 * samples, and when it is time to transmit a packet we will
 * model the extra delay as extra bits in the packet.
 *
 */

#define ED_MAX_LINE_LEN	256+ED_MAX_NAME_LEN
#define ED_TOK_SAMPLES	"samples"
#define ED_TOK_LOSS	"loss-level"
#define ED_TOK_NAME	"name"
#define ED_TOK_DELAY	"delay"
#define ED_TOK_PROB	"prob"
#define ED_TOK_BW	"bw"
#define ED_SEPARATORS	" \t\n"
#define ED_MIN_SAMPLES_NO	2

/*
 * returns 1 if s is a non-negative number, with at least one '.'
 */
static int
is_valid_number(const char *s)
{
	int i, dots_found = 0;
	int len = strlen(s);

	for (i = 0; i<len; ++i)
		if (!isdigit(s[i]) && (s[i] !='.' || ++dots_found > 1))
			return 0;
	return 1;
}

/*
 * Take as input a string describing a bandwidth value
 * and return the numeric bandwidth value.
 * set clocking interface or bandwidth value
 */
static void
read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen)
{
	if (*bandwidth != -1)
		warnx("duplicate token, override bandwidth value!");

	if (arg[0] >= 'a' && arg[0] <= 'z') {
		if (!if_name) {
			errx(1, "no if support");
		}
		if (namelen >= IFNAMSIZ)
			warn("interface name truncated");
		namelen--;
		/* interface name */
		strncpy(if_name, arg, namelen);
		if_name[namelen] = '\0';
		*bandwidth = 0;
	} else {	/* read bandwidth value */
		int bw;
		char *end = NULL;

		bw = strtoul(arg, &end, 0);
		if (*end == 'K' || *end == 'k') {
			end++;
			bw *= 1000;
		} else if (*end == 'M' || *end == 'm') {
			end++;
			bw *= 1000000;
		}
		if ((*end == 'B' &&
			_substrcmp2(end, "Bi", "Bit/s") != 0) ||
		    _substrcmp2(end, "by", "bytes") == 0)
			bw *= 8;

		if (bw < 0)
			errx(EX_DATAERR, "bandwidth too large");

		*bandwidth = bw;
		if (if_name)
			if_name[0] = '\0';
	}
}

struct point {
	double prob;
	double delay;
};

static int
compare_points(const void *vp1, const void *vp2)
{
	const struct point *p1 = vp1;
	const struct point *p2 = vp2;
	double res = 0;

	res = p1->prob - p2->prob;
	if (res == 0)
		res = p1->delay - p2->delay;
	if (res < 0)
		return -1;
	else if (res > 0)
		return 1;
	else
		return 0;
}

#define ED_EFMT(s) EX_DATAERR,"error in %s at line %d: "#s,filename,lineno

/*
 * Interpolate a set of proability-value tuples.
 *
 * This function takes as input a tuple of values <prob, value>
 * and samples the interpolated curve described from the tuples.
 *
 * The user defined points are stored in the ponts structure.
 * The number of points is stored in points_no.
 * The user defined sampling value is stored in samples_no.
 * The resulting samples are in the "samples" pointer.
 *
 *       We assume that The last point for the '1' value of the
 *       probability should be defined. (XXX add checks for this)
 *
 * The input data are points and points_no.
 * The output data are s (the array of s_no samples)
 * and s_no (the number of samples)
 *
 */
static void
interpolate_samples(struct point *p, int points_no,
		int *samples, int samples_no, const char *filename)
{
	double dy;		/* delta on the y axis */
	double y;		/* current value of y */
	double x;		/* current value of x */
	double m;		/* the y slope */
	int i;			/* samples index */
	int curr;		/* points current index */

        /* make sure that there are enough points. */
        /* XXX Duplicated should be removed */
        if (points_no < 3)
            errx(EX_DATAERR, "%s too few samples, need at least %d",
                filename, 3);

        qsort(p, points_no, sizeof(struct point), compare_points);

	dy = 1.0/samples_no;
	y = 0;

	for (i=0, curr = 0; i < samples_no; i++, y+=dy) {
		/* This statment move the curr pointer to the next point
		 * skipping the points with the same x value. We are
		 * guaranteed to exit from the loop because the
		 * last possible value of y is stricly less than 1
		 * and the last possible value of the y points is 1 */
		while ( y >= p[curr+1].prob ) curr++;

		/* compute the slope of the curve */
		m = (p[curr+1].delay - p[curr].delay) / (p[curr+1].prob - p[curr].prob);
		/* compute the x value starting from the current point */
		x = p[curr].delay + (y - p[curr].prob) * m;
		samples[i] = x;
	}

	/* add the last sample */
	samples[i] = p[curr+1].delay;
}

/*
 * p is the link (old pipe)
 * pf is the profile
 */
static void
load_extra_delays(const char *filename, struct dn_profile *p,
	struct dn_link *link)
{
	char    line[ED_MAX_LINE_LEN];
	FILE    *f;
	int     lineno = 0;

	int     samples = -1;
	double  loss = -1.0;
	char    profile_name[ED_MAX_NAME_LEN];
	int     delay_first = -1;
	int     do_points = 0;
	struct point    points[ED_MAX_SAMPLES_NO];
	int     points_no = 0;

	/* XXX link never NULL? */
	p->link_nr = link->link_nr;

	profile_name[0] = '\0';
	f = fopen(filename, "r");
	if (f == NULL)
		err(EX_UNAVAILABLE, "fopen: %s", filename);

	while (fgets(line, ED_MAX_LINE_LEN, f)) {	 /* read commands */
		char *s, *cur = line, *name = NULL, *arg = NULL;

		++lineno;

		/* parse the line */
		while (cur) {
			s = strsep(&cur, ED_SEPARATORS);
			if (s == NULL || *s == '#')
				break;
			if (*s == '\0')
				continue;
			if (arg)
				errx(ED_EFMT("too many arguments"));
			if (name == NULL)
				name = s;
			else
				arg = s;
		}

		if ((name == NULL) || (*name == '#'))   /* empty line */
			continue;
		if (arg == NULL)
			errx(ED_EFMT("missing arg for %s"), name);

		if (!strcasecmp(name, ED_TOK_SAMPLES)) {
		    if (samples > 0)
			errx(ED_EFMT("duplicate ``samples'' line"));
		    if (atoi(arg) <=0)
			errx(ED_EFMT("invalid number of samples"));
		    samples = atoi(arg);
		    if (samples>=ED_MAX_SAMPLES_NO-1)
			    errx(ED_EFMT("too many samples, maximum is %d"),
				ED_MAX_SAMPLES_NO-1);
		    do_points = 0;
		} else if (!strcasecmp(name, ED_TOK_BW)) {
		    char buf[IFNAMSIZ];
		    read_bandwidth(arg, &link->bandwidth, buf, sizeof(buf));
		    p->bandwidth = link->bandwidth;
		} else if (!strcasecmp(name, ED_TOK_LOSS)) {
		    if (loss != -1.0)
			errx(ED_EFMT("duplicated token: %s"), name);
		    if (!is_valid_number(arg))
			errx(ED_EFMT("invalid %s"), arg);
		    loss = atof(arg);
		    if (loss > 1)
			errx(ED_EFMT("%s greater than 1.0"), name);
		    do_points = 0;
		} else if (!strcasecmp(name, ED_TOK_NAME)) {
		    if (profile_name[0] != '\0')
			errx(ED_EFMT("duplicated token: %s"), name);
		    strncpy(profile_name, arg, sizeof(profile_name) - 1);
		    profile_name[sizeof(profile_name)-1] = '\0';
		    do_points = 0;
		} else if (!strcasecmp(name, ED_TOK_DELAY)) {
		    if (do_points)
			errx(ED_EFMT("duplicated token: %s"), name);
		    delay_first = 1;
		    do_points = 1;
		} else if (!strcasecmp(name, ED_TOK_PROB)) {
		    if (do_points)
			errx(ED_EFMT("duplicated token: %s"), name);
		    delay_first = 0;
		    do_points = 1;
		} else if (do_points) {
		    if (!is_valid_number(name) || !is_valid_number(arg))
			errx(ED_EFMT("invalid point found"));
		    if (delay_first) {
			points[points_no].delay = atof(name);
			points[points_no].prob = atof(arg);
		    } else {
			points[points_no].delay = atof(arg);
			points[points_no].prob = atof(name);
		    }
		    if (points[points_no].prob > 1.0)
			errx(ED_EFMT("probability greater than 1.0"));
		    ++points_no;
		} else {
		    errx(ED_EFMT("unrecognised command '%s'"), name);
		}
	}

	fclose (f);

	if (samples == -1) {
	    warnx("'%s' not found, assuming 100", ED_TOK_SAMPLES);
	    samples = 100;
	}

	if (loss == -1.0) {
	    warnx("'%s' not found, assuming no loss", ED_TOK_LOSS);
	    loss = 1;
	}

	interpolate_samples(points, points_no, p->samples, samples, filename);

	p->samples_no = samples++;
	p->loss_level = loss * samples;
	strncpy(p->name, profile_name, sizeof(p->name));
}

/*
 * configuration of pipes, schedulers, flowsets.
 * When we configure a new scheduler, an empty pipe is created, so:
 *
 * do_pipe = 1 -> "pipe N config ..." only for backward compatibility
 *	sched N+Delta type fifo sched_mask ...
 *	pipe N+Delta <parameters>
 *	flowset N+Delta pipe N+Delta (no parameters)
 *	sched N type wf2q+ sched_mask ...
 *	pipe N <parameters>
 *
 * do_pipe = 2 -> flowset N config
 *	flowset N parameters
 *
 * do_pipe = 3 -> sched N config
 *	sched N parameters (default no pipe)
 *	optional Pipe N config ...
 * pipe ==>
 */
void
ipfw_config_pipe(int ac, char **av)
{
	int i;
	u_int j;
	char *end;
	void *par = NULL;
	struct dn_id *buf, *base;
	struct dn_sch *sch = NULL;
	struct dn_link *p = NULL;
	struct dn_fs *fs = NULL;
	struct dn_profile *pf = NULL;
	struct ipfw_flow_id *mask = NULL;
	int lmax;
	uint32_t _foo = 0, *flags = &_foo , *buckets = &_foo;
	size_t max_pf_size = sizeof(struct dn_profile) + ED_MAX_SAMPLES_NO * sizeof(int);

	/*
	 * allocate space for 1 header,
	 * 1 scheduler, 1 link, 1 flowset, 1 profile
	 */
	lmax = sizeof(struct dn_id);	/* command header */
	lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) +
		sizeof(struct dn_fs);
	lmax += max_pf_size;

	av++; ac--;
	/* Pipe number */
	if (ac && isdigit(**av)) {
		i = atoi(*av); av++; ac--;
	} else
		i = -1;
	if (i <= 0)
		errx(EX_USAGE, "need a pipe/flowset/sched number");
	base = buf = safe_calloc(1, lmax);
	/* all commands start with a 'CONFIGURE' and a version */
	o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG);
	base->id = DN_API_VERSION;

	switch (co.do_pipe) {
	case 1: /* "pipe N config ..." */
		/* Allocate space for the WF2Q+ scheduler, its link
		 * and the FIFO flowset. Set the number, but leave
		 * the scheduler subtype and other parameters to 0
		 * so the kernel will use appropriate defaults.
		 * XXX todo: add a flag to record if a parameter
		 * is actually configured.
		 * If we do a 'pipe config' mask -> sched_mask.
		 * The FIFO scheduler and link are derived from the
		 * WF2Q+ one in the kernel.
		 */
		sch = o_next(&buf, sizeof(*sch), DN_SCH);
		p = o_next(&buf, sizeof(*p), DN_LINK);
		fs = o_next(&buf, sizeof(*fs), DN_FS);

		sch->sched_nr = i;
		sch->oid.subtype = 0;	/* defaults to WF2Q+ */
		mask = &sch->sched_mask;
		flags = &sch->flags;
		buckets = &sch->buckets;
		*flags |= DN_PIPE_CMD;

		p->link_nr = i;

		/* This flowset is only for the FIFO scheduler */
		fs->fs_nr = i + 2*DN_MAX_ID;
		fs->sched_nr = i + DN_MAX_ID;
		break;

	case 2: /* "queue N config ... " */
		fs = o_next(&buf, sizeof(*fs), DN_FS);
		fs->fs_nr = i;
		mask = &fs->flow_mask;
		flags = &fs->flags;
		buckets = &fs->buckets;
		break;

	case 3: /* "sched N config ..." */
		sch = o_next(&buf, sizeof(*sch), DN_SCH);
		fs = o_next(&buf, sizeof(*fs), DN_FS);
		sch->sched_nr = i;
		mask = &sch->sched_mask;
		flags = &sch->flags;
		buckets = &sch->buckets;
		/* fs is used only with !MULTIQUEUE schedulers */
		fs->fs_nr = i + DN_MAX_ID;
		fs->sched_nr = i;
		break;
	}
	/* set to -1 those fields for which we want to reuse existing
	 * values from the kernel.
	 * Also, *_nr and subtype = 0 mean reuse the value from the kernel.
	 * XXX todo: support reuse of the mask.
	 */
	if (p)
		p->bandwidth = -1;
	for (j = 0; j < sizeof(fs->par)/sizeof(fs->par[0]); j++)
		fs->par[j] = -1;
	while (ac > 0) {
		double d;
		int tok = match_token(dummynet_params, *av);
		ac--; av++;

		switch(tok) {
		case TOK_NOERROR:
			NEED(fs, "noerror is only for pipes");
			fs->flags |= DN_NOERROR;
			break;

		case TOK_PLR:
			NEED(fs, "plr is only for pipes");
			NEED1("plr needs argument 0..1\n");
			d = strtod(av[0], NULL);
			if (d > 1)
				d = 1;
			else if (d < 0)
				d = 0;
			fs->plr = (int)(d*0x7fffffff);
			ac--; av++;
			break;

		case TOK_QUEUE:
			NEED(fs, "queue is only for pipes or flowsets");
			NEED1("queue needs queue size\n");
			end = NULL;
			fs->qsize = strtoul(av[0], &end, 0);
			if (*end == 'K' || *end == 'k') {
				fs->flags |= DN_QSIZE_BYTES;
				fs->qsize *= 1024;
			} else if (*end == 'B' ||
			    _substrcmp2(end, "by", "bytes") == 0) {
				fs->flags |= DN_QSIZE_BYTES;
			}
			ac--; av++;
			break;

		case TOK_BUCKETS:
			NEED(fs, "buckets is only for pipes or flowsets");
			NEED1("buckets needs argument\n");
			*buckets = strtoul(av[0], NULL, 0);
			ac--; av++;
			break;

		case TOK_FLOW_MASK:
		case TOK_SCHED_MASK:
		case TOK_MASK:
			NEED(mask, "tok_mask");
			NEED1("mask needs mask specifier\n");
			/*
			 * per-flow queue, mask is dst_ip, dst_port,
			 * src_ip, src_port, proto measured in bits
			 */
			par = NULL;

			bzero(mask, sizeof(*mask));
			end = NULL;

			while (ac >= 1) {
			    uint32_t *p32 = NULL;
			    uint16_t *p16 = NULL;
			    uint32_t *p20 = NULL;
			    struct in6_addr *pa6 = NULL;
			    uint32_t a;

			    tok = match_token(dummynet_params, *av);
			    ac--; av++;
			    switch(tok) {
			    case TOK_ALL:
				    /*
				     * special case, all bits significant
				     * except 'extra' (the queue number)
				     */
				    mask->dst_ip = ~0;
				    mask->src_ip = ~0;
				    mask->dst_port = ~0;
				    mask->src_port = ~0;
				    mask->proto = ~0;
				    n2mask(&mask->dst_ip6, 128);
				    n2mask(&mask->src_ip6, 128);
				    mask->flow_id6 = ~0;
				    *flags |= DN_HAVE_MASK;
				    goto end_mask;

			    case TOK_QUEUE:
				    mask->extra = ~0;
				    *flags |= DN_HAVE_MASK;
				    goto end_mask;

			    case TOK_DSTIP:
				    mask->addr_type = 4;
				    p32 = &mask->dst_ip;
				    break;

			    case TOK_SRCIP:
				    mask->addr_type = 4;
				    p32 = &mask->src_ip;
				    break;

			    case TOK_DSTIP6:
				    mask->addr_type = 6;
				    pa6 = &mask->dst_ip6;
				    break;

			    case TOK_SRCIP6:
				    mask->addr_type = 6;
				    pa6 = &mask->src_ip6;
				    break;

			    case TOK_FLOWID:
				    mask->addr_type = 6;
				    p20 = &mask->flow_id6;
				    break;

			    case TOK_DSTPORT:
				    p16 = &mask->dst_port;
				    break;

			    case TOK_SRCPORT:
				    p16 = &mask->src_port;
				    break;

			    case TOK_PROTO:
				    break;

			    default:
				    ac++; av--; /* backtrack */
				    goto end_mask;
			    }
			    if (ac < 1)
				    errx(EX_USAGE, "mask: value missing");
			    if (*av[0] == '/') {
				    a = strtoul(av[0]+1, &end, 0);
				    if (pa6 == NULL)
					    a = (a == 32) ? ~0 : (1 << a) - 1;
			    } else
				    a = strtoul(av[0], &end, 0);
			    if (p32 != NULL)
				    *p32 = a;
			    else if (p16 != NULL) {
				    if (a > 0xFFFF)
					    errx(EX_DATAERR,
						"port mask must be 16 bit");
				    *p16 = (uint16_t)a;
			    } else if (p20 != NULL) {
				    if (a > 0xfffff)
					errx(EX_DATAERR,
					    "flow_id mask must be 20 bit");
				    *p20 = (uint32_t)a;
			    } else if (pa6 != NULL) {
				    if (a > 128)
					errx(EX_DATAERR,
					    "in6addr invalid mask len");
				    else
					n2mask(pa6, a);
			    } else {
				    if (a > 0xFF)
					    errx(EX_DATAERR,
						"proto mask must be 8 bit");
				    mask->proto = (uint8_t)a;
			    }
			    if (a != 0)
				    *flags |= DN_HAVE_MASK;
			    ac--; av++;
			} /* end while, config masks */
end_mask:
			break;

		case TOK_RED:
		case TOK_GRED:
			NEED1("red/gred needs w_q/min_th/max_th/max_p\n");
			fs->flags |= DN_IS_RED;
			if (tok == TOK_GRED)
				fs->flags |= DN_IS_GENTLE_RED;
			/*
			 * the format for parameters is w_q/min_th/max_th/max_p
			 */
			if ((end = strsep(&av[0], "/"))) {
			    double w_q = strtod(end, NULL);
			    if (w_q > 1 || w_q <= 0)
				errx(EX_DATAERR, "0 < w_q <= 1");
			    fs->w_q = (int) (w_q * (1 << SCALE_RED));
			}
			if ((end = strsep(&av[0], "/"))) {
			    fs->min_th = strtoul(end, &end, 0);
			    if (*end == 'K' || *end == 'k')
				fs->min_th *= 1024;
			}
			if ((end = strsep(&av[0], "/"))) {
			    fs->max_th = strtoul(end, &end, 0);
			    if (*end == 'K' || *end == 'k')
				fs->max_th *= 1024;
			}
			if ((end = strsep(&av[0], "/"))) {
			    double max_p = strtod(end, NULL);
			    if (max_p > 1 || max_p <= 0)
				errx(EX_DATAERR, "0 < max_p <= 1");
			    fs->max_p = (int)(max_p * (1 << SCALE_RED));
			}
			ac--; av++;
			break;

		case TOK_DROPTAIL:
			NEED(fs, "droptail is only for flowsets");
			fs->flags &= ~(DN_IS_RED|DN_IS_GENTLE_RED);
			break;

		case TOK_BW:
			NEED(p, "bw is only for links");
			NEED1("bw needs bandwidth or interface\n");
			read_bandwidth(av[0], &p->bandwidth, NULL, 0);
			ac--; av++;
			break;

		case TOK_DELAY:
			NEED(p, "delay is only for links");
			NEED1("delay needs argument 0..10000ms\n");
			p->delay = strtoul(av[0], NULL, 0);
			ac--; av++;
			break;

		case TOK_TYPE: {
			int l;
			NEED(sch, "type is only for schedulers");
			NEED1("type needs a string");
			l = strlen(av[0]);
			if (l == 0 || l > 15)
				errx(1, "type %s too long\n", av[0]);
			strcpy(sch->name, av[0]);
			sch->oid.subtype = 0; /* use string */
			ac--; av++;
			break;
		    }

		case TOK_WEIGHT:
			NEED(fs, "weight is only for flowsets");
			NEED1("weight needs argument\n");
			fs->par[0] = strtol(av[0], &end, 0);
			ac--; av++;
			break;

		case TOK_LMAX:
			NEED(fs, "lmax is only for flowsets");
			NEED1("lmax needs argument\n");
			fs->par[1] = strtol(av[0], &end, 0);
			ac--; av++;
			break;

		case TOK_PRI:
			NEED(fs, "priority is only for flowsets");
			NEED1("priority needs argument\n");
			fs->par[2] = strtol(av[0], &end, 0);
			ac--; av++;
			break;

		case TOK_SCHED:
		case TOK_PIPE:
			NEED(fs, "pipe/sched");
			NEED1("pipe/link/sched needs number\n");
			fs->sched_nr = strtoul(av[0], &end, 0);
			ac--; av++;
			break;

		case TOK_PROFILE:
		    {
			size_t real_length;

			NEED((!pf), "profile already set");
			NEED(p, "profile");
			NEED1("extra delay needs the file name\n");

			/* load the profile structure using the DN_API */
			pf = o_next(&buf, max_pf_size, DN_PROFILE);
			load_extra_delays(av[0], pf, p); //XXX can't fail?

			/* compact the dn_id structure */
			real_length = sizeof(struct dn_profile) +
				pf->samples_no * sizeof(int);
			o_compact(&buf, max_pf_size, real_length, DN_PROFILE);
			--ac; ++av;
		    }
			break;

		case TOK_BURST:
			NEED(p, "burst");
			NEED1("burst needs argument\n");
			errno = 0;
			if (expand_number(av[0], (int64_t *)&p->burst) < 0)
				if (errno != ERANGE)
					errx(EX_DATAERR,
					    "burst: invalid argument");
			if (errno || p->burst > (1ULL << 48) - 1)
				errx(EX_DATAERR,
				    "burst: out of range (0..2^48-1)");
			ac--; av++;
			break;

		default:
			errx(EX_DATAERR, "unrecognised option ``%s''", av[-1]);
		}
	}

	/* check validity of parameters */
	if (p) {
		if (p->delay > 10000)
			errx(EX_DATAERR, "delay must be < 10000");
		if (p->bandwidth == -1)
			p->bandwidth = 0;
	}
	if (fs) {
		/* XXX accept a 0 scheduler to keep the default */
	    if (fs->flags & DN_QSIZE_BYTES) {
		size_t len;
		long limit;

		len = sizeof(limit);
		if (sysctlbyname("net.inet.ip.dummynet.pipe_byte_limit",
			&limit, &len, NULL, 0) == -1)
			limit = 1024*1024;
		if (fs->qsize > limit)
			errx(EX_DATAERR, "queue size must be < %ldB", limit);
	    } else {
		size_t len;
		long limit;

		len = sizeof(limit);
		if (sysctlbyname("net.inet.ip.dummynet.pipe_slot_limit",
			&limit, &len, NULL, 0) == -1)
			limit = 100;
		if (fs->qsize > limit)
			errx(EX_DATAERR, "2 <= queue size <= %ld", limit);
	    }

	    if (fs->flags & DN_IS_RED) {
		size_t len;
		int lookup_depth, avg_pkt_size;
		double w_q;

		if (fs->min_th >= fs->max_th)
		    errx(EX_DATAERR, "min_th %d must be < than max_th %d",
			fs->min_th, fs->max_th);
		if (fs->max_th == 0)
		    errx(EX_DATAERR, "max_th must be > 0");

		len = sizeof(int);
		if (sysctlbyname("net.inet.ip.dummynet.red_lookup_depth",
			&lookup_depth, &len, NULL, 0) == -1)
			lookup_depth = 256;
		if (lookup_depth == 0)
		    errx(EX_DATAERR, "net.inet.ip.dummynet.red_lookup_depth"
			" must be greater than zero");

		len = sizeof(int);
		if (sysctlbyname("net.inet.ip.dummynet.red_avg_pkt_size",
			&avg_pkt_size, &len, NULL, 0) == -1)
			avg_pkt_size = 512;

		if (avg_pkt_size == 0)
			errx(EX_DATAERR,
			    "net.inet.ip.dummynet.red_avg_pkt_size must"
			    " be greater than zero");

		/*
		 * Ticks needed for sending a medium-sized packet.
		 * Unfortunately, when we are configuring a WF2Q+ queue, we
		 * do not have bandwidth information, because that is stored
		 * in the parent pipe, and also we have multiple queues
		 * competing for it. So we set s=0, which is not very
		 * correct. But on the other hand, why do we want RED with
		 * WF2Q+ ?
		 */
#if 0
		if (p.bandwidth==0) /* this is a WF2Q+ queue */
			s = 0;
		else
			s = (double)ck.hz * avg_pkt_size * 8 / p.bandwidth;
#endif
		/*
		 * max idle time (in ticks) before avg queue size becomes 0.
		 * NOTA:  (3/w_q) is approx the value x so that
		 * (1-w_q)^x < 10^-3.
		 */
		w_q = ((double)fs->w_q) / (1 << SCALE_RED);
#if 0 // go in kernel
		idle = s * 3. / w_q;
		fs->lookup_step = (int)idle / lookup_depth;
		if (!fs->lookup_step)
			fs->lookup_step = 1;
		weight = 1 - w_q;
		for (t = fs->lookup_step; t > 1; --t)
			weight *= 1 - w_q;
		fs->lookup_weight = (int)(weight * (1 << SCALE_RED));
#endif /* code moved in the kernel */
	    }
	}

	i = do_cmd(IP_DUMMYNET3, base, (char *)buf - (char *)base);

	if (i)
		err(1, "setsockopt(%s)", "IP_DUMMYNET_CONFIGURE");
}

void
dummynet_flush(void)
{
	struct dn_id oid;
	oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION);
	do_cmd(IP_DUMMYNET3, &oid, oid.len);
}

/* Parse input for 'ipfw [pipe|sched|queue] show [range list]'
 * Returns the number of ranges, and possibly stores them
 * in the array v of size len.
 */
static int
parse_range(int ac, char *av[], uint32_t *v, int len)
{
	int n = 0;
	char *endptr, *s;
	uint32_t base[2];

	if (v == NULL || len < 2) {
		v = base;
		len = 2;
	}

	for (s = *av; s != NULL; av++, ac--) {
		v[0] = strtoul(s, &endptr, 10);
		v[1] = (*endptr != '-') ? v[0] :
			 strtoul(endptr+1, &endptr, 10);
		if (*endptr == '\0') { /* prepare for next round */
			s = (ac > 0) ? *(av+1) : NULL;
		} else {
			if (*endptr != ',') {
				warn("invalid number: %s", s);
				s = ++endptr;
				continue;
			}
			/* continue processing from here */
			s = ++endptr;
			ac++;
			av--;
		}
		if (v[1] < v[0] ||
			v[1] >= DN_MAX_ID-1 ||
			v[1] >= DN_MAX_ID-1) {
			continue; /* invalid entry */
		}
		n++;
		/* translate if 'pipe list' */
		if (co.do_pipe == 1) {
			v[0] += DN_MAX_ID;
			v[1] += DN_MAX_ID;
		}
		v = (n*2 < len) ? v + 2 : base;
	}
	return n;
}

/* main entry point for dummynet list functions. co.do_pipe indicates
 * which function we want to support.
 * av may contain filtering arguments, either individual entries
 * or ranges, or lists (space or commas are valid separators).
 * Format for a range can be n1-n2 or n3 n4 n5 ...
 * In a range n1 must be <= n2, otherwise the range is ignored.
 * A number 'n4' is translate in a range 'n4-n4'
 * All number must be > 0 and < DN_MAX_ID-1
 */
void
dummynet_list(int ac, char *av[], int show_counters)
{
	struct dn_id *oid, *x = NULL;
	int ret, i;
	int n; 		/* # of ranges */
	u_int buflen, l;
	u_int max_size;	/* largest obj passed up */

	(void)show_counters;	// XXX unused, but we should use it.
	ac--;
	av++; 		/* skip 'list' | 'show' word */

	n = parse_range(ac, av, NULL, 0);	/* Count # of ranges. */

	/* Allocate space to store ranges */
	l = sizeof(*oid) + sizeof(uint32_t) * n * 2;
	oid = safe_calloc(1, l);
	oid_fill(oid, l, DN_CMD_GET, DN_API_VERSION);

	if (n > 0)	/* store ranges in idx */
		parse_range(ac, av, (uint32_t *)(oid + 1), n*2);
	/*
	 * Compute the size of the largest object returned. If the
	 * response leaves at least this much spare space in the
	 * buffer, then surely the response is complete; otherwise
	 * there might be a risk of truncation and we will need to
	 * retry with a larger buffer.
	 * XXX don't bother with smaller structs.
	 */
	max_size = sizeof(struct dn_fs);
	if (max_size < sizeof(struct dn_sch))
		max_size = sizeof(struct dn_sch);
	if (max_size < sizeof(struct dn_flow))
		max_size = sizeof(struct dn_flow);

	switch (co.do_pipe) {
	case 1:
		oid->subtype = DN_LINK;	/* list pipe */
		break;
	case 2:
		oid->subtype = DN_FS;	/* list queue */
		break;
	case 3:
		oid->subtype = DN_SCH;	/* list sched */
		break;
	}

	/*
	 * Ask the kernel an estimate of the required space (result
	 * in oid.id), unless we are requesting a subset of objects,
	 * in which case the kernel does not give an exact answer.
	 * In any case, space might grow in the meantime due to the
	 * creation of new queues, so we must be prepared to retry.
	 */
	if (n > 0) {
		buflen = 4*1024;
	} else {
		ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l);
		if (ret != 0 || oid->id <= sizeof(*oid))
			goto done;
		buflen = oid->id + max_size;
		oid->len = sizeof(*oid); /* restore */
	}
	/* Try a few times, until the buffer fits */
	for (i = 0; i < 20; i++) {
		l = buflen;
		x = safe_realloc(x, l);
		bcopy(oid, x, oid->len);
		ret = do_cmd(-IP_DUMMYNET3, x, (uintptr_t)&l);
		if (ret != 0 || x->id <= sizeof(*oid))
			goto done; /* no response */
		if (l + max_size <= buflen)
			break; /* ok */
		buflen *= 2;	 /* double for next attempt */
	}
	list_pipes(x, O_NEXT(x, l));
done:
	if (x)
		free(x);
	free(oid);
}


================================================
FILE: ipfw/expand_number.c
================================================
/*-
 * Copyright (c) 2007 Eric Anderson <anderson@FreeBSD.org>
 * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

// #include <sys/cdefs.h>
__FBSDID("$FreeBSD: src/lib/libutil/expand_number.c,v 1.2.4.2 2009/06/10 14:52:34 des Exp $");

#include <sys/types.h>
#include <ctype.h>
#include <errno.h>
#include <inttypes.h>
//#include <libutil.h>
#include <stdint.h>

/*
 * Convert an expression of the following forms to a int64_t.
 * 	1) A positive decimal number.
 *	2) A positive decimal number followed by a 'b' or 'B' (mult by 1).
 *	3) A positive decimal number followed by a 'k' or 'K' (mult by 1 << 10).
 *	4) A positive decimal number followed by a 'm' or 'M' (mult by 1 << 20).
 *	5) A positive decimal number followed by a 'g' or 'G' (mult by 1 << 30).
 *	6) A positive decimal number followed by a 't' or 'T' (mult by 1 << 40).
 *	7) A positive decimal number followed by a 'p' or 'P' (mult by 1 << 50).
 *	8) A positive decimal number followed by a 'e' or 'E' (mult by 1 << 60).
 */
int
expand_number(const char *buf, int64_t *num)
{
	static const char unit[] = "bkmgtpe";
	char *endptr, s;
	int64_t number;
	int i;

	number = strtoimax(buf, &endptr, 0);

	if (endptr == buf) {
		/* No valid digits. */
		errno = EINVAL;
		return (-1);
	}

	if (*endptr == '\0') {
		/* No unit. */
		*num = number;
		return (0);
	}

	s = tolower(*endptr);
	switch (s) {
	case 'b':
	case 'k':
	case 'm':
	case 'g':
	case 't':
	case 'p':
	case 'e':
		break;
	default:
		/* Unrecognized unit. */
		errno = EINVAL;
		return (-1);
	}

	for (i = 0; unit[i] != '\0'; i++) {
		if (s == unit[i])
			break;
		if ((number < 0 && (number << 10) > number) ||
		    (number >= 0 && (number << 10) < number)) {
			errno = ERANGE;
			return (-1);
		}
		number <<= 10;
	}

	*num = number;
	return (0);
}


================================================
FILE: ipfw/glue.c
================================================
/*
 * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $Id: glue.c 12264 2013-04-27 20:21:06Z luigi $
 *
 * Userland functions missing in linux/Windows
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#ifdef _WIN32
#include <netdb.h>
#include <windows.h>
#endif /* _WIN32 */

#ifndef HAVE_NAT
/* dummy nat functions */
void
ipfw_show_nat(int ac, char **av)
{
	fprintf(stderr, "%s unsupported\n", __FUNCTION__);
}

void
ipfw_config_nat(int ac, char **av)
{
	fprintf(stderr, "%s unsupported\n", __FUNCTION__);
}
#endif

#ifdef __linux__
int optreset;	/* missing in linux */
#endif

/*
 * not implemented in linux.
 * taken from /usr/src/lib/libc/string/strlcpy.c
 */
size_t
strlcpy(char *dst, const char *src, size_t siz)
{
        char *d = dst;
        const char *s = src;
        size_t n = siz;

        /* Copy as many bytes as will fit */
        if (n != 0 && --n != 0) {
                do {
                        if ((*d++ = *s++) == 0)
                                break;
                } while (--n != 0);
        }

        /* Not enough room in dst, add NUL and traverse rest of src */
        if (n == 0) {
                if (siz != 0)
                        *d = '\0';              /* NUL-terminate dst */
                while (*s++)
                        ;
        }

        return(s - src - 1);    /* count does not include NUL */
}


/* missing in linux and windows */
long long int
strtonum(const char *nptr, long long minval, long long maxval,
         const char **errstr)
{
	long long ret;
	int errno_c = errno;	/* save actual errno */

	errno = 0;
#ifdef TCC
	ret = strtol(nptr, (char **)errstr, 0);
#else
	ret = strtoll(nptr, (char **)errstr, 0);
#endif
	/* We accept only a string that represent exactly a number (ie. start
	 * and end with a digit).
	 * FreeBSD version wants errstr==NULL if no error occurs, otherwise
	 * errstr should point to an error string.
	 * For our purspose, we implement only the invalid error, ranges
	 * error aren't checked
	 */
	if (errno != 0 || nptr == *errstr || **errstr != '\0')
		*errstr = "invalid";
	else  {
		*errstr = NULL;
		errno = errno_c;
	}
	return ret;
}

#if defined (_WIN32) || defined (EMULATE_SYSCTL)
//XXX missing prerequisites
#include <net/if.h> 		//openwrt
#include <netinet/ip.h> 	//openwrt
#include <netinet/ip_fw.h>
#include <netinet/ip_dummynet.h>
#endif

/*
 * set or get system information
 * XXX lock acquisition/serialize calls
 *
 * we export this as sys/module/ipfw_mod/parameters/___
 * This function get or/and set the value of the sysctl passed by
 * the name parameter. If the old value is not desired,
 * oldp and oldlenp should be set to NULL.
 *
 * XXX
 * I do not know how this works in FreeBSD in the case
 * where there are no write permission on the sysctl var.
 * We read the value and set return variables in any way
 * but returns -1 on write failures, regardless the
 * read success.
 *
 * Since there is no information on types, in the following
 * code we assume a length of 4 is a int.
 *
 * Returns 0 on success, -1 on errors.
 */
int
sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp,
         size_t newlen)
{
#if defined (_WIN32) || defined (EMULATE_SYSCTL)
	/*
	 * we embed the sysctl request in the usual sockopt mechanics.
	 * the sockopt buffer il filled with a dn_id with IP_DUMMYNET3
	 * command, and the special DN_SYSCTL_GET and DN_SYSCTL_SET
	 * subcommands.
	 * the syntax of this function is fully compatible with
	 * POSIX sysctlby name:
	 * if newp and newlen are != 0 => this is a set
	 * else if oldp and oldlen are != 0 => this is a get
	 *		to avoid too much overhead in the module, the whole
	 *		sysctltable is returned, and the parsing is done in userland,
	 *		a probe request is done to retrieve the size needed to
	 *		transfer the table, before the real request
	 * if both old and new params = 0 => this is a print
	 *		this is a special request, done only by main()
	 *		to implement the extension './ipfw sysctl',
	 *		a command that bypasses the normal getopt, and that
	 *		is available on those platforms that use this
	 *		sysctl emulation.
	 *		in this case, a negative oldlen signals that *oldp
	 *		is actually a FILE* to print somewhere else than stdout
	 */

	int l;
	int ret;
	struct dn_id* oid;
	struct sysctlhead* entry;
	char* pstring;
	char* pdata;
	FILE* fp;

	if((oldlenp != NULL) && (*oldlenp < 0))
		fp = (FILE*)oldp;
	else
		fp = stdout;
	if(newp != NULL && newlen != 0)
	{
		//this is a set
		l = sizeof(struct dn_id) + sizeof(struct sysctlhead) + strlen(name)+1 + newlen;
		oid = malloc(l);
		if (oid == NULL)
			return -1;
		oid->len = l;
		oid->type = DN_SYSCTL_SET;
		oid->id = DN_API_VERSION;

		entry = (struct sysctlhead*)(oid+1);
		pdata = (char*)(entry+1);
		pstring = pdata + newlen;

		entry->blocklen = ((sizeof(struct sysctlhead) + strlen(name)+1 + newlen) + 3) & ~3;
		entry->namelen = strlen(name)+1;
		entry->flags = 0;
		entry->datalen = newlen;

		bcopy(newp, pdata, newlen);
		bcopy(name, pstring, strlen(name)+1);

		ret = do_cmd(IP_DUMMYNET3, oid, (uintptr_t)l);
		if (ret != 0)
			return -1;
	}
	else
	{
		//this is a get or a print
		l = sizeof(struct dn_id);
		oid = malloc(l);
		if (oid == NULL)
			return -1;
		oid->len = l;
		oid->type = DN_SYSCTL_GET;
		oid->id = DN_API_VERSION;

		ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l);
		if (ret != 0)
			return -1;

		l=oid->id;
		free(oid);
		oid = malloc(l);
		if (oid == NULL)
			return -1;
		oid->len = l;
		oid->type = DN_SYSCTL_GET;
		oid->id = DN_API_VERSION;

		ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l);
		if (ret != 0)
			return -1;

		entry = (struct sysctlhead*)(oid+1);
		while(entry->blocklen != 0)
		{
			pdata = (char*)(entry+1);
			pstring = pdata+entry->datalen;

			//time to check if this is a get or a print
			if(name != NULL && oldp != NULL && *oldlenp > 0)
			{
				//this is a get
				if(strcmp(name,pstring) == 0)
				{
					//match found, sanity chech on len
					if(*oldlenp < entry->datalen)
					{
						printf("%s error: buffer too small\n",__FUNCTION__);
						return -1;
					}
					*oldlenp = entry->datalen;
					bcopy(pdata, oldp, *oldlenp);
					return 0;
				}
			}
			else
			{
				//this is a print
				if( name == NULL )
					goto print;
				if ( (strncmp(pstring,name,strlen(name)) == 0) && ( pstring[strlen(name)]=='\0' || pstring[strlen(name)]=='.' ) )
						goto print;
				else
						goto skip;
print:
				fprintf(fp, "%s: ",pstring);
				switch( entry->flags >> 2 )
				{
					case SYSCTLTYPE_LONG:
						fprintf(fp, "%li ", *(long*)(pdata));
						break;
					case SYSCTLTYPE_UINT:
						fprintf(fp, "%u ", *(unsigned int*)(pdata));
						break;
					case SYSCTLTYPE_ULONG:
						fprintf(fp, "%lu ", *(unsigned long*)(pdata));
						break;
					case SYSCTLTYPE_INT:
					default:
						fprintf(fp, "%i ", *(int*)(pdata));
				}
				if( (entry->flags & 0x00000003) == CTLFLAG_RD )
					fprintf(fp, "\t(read only)\n");
				else
					fprintf(fp, "\n");
skip:			;
			}
			entry = (struct sysctlhead*)((unsigned char*)entry + entry->blocklen);
		}
		free(oid);
		return 0;
	}
	//fallback for invalid options
	return -1;

#else /* __linux__ */
	FILE *fp;
	char *basename = "/sys/module/ipfw_mod/parameters/";
	char filename[256];	/* full filename */
	char *varp;
	int ret = 0;		/* return value */
	long d;

	if (name == NULL) /* XXX set errno */
		return -1;

	/* locate the filename */
	varp = strrchr(name, '.');
	if (varp == NULL) /* XXX set errno */
		return -1;

	snprintf(filename, sizeof(filename), "%s%s", basename, varp+1);

	/*
	 * XXX we could open the file here, in rw mode
	 * but need to check if a file have write
	 * permissions.
	 */

	/* check parameters */
	if (oldp && oldlenp) { /* read mode */
		fp = fopen(filename, "r");
		if (fp == NULL) {
			fprintf(stderr, "%s fopen error reading filename %s\n", __FUNCTION__, filename);
			return -1;
		}
		if (fscanf(fp, "%ld", &d) != 1) {
			ret = -1;
		} else if (*oldlenp == sizeof(int)) {
			int dst = d;
			memcpy(oldp, &dst, *oldlenp);
		} else if (*oldlenp == sizeof(long)) {
			memcpy(oldp, &d, *oldlenp);
		} else {
			fprintf(stderr, "unknown paramerer len %d\n",
				(int)*oldlenp);
		}
		fclose(fp);
	}

	if (newp && newlen) { /* write */
		fp = fopen(filename, "w");
		if (fp == NULL) {
			fprintf(stderr, "%s fopen error writing filename %s\n", __FUNCTION__, filename);
			return -1;
		}
		if (newlen == sizeof(int)) {
			if (fprintf(fp, "%d", *(int *)newp) < 1)
				ret = -1;
		} else if (newlen == sizeof(long)) {
			if (fprintf(fp, "%ld", *(long *)newp) < 1)
				ret = -1;
		} else {
			fprintf(stderr, "unknown paramerer len %d\n",
				(int)newlen);
		}

		fclose(fp);
	}

	return ret;
#endif /* __linux__ */
}

#ifdef _WIN32
/*
 * On windows, set/getsockopt are mapped to DeviceIoControl()
 */
int
wnd_setsockopt(int s, int level, int sopt_name, const void *optval,
                socklen_t optlen)
{
    size_t len = sizeof (struct sockopt) + optlen;
    struct sockopt *sock;
    DWORD n;
    BOOL result;
    HANDLE _dev_h = (HANDLE)s;

    /* allocate a data structure for communication */
    sock = malloc(len);
    if (sock == NULL)
        return -1;

    sock->sopt_dir = SOPT_SET;
    sock->sopt_name = sopt_name;
    sock->sopt_valsize = optlen;
    sock->sopt_val = (void *)(sock+1);

    memcpy(sock->sopt_val, optval, optlen);
    result = DeviceIoControl (_dev_h, IP_FW_SETSOCKOPT, sock, len,
		NULL, 0, &n, NULL);
    free (sock);

    return (result ? 0 : -1);
}

int
wnd_getsockopt(int s, int level, int sopt_name, void *optval,
                socklen_t *optlen)
{
    size_t len = sizeof (struct sockopt) + *optlen;
    struct sockopt *sock;
    DWORD n;
    BOOL result;
    HANDLE _dev_h = (HANDLE)s;

    sock = malloc(len);
    if (sock == NULL)
        return -1;

    sock->sopt_dir = SOPT_GET;
    sock->sopt_name = sopt_name;
    sock->sopt_valsize = *optlen;
    sock->sopt_val = (void *)(sock+1);

    memcpy (sock->sopt_val, optval, *optlen);

    result = DeviceIoControl (_dev_h, IP_FW_GETSOCKOPT, sock, len,
		sock, len, &n, NULL);
	//printf("len = %i, returned = %u, valsize = %i\n",len,n,sock->sopt_valsize);
    *optlen = sock->sopt_valsize;
    memcpy (optval, sock->sopt_val, *optlen);
    free (sock);
    return (result ? 0 : -1);
}

int
my_socket(int domain, int ty, int proto)
{
    TCHAR *pcCommPort = TEXT("\\\\.\\Ipfw");
    HANDLE _dev_h = INVALID_HANDLE_VALUE;

    /* Special Handling For Accessing Device On Windows 2000 Terminal Server
       See Microsoft KB Article 259131 */
    if (_dev_h == INVALID_HANDLE_VALUE) {
        _dev_h = CreateFile (pcCommPort,
		GENERIC_READ | GENERIC_WRITE,
		0, NULL,
		OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
    }
    if (_dev_h == INVALID_HANDLE_VALUE) {
	printf("%s failed %u, cannot talk to kernel module\n",
		__FUNCTION__, (unsigned)GetLastError());
        return -1;
    }
    return (int)_dev_h;
}

struct hostent* gethostbyname2(const char *name, int af)
{
	return gethostbyname(name);
}

struct ether_addr* ether_aton(const char *a)
{
	fprintf(stderr, "%s empty\n", __FUNCTION__);
	return NULL;
}

#ifdef TCC
int     opterr = 1,             /* if error message should be printed */
        optind = 1,             /* index into parent argv vector */
        optopt,                 /* character checked for validity */
        optreset;               /* reset getopt */
char    *optarg;                /* argument associated with option */

#define BADCH   (int)'?'
#define BADARG  (int)':'
#define EMSG    ""

#define PROGNAME	"ipfw"
/*
 * getopt --
 *      Parse argc/argv argument vector.
 */
int
getopt(nargc, nargv, ostr)
        int nargc;
        char * const nargv[];
        const char *ostr;
{
        static char *place = EMSG;              /* option letter processing */
        char *oli;                              /* option letter list index */

        if (optreset || *place == 0) {          /* update scanning pointer */
                optreset = 0;
                place = nargv[optind];
                if (optind >= nargc || *place++ != '-') {
                        /* Argument is absent or is not an option */
                        place = EMSG;
                        return (-1);
                }
                optopt = *place++;
                if (optopt == '-' && *place == 0) {
                        /* "--" => end of options */
                        ++optind;
                        place = EMSG;
                        return (-1);
                }
                if (optopt == 0) {
                        /* Solitary '-', treat as a '-' option
                           if the program (eg su) is looking for it. */
                        place = EMSG;
                        if (strchr(ostr, '-') == NULL)
                                return (-1);
                        optopt = '-';
                }
        } else
                optopt = *place++;

        /* See if option letter is one the caller wanted... */
        if (optopt == ':' || (oli = strchr(ostr, optopt)) == NULL) {
                if (*place == 0)
                        ++optind;
                if (opterr && *ostr != ':')
                        (void)fprintf(stderr,
                            "%s: illegal option -- %c\n", PROGNAME,
                            optopt);
                return (BADCH);
        }

        /* Does this option need an argument? */
        if (oli[1] != ':') {
                /* don't need argument */
                optarg = NULL;
                if (*place == 0)
                        ++optind;
        } else {
                /* Option-argument is either the rest of this argument or the
                   entire next argument. */
                if (*place)
                        optarg = place;
                else if (nargc > ++optind)
                        optarg = nargv[optind];
                else {
                        /* option-argument absent */
                        place = EMSG;
                        if (*ostr == ':')
                                return (BADARG);
                        if (opterr)
                                (void)fprintf(stderr,
                                    "%s: option requires an argument -- %c\n",
                                    PROGNAME, optopt);
                        return (BADCH);
                }
                place = EMSG;
                ++optind;
        }
        return (optopt);                        /* return option letter */
}

//static FILE *err_file = stderr;
void
verrx(int ex, int eval, const char *fmt, va_list ap)
{
        fprintf(stderr, "%s: ", PROGNAME);
        if (fmt != NULL)
                vfprintf(stderr, fmt, ap);
        fprintf(stderr, "\n");
	if (ex)
		exit(eval);
}
void
errx(int eval, const char *fmt, ...)
{
        va_list ap;
        va_start(ap, fmt);
        verrx(1, eval, fmt, ap);
        va_end(ap);
}

void
warnx(const char *fmt, ...)
{
        va_list ap;
        va_start(ap, fmt);
	verrx(0, 0, fmt, ap);
        va_end(ap);
}

char *
strsep(char **stringp, const char *delim)
{
        char *s;
        const char *spanp;
        int c, sc;
        char *tok;

        if ((s = *stringp) == NULL)
                return (NULL);
        for (tok = s;;) {
                c = *s++;
                spanp = delim;
                do {
                        if ((sc = *spanp++) == c) {
                                if (c == 0)
                                        s = NULL;
                                else
                                        s[-1] = 0;
                                *stringp = s;
                                return (tok);
                        }
                } while (sc != 0);
        }
        /* NOTREACHED */
}

static unsigned char
tolower(unsigned char c)
{
	return (c >= 'A' && c <= 'Z') ? c + 'a' - 'A' : c;
}

static int isdigit(unsigned char c)
{
	return (c >= '0' && c <= '9');
}

static int isxdigit(unsigned char c)
{
	return (strchr("0123456789ABCDEFabcdef", c) ? 1 : 0);
}

static int isspace(unsigned char c)
{
	return (strchr(" \t\n\r", c) ? 1 : 0);
}

static int isascii(unsigned char c)
{
	return (c < 128);
}

static int islower(unsigned char c)
{
	return (c >= 'a' && c <= 'z');
}

int
strcasecmp(const char *s1, const char *s2)
{
        const unsigned char
                        *us1 = (const unsigned char *)s1,
                        *us2 = (const unsigned char *)s2;

        while (tolower(*us1) == tolower(*us2++))
                if (*us1++ == '\0')
                        return (0);
        return (tolower(*us1) - tolower(*--us2));
}

intmax_t
strtoimax(const char * restrict nptr, char ** restrict endptr, int base)
{
	return strtol(nptr, endptr,base);
}

void
setservent(int a)
{
}

#define NS_INADDRSZ 128

int
inet_pton(int af, const char *src, void *dst)
{
        static const char digits[] = "0123456789";
        int saw_digit, octets, ch;
        u_char tmp[NS_INADDRSZ], *tp;

	if (af != AF_INET) {
		errno = EINVAL;
		return -1;
	}

        saw_digit = 0;
        octets = 0;
        *(tp = tmp) = 0;
        while ((ch = *src++) != '\0') {
                const char *pch;

                if ((pch = strchr(digits, ch)) != NULL) {
                        u_int new = *tp * 10 + (pch - digits);

                        if (saw_digit && *tp == 0)
                                return (0);
                        if (new > 255)
                                return (0);
                        *tp = new;
                        if (!saw_digit) {
                                if (++octets > 4)
                                        return (0);
                                saw_digit = 1;
                        }
                } else if (ch == '.' && saw_digit) {
                        if (octets == 4)
                                return (0);
                        *++tp = 0;
                        saw_digit = 0;
                } else
                        return (0);
        }
        if (octets < 4)
                return (0);
        memcpy(dst, tmp, NS_INADDRSZ);
        return (1);
}

const char *
inet_ntop(int af, const void *_src, char *dst, socklen_t size)
{
        static const char fmt[] = "%u.%u.%u.%u";
        char tmp[sizeof "255.255.255.255"];
	const u_char *src = _src;
        int l;
	if (af != AF_INET) {
		errno = EINVAL;
		return NULL;
	}

        l = snprintf(tmp, sizeof(tmp), fmt, src[0], src[1], src[2], src[3]);
        if (l <= 0 || (socklen_t) l >= size) {
                errno = ENOSPC;
                return (NULL);
        }
        strlcpy(dst, tmp, size);
        return (dst);
}

/*%
 * Check whether "cp" is a valid ascii representation
 * of an Internet address and convert to a binary address.
 * Returns 1 if the address is valid, 0 if not.
 * This replaces inet_addr, the return value from which
 * cannot distinguish between failure and a local broadcast address.
 */
int
inet_aton(const char *cp, struct in_addr *addr) {
        u_long val;
        int base, n;
        char c;
        u_int8_t parts[4];
        u_int8_t *pp = parts;
        int digit;

        c = *cp;
        for (;;) {
                /*
                 * Collect number up to ``.''.
                 * Values are specified as for C:
                 * 0x=hex, 0=octal, isdigit=decimal.
                 */
                if (!isdigit((unsigned char)c))
                        return (0);
                val = 0; base = 10; digit = 0;
                if (c == '0') {
                        c = *++cp;
                        if (c == 'x' || c == 'X')
                                base = 16, c = *++cp;
                        else {
                                base = 8;
                                digit = 1 ;
                        }
                }
                for (;;) {
                        if (isascii(c) && isdigit((unsigned char)c)) {
                                if (base == 8 && (c == '8' || c == '9'))
                                        return (0);
                                val = (val * base) + (c - '0');
                                c = *++cp;
                                digit = 1;
                        } else if (base == 16 && isascii(c) &&
                                   isxdigit((unsigned char)c)) {
                                val = (val << 4) |
                                        (c + 10 - (islower((unsigned char)c) ? 'a' : 'A'));
                                c = *++cp;
                                digit = 1;
                        } else
                                break;
                }
                if (c == '.') {
                        /*
                         * Internet format:
                         *      a.b.c.d
                         *      a.b.c   (with c treated as 16 bits)
                         *      a.b     (with b treated as 24 bits)
                         */
                        if (pp >= parts + 3 || val > 0xffU)
                                return (0);
                        *pp++ = val;
                        c = *++cp;
                } else
                        break;
        }
        /*
         * Check for trailing characters.
         */
        if (c != '\0' && (!isascii(c) || !isspace((unsigned char)c)))
                return (0);
        /*
         * Did we get a valid digit?
         */
        if (!digit)
                return (0);
        /*
         * Concoct the address according to
         * the number of parts specified.
         */
        n = pp - parts + 1;
        switch (n) {
        case 1:                         /*%< a -- 32 bits */
                break;

        case 2:                         /*%< a.b -- 8.24 bits */
                if (val > 0xffffffU)
                        return (0);
                val |= parts[0] << 24;
                break;

        case 3:                         /*%< a.b.c -- 8.8.16 bits */
                if (val > 0xffffU)
                        return (0);
                val |= (parts[0] << 24) | (parts[1] << 16);
                break;

        case 4:                         /*%< a.b.c.d -- 8.8.8.8 bits */
                if (val > 0xffU)
                        return (0);
                val |= (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8);
                break;
        }
        if (addr != NULL)
                addr->s_addr = htonl(val);
        return (1);
}

#endif /* TCC */

#endif /* _WIN32 */


================================================
FILE: ipfw/humanize_number.c
================================================
/*	$NetBSD: humanize_number.c,v 1.13 2007/12/14 17:26:19 christos Exp $	*/

/*
 * Copyright (c) 1997, 1998, 1999, 2002 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center, by Luke Mewburn and by Tomas Svensson.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by the NetBSD
 *      Foundation, Inc. and its contributors.
 * 4. Neither the name of The NetBSD Foundation nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

// #include <sys/cdefs.h>
__FBSDID("$FreeBSD: src/lib/libutil/humanize_number.c,v 1.2.10.1 2008/04/20 16:29:01 antoine Exp $");

#include <sys/types.h>
#include <assert.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// #include <locale.h>
//#include <libutil.h>

int
humanize_number(char *buf, size_t len, int64_t bytes,
    const char *suffix, int scale, int flags)
{
	const char *prefixes, *sep;
	int	b, i, r, maxscale, s1, s2, sign;
	int64_t	divisor, max;
	size_t	baselen;

	assert(buf != NULL);
	assert(suffix != NULL);
	assert(scale >= 0);

	if (flags & HN_DIVISOR_1000) {
		/* SI for decimal multiplies */
		divisor = 1000;
		if (flags & HN_B)
			prefixes = "B\0k\0M\0G\0T\0P\0E";
		else
			prefixes = "\0\0k\0M\0G\0T\0P\0E";
	} else {
		/*
		 * binary multiplies
		 * XXX IEC 60027-2 recommends Ki, Mi, Gi...
		 */
		divisor = 1024;
		if (flags & HN_B)
			prefixes = "B\0K\0M\0G\0T\0P\0E";
		else
			prefixes = "\0\0K\0M\0G\0T\0P\0E";
	}

#define	SCALE2PREFIX(scale)	(&prefixes[(scale) << 1])
	maxscale = 7;

	if (scale >= maxscale &&
	    (scale & (HN_AUTOSCALE | HN_GETSCALE)) == 0)
		return (-1);

	if (buf == NULL || suffix == NULL)
		return (-1);

	if (len > 0)
		buf[0] = '\0';
	if (bytes < 0) {
		sign = -1;
		bytes *= -100;
		baselen = 3;		/* sign, digit, prefix */
	} else {
		sign = 1;
		bytes *= 100;
		baselen = 2;		/* digit, prefix */
	}
	if (flags & HN_NOSPACE)
		sep = "";
	else {
		sep = " ";
		baselen++;
	}
	baselen += strlen(suffix);

	/* Check if enough room for `x y' + suffix + `\0' */
	if (len < baselen + 1)
		return (-1);

	if (scale & (HN_AUTOSCALE | HN_GETSCALE)) {
		/* See if there is additional columns can be used. */
		for (max = 100, i = len - baselen; i-- > 0;)
			max *= 10;

		/*
		 * Divide the number until it fits the given column.
		 * If there will be an overflow by the rounding below,
		 * divide once more.
		 */
		for (i = 0; bytes >= max - 50 && i < maxscale; i++)
			bytes /= divisor;

		if (scale & HN_GETSCALE)
			return (i);
	} else
		for (i = 0; i < scale && i < maxscale; i++)
			bytes /= divisor;

	/* If a value <= 9.9 after rounding and ... */
	if (bytes < 995 && i > 0 && flags & HN_DECIMAL) {
		/* baselen + \0 + .N */
		if (len < baselen + 1 + 2)
			return (-1);
		b = ((int)bytes + 5) / 10;
		s1 = b / 10;
		s2 = b % 10;
		r = snprintf(buf, len, "%d%s%d%s%s%s",
		    sign * s1, ".", s2,
		    sep, SCALE2PREFIX(i), suffix);
	} else
		r = snprintf(buf, len, "%" PRId64 "%s%s%s",
		    sign * ((bytes + 50) / 100),
		    sep, SCALE2PREFIX(i), suffix);

	return (r);
}


================================================
FILE: ipfw/include/alias.h
================================================
#ifndef _ALIAS_H_
#define	_ALIAS_H_

#define LIBALIAS_BUF_SIZE 128

/*
 * If PKT_ALIAS_LOG is set, a message will be printed to /var/log/alias.log
 * every time a link is created or deleted.  This is useful for debugging.
 */
#define	PKT_ALIAS_LOG			0x01

/*
 * If PKT_ALIAS_DENY_INCOMING is set, then incoming connections (e.g. to ftp,
 * telnet or web servers will be prevented by the aliasing mechanism.
 */
#define	PKT_ALIAS_DENY_INCOMING		0x02

/*
 * If PKT_ALIAS_SAME_PORTS is set, packets will be attempted sent from the
 * same port as they originated on.  This allows e.g. rsh to work *99% of the
 * time*, but _not_ 100% (it will be slightly flakey instead of not working
 * at all).  This mode bit is set by PacketAliasInit(), so it is a default
 * mode of operation.
 */
#define	PKT_ALIAS_SAME_PORTS		0x04

/*
 * If PKT_ALIAS_USE_SOCKETS is set, then when partially specified links (e.g.
 * destination port and/or address is zero), the packet aliasing engine will
 * attempt to allocate a socket for the aliasing port it chooses.  This will
 * avoid interference with the host machine.  Fully specified links do not
 * require this.  This bit is set after a call to PacketAliasInit(), so it is
 * a default mode of operation.
 */
#ifndef	NO_USE_SOCKETS
#define	PKT_ALIAS_USE_SOCKETS		0x08
#endif
/*-
 * If PKT_ALIAS_UNREGISTERED_ONLY is set, then only packets with
 * unregistered source addresses will be aliased.  Private
 * addresses are those in the following ranges:
 *
 *		10.0.0.0     ->   10.255.255.255
 *		172.16.0.0   ->   172.31.255.255
 *		192.168.0.0  ->   192.168.255.255
 */
#define	PKT_ALIAS_UNREGISTERED_ONLY	0x10

/*
 * If PKT_ALIAS_RESET_ON_ADDR_CHANGE is set, then the table of dynamic
 * aliasing links will be reset whenever PacketAliasSetAddress() changes the
 * default aliasing address.  If the default aliasing address is left
 * unchanged by this function call, then the table of dynamic aliasing links
 * will be left intact.  This bit is set after a call to PacketAliasInit().
 */
#define	PKT_ALIAS_RESET_ON_ADDR_CHANGE	0x20


/*
 * If PKT_ALIAS_PROXY_ONLY is set, then NAT will be disabled and only
 * transparent proxying is performed.
 */
#define	PKT_ALIAS_PROXY_ONLY		0x40

/*
 * If PKT_ALIAS_REVERSE is set, the actions of PacketAliasIn() and
 * PacketAliasOut() are reversed.
 */
#define	PKT_ALIAS_REVERSE		0x80

#endif				/* !_ALIAS_H_ */


================================================
FILE: ipfw/include/net/if_dl.h
================================================
/*-
 * Copyright (c) 1990, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)if_dl.h	8.1 (Berkeley) 6/10/93
 * $FreeBSD: src/sys/net/if_dl.h,v 1.14 2005/01/07 01:45:34 imp Exp $
 */

#ifndef _NET_IF_DL_H_
#define _NET_IF_DL_H_

/*
 * A Link-Level Sockaddr may specify the interface in one of two
 * ways: either by means of a system-provided index number (computed
 * anew and possibly differently on every reboot), or by a human-readable
 * string such as "il0" (for managerial convenience).
 *
 * Census taking actions, such as something akin to SIOCGCONF would return
 * both the index and the human name.
 *
 * High volume transactions (such as giving a link-level ``from'' address
 * in a recvfrom or recvmsg call) may be likely only to provide the indexed
 * form, (which requires fewer copy operations and less space).
 *
 * The form and interpretation  of the link-level address is purely a matter
 * of convention between the device driver and its consumers; however, it is
 * expected that all drivers for an interface of a given if_type will agree.
 */

/*
 * Structure of a Link-Level sockaddr:
 */
struct sockaddr_dl {
	u_char	sdl_len;	/* Total length of sockaddr */
	u_char	sdl_family;	/* AF_LINK */
	u_short	sdl_index;	/* if != 0, system given index for interface */
	u_char	sdl_type;	/* interface type */
	u_char	sdl_nlen;	/* interface name length, no trailing 0 reqd. */
	u_char	sdl_alen;	/* link level address length */
	u_char	sdl_slen;	/* link layer selector length */
	char	sdl_data[46];	/* minimum work area, can be larger;
				   contains both if name and ll address */
};

#define LLADDR(s) ((caddr_t)((s)->sdl_data + (s)->sdl_nlen))

#ifndef _KERNEL

#include <sys/cdefs.h>

__BEGIN_DECLS
void	link_addr(const char *, struct sockaddr_dl *);
char	*link_ntoa(const struct sockaddr_dl *);
__END_DECLS

#endif /* !_KERNEL */

#endif


================================================
FILE: ipfw/include/net/pfvar.h
================================================
#ifndef _PF_VAR_H_
#define _PF_VAR_H_

/*
 * replacement for FreeBSD's pfqueue.h
 */
#include <sys/queue.h>

#define DIOCSTARTALTQ   _IO  ('D', 42)
#define DIOCSTOPALTQ    _IO  ('D', 43)

struct pf_altq {
	TAILQ_ENTRY(pf_altq)     entries;
	/* ... */
        u_int32_t                qid;           /* return value */

#define PF_QNAME_SIZE            64
        char                     qname[PF_QNAME_SIZE];  /* queue name */

};

struct pfioc_altq {
        u_int32_t        action;
        u_int32_t        ticket;
        u_int32_t        nr;
        struct pf_altq   altq;
};

#define DIOCGETALTQS    _IOWR('D', 47, struct pfioc_altq)
#define DIOCGETALTQ    _IOWR('D', 48, struct pfioc_altq)

#endif /* !_PF_VAR_H */


================================================
FILE: ipfw/include/timeconv.h
================================================
/*
 * simple override for _long_to_time()
 */
#ifndef _TIMECONV_H_
#define _TIMECONV_H_
static __inline time_t
_long_to_time(long tlong)
{
    if (sizeof(long) == sizeof(__int32_t))
        return((time_t)(__int32_t)(tlong));
    return((time_t)tlong);
}

#endif /* _TIMECONV_H_ */


================================================
FILE: ipfw/ipfw.8
================================================
.\"
.\" $FreeBSD$
.\"
.Dd October 25, 2012
.Dt IPFW 8
.Os
.Sh NAME
.Nm ipfw
.Nd User interface for firewall, traffic shaper, packet scheduler,
in-kernel NAT.
.Sh SYNOPSIS
.Ss FIREWALL CONFIGURATION
.Nm
.Op Fl cq
.Cm add
.Ar rule
.Nm
.Op Fl acdefnNStT
.Op Cm set Ar N
.Brq Cm list | show
.Op Ar rule | first-last ...
.Nm
.Op Fl f | q
.Op Cm set Ar N
.Cm flush
.Nm
.Op Fl q
.Op Cm set Ar N
.Brq Cm delete | zero | resetlog
.Op Ar number ...
.Pp
.Nm
.Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ...
.Nm
.Cm set move
.Op Cm rule
.Ar number Cm to Ar number
.Nm
.Cm set swap Ar number number
.Nm
.Cm set show
.Ss SYSCTL SHORTCUTS
.Nm
.Cm enable
.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive
.Nm
.Cm disable
.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive
.Ss LOOKUP TABLES
.Nm
.Cm table Ar number Cm add Ar addr Ns Oo / Ns Ar masklen Oc Op Ar value
.Nm
.Cm table Ar number Cm delete Ar addr Ns Op / Ns Ar masklen
.Nm
.Cm table
.Brq Ar number | all
.Cm flush
.Nm
.Cm table
.Brq Ar number | all
.Cm list
.Ss DUMMYNET CONFIGURATION (TRAFFIC SHAPER AND PACKET SCHEDULER)
.Nm
.Brq Cm pipe | queue | sched
.Ar number
.Cm config
.Ar config-options
.Nm
.Op Fl s Op Ar field
.Brq Cm pipe | queue | sched
.Brq Cm delete | list | show
.Op Ar number ...
.Ss IN-KERNEL NAT
.Nm
.Op Fl q
.Cm nat
.Ar number
.Cm config
.Ar config-options
.Pp
.Nm
.Op Fl cfnNqS
.Oo
.Fl p Ar preproc
.Oo
.Ar preproc-flags
.Oc
.Oc
.Ar pathname
.Sh DESCRIPTION
The
.Nm
utility is the user interface for controlling the
.Xr ipfw 4
firewall, the
.Xr dummynet 4
traffic shaper/packet scheduler, and the
in-kernel NAT services.
.Pp
A firewall configuration, or
.Em ruleset ,
is made of a list of
.Em rules
numbered from 1 to 65535.
Packets are passed to the firewall
from a number of different places in the protocol stack
(depending on the source and destination of the packet,
it is possible for the firewall to be
invoked multiple times on the same packet).
The packet passed to the firewall is compared
against each of the rules in the
.Em ruleset ,
in rule-number order
(multiple rules with the same number are permitted, in which case
they are processed in order of insertion).
When a match is found, the action corresponding to the
matching rule is performed.
.Pp
Depending on the action and certain system settings, packets
can be reinjected into the firewall at some rule after the
matching one for further processing.
.Pp
A ruleset always includes a
.Em default
rule (numbered 65535) which cannot be modified or deleted,
and matches all packets.
The action associated with the
.Em default
rule can be either
.Cm deny
or
.Cm allow
depending on how the kernel is configured.
.Pp
If the ruleset includes one or more rules with the
.Cm keep-state
or
.Cm limit
option,
the firewall will have a
.Em stateful
behaviour, i.e., upon a match it will create
.Em dynamic rules ,
i.e., rules that match packets with the same 5-tuple
(protocol, source and destination addresses and ports)
as the packet which caused their creation.
Dynamic rules, which have a limited lifetime, are checked
at the first occurrence of a
.Cm check-state ,
.Cm keep-state
or
.Cm limit
rule, and are typically used to open the firewall on-demand to
legitimate traffic only.
See the
.Sx STATEFUL FIREWALL
and
.Sx EXAMPLES
Sections below for more information on the stateful behaviour of
.Nm .
.Pp
All rules (including dynamic ones) have a few associated counters:
a packet count, a byte count, a log count and a timestamp
indicating the time of the last match.
Counters can be displayed or reset with
.Nm
commands.
.Pp
Each rule belongs to one of 32 different
.Em sets
, and there are
.Nm
commands to atomically manipulate sets, such as enable,
disable, swap sets, move all rules in a set to another
one, delete all rules in a set.
These can be useful to
install temporary configurations, or to test them.
See Section
.Sx SETS OF RULES
for more information on
.Em sets .
.Pp
Rules can be added with the
.Cm add
command; deleted individually or in groups with the
.Cm delete
command, and globally (except those in set 31) with the
.Cm flush
command; displayed, optionally with the content of the
counters, using the
.Cm show
and
.Cm list
commands.
Finally, counters can be reset with the
.Cm zero
and
.Cm resetlog
commands.
.Pp
.Ss COMMAND OPTIONS
The following general options are available when invoking
.Nm :
.Bl -tag -width indent
.It Fl a
Show counter values when listing rules.
The
.Cm show
command implies this option.
.It Fl b
Only show the action and the comment, not the body of a rule.
Implies
.Fl c .
.It Fl c
When entering or showing rules, print them in compact form,
i.e., omitting the "ip from any to any" string
when this does not carry any additional information.
.It Fl d
When listing, show dynamic rules in addition to static ones.
.It Fl e
When listing and
.Fl d
is specified, also show expired dynamic rules.
.It Fl f
Do not ask for confirmation for commands that can cause problems
if misused, i.e.,
.Cm flush .
If there is no tty associated with the process, this is implied.
.It Fl i
When listing a table (see the
.Sx LOOKUP TABLES
section below for more information on lookup tables), format values
as IP addresses.
By default, values are shown as integers.
.It Fl n
Only check syntax of the command strings, without actually passing
them to the kernel.
.It Fl N
Try to resolve addresses and service names in output.
.It Fl q
Be quiet when executing the
.Cm add ,
.Cm nat ,
.Cm zero ,
.Cm resetlog
or
.Cm flush
commands;
(implies
.Fl f ) .
This is useful when updating rulesets by executing multiple
.Nm
commands in a script
(e.g.,
.Ql sh\ /etc/rc.firewall ) ,
or by processing a file with many
.Nm
rules across a remote login session.
It also stops a table add or delete
from failing if the entry already exists or is not present.
.Pp
The reason why this option may be important is that
for some of these actions,
.Nm
may print a message; if the action results in blocking the
traffic to the remote client,
the remote login session will be closed
and the rest of the ruleset will not be processed.
Access to the console would then be required to recover.
.It Fl S
When listing rules, show the
.Em set
each rule belongs to.
If this flag is not specified, disabled rules will not be
listed.
.It Fl s Op Ar field
When listing pipes, sort according to one of the four
counters (total or current packets or bytes).
.It Fl t
When listing, show last match timestamp converted with ctime().
.It Fl T
When listing, show last match timestamp as seconds from the epoch.
This form can be more convenient for postprocessing by scripts.
.El
.Ss LIST OF RULES AND PREPROCESSING
To ease configuration, rules can be put into a file which is
processed using
.Nm
as shown in the last synopsis line.
An absolute
.Ar pathname
must be used.
The file will be read line by line and applied as arguments to the
.Nm
utility.
.Pp
Optionally, a preprocessor can be specified using
.Fl p Ar preproc
where
.Ar pathname
is to be piped through.
Useful preprocessors include
.Xr cpp 1
and
.Xr m4 1 .
If
.Ar preproc
does not start with a slash
.Pq Ql /
as its first character, the usual
.Ev PATH
name search is performed.
Care should be taken with this in environments where not all
file systems are mounted (yet) by the time
.Nm
is being run (e.g.\& when they are mounted over NFS).
Once
.Fl p
has been specified, any additional arguments are passed on to the preprocessor
for interpretation.
This allows for flexible configuration files (like conditionalizing
them on the local hostname) and the use of macros to centralize
frequently required arguments like IP addresses.
.Ss TRAFFIC SHAPER CONFIGURATION
The
.Nm
.Cm pipe , queue
and
.Cm sched
commands are used to configure the traffic shaper and packet scheduler.
See the
.Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION
Section below for details.
.Pp
If the world and the kernel get out of sync the
.Nm
ABI may break, preventing you from being able to add any rules.
This can adversely affect the booting process.
You can use
.Nm
.Cm disable
.Cm firewall
to temporarily disable the firewall to regain access to the network,
allowing you to fix the problem.
.Sh PACKET FLOW
A packet is checked against the active ruleset in multiple places
in the protocol stack, under control of several sysctl variables.
These places and variables are shown below, and it is important to
have this picture in mind in order to design a correct ruleset.
.Bd -literal -offset indent
       ^    to upper layers    V
       |                       |
       +----------->-----------+
       ^                       V
 [ip(6)_input]           [ip(6)_output]     net.inet(6).ip(6).fw.enable=1
       |                       |
       ^                       V
 [ether_demux]        [ether_output_frame]  net.link.ether.ipfw=1
       |                       |
       +-->--[bdg_forward]-->--+            net.link.bridge.ipfw=1
       ^                       V
       |      to devices       |
.Ed
.Pp
The number of
times the same packet goes through the firewall can
vary between 0 and 4 depending on packet source and
destination, and system configuration.
.Pp
Note that as packets flow through the stack, headers can be
stripped or added to it, and so they may or may not be available
for inspection.
E.g., incoming packets will include the MAC header when
.Nm
is invoked from
.Cm ether_demux() ,
but the same packets will have the MAC header stripped off when
.Nm
is invoked from
.Cm ip_input()
or
.Cm ip6_input() .
.Pp
Also note that each packet is always checked against the complete ruleset,
irrespective of the place where the check occurs, or the source of the packet.
If a rule contains some match patterns or actions which are not valid
for the place of invocation (e.g.\& trying to match a MAC header within
.Cm ip_input
or
.Cm ip6_input ),
the match pattern will not match, but a
.Cm not
operator in front of such patterns
.Em will
cause the pattern to
.Em always
match on those packets.
It is thus the responsibility of
the programmer, if necessary, to write a suitable ruleset to
differentiate among the possible places.
.Cm skipto
rules can be useful here, as an example:
.Bd -literal -offset indent
# packets from ether_demux or bdg_forward
ipfw add 10 skipto 1000 all from any to any layer2 in
# packets from ip_input
ipfw add 10 skipto 2000 all from any to any not layer2 in
# packets from ip_output
ipfw add 10 skipto 3000 all from any to any not layer2 out
# packets from ether_output_frame
ipfw add 10 skipto 4000 all from any to any layer2 out
.Ed
.Pp
(yes, at the moment there is no way to differentiate between
ether_demux and bdg_forward).
.Sh SYNTAX
In general, each keyword or argument must be provided as
a separate command line argument, with no leading or trailing
spaces.
Keywords are case-sensitive, whereas arguments may
or may not be case-sensitive depending on their nature
(e.g.\& uid's are, hostnames are not).
.Pp
Some arguments (e.g., port or address lists) are comma-separated
lists of values.
In this case, spaces after commas ',' are allowed to make
the line more readable.
You can also put the entire
command (including flags) into a single argument.
E.g., the following forms are equivalent:
.Bd -literal -offset indent
ipfw -q add deny src-ip 10.0.0.0/24,127.0.0.1/8
ipfw -q add deny src-ip 10.0.0.0/24, 127.0.0.1/8
ipfw "-q add deny src-ip 10.0.0.0/24, 127.0.0.1/8"
.Ed
.Sh RULE FORMAT
The format of firewall rules is the following:
.Bd -ragged -offset indent
.Bk -words
.Op Ar rule_number
.Op Cm set Ar set_number
.Op Cm prob Ar match_probability
.Ar action
.Op Cm log Op Cm logamount Ar number
.Op Cm altq Ar queue
.Oo
.Bro Cm tag | untag
.Brc Ar number
.Oc
.Ar body
.Ek
.Ed
.Pp
where the body of the rule specifies which information is used
for filtering packets, among the following:
.Pp
.Bl -tag -width "Source and dest. addresses and ports" -offset XXX -compact
.It Layer-2 header fields
When available
.It IPv4 and IPv6 Protocol
TCP, UDP, ICMP, etc.
.It Source and dest. addresses and ports
.It Direction
See Section
.Sx PACKET FLOW
.It Transmit and receive interface
By name or address
.It Misc. IP header fields
Version, type of service, datagram length, identification,
fragment flag (non-zero IP offset),
Time To Live
.It IP options
.It IPv6 Extension headers
Fragmentation, Hop-by-Hop options,
Routing Headers, Source routing rthdr0, Mobile IPv6 rthdr2, IPSec options.
.It IPv6 Flow-ID
.It Misc. TCP header fields
TCP flags (SYN, FIN, ACK, RST, etc.),
sequence number, acknowledgment number,
window
.It TCP options
.It ICMP types
for ICMP packets
.It ICMP6 types
for ICMP6 packets
.It User/group ID
When the packet can be associated with a local socket.
.It Divert status
Whether a packet came from a divert socket (e.g.,
.Xr natd 8 ) .
.It Fib annotation state
Whether a packet has been tagged for using a specific FIB (routing table)
in future forwarding decisions.
.El
.Pp
Note that some of the above information, e.g.\& source MAC or IP addresses and
TCP/UDP ports, can be easily spoofed, so filtering on those fields
alone might not guarantee the desired results.
.Bl -tag -width indent
.It Ar rule_number
Each rule is associated with a
.Ar rule_number
in the range 1..65535, with the latter reserved for the
.Em default
rule.
Rules are checked sequentially by rule number.
Multiple rules can have the same number, in which case they are
checked (and listed) according to the order in which they have
been added.
If a rule is entered without specifying a number, the kernel will
assign one in such a way that the rule becomes the last one
before the
.Em default
rule.
Automatic rule numbers are assigned by incrementing the last
non-default rule number by the value of the sysctl variable
.Ar net.inet.ip.fw.autoinc_step
which defaults to 100.
If this is not possible (e.g.\& because we would go beyond the
maximum allowed rule number), the number of the last
non-default value is used instead.
.It Cm set Ar set_number
Each rule is associated with a
.Ar set_number
in the range 0..31.
Sets can be individually disabled and enabled, so this parameter
is of fundamental importance for atomic ruleset manipulation.
It can be also used to simplify deletion of groups of rules.
If a rule is entered without specifying a set number,
set 0 will be used.
.br
Set 31 is special in that it cannot be disabled,
and rules in set 31 are not deleted by the
.Nm ipfw flush
command (but you can delete them with the
.Nm ipfw delete set 31
command).
Set 31 is also used for the
.Em default
rule.
.It Cm prob Ar match_probability
A match is only declared with the specified probability
(floating point number between 0 and 1).
This can be useful for a number of applications such as
random packet drop or
(in conjunction with
.Nm dummynet )
to simulate the effect of multiple paths leading to out-of-order
packet delivery.
.Pp
Note: this condition is checked before any other condition, including
ones such as keep-state or check-state which might have side effects.
.It Cm log Op Cm logamount Ar number
Packets matching a rule with the
.Cm log
keyword will be made available for logging in two ways:
if the sysctl variable
.Va net.inet.ip.fw.verbose
is set to 0 (default), one can use
.Xr bpf 4
attached to the
.Li ipfw0
pseudo interface.
This pseudo interface can be created after a boot
manually by using the following command:
.Bd -literal -offset indent
# ifconfig ipfw0 create
.Ed
.Pp
Or, automatically at boot time by adding the following
line to the
.Xr rc.conf 5
file:
.Bd -literal -offset indent
firewall_logif="YES"
.Ed
.Pp
There is no overhead if no
.Xr bpf 4
is attached to the pseudo interface.
.Pp
If
.Va net.inet.ip.fw.verbose
is set to 1, packets will be logged to
.Xr syslogd 8
with a
.Dv LOG_SECURITY
facility up to a maximum of
.Cm logamount
packets.
If no
.Cm logamount
is specified, the limit is taken from the sysctl variable
.Va net.inet.ip.fw.verbose_limit .
In both cases, a value of 0 means unlimited logging.
.Pp
Once the limit is reached, logging can be re-enabled by
clearing the logging counter or the packet counter for that entry, see the
.Cm resetlog
command.
.Pp
Note: logging is done after all other packet matching conditions
have been successfully verified, and before performing the final
action (accept, deny, etc.) on the packet.
.It Cm tag Ar number
When a packet matches a rule with the
.Cm tag
keyword, the numeric tag for the given
.Ar number
in the range 1..65534 will be attached to the packet.
The tag acts as an internal marker (it is not sent out over
the wire) that can be used to identify these packets later on.
This can be used, for example, to provide trust between interfaces
and to start doing policy-based filtering.
A packet can have multiple tags at the same time.
Tags are "sticky", meaning once a tag is applied to a packet by a
matching rule it exists until explicit removal.
Tags are kept with the packet everywhere within the kernel, but are
lost when packet leaves the kernel, for example, on transmitting
packet out to the network or sending packet to a
.Xr divert 4
socket.
.Pp
To check for previously applied tags, use the
.Cm tagged
rule option.
To delete previously applied tag, use the
.Cm untag
keyword.
.Pp
Note: since tags are kept with the packet everywhere in kernelspace,
they can be set and unset anywhere in the kernel network subsystem
(using the
.Xr mbuf_tags 9
facility), not only by means of the
.Xr ipfw 4
.Cm tag
and
.Cm untag
keywords.
For example, there can be a specialized
.Xr netgraph 4
node doing traffic analyzing and tagging for later inspecting
in firewall.
.It Cm untag Ar number
When a packet matches a rule with the
.Cm untag
keyword, the tag with the number
.Ar number
is searched among the tags attached to this packet and,
if found, removed from it.
Other tags bound to packet, if present, are left untouched.
.It Cm altq Ar queue
When a packet matches a rule with the
.Cm altq
keyword, the ALTQ identifier for the given
.Ar queue
(see
.Xr altq 4 )
will be attached.
Note that this ALTQ tag is only meaningful for packets going "out" of IPFW,
and not being rejected or going to divert sockets.
Note that if there is insufficient memory at the time the packet is
processed, it will not be tagged, so it is wise to make your ALTQ
"default" queue policy account for this.
If multiple
.Cm altq
rules match a single packet, only the first one adds the ALTQ classification
tag.
In doing so, traffic may be shaped by using
.Cm count Cm altq Ar queue
rules for classification early in the ruleset, then later applying
the filtering decision.
For example,
.Cm check-state
and
.Cm keep-state
rules may come later and provide the actual filtering decisions in
addition to the fallback ALTQ tag.
.Pp
You must run
.Xr pfctl 8
to set up the queues before IPFW will be able to look them up by name,
and if the ALTQ disciplines are rearranged, the rules in containing the
queue identifiers in the kernel will likely have gone stale and need
to be reloaded.
Stale queue identifiers will probably result in misclassification.
.Pp
All system ALTQ processing can be turned on or off via
.Nm
.Cm enable Ar altq
and
.Nm
.Cm disable Ar altq .
The usage of
.Va net.inet.ip.fw.one_pass
is irrelevant to ALTQ traffic shaping, as the actual rule action is followed
always after adding an ALTQ tag.
.El
.Ss RULE ACTIONS
A rule can be associated with one of the following actions, which
will be executed when the packet matches the body of the rule.
.Bl -tag -width indent
.It Cm allow | accept | pass | permit
Allow packets that match rule.
The search terminates.
.It Cm check-state
Checks the packet against the dynamic ruleset.
If a match is found, execute the action associated with
the rule which generated this dynamic rule, otherwise
move to the next rule.
.br
.Cm Check-state
rules do not have a body.
If no
.Cm check-state
rule is found, the dynamic ruleset is checked at the first
.Cm keep-state
or
.Cm limit
rule.
.It Cm count
Update counters for all packets that match rule.
The search continues with the next rule.
.It Cm deny | drop
Discard packets that match this rule.
The search terminates.
.It Cm divert Ar port
Divert packets that match this rule to the
.Xr divert 4
socket bound to port
.Ar port .
The search terminates.
.It Cm fwd | forward Ar ipaddr | tablearg Ns Op , Ns Ar port
Change the next-hop on matching packets to
.Ar ipaddr ,
which can be an IP address or a host name.
For IPv4, the next hop can also be supplied by the last table
looked up for the packet by using the
.Cm tablearg
keyword instead of an explicit address.
The search terminates if this rule matches.
.Pp
If
.Ar ipaddr
is a local address, then matching packets will be forwarded to
.Ar port
(or the port number in the packet if one is not specified in the rule)
on the local machine.
.br
If
.Ar ipaddr
is not a local address, then the port number
(if specified) is ignored, and the packet will be
forwarded to the remote address, using the route as found in
the local routing table for that IP.
.br
A
.Ar fwd
rule will not match layer-2 packets (those received
on ether_input, ether_output, or bridged).
.br
The
.Cm fwd
action does not change the contents of the packet at all.
In particular, the destination address remains unmodified, so
packets forwarded to another system will usually be rejected by that system
unless there is a matching rule on that system to capture them.
For packets forwarded locally,
the local address of the socket will be
set to the original destination address of the packet.
This makes the
.Xr netstat 1
entry look rather weird but is intended for
use with transparent proxy servers.
.It Cm nat Ar nat_nr | tablearg
Pass packet to a
nat instance
(for network address translation, address redirect, etc.):
see the
.Sx NETWORK ADDRESS TRANSLATION (NAT)
Section for further information.
.It Cm pipe Ar pipe_nr
Pass packet to a
.Nm dummynet
.Dq pipe
(for bandwidth limitation, delay, etc.).
See the
.Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION
Section for further information.
The search terminates; however, on exit from the pipe and if
the
.Xr sysctl 8
variable
.Va net.inet.ip.fw.one_pass
is not set, the packet is passed again to the firewall code
starting from the next rule.
.It Cm queue Ar queue_nr
Pass packet to a
.Nm dummynet
.Dq queue
(for bandwidth limitation using WF2Q+).
.It Cm reject
(Deprecated).
Synonym for
.Cm unreach host .
.It Cm reset
Discard packets that match this rule, and if the
packet is a TCP packet, try to send a TCP reset (RST) notice.
The search terminates.
.It Cm reset6
Discard packets that match this rule, and if the
packet is a TCP packet, try to send a TCP reset (RST) notice.
The search terminates.
.It Cm skipto Ar number | tablearg
Skip all subsequent rules numbered less than
.Ar number .
The search continues with the first rule numbered
.Ar number
or higher.
It is possible to use the
.Cm tablearg
keyword with a skipto for a
.Em computed
skipto, but care should be used, as no destination caching
is possible in this case so the rules are always walked to find it,
starting from the
.Cm skipto .
.It Cm call Ar number | tablearg
The current rule number is saved in the internal stack and
ruleset processing continues with the first rule numbered
.Ar number
or higher.
If later a rule with the
.Cm return
action is encountered, the processing returns to the first rule
with number of this
.Cm call
rule plus one or higher
(the same behaviour as with packets returning from
.Xr divert 4
socket after a
.Cm divert
action).
This could be used to make somewhat like an assembly language
.Dq subroutine
calls to rules with common checks for different interfaces, etc.
.Pp
Rule with any number could be called, not just forward jumps as with
.Cm skipto .
So, to prevent endless loops in case of mistakes, both
.Cm call
and
.Cm return
actions don't do any jumps and simply go to the next rule if memory
cannot be allocated or stack overflowed/underflowed.
.Pp
Internally stack for rule numbers is implemented using
.Xr mbuf_tags 9
facility and currently has size of 16 entries.
As mbuf tags are lost when packet leaves the kernel,
.Cm divert
should not be used in subroutines to avoid endless loops
and other undesired effects.
.It Cm return
Takes rule number saved to internal stack by the last
.Cm call
action and returns ruleset processing to the first rule
with number greater than number of corresponding
.Cm call
rule.
See description of the
.Cm call
action for more details.
.Pp
Note that
.Cm return
rules usually end a
.Dq subroutine
and thus are unconditional, but
.Nm
command-line utility currently requires every action except
.Cm check-state
to have body.
While it is sometimes useful to return only on some packets,
usually you want to print just
.Dq return
for readability.
A workaround for this is to use new syntax and
.Fl c
switch:
.Bd -literal -offset indent
# Add a rule without actual body
ipfw add 2999 return via any

# List rules without "from any to any" part
ipfw -c list
.Ed
.Pp
This cosmetic annoyance may be fixed in future releases.
.It Cm tee Ar port
Send a copy of packets matching this rule to the
.Xr divert 4
socket bound to port
.Ar port .
The search continues with the next rule.
.It Cm unreach Ar code
Discard packets that match this rule, and try to send an ICMP
unreachable notice with code
.Ar code ,
where
.Ar code
is a number from 0 to 255, or one of these aliases:
.Cm net , host , protocol , port ,
.Cm needfrag , srcfail , net-unknown , host-unknown ,
.Cm isolated , net-prohib , host-prohib , tosnet ,
.Cm toshost , filter-prohib , host-precedence
or
.Cm precedence-cutoff .
The search terminates.
.It Cm unreach6 Ar code
Discard packets that match this rule, and try to send an ICMPv6
unreachable notice with code
.Ar code ,
where
.Ar code
is a number from 0, 1, 3 or 4, or one of these aliases:
.Cm no-route, admin-prohib, address
or
.Cm port .
The search terminates.
.It Cm netgraph Ar cookie
Divert packet into netgraph with given
.Ar cookie .
The search terminates.
If packet is later returned from netgraph it is either
accepted or continues with the next rule, depending on
.Va net.inet.ip.fw.one_pass
sysctl variable.
.It Cm ngtee Ar cookie
A copy of packet is diverted into netgraph, original
packet continues with the next rule.
See
.Xr ng_ipfw 4
for more information on
.Cm netgraph
and
.Cm ngtee
actions.
.It Cm setfib Ar fibnum | tablearg
The packet is tagged so as to use the FIB (routing table)
.Ar fibnum
in any subsequent forwarding decisions.
In the current implementation, this is limited to the values 0 through 15, see
.Xr setfib 2 .
Processing continues at the next rule.
It is possible to use the
.Cm tablearg
keyword with setfib.
If the tablearg value is not within the compiled range of fibs,
the packet's fib is set to 0.
.It Cm setdscp Ar DSCP | number | tablearg
Set specified DiffServ codepoint for an IPv4/IPv6 packet.
Processing continues at the next rule.
Supported values are:
.Pp
.Cm CS0
.Pq Dv 000000 ,
.Cm CS1
.Pq Dv 001000 ,
.Cm CS2
.Pq Dv 010000 ,
.Cm CS3
.Pq Dv 011000 ,
.Cm CS4
.Pq Dv 100000 ,
.Cm CS5
.Pq Dv 101000 ,
.Cm CS6
.Pq Dv 110000 ,
.Cm CS7
.Pq Dv 111000 ,
.Cm AF11
.Pq Dv 001010 ,
.Cm AF12
.Pq Dv 001100 ,
.Cm AF13
.Pq Dv 001110 ,
.Cm AF21
.Pq Dv 010010 ,
.Cm AF22
.Pq Dv 010100 ,
.Cm AF23
.Pq Dv 010110 ,
.Cm AF31
.Pq Dv 011010 ,
.Cm AF32
.Pq Dv 011100 ,
.Cm AF33
.Pq Dv 011110 ,
.Cm AF41
.Pq Dv 100010 ,
.Cm AF42
.Pq Dv 100100 ,
.Cm AF43
.Pq Dv 100110 ,
.Cm EF
.Pq Dv 101110 ,
.Cm BE
.Pq Dv 000000 .
Additionally, DSCP value can be specified by number (0..64).
It is also possible to use the
.Cm tablearg
keyword with setdscp.
If the tablearg value is not within the 0..64 range, lower 6 bits of supplied
value are used.
.It Cm reass
Queue and reassemble IP fragments.
If the packet is not fragmented, counters are updated and
processing continues with the next rule.
If the packet is the last logical fragment, the packet is reassembled and, if
.Va net.inet.ip.fw.one_pass
is set to 0, processing continues with the next rule.
Otherwise, the packet is allowed to pass and the search terminates.
If the packet is a fragment in the middle of a logical group of fragments,
it is consumed and
processing stops immediately.
.Pp
Fragment handling can be tuned via
.Va net.inet.ip.maxfragpackets
and
.Va net.inet.ip.maxfragsperpacket
which limit, respectively, the maximum number of processable
fragments (default: 800) and
the maximum number of fragments per packet (default: 16).
.Pp
NOTA BENE: since fragments do not contain port numbers,
they should be avoided with the
.Nm reass
rule.
Alternatively, direction-based (like
.Nm in
/
.Nm out
) and source-based (like
.Nm via
) match patterns can be used to select fragments.
.Pp
Usually a simple rule like:
.Bd -literal -offset indent
# reassemble incoming fragments
ipfw add reass all from any to any in
.Ed
.Pp
is all you need at the beginning of your ruleset.
.El
.Ss RULE BODY
The body of a rule contains zero or more patterns (such as
specific source and destination addresses or ports,
protocol options, incoming or outgoing interfaces, etc.)
that the packet must match in order to be recognised.
In general, the patterns are connected by (implicit)
.Cm and
operators -- i.e., all must match in order for the
rule to match.
Individual patterns can be prefixed by the
.Cm not
operator to reverse the result of the match, as in
.Pp
.Dl "ipfw add 100 allow ip from not 1.2.3.4 to any"
.Pp
Additionally, sets of alternative match patterns
.Pq Em or-blocks
can be constructed by putting the patterns in
lists enclosed between parentheses ( ) or braces { }, and
using the
.Cm or
operator as follows:
.Pp
.Dl "ipfw add 100 allow ip from { x or not y or z } to any"
.Pp
Only one level of parentheses is allowed.
Beware that most shells have special meanings for parentheses
or braces, so it is advisable to put a backslash \\ in front of them
to prevent such interpretations.
.Pp
The body of a rule must in general include a source and destination
address specifier.
The keyword
.Ar any
can be used in various places to specify that the content of
a required field is irrelevant.
.Pp
The rule body has the following format:
.Bd -ragged -offset indent
.Op Ar proto Cm from Ar src Cm to Ar dst
.Op Ar options
.Ed
.Pp
The first part (proto from src to dst) is for backward
compatibility with earlier versions of
.Fx .
In modern
.Fx
any match pattern (including MAC headers, IP protocols,
addresses and ports) can be specified in the
.Ar options
section.
.Pp
Rule fields have the following meaning:
.Bl -tag -width indent
.It Ar proto : protocol | Cm { Ar protocol Cm or ... }
.It Ar protocol : Oo Cm not Oc Ar protocol-name | protocol-number
An IP protocol specified by number or name
(for a complete list see
.Pa /etc/protocols ) ,
or one of the following keywords:
.Bl -tag -width indent
.It Cm ip4 | ipv4
Matches IPv4 packets.
.It Cm ip6 | ipv6
Matches IPv6 packets.
.It Cm ip | all
Matches any packet.
.El
.Pp
The
.Cm ipv6
in
.Cm proto
option will be treated as inner protocol.
And, the
.Cm ipv4
is not available in
.Cm proto
option.
.Pp
The
.Cm { Ar protocol Cm or ... }
format (an
.Em or-block )
is provided for convenience only but its use is deprecated.
.It Ar src No and Ar dst : Bro Cm addr | Cm { Ar addr Cm or ... } Brc Op Oo Cm not Oc Ar ports
An address (or a list, see below)
optionally followed by
.Ar ports
specifiers.
.Pp
The second format
.Em ( or-block
with multiple addresses) is provided for convenience only and
its use is discouraged.
.It Ar addr : Oo Cm not Oc Bro
.Cm any | me | me6 |
.Cm table Ns Pq Ar number Ns Op , Ns Ar value
.Ar | addr-list | addr-set
.Brc
.Bl -tag -width indent
.It Cm any
matches any IP address.
.It Cm me
matches any IP address configured on an interface in the system.
.It Cm me6
matches any IPv6 address configured on an interface in the system.
The address list is evaluated at the time the packet is
analysed.
.It Cm table Ns Pq Ar number Ns Op , Ns Ar value
Matches any IPv4 address for which an entry exists in the lookup table
.Ar number .
If an optional 32-bit unsigned
.Ar value
is also specified, an entry will match only if it has this value.
See the
.Sx LOOKUP TABLES
section below for more information on lookup tables.
.El
.It Ar addr-list : ip-addr Ns Op Ns , Ns Ar addr-list
.It Ar ip-addr :
A host or subnet address specified in one of the following ways:
.Bl -tag -width indent
.It Ar numeric-ip | hostname
Matches a single IPv4 address, specified as dotted-quad or a hostname.
Hostnames are resolved at the time the rule is added to the firewall list.
.It Ar addr Ns / Ns Ar masklen
Matches all addresses with base
.Ar addr
(specified as an IP address, a network number, or a hostname)
and mask width of
.Cm masklen
bits.
As an example, 1.2.3.4/25 or 1.2.3.0/25 will match
all IP numbers from 1.2.3.0 to 1.2.3.127 .
.It Ar addr Ns : Ns Ar mask
Matches all addresses with base
.Ar addr
(specified as an IP address, a network number, or a hostname)
and the mask of
.Ar mask ,
specified as a dotted quad.
As an example, 1.2.3.4:255.0.255.0 or 1.0.3.0:255.0.255.0 will match
1.*.3.*.
This form is advised only for non-contiguous
masks.
It is better to resort to the
.Ar addr Ns / Ns Ar masklen
format for contiguous masks, which is more compact and less
error-prone.
.El
.It Ar addr-set : addr Ns Oo Ns / Ns Ar masklen Oc Ns Cm { Ns Ar list Ns Cm }
.It Ar list : Bro Ar num | num-num Brc Ns Op Ns , Ns Ar list
Matches all addresses with base address
.Ar addr
(specified as an IP address, a network number, or a hostname)
and whose last byte is in the list between braces { } .
Note that there must be no spaces between braces and
numbers (spaces after commas are allowed).
Elements of the list can be specified as single entries
or ranges.
The
.Ar masklen
field is used to limit the size of the set of addresses,
and can have any value between 24 and 32.
If not specified,
it will be assumed as 24.
.br
This format is particularly useful to handle sparse address sets
within a single rule.
Because the matching occurs using a
bitmask, it takes constant time and dramatically reduces
the complexity of rulesets.
.br
As an example, an address specified as 1.2.3.4/24{128,35-55,89}
or 1.2.3.0/24{128,35-55,89}
will match the following IP addresses:
.br
1.2.3.128, 1.2.3.35 to 1.2.3.55, 1.2.3.89 .
.It Ar addr6-list : ip6-addr Ns Op Ns , Ns Ar addr6-list
.It Ar ip6-addr :
A host or subnet specified one of the following ways:
.Bl -tag -width indent
.It Ar numeric-ip | hostname
Matches a single IPv6 address as allowed by
.Xr inet_pton 3
or a hostname.
Hostnames are resolved at the time the rule is added to the firewall
list.
.It Ar addr Ns / Ns Ar masklen
Matches all IPv6 addresses with base
.Ar addr
(specified as allowed by
.Xr inet_pton
or a hostname)
and mask width of
.Cm masklen
bits.
.El
.Pp
No support for sets of IPv6 addresses is provided because IPv6 addresses
are typically random past the initial prefix.
.It Ar ports : Bro Ar port | port Ns \&- Ns Ar port Ns Brc Ns Op , Ns Ar ports
For protocols which support port numbers (such as TCP and UDP), optional
.Cm ports
may be specified as one or more ports or port ranges, separated
by commas but no spaces, and an optional
.Cm not
operator.
The
.Ql \&-
notation specifies a range of ports (including boundaries).
.Pp
Service names (from
.Pa /etc/services )
may be used instead of numeric port values.
The length of the port list is limited to 30 ports or ranges,
though one can specify larger ranges by using an
.Em or-block
in the
.Cm options
section of the rule.
.Pp
A backslash
.Pq Ql \e
can be used to escape the dash
.Pq Ql -
character in a service name (from a shell, the backslash must be
typed twice to avoid the shell itself interpreting it as an escape
character).
.Pp
.Dl "ipfw add count tcp from any ftp\e\e-data-ftp to any"
.Pp
Fragmented packets which have a non-zero offset (i.e., not the first
fragment) will never match a rule which has one or more port
specifications.
See the
.Cm frag
option for details on matching fragmented packets.
.El
.Ss RULE OPTIONS (MATCH PATTERNS)
Additional match patterns can be used within
rules.
Zero or more of these so-called
.Em options
can be present in a rule, optionally prefixed by the
.Cm not
operand, and possibly grouped into
.Em or-blocks .
.Pp
The following match patterns can be used (listed in alphabetical order):
.Bl -tag -width indent
.It Cm // this is a comment.
Inserts the specified text as a comment in the rule.
Everything following // is considered as a comment and stored in the rule.
You can have comment-only rules, which are listed as having a
.Cm count
action followed by the comment.
.It Cm bridged
Alias for
.Cm layer2 .
.It Cm diverted
Matches only packets generated by a divert socket.
.It Cm diverted-loopback
Matches only packets coming from a divert socket back into the IP stack
input for delivery.
.It Cm diverted-output
Matches only packets going from a divert socket back outward to the IP
stack output for delivery.
.It Cm dst-ip Ar ip-address
Matches IPv4 packets whose destination IP is one of the address(es)
specified as argument.
.It Bro Cm dst-ip6 | dst-ipv6 Brc Ar ip6-address
Matches IPv6 packets whose destination IP is one of the address(es)
specified as argument.
.It Cm dst-port Ar ports
Matches IP packets whose destination port is one of the port(s)
specified as argument.
.It Cm established
Matches TCP packets that have the RST or ACK bits set.
.It Cm ext6hdr Ar header
Matches IPv6 packets containing the extended header given by
.Ar header .
Supported headers are:
.Pp
Fragment,
.Pq Cm frag ,
Hop-to-hop options
.Pq Cm hopopt ,
any type of Routing Header
.Pq Cm route ,
Source routing Routing Header Type 0
.Pq Cm rthdr0 ,
Mobile IPv6 Routing Header Type 2
.Pq Cm rthdr2 ,
Destination options
.Pq Cm dstopt ,
IPSec authentication headers
.Pq Cm ah ,
and IPsec encapsulated security payload headers
.Pq Cm esp .
.It Cm fib Ar fibnum
Matches a packet that has been tagged to use
the given FIB (routing table) number.
.It Cm flow-id Ar labels
Matches IPv6 packets containing any of the flow labels given in
.Ar labels .
.Ar labels
is a comma separated list of numeric flow labels.
.It Cm frag
Matches packets that are fragments and not the first
fragment of an IP datagram.
Note that these packets will not have
the next protocol header (e.g.\& TCP, UDP) so options that look into
these headers cannot match.
.It Cm gid Ar group
Matches all TCP or UDP packets sent by or received for a
.Ar group .
A
.Ar group
may be specified by name or number.
.It Cm jail Ar prisonID
Matches all TCP or UDP packets sent by or received for the
jail whos prison ID is
.Ar prisonID .
.It Cm icmptypes Ar types
Matches ICMP packets whose ICMP type is in the list
.Ar types .
The list may be specified as any combination of
individual types (numeric) separated by commas.
.Em Ranges are not allowed .
The supported ICMP types are:
.Pp
echo reply
.Pq Cm 0 ,
destination unreachable
.Pq Cm 3 ,
source quench
.Pq Cm 4 ,
redirect
.Pq Cm 5 ,
echo request
.Pq Cm 8 ,
router advertisement
.Pq Cm 9 ,
router solicitation
.Pq Cm 10 ,
time-to-live exceeded
.Pq Cm 11 ,
IP header bad
.Pq Cm 12 ,
timestamp request
.Pq Cm 13 ,
timestamp reply
.Pq Cm 14 ,
information request
.Pq Cm 15 ,
information reply
.Pq Cm 16 ,
address mask request
.Pq Cm 17
and address mask reply
.Pq Cm 18 .
.It Cm icmp6types Ar types
Matches ICMP6 packets whose ICMP6 type is in the list of
.Ar types .
The list may be specified as any combination of
individual types (numeric) separated by commas.
.Em Ranges are not allowed .
.It Cm in | out
Matches incoming or outgoing packets, respectively.
.Cm in
and
.Cm out
are mutually exclusive (in fact,
.Cm out
is implemented as
.Cm not in Ns No ).
.It Cm ipid Ar id-list
Matches IPv4 packets whose
.Cm ip_id
field has value included in
.Ar id-list ,
which is either a single value or a list of values or ranges
specified in the same way as
.Ar ports .
.It Cm iplen Ar len-list
Matches IP packets whose total length, including header and data, is
in the set
.Ar len-list ,
which is either a single value or a list of values or ranges
specified in the same way as
.Ar ports .
.It Cm ipoptions Ar spec
Matches packets whose IPv4 header contains the comma separated list of
options specified in
.Ar spec .
The supported IP options are:
.Pp
.Cm ssrr
(strict source route),
.Cm lsrr
(loose source route),
.Cm rr
(record packet route) and
.Cm ts
(timestamp).
The absence of a particular option may be denoted
with a
.Ql \&! .
.It Cm ipprecedence Ar precedence
Matches IPv4 packets whose precedence field is equal to
.Ar precedence .
.It Cm ipsec
Matches packets that have IPSEC history associated with them
(i.e., the packet comes encapsulated in IPSEC, the kernel
has IPSEC support and IPSEC_FILTERTUNNEL option, and can correctly
decapsulate it).
.Pp
Note that specifying
.Cm ipsec
is different from specifying
.Cm proto Ar ipsec
as the latter will only look at the specific IP protocol field,
irrespective of IPSEC kernel support and the validity of the IPSEC data.
.Pp
Further note that this flag is silently ignored in kernels without
IPSEC support.
It does not affect rule processing when given and the
rules are handled as if with no
.Cm ipsec
flag.
.It Cm iptos Ar spec
Matches IPv4 packets whose
.Cm tos
field contains the comma separated list of
service types specified in
.Ar spec .
The supported IP types of service are:
.Pp
.Cm lowdelay
.Pq Dv IPTOS_LOWDELAY ,
.Cm throughput
.Pq Dv IPTOS_THROUGHPUT ,
.Cm reliability
.Pq Dv IPTOS_RELIABILITY ,
.Cm mincost
.Pq Dv IPTOS_MINCOST ,
.Cm congestion
.Pq Dv IPTOS_ECN_CE .
The absence of a particular type may be denoted
with a
.Ql \&! .
.It Cm dscp spec Ns Op , Ns Ar spec
Matches IPv4/IPv6 packets whose
.Cm DS
field value is contained in
.Ar spec
mask.
Multiple values can be specified via
the comma separated list.
Value can be one of keywords used in
.Cm setdscp
action or exact number.
.It Cm ipttl Ar ttl-list
Matches IPv4 packets whose time to live is included in
.Ar ttl-list ,
which is either a single value or a list of values or ranges
specified in the same way as
.Ar ports .
.It Cm ipversion Ar ver
Matches IP packets whose IP version field is
.Ar ver .
.It Cm keep-state
Upon a match, the firewall will create a dynamic rule, whose
default behaviour is to match bidirectional traffic between
source and destination IP/port using the same protocol.
The rule has a limited lifetime (controlled by a set of
.Xr sysctl 8
variables), and the lifetime is refreshed every time a matching
packet is found.
.It Cm layer2
Matches only layer2 packets, i.e., those passed to
.Nm
from ether_demux() and ether_output_frame().
.It Cm limit Bro Cm src-addr | src-port | dst-addr | dst-port Brc Ar N
The firewall will only allow
.Ar N
connections with the same
set of parameters as specified in the rule.
One or more
of source and destination addresses and ports can be
specified.
Currently,
only IPv4 flows are supported.
.It Cm lookup Bro Cm dst-ip | dst-port | src-ip | src-port | uid | jail Brc Ar N
Search an entry in lookup table
.Ar N
that matches the field specified as argument.
If not found, the match fails.
Otherwise, the match succeeds and
.Cm tablearg
is set to the value extracted from the table.
.Pp
This option can be useful to quickly dispatch traffic based on
certain packet fields.
See the
.Sx LOOKUP TABLES
section below for more information on lookup tables.
.It Cm { MAC | mac } Ar dst-mac src-mac
Match packets with a given
.Ar dst-mac
and
.Ar src-mac
addresses, specified as the
.Cm any
keyword (matching any MAC address), or six groups of hex digits
separated by colons,
and optionally followed by a mask indicating the significant bits.
The mask may be specified using either of the following methods:
.Bl -enum -width indent
.It
A slash
.Pq /
followed by the number of significant bits.
For example, an address with 33 significant bits could be specified as:
.Pp
.Dl "MAC 10:20:30:40:50:60/33 any"
.Pp
.It
An ampersand
.Pq &
followed by a bitmask specified as six groups of hex digits separated
by colons.
For example, an address in which the last 16 bits are significant could
be specified as:
.Pp
.Dl "MAC 10:20:30:40:50:60&00:00:00:00:ff:ff any"
.Pp
Note that the ampersand character has a special meaning in many shells
and should generally be escaped.
.Pp
.El
Note that the order of MAC addresses (destination first,
source second) is
the same as on the wire, but the opposite of the one used for
IP addresses.
.It Cm mac-type Ar mac-type
Matches packets whose Ethernet Type field
corresponds to one of those specified as argument.
.Ar mac-type
is specified in the same way as
.Cm port numbers
(i.e., one or more comma-separated single values or ranges).
You can use symbolic names for known values such as
.Em vlan , ipv4, ipv6 .
Values can be entered as decimal or hexadecimal (if prefixed by 0x),
and they are always printed as hexadecimal (unless the
.Cm -N
option is used, in which case symbolic resolution will be attempted).
.It Cm proto Ar protocol
Matches packets with the corresponding IP protocol.
.It Cm recv | xmit | via Brq Ar ifX | Ar if Ns Cm * | Ar table Ns Pq Ar number Ns Op , Ns Ar value | Ar ipno | Ar any
Matches packets received, transmitted or going through,
respectively, the interface specified by exact name
.Po Ar ifX Pc ,
by device name
.Po Ar if* Pc ,
by IP address, or through some interface.
.Pp
The
.Cm via
keyword causes the interface to always be checked.
If
.Cm recv
or
.Cm xmit
is used instead of
.Cm via ,
then only the receive or transmit interface (respectively)
is checked.
By specifying both, it is possible to match packets based on
both receive and transmit interface, e.g.:
.Pp
.Dl "ipfw add deny ip from any to any out recv ed0 xmit ed1"
.Pp
The
.Cm recv
interface can be tested on either incoming or outgoing packets,
while the
.Cm xmit
interface can only be tested on outgoing packets.
So
.Cm out
is required (and
.Cm in
is invalid) whenever
.Cm xmit
is used.
.Pp
A packet might not have a receive or transmit interface: packets
originating from the local host have no receive interface,
while packets destined for the local host have no transmit
interface.
.It Cm setup
Matches TCP packets that have the SYN bit set but no ACK bit.
This is the short form of
.Dq Li tcpflags\ syn,!ack .
.It Cm sockarg
Matches packets that are associated to a local socket and
for which the SO_USER_COOKIE socket option has been set
to a non-zero value.
As a side effect, the value of the
option is made available as
.Cm tablearg
value, which in turn can be used as
.Cm skipto
or
.Cm pipe
number.
.It Cm src-ip Ar ip-address
Matches IPv4 packets whose source IP is one of the address(es)
specified as an argument.
.It Cm src-ip6 Ar ip6-address
Matches IPv6 packets whose source IP is one of the address(es)
specified as an argument.
.It Cm src-port Ar ports
Matches IP packets whose source port is one of the port(s)
specified as argument.
.It Cm tagged Ar tag-list
Matches packets whose tags are included in
.Ar tag-list ,
which is either a single value or a list of values or ranges
specified in the same way as
.Ar ports .
Tags can be applied to the packet using
.Cm tag
rule action parameter (see it's description for details on tags).
.It Cm tcpack Ar ack
TCP packets only.
Match if the TCP header acknowledgment number field is set to
.Ar ack .
.It Cm tcpdatalen Ar tcpdatalen-list
Matches TCP packets whose length of TCP data is
.Ar tcpdatalen-list ,
which is either a single value or a list of values or ranges
specified in the same way as
.Ar ports .
.It Cm tcpflags Ar spec
TCP packets only.
Match if the TCP header contains the comma separated list of
flags specified in
.Ar spec .
The supported TCP flags are:
.Pp
.Cm fin ,
.Cm syn ,
.Cm rst ,
.Cm psh ,
.Cm ack
and
.Cm urg .
The absence of a particular flag may be denoted
with a
.Ql \&! .
A rule which contains a
.Cm tcpflags
specification can never match a fragmented packet which has
a non-zero offset.
See the
.Cm frag
option for details on matching fragmented packets.
.It Cm tcpseq Ar seq
TCP packets only.
Match if the TCP header sequence number field is set to
.Ar seq .
.It Cm tcpwin Ar tcpwin-list
Matches TCP packets whose  header window field is set to
.Ar tcpwin-list ,
which is either a single value or a list of values or ranges
specified in the same way as
.Ar ports .
.It Cm tcpoptions Ar spec
TCP packets only.
Match if the TCP header contains the comma separated list of
options specified in
.Ar spec .
The supported TCP options are:
.Pp
.Cm mss
(maximum segment size),
.Cm window
(tcp window advertisement),
.Cm sack
(selective ack),
.Cm ts
(rfc1323 timestamp) and
.Cm cc
(rfc1644 t/tcp connection count).
The absence of a particular option may be denoted
with a
.Ql \&! .
.It Cm uid Ar user
Match all TCP or UDP packets sent by or received for a
.Ar user .
A
.Ar user
may be matched by name or identification number.
.It Cm verrevpath
For incoming packets,
a routing table lookup is done on the packet's source address.
If the interface on which the packet entered the system matches the
outgoing interface for the route,
the packet matches.
If the interfaces do not match up,
the packet does not match.
All outgoing packets or packets with no incoming interface match.
.Pp
The name and functionality of the option is intentionally similar to
the Cisco IOS command:
.Pp
.Dl ip verify unicast reverse-path
.Pp
This option can be used to make anti-spoofing rules to reject all
packets with source addresses not from this interface.
See also the option
.Cm antispoof .
.It Cm versrcreach
For incoming packets,
a routing table lookup is done on the packet's source address.
If a route to the source address exists, but not the default route
or a blackhole/reject route, the packet matches.
Otherwise, the packet does not match.
All outgoing packets match.
.Pp
The name and functionality of the option is intentionally similar to
the Cisco IOS command:
.Pp
.Dl ip verify unicast source reachable-via any
.Pp
This option can be used to make anti-spoofing rules to reject all
packets whose source address is unreachable.
.It Cm antispoof
For incoming packets, the packet's source address is checked if it
belongs to a directly connected network.
If the network is directly connected, then the interface the packet
came on in is compared to the interface the network is connected to.
When incoming interface and directly connected interface are not the
same, the packet does not match.
Otherwise, the packet does match.
All outgoing packets match.
.Pp
This option can be used to make anti-spoofing rules to reject all
packets that pretend to be from a directly connected network but do
not come in through that interface.
This option is similar to but more restricted than
.Cm verrevpath
because it engages only on packets with source addresses of directly
connected networks instead of all source addresses.
.El
.Sh LOOKUP TABLES
Lookup tables are useful to handle large sparse sets of
addresses or other search keys (e.g., ports, jail IDs, interface names).
In the rest of this section we will use the term ``address''.
There may be up to 65535 different lookup tables, numbered 0 to 65534.
.Pp
Each entry is represented by an
.Ar addr Ns Op / Ns Ar masklen
and will match all addresses with base
.Ar addr
(specified as an IPv4/IPv6 address, a hostname or an unsigned integer)
and mask width of
.Ar masklen
bits.
If
.Ar masklen
is not specified, it defaults to 32 for IPv4 and 128 for IPv6.
When looking up an IP address in a table, the most specific
entry will match.
Associated with each entry is a 32-bit unsigned
.Ar value ,
which can optionally be checked by a rule matching code.
When adding an entry, if
.Ar value
is not specified, it defaults to 0.
.Pp
An entry can be added to a table
.Pq Cm add ,
or removed from a table
.Pq Cm delete .
A table can be examined
.Pq Cm list
or flushed
.Pq Cm flush .
.Pp
Internally, each table is stored in a Radix tree, the same way as
the routing table (see
.Xr route 4 ) .
.Pp
Lookup tables currently support only ports, jail IDs, IPv4/IPv6  addresses
and interface names.
Wildcards is not supported for interface names.
.Pp
The
.Cm tablearg
feature provides the ability to use a value, looked up in the table, as
the argument for a rule action, action parameter or rule option.
This can significantly reduce number of rules in some configurations.
If two tables are used in a rule, the result of the second (destination)
is used.
The
.Cm tablearg
argument can be used with the following actions:
.Cm nat, pipe , queue, divert, tee, netgraph, ngtee, fwd, skipto, setfib,
action parameters:
.Cm tag, untag,
rule options:
.Cm limit, tagged.
.Pp
When used with
.Cm fwd
it is possible to supply table entries with values
that are in the form of IP addresses or hostnames.
See the
.Sx EXAMPLES
Section for example usage of tables and the tablearg keyword.
.Pp
When used with the
.Cm skipto
action, the user should be aware that the code will walk the ruleset
up to a rule equal to, or past, the given number,
and should therefore try keep the
ruleset compact between the skipto and the target rules.
.Sh SETS OF RULES
Each rule belongs to one of 32 different
.Em sets
, numbered 0 to 31.
Set 31 is reserved for the default rule.
.Pp
By default, rules are put in set 0, unless you use the
.Cm set N
attribute when entering a new rule.
Sets can be individually and atomically enabled or disabled,
so this mechanism permits an easy way to store multiple configurations
of the firewall and quickly (and atomically) switch between them.
The command to enable/disable sets is
.Bd -ragged -offset indent
.Nm
.Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ...
.Ed
.Pp
where multiple
.Cm enable
or
.Cm disable
sections can be specified.
Command execution is atomic on all the sets specified in the command.
By default, all sets are enabled.
.Pp
When you disable a set, its rules behave as if they do not exist
in the firewall configuration, with only one exception:
.Bd -ragged -offset indent
dynamic rules created from a rule before it had been disabled
will still be active until they expire.
In order to delete
dynamic rules you have to explicitly delete the parent rule
which generated them.
.Ed
.Pp
The set number of rules can be changed with the command
.Bd -ragged -offset indent
.Nm
.Cm set move
.Brq Cm rule Ar rule-number | old-set
.Cm to Ar new-set
.Ed
.Pp
Also, you can atomically swap two rulesets with the command
.Bd -ragged -offset indent
.Nm
.Cm set swap Ar first-set second-set
.Ed
.Pp
See the
.Sx EXAMPLES
Section on some possible uses of sets of rules.
.Sh STATEFUL FIREWALL
Stateful operation is a way for the firewall to dynamically
create rules for specific flows when packets that
match a given pattern are detected.
Support for stateful
operation comes through the
.Cm check-state , keep-state
and
.Cm limit
options of
.Nm rules .
.Pp
Dynamic rules are created when a packet matches a
.Cm keep-state
or
.Cm limit
rule, causing the creation of a
.Em dynamic
rule which will match all and only packets with
a given
.Em protocol
between a
.Em src-ip/src-port dst-ip/dst-port
pair of addresses
.Em ( src
and
.Em dst
are used here only to denote the initial match addresses, but they
are completely equivalent afterwards).
Dynamic rules will be checked at the first
.Cm check-state, keep-state
or
.Cm limit
occurrence, and the action performed upon a match will be the same
as in the parent rule.
.Pp
Note that no additional attributes other than protocol and IP addresses
and ports are checked on dynamic rules.
.Pp
The typical use of dynamic rules is to keep a closed firewall configuration,
but let the first TCP SYN packet from the inside network install a
dynamic rule for the flow so that packets belonging to that session
will be allowed through the firewall:
.Pp
.Dl "ipfw add check-state"
.Dl "ipfw add allow tcp from my-subnet to any setup keep-state"
.Dl "ipfw add deny tcp from any to any"
.Pp
A similar approach can be used for UDP, where an UDP packet coming
from the inside will install a dynamic rule to let the response through
the firewall:
.Pp
.Dl "ipfw add check-state"
.Dl "ipfw add allow udp from my-subnet to any keep-state"
.Dl "ipfw add deny udp from any to any"
.Pp
Dynamic rules expire after some time, which depends on the status
of the flow and the setting of some
.Cm sysctl
variables.
See Section
.Sx SYSCTL VARIABLES
for more details.
For TCP sessions, dynamic rules can be instructed to periodically
send keepalive packets to refresh the state of the rule when it is
about to expire.
.Pp
See Section
.Sx EXAMPLES
for more examples on how to use dynamic rules.
.Sh TRAFFIC SHAPER (DUMMYNET) CONFIGURATION
.Nm
is also the user interface for the
.Nm dummynet
traffic shaper, packet scheduler and network emulator, a subsystem that
can artificially queue, delay or drop packets
emulating the behaviour of certain network links
or queueing systems.
.Pp
.Nm dummynet
operates by first using the firewall to select packets
using any match pattern that can be used in
.Nm
rules.
Matching packets are then passed to either of two
different objects, which implement the traffic regulation:
.Bl -hang -offset XXXX
.It Em pipe
A
.Em pipe
emulates a
.Em link
with given bandwidth and propagation delay,
driven by a FIFO scheduler and a single queue with programmable
queue size and packet loss rate.
Packets are appended to the queue as they come out from
.Nm ipfw ,
and then transferred in FIFO order to the link at the desired rate.
.It Em queue
A
.Em queue
is an abstraction used to implement packet scheduling
using one of several packet scheduling algorithms.
Packets sent to a
.Em queue
are first grouped into flows according to a mask on the 5-tuple.
Flows are then passed to the scheduler associated to the
.Em queue ,
and each flow uses scheduling parameters (weight and others)
as configured in the
.Em queue
itself.
A scheduler in turn is connected to an emulated link,
and arbitrates the link's bandwidth among backlogged flows according to
weights and to the features of the scheduling algorithm in use.
.El
.Pp
In practice,
.Em pipes
can be used to set hard limits to the bandwidth that a flow can use, whereas
.Em queues
can be used to determine how different flows share the available bandwidth.
.Pp
A graphical representation of the binding of queues,
flows, schedulers and links is below.
.Bd -literal -offset indent
                 (flow_mask|sched_mask)  sched_mask
         +---------+   weight Wx  +-------------+
         |         |->-[flow]-->--|             |-+
    -->--| QUEUE x |   ...        |             | |
         |         |->-[flow]-->--| SCHEDuler N | |
         +---------+              |             | |
             ...                  |             +--[LINK N]-->--
         +---------+   weight Wy  |             | +--[LINK N]-->--
         |         |->-[flow]-->--|             | |
    -->--| QUEUE y |   ...        |             | |
         |         |->-[flow]-->--|             | |
         +---------+              +-------------+ |
                                    +-------------+
.Ed
It is important to understand the role of the SCHED_MASK
and FLOW_MASK, which are configured through the commands
.Dl "ipfw sched N config mask SCHED_MASK ..."
and
.Dl "ipfw queue X config mask FLOW_MASK ..." .
.Pp
The SCHED_MASK is used to assign flows to one or more
scheduler instances, one for each
value of the packet's 5-tuple after applying SCHED_MASK.
As an example, using ``src-ip 0xffffff00'' creates one instance
for each /24 destination subnet.
.Pp
The FLOW_MASK, together with the SCHED_MASK, is used to split
packets into flows.
As an example, using
``src-ip 0x000000ff''
together with the previous SCHED_MASK makes a flow for
each individual source address.
In turn, flows for each /24
subnet will be sent to the same scheduler instance.
.Pp
The above diagram holds even for the
.Em pipe
case, with the only restriction that a
.Em pipe
only supports a SCHED_MASK, and forces the use of a FIFO
scheduler (these are for backward compatibility reasons;
in fact, internally, a
.Nm dummynet's
pipe is implemented exactly as above).
.Pp
There are two modes of
.Nm dummynet
operation:
.Dq normal
and
.Dq fast .
The
.Dq normal
mode tries to emulate a real link: the
.Nm dummynet
scheduler ensures that the packet will not leave the pipe faster than it
would on the real link with a given bandwidth.
The
.Dq fast
mode allows certain packets to bypass the
.Nm dummynet
scheduler (if packet flow does not exceed pipe's bandwidth).
This is the reason why the
.Dq fast
mode requires less CPU cycles per packet (on average) and packet latency
can be significantly lower in comparison to a real link with the same
bandwidth.
The default mode is
.Dq normal .
The
.Dq fast
mode can be enabled by setting the
.Va net.inet.ip.dummynet.io_fast
.Xr sysctl 8
variable to a non-zero value.
.Pp
.Ss PIPE, QUEUE AND SCHEDULER CONFIGURATION
The
.Em pipe ,
.Em queue
and
.Em scheduler
configuration commands are the following:
.Bd -ragged -offset indent
.Cm pipe Ar number Cm config Ar pipe-configuration
.Pp
.Cm queue Ar number Cm config Ar queue-configuration
.Pp
.Cm sched Ar number Cm config Ar sched-configuration
.Ed
.Pp
The following parameters can be configured for a pipe:
.Pp
.Bl -tag -width indent -compact
.It Cm bw Ar bandwidth | device
Bandwidth, measured in
.Sm off
.Op Cm K | M
.Brq Cm bit/s | Byte/s .
.Sm on
.Pp
A value of 0 (default) means unlimited bandwidth.
The unit must immediately follow the number, as in
.Pp
.Dl "ipfw pipe 1 config bw 300Kbit/s"
.Pp
If a device name is specified instead of a numeric value, as in
.Pp
.Dl "ipfw pipe 1 config bw tun0"
.Pp
then the transmit clock is supplied by the specified device.
At the moment only the
.Xr tun 4
device supports this
functionality, for use in conjunction with
.Xr ppp 8 .
.Pp
.It Cm delay Ar ms-delay
Propagation delay, measured in milliseconds.
The value is rounded to the next multiple of the clock tick
(typically 10ms, but it is a good practice to run kernels
with
.Dq "options HZ=1000"
to reduce
the granularity to 1ms or less).
The default value is 0, meaning no delay.
.Pp
.It Cm burst Ar size
If the data to be sent exceeds the pipe's bandwidth limit
(and the pipe was previously idle), up to
.Ar size
bytes of data are allowed to bypass the
.Nm dummynet
scheduler, and will be sent as fast as the physical link allows.
Any additional data will be transmitted at the rate specified
by the
.Nm pipe
bandwidth.
The burst size depends on how long the pipe has been idle;
the effective burst size is calculated as follows:
MAX(
.Ar size
,
.Nm bw
* pipe_idle_time).
.Pp
.It Cm profile Ar filename
A file specifying the additional overhead incurred in the transmission
of a packet on the link.
.Pp
Some link types introduce extra delays in the transmission
of a packet, e.g., because of MAC level framing, contention on
the use of the channel, MAC level retransmissions and so on.
From our point of view, the channel is effectively unavailable
for this extra time, which is constant or variable depending
on the link type.
Additionally, packets may be dropped after this
time (e.g., on a wireless link after too many retransmissions).
We can model the additional delay with an empirical curve
that represents its distribution.
.Bd -literal -offset indent
      cumulative probability
      1.0 ^
          |
      L   +-- loss-level          x
          |                 ******
          |                *
          |           *****
          |          *
          |        **
          |       *
          +-------*------------------->
                      delay
.Ed
The empirical curve may have both vertical and horizontal lines.
Vertical lines represent constant delay for a range of
probabilities.
Horizontal lines correspond to a discontinuity in the delay
distribution: the pipe will use the largest delay for a
given probability.
.Pp
The file format is the following, with whitespace acting as
a separator and '#' indicating the beginning a comment:
.Bl -tag -width indent
.It Cm name Ar identifier
optional name (listed by "ipfw pipe show")
to identify the delay distribution;
.It Cm bw Ar value
the bandwidth used for the pipe.
If not specified here, it must be present
explicitly as a configuration parameter for the pipe;
.It Cm loss-level Ar L
the probability above which packets are lost.
(0.0 <= L <= 1.0, default 1.0 i.e., no loss);
.It Cm samples Ar N
the number of samples used in the internal
representation of the curve (2..1024; default 100);
.It Cm "delay prob" | "prob delay"
One of these two lines is mandatory and defines
the format of the following lines with data points.
.It Ar XXX Ar YYY
2 or more lines representing points in the curve,
with either delay or probability first, according
to the chosen format.
The unit for delay is milliseconds.
Data points do not need to be sorted.
Also, the number of actual lines can be different
from the value of the "samples" parameter:
.Nm
utility will sort and interpolate
the curve as needed.
.El
.Pp
Example of a profile file:
.Bd -literal -offset indent
name    bla_bla_bla
samples 100
loss-level    0.86
prob    delay
0       200	# minimum overhead is 200ms
0.5     200
0.5     300
0.8     1000
0.9     1300
1       1300
#configuration file end
.Ed
.El
.Pp
The following parameters can be configured for a queue:
.Pp
.Bl -tag -width indent -compact
.It Cm pipe Ar pipe_nr
Connects a queue to the specified pipe.
Multiple queues (with the same or different weights) can be connected to
the same pipe, which specifies the aggregate rate for the set of queues.
.Pp
.It Cm weight Ar weight
Specifies the weight to be used for flows matching this queue.
The weight must be in the range 1..100, and defaults to 1.
.El
.Pp
The following case-insensitive parameters can be configured for a
scheduler:
.Pp
.Bl -tag -width indent -compact
.It Cm type Ar {fifo | wf2q+ | rr | qfq}
specifies the scheduling algorithm to use.
.Bl -tag -width indent -compact
.It Cm fifo
is just a FIFO scheduler (which means that all packets
are stored in the same queue as they arrive to the scheduler).
FIFO has O(1) per-packet time complexity, with very low
constants (estimate 60-80ns on a 2GHz desktop machine)
but gives no service guarantees.
.It Cm wf2q+
implements the WF2Q+ algorithm, which is a Weighted Fair Queueing
algorithm which permits flows to share bandwidth according to
their weights.
Note that weights are not priorities; even a flow
with a minuscule weight will never starve.
WF2Q+ has O(log N) per-packet processing cost, where N is the number
of flows, and is the default algorithm used by previous versions
dummynet's queues.
.It Cm rr
implements the Deficit Round Robin algorithm, which has O(1) processing
costs (roughly, 100-150ns per packet)
and permits bandwidth allocation according to weights, but
with poor service guarantees.
.It Cm qfq
implements the QFQ algorithm, which is a very fast variant of
WF2Q+, with similar service guarantees and O(1) processing
costs (roughly, 200-250ns per packet).
.El
.El
.Pp
In addition to the type, all parameters allowed for a pipe can also
be specified for a scheduler.
.Pp
Finally, the following parameters can be configured for both
pipes and queues:
.Pp
.Bl -tag -width XXXX -compact
.It Cm buckets Ar hash-table-size
Specifies the size of the hash table used for storing the
various queues.
Default value is 64 controlled by the
.Xr sysctl 8
variable
.Va net.inet.ip.dummynet.hash_size ,
allowed range is 16 to 65536.
.Pp
.It Cm mask Ar mask-specifier
Packets sent to a given pipe or queue by an
.Nm
rule can be further classified into multiple flows, each of which is then
sent to a different
.Em dynamic
pipe or queue.
A flow identifier is constructed by masking the IP addresses,
ports and protocol types as specified with the
.Cm mask
options in the configuration of the pipe or queue.
For each different flow identifier, a new pipe or queue is created
with the same parameters as the original object, and matching packets
are sent to it.
.Pp
Thus, when
.Em dynamic pipes
are used, each flow will get the same bandwidth as defined by the pipe,
whereas when
.Em dynamic queues
are used, each flow will share the parent's pipe bandwidth evenly
with other flows generated by the same queue (note that other queues
with different weights might be connected to the same pipe).
.br
Available mask specifiers are a combination of one or more of the following:
.Pp
.Cm dst-ip Ar mask ,
.Cm dst-ip6 Ar mask ,
.Cm src-ip Ar mask ,
.Cm src-ip6 Ar mask ,
.Cm dst-port Ar mask ,
.Cm src-port Ar mask ,
.Cm flow-id Ar mask ,
.Cm proto Ar mask
or
.Cm all ,
.Pp
where the latter means all bits in all fields are significant.
.Pp
.It Cm noerror
When a packet is dropped by a
.Nm dummynet
queue or pipe, the error
is normally reported to the caller routine in the kernel, in the
same way as it happens when a device queue fills up.
Setting this
option reports the packet as successfully delivered, which can be
needed for some experimental setups where you want to simulate
loss or congestion at a remote router.
.Pp
.It Cm plr Ar packet-loss-rate
Packet loss rate.
Argument
.Ar packet-loss-rate
is a floating-point number between 0 and 1, with 0 meaning no
loss, 1 meaning 100% loss.
The loss rate is internally represented on 31 bits.
.Pp
.It Cm queue Brq Ar slots | size Ns Cm Kbytes
Queue size, in
.Ar slots
or
.Cm KBytes .
Default value is 50 slots, which
is the typical queue size for Ethernet devices.
Note that for slow speed links you should keep the queue
size short or your traffic might be affected by a significant
queueing delay.
E.g., 50 max-sized ethernet packets (1500 bytes) mean 600Kbit
or 20s of queue on a 30Kbit/s pipe.
Even worse effects can result if you get packets from an
interface with a much larger MTU, e.g.\& the loopback interface
with its 16KB packets.
The
.Xr sysctl 8
variables
.Em net.inet.ip.dummynet.pipe_byte_limit
and
.Em net.inet.ip.dummynet.pipe_slot_limit
control the maximum lengths that can be specified.
.Pp
.It Cm red | gred Ar w_q Ns / Ns Ar min_th Ns / Ns Ar max_th Ns / Ns Ar max_p
Make use of the RED (Random Early Detection) queue management algorithm.
.Ar w_q
and
.Ar max_p
are floating
point numbers between 0 and 1 (0 not included), while
.Ar min_th
and
.Ar max_th
are integer numbers specifying thresholds for queue management
(thresholds are computed in bytes if the queue has been defined
in bytes, in slots otherwise).
The
.Nm dummynet
also supports the gentle RED variant (gred).
Three
.Xr sysctl 8
variables can be used to control the RED behaviour:
.Bl -tag -width indent
.It Va net.inet.ip.dummynet.red_lookup_depth
specifies the accuracy in computing the average queue
when the link is idle (defaults to 256, must be greater than zero)
.It Va net.inet.ip.dummynet.red_avg_pkt_size
specifies the expected average packet size (defaults to 512, must be
greater than zero)
.It Va net.inet.ip.dummynet.red_max_pkt_size
specifies the expected maximum packet size, only used when queue
thresholds are in bytes (defaults to 1500, must be greater than zero).
.El
.El
.Pp
When used with IPv6 data,
.Nm dummynet
currently has several limitations.
Information necessary to route link-local packets to an
interface is not available after processing by
.Nm dummynet
so those packets are dropped in the output path.
Care should be taken to ensure that link-local packets are not passed to
.Nm dummynet .
.Sh CHECKLIST
Here are some important points to consider when designing your
rules:
.Bl -bullet
.It
Remember that you filter both packets going
.Cm in
and
.Cm out .
Most connections need packets going in both directions.
.It
Remember to test very carefully.
It is a good idea to be near the console when doing this.
If you cannot be near the console,
use an auto-recovery script such as the one in
.Pa /usr/share/examples/ipfw/change_rules.sh .
.It
Do not forget the loopback interface.
.El
.Sh FINE POINTS
.Bl -bullet
.It
There are circumstances where fragmented datagrams are unconditionally
dropped.
TCP packets are dropped if they do not contain at least 20 bytes of
TCP header, UDP packets are dropped if they do not contain a full 8
byte UDP header, and ICMP packets are dropped if they do not contain
4 bytes of ICMP header, enough to specify the ICMP type, code, and
checksum.
These packets are simply logged as
.Dq pullup failed
since there may not be enough good data in the packet to produce a
meaningful log entry.
.It
Another type of packet is unconditionally dropped, a TCP packet with a
fragment offset of one.
This is a valid packet, but it only has one use, to try
to circumvent firewalls.
When logging is enabled, these packets are
reported as being dropped by rule -1.
.It
If you are logged in over a network, loading the
.Xr kld 4
version of
.Nm
is probably not as straightforward as you would think.
The following command line is recommended:
.Bd -literal -offset indent
kldload ipfw && \e
ipfw add 32000 allow ip from any to any
.Ed
.Pp
Along the same lines, doing an
.Bd -literal -offset indent
ipfw flush
.Ed
.Pp
in similar surroundings is also a bad idea.
.It
The
.Nm
filter list may not be modified if the system security level
is set to 3 or higher
(see
.Xr init 8
for information on system security levels).
.El
.Sh PACKET DIVERSION
A
.Xr divert 4
socket bound to the specified port will receive all packets
diverted to that port.
If no socket is bound to the destination port, or if the divert module is
not loaded, or if the kernel was not compiled with divert socket support,
the packets are dropped.
.Sh NETWORK ADDRESS TRANSLATION (NAT)
.Nm
support in-kernel NAT using the kernel version of
.Xr libalias 3 .
.Pp
The nat configuration command is the following:
.Bd -ragged -offset indent
.Bk -words
.Cm nat
.Ar nat_number
.Cm config
.Ar nat-configuration
.Ek
.Ed
.Pp
The following parameters can be configured:
.Bl -tag -width indent
.It Cm ip Ar ip_address
Define an ip address to use for aliasing.
.It Cm if Ar nic
Use ip address of NIC for aliasing, dynamically changing
it if NIC's ip address changes.
.It Cm log
Enable logging on this nat instance.
.It Cm deny_in
Deny any incoming connection from outside world.
.It Cm same_ports
Try to leave the alias port numbers unchanged from
the actual local port numbers.
.It Cm unreg_only
Traffic on the local network not originating from an
unregistered address spaces will be ignored.
.It Cm reset
Reset table of the packet aliasing engine on address change.
.It Cm reverse
Reverse the way libalias handles aliasing.
.It Cm proxy_only
Obey transparent proxy rules only, packet aliasing is not performed.
.It Cm skip_global
Skip instance in case of global state lookup (see below).
.El
.Pp
Some specials value can be supplied instead of
.Va nat_number:
.Bl -tag -width indent
.It Cm global
Looks up translation state in all configured nat instances.
If an entry is found, packet is aliased according to that entry.
If no entry was found in any of the instances, packet is passed unchanged,
and no new entry will be created.
See section
.Sx MULTIPLE INSTANCES
in
.Xr natd 8
for more information.
.It Cm tablearg
Uses argument supplied in lookup table.
See
.Sx LOOKUP TABLES
section below for more information on lookup tables.
.El
.Pp
To let the packet continue after being (de)aliased, set the sysctl variable
.Va net.inet.ip.fw.one_pass
to 0.
For more information about aliasing modes, refer to
.Xr libalias 3 .
See Section
.Sx EXAMPLES
for some examples about nat usage.
.Ss REDIRECT AND LSNAT SUPPORT IN IPFW
Redirect and LSNAT support follow closely the syntax used in
.Xr natd 8 .
See Section
.Sx EXAMPLES
for some examples on how to do redirect and lsnat.
.Ss SCTP NAT SUPPORT
SCTP nat can be configured in a similar manner to TCP through the
.Nm
command line tool.
The main difference is that
.Nm sctp nat
does not do port translation.
Since the local and global side ports will be the same,
there is no need to specify both.
Ports are redirected as follows:
.Bd -ragged -offset indent
.Bk -words
.Cm nat
.Ar nat_number
.Cm config if
.Ar nic
.Cm redirect_port sctp
.Ar ip_address [,addr_list] {[port | port-port] [,ports]}
.Ek
.Ed
.Pp
Most
.Nm sctp nat
configuration can be done in real-time through the
.Xr sysctl 8
interface.
All may be changed dynamically, though the hash_table size will only
change for new
.Nm nat
instances.
See
.Sx SYSCTL VARIABLES
for more info.
.Sh LOADER TUNABLES
Tunables can be set in
.Xr loader 8
prompt,
.Xr loader.conf 5
or
.Xr kenv 1
before ipfw module gets loaded.
.Bl -tag -width indent
.It Va net.inet.ip.fw.default_to_accept: No 0
Defines ipfw last rule behavior.
This value overrides
.Cd "options IPFW_DEFAULT_TO_(ACCEPT|DENY)"
from kernel configuration file.
.It Va net.inet.ip.fw.tables_max: No 128
Defines number of tables available in ipfw.
Number cannot exceed 65534.
.El
.Sh SYSCTL VARIABLES
A set of
.Xr sysctl 8
variables controls the behaviour of the firewall and
associated modules
.Pq Nm dummynet , bridge , sctp nat .
These are shown below together with their default value
(but always check with the
.Xr sysctl 8
command what value is actually in use) and meaning:
.Bl -tag -width indent
.It Va net.inet.ip.alias.sctp.accept_global_ootb_addip: No 0
Defines how the
.Nm nat
responds to receipt of global OOTB ASCONF-AddIP:
.Bl -tag -width indent
.It Cm 0
No response (unless a partially matching association exists -
ports and vtags match but global address does not)
.It Cm 1
.Nm nat
will accept and process all OOTB global AddIP messages.
.El
.Pp
Option 1 should never be selected as this forms a security risk.
An attacker can
establish multiple fake associations by sending AddIP messages.
.It Va net.inet.ip.alias.sctp.chunk_proc_limit: No 5
Defines the maximum number of chunks in an SCTP packet that will be
parsed for a
packet that matches an existing association.
This value is enforced to be greater or equal than
.Cm net.inet.ip.alias.sctp.initialising_chunk_proc_limit .
A high value is
a DoS risk yet setting too low a value may result in
important control chunks in
the packet not being located and parsed.
.It Va net.inet.ip.alias.sctp.error_on_ootb: No 1
Defines when the
.Nm nat
responds to any Out-of-the-Blue (OOTB) packets with ErrorM packets.
An OOTB packet is a packet that arrives with no existing association
registered in the
.Nm nat
and is not an INIT or ASCONF-AddIP packet:
.Bl -tag -width indent
.It Cm 0
ErrorM is never sent in response to OOTB packets.
.It Cm 1
ErrorM is only sent to OOTB packets received on the local side.
.It Cm 2
ErrorM is sent to the local side and on the global side ONLY if there is a
partial match (ports and vtags match but the source global IP does not).
This value is only useful if the
.Nm nat
is tracking global IP addresses.
.It Cm 3
ErrorM is sent in response to all OOTB packets on both
the local and global side
(DoS risk).
.El
.Pp
At the moment the default is 0, since the ErrorM packet is not yet
supported by most SCTP stacks.
When it is supported, and if not tracking
global addresses, we recommend setting this value to 1 to allow
multi-homed local hosts to function with the
.Nm nat .
To track global addresses, we recommend setting this value to 2 to
allow global hosts to be informed when they need to (re)send an
ASCONF-AddIP.
Value 3 should never be chosen (except for debugging) as the
.Nm nat
will respond to all OOTB global packets (a DoS risk).
.It Va net.inet.ip.alias.sctp.hashtable_size: No 2003
Size of hash tables used for
.Nm nat
lookups (100 < prime_number > 1000001).
This value sets the
.Nm hash table
size for any future created
.Nm nat
instance and therefore must be set prior to creating a
.Nm nat
instance.
The table sizes may be changed to suit specific needs.
If there will be few
concurrent associations, and memory is scarce, you may make these smaller.
If there will be many thousands (or millions) of concurrent associations, you
should make these larger.
A prime number is best for the table size.
The sysctl
update function will adjust your input value to the next highest prime number.
.It Va net.inet.ip.alias.sctp.holddown_time:  No 0
Hold association in table for this many seconds after receiving a
SHUTDOWN-COMPLETE.
This allows endpoints to correct shutdown gracefully if a
shutdown_complete is lost and retransmissions are required.
.It Va net.inet.ip.alias.sctp.init_timer: No 15
Timeout value while waiting for (INIT-ACK|AddIP-ACK).
This value cannot be 0.
.It Va net.inet.ip.alias.sctp.initialising_chunk_proc_limit: No 2
Defines the maximum number of chunks in an SCTP packet that will be parsed when
no existing association exists that matches that packet.
Ideally this packet
will only be an INIT or ASCONF-AddIP packet.
A higher value may become a DoS
risk as malformed packets can consume processing resources.
.It Va net.inet.ip.alias.sctp.param_proc_limit: No 25
Defines the maximum number of parameters within a chunk that will be
parsed in a
packet.
As for other similar sysctl variables, larger values pose a DoS risk.
.It Va net.inet.ip.alias.sctp.log_level: No 0
Level of detail in the system log messages (0 \- minimal, 1 \- event,
2 \- info, 3 \- detail, 4 \- debug, 5 \- max debug).
May be a good
option in high loss environments.
.It Va net.inet.ip.alias.sctp.shutdown_time: No 15
Timeout value while waiting for SHUTDOWN-COMPLETE.
This value cannot be 0.
.It Va net.inet.ip.alias.sctp.track_global_addresses: No 0
Enables/disables global IP address tracking within the
.Nm nat
and places an
upper limit on the number of addresses tracked for each association:
.Bl -tag -width indent
.It Cm 0
Global tracking is disabled
.It Cm >1
Enables tracking, the maximum number of addresses tracked for each
association is limited to this value
.El
.Pp
This variable is fully dynamic, the new value will be adopted for all newly
arriving associations, existing associations are treated
as they were previously.
Global tracking will decrease the number of collisions within the
.Nm nat
at a cost
of increased processing load, memory usage, complexity, and possible
.Nm nat
state
problems in complex networks with multiple
.Nm nats .
We recommend not tracking
global IP addresses, this will still result in a fully functional
.Nm nat .
.It Va net.inet.ip.alias.sctp.up_timer: No 300
Timeout value to keep an association up with no traffic.
This value cannot be 0.
.It Va net.inet.ip.dummynet.expire : No 1
Lazily delete dynamic pipes/queue once they have no pending traffic.
You can disable this by setting the variable to 0, in which case
the pipes/queues will only be deleted when the threshold is reached.
.It Va net.inet.ip.dummynet.hash_size : No 64
Default size of the hash table used for dynamic pipes/queues.
This value is used when no
.Cm buckets
option is specified when configuring a pipe/queue.
.It Va net.inet.ip.dummynet.io_fast : No 0
If set to a non-zero value,
the
.Dq fast
mode of
.Nm dummynet
operation (see above) is enabled.
.It Va net.inet.ip.dummynet.io_pkt
Number of packets passed to
.Nm dummynet .
.It Va net.inet.ip.dummynet.io_pkt_drop
Number of packets dropped by
.Nm dummynet .
.It Va net.inet.ip.dummynet.io_pkt_fast
Number of packets bypassed by the
.Nm dummynet
scheduler.
.It Va net.inet.ip.dummynet.max_chain_len : No 16
Target value for the maximum number of pipes/queues in a hash bucket.
The product
.Cm max_chain_len*hash_size
is used to determine the threshold over which empty pipes/queues
will be expired even when
.Cm net.inet.ip.dummynet.expire=0 .
.It Va net.inet.ip.dummynet.red_lookup_depth : No 256
.It Va net.inet.ip.dummynet.red_avg_pkt_size : No 512
.It Va net.inet.ip.dummynet.red_max_pkt_size : No 1500
Parameters used in the computations of the drop probability
for the RED algorithm.
.It Va net.inet.ip.dummynet.pipe_byte_limit : No 1048576
.It Va net.inet.ip.dummynet.pipe_slot_limit : No 100
The maximum queue size that can be specified in bytes or packets.
These limits prevent accidental exhaustion of resources such as mbufs.
If you raise these limits,
you should make sure the system is configured so that sufficient resources
are available.
.It Va net.inet.ip.fw.autoinc_step : No 100
Delta between rule numbers when auto-generating them.
The value must be in the range 1..1000.
.It Va net.inet.ip.fw.curr_dyn_buckets : Va net.inet.ip.fw.dyn_buckets
The current number of buckets in the hash table for dynamic rules
(readonly).
.It Va net.inet.ip.fw.debug : No 1
Controls debugging messages produced by
.Nm .
.It Va net.inet.ip.fw.default_rule : No 65535
The default rule number (read-only).
By the design of
.Nm , the default rule is the last one, so its number
can also serve as the highest number allowed for a rule.
.It Va net.inet.ip.fw.dyn_buckets : No 256
The number of buckets in the hash table for dynamic rules.
Must be a power of 2, up to 65536.
It only takes effect when all dynamic rules have expired, so you
are advised to use a
.Cm flush
command to make sure that the hash table is resized.
.It Va net.inet.ip.fw.dyn_count : No 3
Current number of dynamic rules
(read-only).
.It Va net.inet.ip.fw.dyn_keepalive : No 1
Enables generation of keepalive packets for
.Cm keep-state
rules on TCP sessions.
A keepalive is generated to both
sides of the connection every 5 seconds for the last 20
seconds of the lifetime of the rule.
.It Va net.inet.ip.fw.dyn_max : No 8192
Maximum number of dynamic rules.
When you hit this limit, no more dynamic rules can be
installed until old ones expire.
.It Va net.inet.ip.fw.dyn_ack_lifetime : No 300
.It Va net.inet.ip.fw.dyn_syn_lifetime : No 20
.It Va net.inet.ip.fw.dyn_fin_lifetime : No 1
.It Va net.inet.ip.fw.dyn_rst_lifetime : No 1
.It Va net.inet.ip.fw.dyn_udp_lifetime : No 5
.It Va net.inet.ip.fw.dyn_short_lifetime : No 30
These variables control the lifetime, in seconds, of dynamic
rules.
Upon the initial SYN exchange the lifetime is kept short,
then increased after both SYN have been seen, then decreased
again during the final FIN exchange or when a RST is received.
Both
.Em dyn_fin_lifetime
and
.Em dyn_rst_lifetime
must be strictly lower than 5 seconds, the period of
repetition of keepalives.
The firewall enforces that.
.It Va net.inet.ip.fw.dyn_keep_states: No 0
Keep dynamic states on rule/set deletion.
States are relinked to default rule (65535).
This can be handly for ruleset reload.
Turned off by default.
.It Va net.inet.ip.fw.enable : No 1
Enables the firewall.
Setting this variable to 0 lets you run your machine without
firewall even if compiled in.
.It Va net.inet6.ip6.fw.enable : No 1
provides the same functionality as above for the IPv6 case.
.It Va net.inet.ip.fw.one_pass : No 1
When set, the packet exiting from the
.Nm dummynet
pipe or from
.Xr ng_ipfw 4
node is not passed though the firewall again.
Otherwise, after an action, the packet is
reinjected into the firewall at the next rule.
.It Va net.inet.ip.fw.tables_max : No 128
Maximum number of tables.
.It Va net.inet.ip.fw.verbose : No 1
Enables verbose messages.
.It Va net.inet.ip.fw.verbose_limit : No 0
Limits the number of messages produced by a verbose firewall.
.It Va net.inet6.ip6.fw.deny_unknown_exthdrs : No 1
If enabled packets with unknown IPv6 Extension Headers will be denied.
.It Va net.link.ether.ipfw : No 0
Controls whether layer-2 packets are passed to
.Nm .
Default is no.
.It Va net.link.bridge.ipfw : No 0
Controls whether bridged packets are passed to
.Nm .
Default is no.
.El
.Sh EXAMPLES
There are far too many possible uses of
.Nm
so this Section will only give a small set of examples.
.Pp
.Ss BASIC PACKET FILTERING
This command adds an entry which denies all tcp packets from
.Em cracker.evil.org
to the telnet port of
.Em wolf.tambov.su
from being forwarded by the host:
.Pp
.Dl "ipfw add deny tcp from cracker.evil.org to wolf.tambov.su telnet"
.Pp
This one disallows any connection from the entire cracker's
network to my host:
.Pp
.Dl "ipfw add deny ip from 123.45.67.0/24 to my.host.org"
.Pp
A first and efficient way to limit access (not using dynamic rules)
is the use of the following rules:
.Pp
.Dl "ipfw add allow tcp from any to any established"
.Dl "ipfw add allow tcp from net1 portlist1 to net2 portlist2 setup"
.Dl "ipfw add allow tcp from net3 portlist3 to net3 portlist3 setup"
.Dl "..."
.Dl "ipfw add deny tcp from any to any"
.Pp
The first rule will be a quick match for normal TCP packets,
but it will not match the initial SYN packet, which will be
matched by the
.Cm setup
rules only for selected source/destination pairs.
All other SYN packets will be rejected by the final
.Cm deny
rule.
.Pp
If you administer one or more subnets, you can take advantage
of the address sets and or-blocks and write extremely
compact rulesets which selectively enable services to blocks
of clients, as below:
.Pp
.Dl "goodguys=\*q{ 10.1.2.0/24{20,35,66,18} or 10.2.3.0/28{6,3,11} }\*q"
.Dl "badguys=\*q10.1.2.0/24{8,38,60}\*q"
.Dl ""
.Dl "ipfw add allow ip from ${goodguys} to any"
.Dl "ipfw add deny ip from ${badguys} to any"
.Dl "... normal policies ..."
.Pp
The
.Cm verrevpath
option could be used to do automated anti-spoofing by adding the
following to the top of a ruleset:
.Pp
.Dl "ipfw add deny ip from any to any not verrevpath in"
.Pp
This rule drops all incoming packets that appear to be coming to the
system on the wrong interface.
For example, a packet with a source
address belonging to a host on a protected internal network would be
dropped if it tried to enter the system from an external interface.
.Pp
The
.Cm antispoof
option could be used to do similar but more restricted anti-spoofing
by adding the following to the top of a ruleset:
.Pp
.Dl "ipfw add deny ip from any to any not antispoof in"
.Pp
This rule drops all incoming packets that appear to be coming from another
directly connected system but on the wrong interface.
For example, a packet with a source address of
.Li 192.168.0.0/24 ,
configured on
.Li fxp0 ,
but coming in on
.Li fxp1
would be dropped.
.Pp
The
.Cm setdscp
option could be used to (re)mark user traffic,
by adding the following to the appropriate place in ruleset:
.Pp
.Dl "ipfw add setdscp be ip from any to any dscp af11,af21"
.Ss DYNAMIC RULES
In order to protect a site from flood attacks involving fake
TCP packets, it is safer to use dynamic rules:
.Pp
.Dl "ipfw add check-state"
.Dl "ipfw add deny tcp from any to any established"
.Dl "ipfw add allow tcp from my-net to any setup keep-state"
.Pp
This will let the firewall install dynamic rules only for
those connection which start with a regular SYN packet coming
from the inside of our network.
Dynamic rules are checked when encountering the first
occurrence of a
.Cm check-state ,
.Cm keep-state
or
.Cm limit
rule.
A
.Cm check-state
rule should usually be placed near the beginning of the
ruleset to minimize the amount of work scanning the ruleset.
Your mileage may vary.
.Pp
To limit the number of connections a user can open
you can use the following type of rules:
.Pp
.Dl "ipfw add allow tcp from my-net/24 to any setup limit src-addr 10"
.Dl "ipfw add allow tcp from any to me setup limit src-addr 4"
.Pp
The former (assuming it runs on a gateway) will allow each host
on a /24 network to open at most 10 TCP connections.
The latter can be placed on a server to make sure that a single
client does not use more than 4 simultaneous connections.
.Pp
.Em BEWARE :
stateful rules can be subject to denial-of-service attacks
by a SYN-flood which opens a huge number of dynamic rules.
The effects of such attacks can be partially limited by
acting on a set of
.Xr sysctl 8
variables which control the operation of the firewall.
.Pp
Here is a good usage of the
.Cm list
command to see accounting records and timestamp information:
.Pp
.Dl ipfw -at list
.Pp
or in short form without timestamps:
.Pp
.Dl ipfw -a list
.Pp
which is equivalent to:
.Pp
.Dl ipfw show
.Pp
Next rule diverts all incoming packets from 192.168.2.0/24
to divert port 5000:
.Pp
.Dl ipfw divert 5000 ip from 192.168.2.0/24 to any in
.Ss TRAFFIC SHAPING
The following rules show some of the applications of
.Nm
and
.Nm dummynet
for simulations and the like.
.Pp
This rule drops random incoming packets with a probability
of 5%:
.Pp
.Dl "ipfw add prob 0.05 deny ip from any to any in"
.Pp
A similar effect can be achieved making use of
.Nm dummynet
pipes:
.Pp
.Dl "ipfw add pipe 10 ip from any to any"
.Dl "ipfw pipe 10 config plr 0.05"
.Pp
We can use pipes to artificially limit bandwidth, e.g.\& on a
machine acting as a router, if we want to limit traffic from
local clients on 192.168.2.0/24 we do:
.Pp
.Dl "ipfw add pipe 1 ip from 192.168.2.0/24 to any out"
.Dl "ipfw pipe 1 config bw 300Kbit/s queue 50KBytes"
.Pp
note that we use the
.Cm out
modifier so that the rule is not used twice.
Remember in fact that
.Nm
rules are checked both on incoming and outgoing packets.
.Pp
Should we want to simulate a bidirectional link with bandwidth
limitations, the correct way is the following:
.Pp
.Dl "ipfw add pipe 1 ip from any to any out"
.Dl "ipfw add pipe 2 ip from any to any in"
.Dl "ipfw pipe 1 config bw 64Kbit/s queue 10Kbytes"
.Dl "ipfw pipe 2 config bw 64Kbit/s queue 10Kbytes"
.Pp
The above can be very useful, e.g.\& if you want to see how
your fancy Web page will look for a residential user who
is connected only through a slow link.
You should not use only one pipe for both directions, unless
you want to simulate a half-duplex medium (e.g.\& AppleTalk,
Ethernet, IRDA).
It is not necessary that both pipes have the same configuration,
so we can also simulate asymmetric links.
.Pp
Should we want to verify network performance with the RED queue
management algorithm:
.Pp
.Dl "ipfw add pipe 1 ip from any to any"
.Dl "ipfw pipe 1 config bw 500Kbit/s queue 100 red 0.002/30/80/0.1"
.Pp
Another typical application of the traffic shaper is to
introduce some delay in the communication.
This can significantly affect applications which do a lot of Remote
Procedure Calls, and where the round-trip-time of the
connection often becomes a limiting factor much more than
bandwidth:
.Pp
.Dl "ipfw add pipe 1 ip from any to any out"
.Dl "ipfw add pipe 2 ip from any to any in"
.Dl "ipfw pipe 1 config delay 250ms bw 1Mbit/s"
.Dl "ipfw pipe 2 config delay 250ms bw 1Mbit/s"
.Pp
Per-flow queueing can be useful for a variety of purposes.
A very simple one is counting traffic:
.Pp
.Dl "ipfw add pipe 1 tcp from any to any"
.Dl "ipfw add pipe 1 udp from any to any"
.Dl "ipfw add pipe 1 ip from any to any"
.Dl "ipfw pipe 1 config mask all"
.Pp
The above set of rules will create queues (and collect
statistics) for all traffic.
Because the pipes have no limitations, the only effect is
collecting statistics.
Note that we need 3 rules, not just the last one, because
when
.Nm
tries to match IP packets it will not consider ports, so we
would not see connections on separate ports as different
ones.
.Pp
A more sophisticated example is limiting the outbound traffic
on a net with per-host limits, rather than per-network limits:
.Pp
.Dl "ipfw add pipe 1 ip from 192.168.2.0/24 to any out"
.Dl "ipfw add pipe 2 ip from any to 192.168.2.0/24 in"
.Dl "ipfw pipe 1 config mask src-ip 0x000000ff bw 200Kbit/s queue 20Kbytes"
.Dl "ipfw pipe 2 config mask dst-ip 0x000000ff bw 200Kbit/s queue 20Kbytes"
.Ss LOOKUP TABLES
In the following example, we need to create several traffic bandwidth
classes and we need different hosts/networks to fall into different classes.
We create one pipe for each class and configure them accordingly.
Then we create a single table and fill it with IP subnets and addresses.
For each subnet/host we set the argument equal to the number of the pipe
that it should use.
Then we classify traffic using a single rule:
.Pp
.Dl "ipfw pipe 1 config bw 1000Kbyte/s"
.Dl "ipfw pipe 4 config bw 4000Kbyte/s"
.Dl "..."
.Dl "ipfw table 1 add 192.168.2.0/24 1"
.Dl "ipfw table 1 add 192.168.0.0/27 4"
.Dl "ipfw table 1 add 192.168.0.2 1"
.Dl "..."
.Dl "ipfw add pipe tablearg ip from table(1) to any"
.Pp
Using the
.Cm fwd
action, the table entries may include hostnames and IP addresses.
.Pp
.Dl "ipfw table 1 add 192.168.2.0/24 10.23.2.1"
.Dl "ipfw table 1 add 192.168.0.0/27 router1.dmz"
.Dl "..."
.Dl "ipfw add 100 fwd tablearg ip from any to table(1)"
.Pp
In the following example per-interface firewall is created:
.Pp
.Dl "ipfw table 10 add vlan20 12000"
.Dl "ipfw table 10 add vlan30 13000"
.Dl "ipfw table 20 add vlan20 22000"
.Dl "ipfw table 20 add vlan30 23000"
.Dl ".."
.Dl "ipfw add 100 ipfw skipto tablearg ip from any to any recv 'table(10)' in"
.Dl "ipfw add 200 ipfw skipto tablearg ip from any to any xmit 'table(10)' out"
.Ss SETS OF RULES
To add a set of rules atomically, e.g.\& set 18:
.Pp
.Dl "ipfw set disable 18"
.Dl "ipfw add NN set 18 ...         # repeat as needed"
.Dl "ipfw set enable 18"
.Pp
To delete a set of rules atomically the command is simply:
.Pp
.Dl "ipfw delete set 18"
.Pp
To test a ruleset and disable it and regain control if something goes wrong:
.Pp
.Dl "ipfw set disable 18"
.Dl "ipfw add NN set 18 ...         # repeat as needed"
.Dl "ipfw set enable 18; echo done; sleep 30 && ipfw set disable 18"
.Pp
Here if everything goes well, you press control-C before the "sleep"
terminates, and your ruleset will be left active.
Otherwise, e.g.\& if
you cannot access your box, the ruleset will be disabled after
the sleep terminates thus restoring the previous situation.
.Pp
To show rules of the specific set:
.Pp
.Dl "ipfw set 18 show"
.Pp
To show rules of the disabled set:
.Pp
.Dl "ipfw -S set 18 show"
.Pp
To clear a specific rule counters of the specific set:
.Pp
.Dl "ipfw set 18 zero NN"
.Pp
To delete a specific rule of the specific set:
.Pp
.Dl "ipfw set 18 delete NN"
.Ss NAT, REDIRECT AND LSNAT
First redirect all the traffic to nat instance 123:
.Pp
.Dl "ipfw add nat 123 all from any to any"
.Pp
Then to configure nat instance 123 to alias all the outgoing traffic with ip
192.168.0.123, blocking all incoming connections, trying to keep
same ports on both sides, clearing aliasing table on address change
and keeping a log of traffic/link statistics:
.Pp
.Dl "ipfw nat 123 config ip 192.168.0.123 log deny_in reset same_ports"
.Pp
Or to change address of instance 123, aliasing table will be cleared (see
reset option):
.Pp
.Dl "ipfw nat 123 config ip 10.0.0.1"
.Pp
To see configuration of nat instance 123:
.Pp
.Dl "ipfw nat 123 show config"
.Pp
To show logs of all the instances in range 111-999:
.Pp
.Dl "ipfw nat 111-999 show"
.Pp
To see configurations of all instances:
.Pp
.Dl "ipfw nat show config"
.Pp
Or a redirect rule with mixed modes could looks like:
.Pp
.Dl "ipfw nat 123 config redirect_addr 10.0.0.1 10.0.0.66"
.Dl "			 redirect_port tcp 192.168.0.1:80 500"
.Dl "			 redirect_proto udp 192.168.1.43 192.168.1.1"
.Dl "			 redirect_addr 192.168.0.10,192.168.0.11"
.Dl "			 	    10.0.0.100	# LSNAT"
.Dl "			 redirect_port tcp 192.168.0.1:80,192.168.0.10:22"
.Dl "			 	    500		# LSNAT"
.Pp
or it could be split in:
.Pp
.Dl "ipfw nat 1 config redirect_addr 10.0.0.1 10.0.0.66"
.Dl "ipfw nat 2 config redirect_port tcp 192.168.0.1:80 500"
.Dl "ipfw nat 3 config redirect_proto udp 192.168.1.43 192.168.1.1"
.Dl "ipfw nat 4 config redirect_addr 192.168.0.10,192.168.0.11,192.168.0.12"
.Dl "				         10.0.0.100"
.Dl "ipfw nat 5 config redirect_port tcp"
.Dl "			192.168.0.1:80,192.168.0.10:22,192.168.0.20:25 500"
.Sh SEE ALSO
.Xr cpp 1 ,
.Xr m4 1 ,
.Xr altq 4 ,
.Xr divert 4 ,
.Xr dummynet 4 ,
.Xr if_bridge 4 ,
.Xr ip 4 ,
.Xr ipfirewall 4 ,
.Xr ng_ipfw 4 ,
.Xr protocols 5 ,
.Xr services 5 ,
.Xr init 8 ,
.Xr kldload 8 ,
.Xr reboot 8 ,
.Xr sysctl 8 ,
.Xr syslogd 8
.Sh HISTORY
The
.Nm
utility first appeared in
.Fx 2.0 .
.Nm dummynet
was introduced in
.Fx 2.2.8 .
Stateful extensions were introduced in
.Fx 4.0 .
.Nm ipfw2
was introduced in Summer 2002.
.Sh AUTHORS
.An Ugen J. S. Antsilevich ,
.An Poul-Henning Kamp ,
.An Alex Nash ,
.An Archie Cobbs ,
.An Luigi Rizzo .
.Pp
.An -nosplit
API based upon code written by
.An Daniel Boulet
for BSDI.
.Pp
Dummynet has been introduced by Luigi Rizzo in 1997-1998.
.Pp
Some early work (1999-2000) on the
.Nm dummynet
traffic shaper supported by Akamba Corp.
.Pp
The ipfw core (ipfw2) has been completely redesigned and
reimplemented by Luigi Rizzo in summer 2002.
Further
actions and
options have been added by various developer over the years.
.Pp
.An -nosplit
In-kernel NAT support written by
.An Paolo Pisati Aq piso@FreeBSD.org
as part of a Summer of Code 2005 project.
.Pp
SCTP
.Nm nat
support has been developed by
.An The Centre for Advanced Internet Architectures (CAIA) Aq http://www.caia.swin.edu.au .
The primary developers and maintainers are David Hayes and Jason But.
For further information visit:
.Aq http://www.caia.swin.edu.au/urp/SONATA
.Pp
Delay profiles have been developed by Alessandro Cerri and
Luigi Rizzo, supported by the
European Commission within Projects Onelab and Onelab2.
.Sh BUGS
The syntax has grown over the years and sometimes it might be confusing.
Unfortunately, backward compatibility prevents cleaning up mistakes
made in the definition of the syntax.
.Pp
.Em !!! WARNING !!!
.Pp
Misconfiguring the firewall can put your computer in an unusable state,
possibly shutting down network services and requiring console access to
regain control of it.
.Pp
Incoming packet fragments diverted by
.Cm divert
are reassembled before delivery to the socket.
The action used on those packet is the one from the
rule which matches the first fragment of the packet.
.Pp
Packets diverted to userland, and then reinserted by a userland process
may lose various packet attributes.
The packet source interface name
will be preserved if it is shorter than 8 bytes and the userland process
saves and reuses the sockaddr_in
(as does
.Xr natd 8 ) ;
otherwise, it may be lost.
If a packet is reinserted in this manner, later rules may be incorrectly
applied, making the order of
.Cm divert
rules in the rule sequence very important.
.Pp
Dummynet drops all packets with IPv6 link-local addresses.
.Pp
Rules using
.Cm uid
or
.Cm gid
may not behave as expected.
In particular, incoming SYN packets may
have no uid or gid associated with them since they do not yet belong
to a TCP connection, and the uid/gid associated with a packet may not
be as expected if the associated process calls
.Xr setuid 2
or similar system calls.
.Pp
Rule syntax is subject to the command line environment and some patterns
may need to be escaped with the backslash character
or quoted appropriately.
.Pp
Due to the architecture of
.Xr libalias 3 ,
ipfw nat is not compatible with the TCP segmentation offloading (TSO).
Thus, to reliably nat your network traffic, please disable TSO
on your NICs using
.Xr ifconfig 8 .
.Pp
ICMP error messages are not implicitly matched by dynamic rules
for the respective conversations.
To avoid failures of network error detection and path MTU discovery,
ICMP error messages may need to be allowed explicitly through static
rules.
.Pp
Rules using
.Cm call
and
.Cm return
actions may lead to confusing behaviour if ruleset has mistakes,
and/or interaction with other subsystems (netgraph, dummynet, etc.) is used.
One possible case for this is packet leaving
.Nm
in subroutine on the input pass, while later on output encountering unpaired
.Cm return
first.
As the call stack is kept intact after input pass, packet will suddenly
return to the rule number used on input pass, not on output one.
Order of processing should be checked carefully to avoid such mistakes.


================================================
FILE: ipfw/ipfw2.c
================================================
/*
 * Copyright (c) 2002-2003 Luigi Rizzo
 * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp
 * Copyright (c) 1994 Ugen J.S.Antsilevich
 *
 * Idea and grammar partially left from:
 * Copyright (c) 1993 Daniel Boulet
 *
 * Redistribution and use in source forms, with and without modification,
 * are permitted provided that this entire comment appears intact.
 *
 * Redistribution in binary form may occur without any restrictions.
 * Obviously, it would be nice if you gave credit where credit is due
 * but requiring it would be too onerous.
 *
 * This software is provided ``AS IS'' without any warranties of any kind.
 *
 * NEW command line interface for IP firewall facility
 *
 * $FreeBSD: head/sbin/ipfw/ipfw2.c 206843 2010-04-19 15:11:45Z luigi $
 */

#include <sys/types.h>
#include <sys/param.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/sysctl.h>

#include "ipfw2.h"

#include <ctype.h>
#include <err.h>
#include <errno.h>
#include <grp.h>
#include <netdb.h>
#include <pwd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sysexits.h>
#include <time.h>	/* ctime */
#include <timeconv.h>	/* _long_to_time */
#include <unistd.h>
#include <fcntl.h>
#include <stddef.h>	/* offsetof */

#include <net/ethernet.h>
#include <net/if.h>		/* only IFNAMSIZ */
#include <netinet/in.h>
#include <netinet/in_systm.h>	/* only n_short, n_long */
#include <netinet/ip.h>
#include <netinet/ip_icmp.h>
#include <netinet/ip_fw.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>

struct cmdline_opts co;	/* global options */

int resvd_set_number = RESVD_SET;

int ipfw_socket = -1;

#ifndef s6_addr32
#define s6_addr32 __u6_addr.__u6_addr32
#endif

#define GET_UINT_ARG(arg, min, max, tok, s_x) do {			\
	if (!av[0])							\
		errx(EX_USAGE, "%s: missing argument", match_value(s_x, tok)); \
	if (_substrcmp(*av, "tablearg") == 0) {				\
		arg = IP_FW_TABLEARG;					\
		break;							\
	}								\
									\
	{								\
	long _xval;							\
	char *end;							\
									\
	_xval = strtol(*av, &end, 10);					\
									\
	if (!isdigit(**av) || *end != '\0' || (_xval == 0 && errno == EINVAL)) \
		errx(EX_DATAERR, "%s: invalid argument: %s",		\
		    match_value(s_x, tok), *av);			\
									\
	if (errno == ERANGE || _xval < min || _xval > max)		\
		errx(EX_DATAERR, "%s: argument is out of range (%u..%u): %s", \
		    match_value(s_x, tok), min, max, *av);		\
									\
	if (_xval == IP_FW_TABLEARG)					\
		errx(EX_DATAERR, "%s: illegal argument value: %s",	\
		    match_value(s_x, tok), *av);			\
	arg = _xval;							\
	}								\
} while (0)

static void
PRINT_UINT_ARG(const char *str, uint32_t arg)
{
	if (str != NULL)
		printf("%s",str);
	if (arg == IP_FW_TABLEARG)
		printf("tablearg");
	else
		printf("%u", arg);
}

static struct _s_x f_tcpflags[] = {
	{ "syn", TH_SYN },
	{ "fin", TH_FIN },
	{ "ack", TH_ACK },
	{ "psh", TH_PUSH },
	{ "rst", TH_RST },
	{ "urg", TH_URG },
	{ "tcp flag", 0 },
	{ NULL,	0 }
};

static struct _s_x f_tcpopts[] = {
	{ "mss",	IP_FW_TCPOPT_MSS },
	{ "maxseg",	IP_FW_TCPOPT_MSS },
	{ "window",	IP_FW_TCPOPT_WINDOW },
	{ "sack",	IP_FW_TCPOPT_SACK },
	{ "ts",		IP_FW_TCPOPT_TS },
	{ "timestamp",	IP_FW_TCPOPT_TS },
	{ "cc",		IP_FW_TCPOPT_CC },
	{ "tcp option",	0 },
	{ NULL,	0 }
};

/*
 * IP options span the range 0 to 255 so we need to remap them
 * (though in fact only the low 5 bits are significant).
 */
static struct _s_x f_ipopts[] = {
	{ "ssrr",	IP_FW_IPOPT_SSRR},
	{ "lsrr",	IP_FW_IPOPT_LSRR},
	{ "rr",		IP_FW_IPOPT_RR},
	{ "ts",		IP_FW_IPOPT_TS},
	{ "ip option",	0 },
	{ NULL,	0 }
};

static struct _s_x f_iptos[] = {
	{ "lowdelay",	IPTOS_LOWDELAY},
	{ "throughput",	IPTOS_THROUGHPUT},
	{ "reliability", IPTOS_RELIABILITY},
	{ "mincost",	IPTOS_MINCOST},
	{ "congestion",	IPTOS_ECN_CE},
	{ "ecntransport", IPTOS_ECN_ECT0},
	{ "ip tos option", 0},
	{ NULL,	0 }
};

static struct _s_x limit_masks[] = {
	{"all",		DYN_SRC_ADDR|DYN_SRC_PORT|DYN_DST_ADDR|DYN_DST_PORT},
	{"src-addr",	DYN_SRC_ADDR},
	{"src-port",	DYN_SRC_PORT},
	{"dst-addr",	DYN_DST_ADDR},
	{"dst-port",	DYN_DST_PORT},
	{NULL,		0}
};

/*
 * we use IPPROTO_ETHERTYPE as a fake protocol id to call the print routines
 * This is only used in this code.
 */
#define IPPROTO_ETHERTYPE	0x1000
static struct _s_x ether_types[] = {
    /*
     * Note, we cannot use "-:&/" in the names because they are field
     * separators in the type specifications. Also, we use s = NULL as
     * end-delimiter, because a type of 0 can be legal.
     */
	{ "ip",		0x0800 },
	{ "ipv4",	0x0800 },
	{ "ipv6",	0x86dd },
	{ "arp",	0x0806 },
	{ "rarp",	0x8035 },
	{ "vlan",	0x8100 },
	{ "loop",	0x9000 },
	{ "trail",	0x1000 },
	{ "at",		0x809b },
	{ "atalk",	0x809b },
	{ "aarp",	0x80f3 },
	{ "pppoe_disc",	0x8863 },
	{ "pppoe_sess",	0x8864 },
	{ "ipx_8022",	0x00E0 },
	{ "ipx_8023",	0x0000 },
	{ "ipx_ii",	0x8137 },
	{ "ipx_snap",	0x8137 },
	{ "ipx",	0x8137 },
	{ "ns",		0x0600 },
	{ NULL,		0 }
};


static struct _s_x rule_actions[] = {
	{ "accept",		TOK_ACCEPT },
	{ "pass",		TOK_ACCEPT },
	{ "allow",		TOK_ACCEPT },
	{ "permit",		TOK_ACCEPT },
	{ "count",		TOK_COUNT },
	{ "pipe",		TOK_PIPE },
	{ "queue",		TOK_QUEUE },
	{ "divert",		TOK_DIVERT },
	{ "tee",		TOK_TEE },
	{ "netgraph",		TOK_NETGRAPH },
	{ "ngtee",		TOK_NGTEE },
	{ "fwd",		TOK_FORWARD },
	{ "forward",		TOK_FORWARD },
	{ "skipto",		TOK_SKIPTO },
	{ "deny",		TOK_DENY },
	{ "drop",		TOK_DENY },
	{ "reject",		TOK_REJECT },
	{ "reset6",		TOK_RESET6 },
	{ "reset",		TOK_RESET },
	{ "unreach6",		TOK_UNREACH6 },
	{ "unreach",		TOK_UNREACH },
	{ "check-state",	TOK_CHECKSTATE },
	{ "//",			TOK_COMMENT },
	{ "nat",		TOK_NAT },
	{ "reass",		TOK_REASS },
	{ "setfib",		TOK_SETFIB },
	{ "call",		TOK_CALL },
	{ "return",		TOK_RETURN },
	{ NULL, 0 }	/* terminator */
};

static struct _s_x rule_action_params[] = {
	{ "altq",		TOK_ALTQ },
	{ "log",		TOK_LOG },
	{ "tag",		TOK_TAG },
	{ "untag",		TOK_UNTAG },
	{ NULL, 0 }	/* terminator */
};

/*
 * The 'lookup' instruction accepts one of the following arguments.
 * -1 is a terminator for the list.
 * Arguments are passed as v[1] in O_DST_LOOKUP options.
 */
static int lookup_key[] = {
	TOK_DSTIP, TOK_SRCIP, TOK_DSTPORT, TOK_SRCPORT,
	TOK_UID, TOK_JAIL, TOK_DSCP, -1 };

static struct _s_x rule_options[] = {
	{ "tagged",		TOK_TAGGED },
	{ "uid",		TOK_UID },
	{ "gid",		TOK_GID },
	{ "jail",		TOK_JAIL },
	{ "in",			TOK_IN },
	{ "limit",		TOK_LIMIT },
	{ "keep-state",		TOK_KEEPSTATE },
	{ "bridged",		TOK_LAYER2 },
	{ "layer2",		TOK_LAYER2 },
	{ "out",		TOK_OUT },
	{ "diverted",		TOK_DIVERTED },
	{ "diverted-loopback",	TOK_DIVERTEDLOOPBACK },
	{ "diverted-output",	TOK_DIVERTEDOUTPUT },
	{ "xmit",		TOK_XMIT },
	{ "recv",		TOK_RECV },
	{ "via",		TOK_VIA },
	{ "fragment",		TOK_FRAG },
	{ "frag",		TOK_FRAG },
	{ "fib",		TOK_FIB },
	{ "ipoptions",		TOK_IPOPTS },
	{ "ipopts",		TOK_IPOPTS },
	{ "iplen",		TOK_IPLEN },
	{ "ipid",		TOK_IPID },
	{ "ipprecedence",	TOK_IPPRECEDENCE },
	{ "dscp",		TOK_DSCP },
	{ "iptos",		TOK_IPTOS },
	{ "ipttl",		TOK_IPTTL },
	{ "ipversion",		TOK_IPVER },
	{ "ipver",		TOK_IPVER },
	{ "estab",		TOK_ESTAB },
	{ "established",	TOK_ESTAB },
	{ "setup",		TOK_SETUP },
	{ "sockarg",		TOK_SOCKARG },
	{ "tcpdatalen",		TOK_TCPDATALEN },
	{ "tcpflags",		TOK_TCPFLAGS },
	{ "tcpflgs",		TOK_TCPFLAGS },
	{ "tcpoptions",		TOK_TCPOPTS },
	{ "tcpopts",		TOK_TCPOPTS },
	{ "tcpseq",		TOK_TCPSEQ },
	{ "tcpack",		TOK_TCPACK },
	{ "tcpwin",		TOK_TCPWIN },
	{ "icmptype",		TOK_ICMPTYPES },
	{ "icmptypes",		TOK_ICMPTYPES },
	{ "dst-ip",		TOK_DSTIP },
	{ "src-ip",		TOK_SRCIP },
	{ "dst-port",		TOK_DSTPORT },
	{ "src-port",		TOK_SRCPORT },
	{ "proto",		TOK_PROTO },
	{ "MAC",		TOK_MAC },
	{ "mac",		TOK_MAC },
	{ "mac-type",		TOK_MACTYPE },
	{ "verrevpath",		TOK_VERREVPATH },
	{ "versrcreach",	TOK_VERSRCREACH },
	{ "antispoof",		TOK_ANTISPOOF },
	{ "ipsec",		TOK_IPSEC },
	{ "icmp6type",		TOK_ICMP6TYPES },
	{ "icmp6types",		TOK_ICMP6TYPES },
	{ "ext6hdr",		TOK_EXT6HDR},
	{ "flow-id",		TOK_FLOWID},
	{ "ipv6",		TOK_IPV6},
	{ "ip6",		TOK_IPV6},
	{ "ipv4",		TOK_IPV4},
	{ "ip4",		TOK_IPV4},
	{ "dst-ipv6",		TOK_DSTIP6},
	{ "dst-ip6",		TOK_DSTIP6},
	{ "src-ipv6",		TOK_SRCIP6},
	{ "src-ip6",		TOK_SRCIP6},
	{ "lookup",		TOK_LOOKUP},
	{ "//",			TOK_COMMENT },

	{ "not",		TOK_NOT },		/* pseudo option */
	{ "!", /* escape ? */	TOK_NOT },		/* pseudo option */
	{ "or",			TOK_OR },		/* pseudo option */
	{ "|", /* escape */	TOK_OR },		/* pseudo option */
	{ "{",			TOK_STARTBRACE },	/* pseudo option */
	{ "(",			TOK_STARTBRACE },	/* pseudo option */
	{ "}",			TOK_ENDBRACE },		/* pseudo option */
	{ ")",			TOK_ENDBRACE },		/* pseudo option */
	{ NULL, 0 }	/* terminator */
};

/*
 * Helper routine to print a possibly unaligned uint64_t on
 * various platform. If width > 0, print the value with
 * the desired width, followed by a space;
 * otherwise, return the required width.
 */
int
pr_u64(uint64_t *pd, int width)
{
#ifdef TCC
#define U64_FMT "I64"
#else
#define U64_FMT "llu"
#endif
	uint64_t u;
	unsigned long long d;

	bcopy (pd, &u, sizeof(u));
	d = u;
	return (width > 0) ?
		printf("%*" U64_FMT " ", width, d) :
		snprintf(NULL, 0, "%" U64_FMT, d) ;
#undef U64_FMT
}

void *
safe_calloc(size_t number, size_t size)
{
	void *ret = calloc(number, size);

	if (ret == NULL)
		err(EX_OSERR, "calloc");
	return ret;
}

void *
safe_realloc(void *ptr, size_t size)
{
	void *ret = realloc(ptr, size);

	if (ret == NULL)
		err(EX_OSERR, "realloc");
	return ret;
}

/*
 * conditionally runs the command.
 * Selected options or negative -> getsockopt
 */
int
do_cmd(int optname, void *optval, uintptr_t optlen)
{
	int i;

	if (co.test_only)
		return 0;

	if (ipfw_socket == -1)
		ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
	if (ipfw_socket < 0)
		err(EX_UNAVAILABLE, "socket");

	if (optname == IP_FW_GET || optname == IP_DUMMYNET_GET ||
	    optname == IP_FW_ADD || optname == IP_FW_TABLE_LIST ||
	    optname == IP_FW_TABLE_GETSIZE ||
	    optname == IP_FW_NAT_GET_CONFIG ||
	    optname < 0 ||
	    optname == IP_FW_NAT_GET_LOG) {
		if (optname < 0)
			optname = -optname;
		i = getsockopt(ipfw_socket, IPPROTO_IP, optname, optval,
			(socklen_t *)optlen);
	} else {
		i = setsockopt(ipfw_socket, IPPROTO_IP, optname, optval, optlen);
	}
	return i;
}

#if 0 // XXX still unused
/*
 * do_setcmd3 - pass ipfw control cmd to kernel
 * @optname: option name
 * @optval: pointer to option data
 * @optlen: option length
 *
 * Function encapsulates option value in IP_FW3 socket option
 * and calls setsockopt().
 * Function returns 0 on success or -1 otherwise.
 */
static int
do_setcmd3(int optname, void *optval, socklen_t optlen)
{
	socklen_t len;
	ip_fw3_opheader *op3;

	if (co.test_only)
		return (0);

	if (ipfw_socket == -1)
		ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
	if (ipfw_socket < 0)
		err(EX_UNAVAILABLE, "socket");

	len = sizeof(ip_fw3_opheader) + optlen;
	op3 = alloca(len);
	/* Zero reserved fields */
	memset(op3, 0, sizeof(ip_fw3_opheader));
	memcpy(op3 + 1, optval, optlen);
	op3->opcode = optname;

	return setsockopt(ipfw_socket, IPPROTO_IP, IP_FW3, op3, len);
}
#endif // XXX still unused

/**
 * match_token takes a table and a string, returns the value associated
 * with the string (-1 in case of failure).
 */
int
match_token(struct _s_x *table, char *string)
{
	struct _s_x *pt;
	uint i = strlen(string);

	for (pt = table ; i && pt->s != NULL ; pt++)
		if (strlen(pt->s) == i && !bcmp(string, pt->s, i))
			return pt->x;
	return -1;
}

/**
 * match_value takes a table and a value, returns the string associated
 * with the value (NULL in case of failure).
 */
char const *
match_value(struct _s_x *p, int value)
{
	for (; p->s != NULL; p++)
		if (p->x == value)
			return p->s;
	return NULL;
}

/*
 * _substrcmp takes two strings and returns 1 if they do not match,
 * and 0 if they match exactly or the first string is a sub-string
 * of the second.  A warning is printed to stderr in the case that the
 * first string is a sub-string of the second.
 *
 * This function will be removed in the future through the usual
 * deprecation process.
 */
int
_substrcmp(const char *str1, const char* str2)
{

	if (strncmp(str1, str2, strlen(str1)) != 0)
		return 1;

	if (strlen(str1) != strlen(str2))
		warnx("DEPRECATED: '%s' matched '%s' as a sub-string",
		    str1, str2);
	return 0;
}

/*
 * _substrcmp2 takes three strings and returns 1 if the first two do not match,
 * and 0 if they match exactly or the second string is a sub-string
 * of the first.  A warning is printed to stderr in the case that the
 * first string does not match the third.
 *
 * This function exists to warn about the bizarre construction
 * strncmp(str, "by", 2) which is used to allow people to use a shortcut
 * for "bytes".  The problem is that in addition to accepting "by",
 * "byt", "byte", and "bytes", it also excepts "by_rabid_dogs" and any
 * other string beginning with "by".
 *
 * This function will be removed in the future through the usual
 * deprecation process.
 */
int
_substrcmp2(const char *str1, const char* str2, const char* str3)
{

	if (strncmp(str1, str2, strlen(str2)) != 0)
		return 1;

	if (strcmp(str1, str3) != 0)
		warnx("DEPRECATED: '%s' matched '%s'",
		    str1, str3);
	return 0;
}

/*
 * prints one port, symbolic or numeric
 */
static void
print_port(int proto, uint16_t port)
{

	if (proto == IPPROTO_ETHERTYPE) {
		char const *s;

		if (co.do_resolv && (s = match_value(ether_types, port)) )
			printf("%s", s);
		else
			printf("0x%04x", port);
	} else {
		struct servent *se = NULL;
		if (co.do_resolv) {
			struct protoent *pe = getprotobynumber(proto);

			se = getservbyport(htons(port), pe ? pe->p_name : NULL);
		}
		if (se)
			printf("%s", se->s_name);
		else
			printf("%d", port);
	}
}

static struct _s_x _port_name[] = {
	{"dst-port",	O_IP_DSTPORT},
	{"src-port",	O_IP_SRCPORT},
	{"ipid",	O_IPID},
	{"iplen",	O_IPLEN},
	{"ipttl",	O_IPTTL},
	{"mac-type",	O_MAC_TYPE},
	{"tcpdatalen",	O_TCPDATALEN},
	{"tcpwin",	O_TCPWIN},
	{"tagged",	O_TAGGED},
	{NULL,		0}
};

/*
 * Print the values in a list 16-bit items of the types above.
 * XXX todo: add support for mask.
 */
static void
print_newports(ipfw_insn_u16 *cmd, int proto, int opcode)
{
	uint16_t *p = cmd->ports;
	int i;
	char const *sep;

	if (opcode != 0) {
		sep = match_value(_port_name, opcode);
		if (sep == NULL)
			sep = "???";
		printf (" %s", sep);
	}
	sep = " ";
	for (i = F_LEN((ipfw_insn *)cmd) - 1; i > 0; i--, p += 2) {
		printf("%s", sep);
		print_port(proto, p[0]);
		if (p[0] != p[1]) {
			printf("-");
			print_port(proto, p[1]);
		}
		sep = ",";
	}
}

/*
 * Like strtol, but also translates service names into port numbers
 * for some protocols.
 * In particular:
 *	proto == -1 disables the protocol check;
 *	proto == IPPROTO_ETHERTYPE looks up an internal table
 *	proto == <some value in /etc/protocols> matches the values there.
 * Returns *end == s in case the parameter is not found.
 */
static int
strtoport(char *s, char **end, int base, int proto)
{
	char *p, *buf;
	char *s1;
	int i;

	*end = s;		/* default - not found */
	if (*s == '\0')
		return 0;	/* not found */

	if (isdigit(*s))
		return strtol(s, end, base);

	/*
	 * find separator. '\\' escapes the next char.
	 */
	for (s1 = s; *s1 && (isalnum(*s1) || *s1 == '\\') ; s1++)
		if (*s1 == '\\' && s1[1] != '\0')
			s1++;

	buf = safe_calloc(s1 - s + 1, 1);

	/*
	 * copy into a buffer skipping backslashes
	 */
	for (p = s, i = 0; p != s1 ; p++)
		if (*p != '\\')
			buf[i++] = *p;
	buf[i++] = '\0';

	if (proto == IPPROTO_ETHERTYPE) {
		i = match_token(ether_types, buf);
		free(buf);
		if (i != -1) {	/* found */
			*end = s1;
			return i;
		}
	} else {
		struct protoent *pe = NULL;
		struct servent *se;

		if (proto != 0)
			pe = getprotobynumber(proto);
		setservent(1);
		se = getservbyname(buf, pe ? pe->p_name : NULL);
		free(buf);
		if (se != NULL) {
			*end = s1;
			return ntohs(se->s_port);
		}
	}
	return 0;	/* not found */
}

/*
 * Fill the body of the command with the list of port ranges.
 */
static int
fill_newports(ipfw_insn_u16 *cmd, char *av, int proto)
{
	uint16_t a, b, *p = cmd->ports;
	int i = 0;
	char *s = av;

	while (*s) {
		a = strtoport(av, &s, 0, proto);
		if (s == av) 			/* empty or invalid argument */
			return (0);

		switch (*s) {
		case '-':			/* a range */
			av = s + 1;
			b = strtoport(av, &s, 0, proto);
			/* Reject expressions like '1-abc' or '1-2-3'. */
			if (s == av || (*s != ',' && *s != '\0'))
				return (0);
			p[0] = a;
			p[1] = b;
			break;
		case ',':			/* comma separated list */
		case '\0':
			p[0] = p[1] = a;
			break;
		default:
			warnx("port list: invalid separator <%c> in <%s>",
				*s, av);
			return (0);
		}

		i++;
		p += 2;
		av = s + 1;
	}
	if (i > 0) {
		if (i + 1 > F_LEN_MASK)
			errx(EX_DATAERR, "too many ports/ranges\n");
		cmd->o.len |= i + 1;	/* leave F_NOT and F_OR untouched */
	}
	return (i);
}

static struct _s_x icmpcodes[] = {
      { "net",			ICMP_UNREACH_NET },
      { "host",			ICMP_UNREACH_HOST },
      { "protocol",		ICMP_UNREACH_PROTOCOL },
      { "port",			ICMP_UNREACH_PORT },
      { "needfrag",		ICMP_UNREACH_NEEDFRAG },
      { "srcfail",		ICMP_UNREACH_SRCFAIL },
      { "net-unknown",		ICMP_UNREACH_NET_UNKNOWN },
      { "host-unknown",		ICMP_UNREACH_HOST_UNKNOWN },
      { "isolated",		ICMP_UNREACH_ISOLATED },
      { "net-prohib",		ICMP_UNREACH_NET_PROHIB },
      { "host-prohib",		ICMP_UNREACH_HOST_PROHIB },
      { "tosnet",		ICMP_UNREACH_TOSNET },
      { "toshost",		ICMP_UNREACH_TOSHOST },
      { "filter-prohib",	ICMP_UNREACH_FILTER_PROHIB },
      { "host-precedence",	ICMP_UNREACH_HOST_PRECEDENCE },
      { "precedence-cutoff",	ICMP_UNREACH_PRECEDENCE_CUTOFF },
      { NULL, 0 }
};

static void
fill_reject_code(u_short *codep, char *str)
{
	int val;
	char *s;

	val = strtoul(str, &s, 0);
	if (s == str || *s != '\0' || val >= 0x100)
		val = match_token(icmpcodes, str);
	if (val < 0)
		errx(EX_DATAERR, "unknown ICMP unreachable code ``%s''", str);
	*codep = val;
	return;
}

static void
print_reject_code(uint16_t code)
{
	char const *s = match_value(icmpcodes, code);

	if (s != NULL)
		printf("unreach %s", s);
	else
		printf("unreach %u", code);
}

/*
 * Returns the number of bits set (from left) in a contiguous bitmask,
 * or -1 if the mask is not contiguous.
 * XXX this needs a proper fix.
 * This effectively works on masks in big-endian (network) format.
 * when compiled on little endian architectures.
 *
 * First bit is bit 7 of the first byte -- note, for MAC addresses,
 * the first bit on the wire is bit 0 of the first byte.
 * len is the max length in bits.
 */
int
contigmask(uint8_t *p, int len)
{
	int i, n;

	for (i=0; i<len ; i++)
		if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */
			break;
	for (n=i+1; n < len; n++)
		if ( (p[n/8] & (1 << (7 - (n%8)))) != 0)
			return -1; /* mask not contiguous */
	return i;
}

/*
 * print flags set/clear in the two bitmasks passed as parameters.
 * There is a specialized check for f_tcpflags.
 */
static void
print_flags(char const *name, ipfw_insn *cmd, struct _s_x *list)
{
	char const *comma = "";
	int i;
	uint8_t set = cmd->arg1 & 0xff;
	uint8_t clear = (cmd->arg1 >> 8) & 0xff;

	if (list == f_tcpflags && set == TH_SYN && clear == TH_ACK) {
		printf(" setup");
		return;
	}

	printf(" %s ", name);
	for (i=0; list[i].x != 0; i++) {
		if (set & list[i].x) {
			set &= ~list[i].x;
			printf("%s%s", comma, list[i].s);
			comma = ",";
		}
		if (clear & list[i].x) {
			clear &= ~list[i].x;
			printf("%s!%s", comma, list[i].s);
			comma = ",";
		}
	}
}

/*
 * Print the ip address contained in a command.
 */
static void
print_ip(ipfw_insn_ip *cmd, char const *s)
{
	struct hostent *he = NULL;
	uint32_t len = F_LEN((ipfw_insn *)cmd);
	uint32_t *a = ((ipfw_insn_u32 *)cmd)->d;

	if (cmd->o.opcode == O_IP_DST_LOOKUP && len > F_INSN_SIZE(ipfw_insn_u32)) {
		uint32_t d = a[1];
		const char *arg = "<invalid>";

		if (d < sizeof(lookup_key)/sizeof(lookup_key[0]))
			arg = match_value(rule_options, lookup_key[d]);
		printf("%s lookup %s %d", cmd->o.len & F_NOT ? " not": "",
			arg, cmd->o.arg1);
		return;
	}
	printf("%s%s ", cmd->o.len & F_NOT ? " not": "", s);

	if (cmd->o.opcode == O_IP_SRC_ME || cmd->o.opcode == O_IP_DST_ME) {
		printf("me");
		return;
	}
	if (cmd->o.opcode == O_IP_SRC_LOOKUP ||
	    cmd->o.opcode == O_IP_DST_LOOKUP) {
		printf("table(%u", ((ipfw_insn *)cmd)->arg1);
		if (len == F_INSN_SIZE(ipfw_insn_u32))
			printf(",%u", *a);
		printf(")");
		return;
	}
	if (cmd->o.opcode == O_IP_SRC_SET || cmd->o.opcode == O_IP_DST_SET) {
		uint32_t x, *map = (uint32_t *)&(cmd->mask);
		int i, j;
		char comma = '{';

		x = cmd->o.arg1 - 1;
		x = htonl( ~x );
		cmd->addr.s_addr = htonl(cmd->addr.s_addr);
		printf("%s/%d", inet_ntoa(cmd->addr),
			contigmask((uint8_t *)&x, 32));
		x = cmd->addr.s_addr = htonl(cmd->addr.s_addr);
		x &= 0xff; /* base */
		/*
		 * Print bits and ranges.
		 * Locate first bit set (i), then locate first bit unset (j).
		 * If we have 3+ consecutive bits set, then print them as a
		 * range, otherwise only print the initial bit and rescan.
		 */
		for (i=0; i < cmd->o.arg1; i++)
			if (map[i/32] & (1<<(i & 31))) {
				for (j=i+1; j < cmd->o.arg1; j++)
					if (!(map[ j/32] & (1<<(j & 31))))
						break;
				printf("%c%d", comma, i+x);
				if (j>i+2) { /* range has at least 3 elements */
					printf("-%d", j-1+x);
					i = j-1;
				}
				comma = ',';
			}
		printf("}");
		return;
	}
	/*
	 * len == 2 indicates a single IP, whereas lists of 1 or more
	 * addr/mask pairs have len = (2n+1). We convert len to n so we
	 * use that to count the number of entries.
	 */
    for (len = len / 2; len > 0; len--, a += 2) {
	int mb =	/* mask length */
	    (cmd->o.opcode == O_IP_SRC || cmd->o.opcode == O_IP_DST) ?
		32 : contigmask((uint8_t *)&(a[1]), 32);
	if (mb == 32 && co.do_resolv)
		he = gethostbyaddr((char *)&(a[0]), sizeof(u_long), AF_INET);
	if (he != NULL)		/* resolved to name */
		printf("%s", he->h_name);
	else if (mb == 0)	/* any */
		printf("any");
	else {		/* numeric IP followed by some kind of mask */
		printf("%s", inet_ntoa( *((struct in_addr *)&a[0]) ) );
		if (mb < 0)
			printf(":%s", inet_ntoa( *((struct in_addr *)&a[1]) ) );
		else if (mb < 32)
			printf("/%d", mb);
	}
	if (len > 1)
		printf(",");
    }
}

/*
 * prints a MAC address/mask pair
 */
static void
print_mac(uint8_t *addr, uint8_t *mask)
{
	int l = contigmask(mask, 48);

	if (l == 0)
		printf(" any");
	else {
		printf(" %02x:%02x:%02x:%02x:%02x:%02x",
		    addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]);
		if (l == -1)
			printf("&%02x:%02x:%02x:%02x:%02x:%02x",
			    mask[0], mask[1], mask[2],
			    mask[3], mask[4], mask[5]);
		else if (l < 48)
			printf("/%d", l);
	}
}

static void
fill_icmptypes(ipfw_insn_u32 *cmd, char *av)
{
	uint8_t type;

	cmd->d[0] = 0;
	while (*av) {
		if (*av == ',')
			av++;

		type = strtoul(av, &av, 0);

		if (*av != ',' && *av != '\0')
			errx(EX_DATAERR, "invalid ICMP type");

		if (type > 31)
			errx(EX_DATAERR, "ICMP type out of range");

		cmd->d[0] |= 1 << type;
	}
	cmd->o.opcode = O_ICMPTYPE;
	cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32);
}

static void
print_icmptypes(ipfw_insn_u32 *cmd)
{
	int i;
	char sep= ' ';

	printf(" icmptypes");
	for (i = 0; i < 32; i++) {
		if ( (cmd->d[0] & (1 << (i))) == 0)
			continue;
		printf("%c%d", sep, i);
		sep = ',';
	}
}

/*
 * show_ipfw() prints the body of an ipfw rule.
 * Because the standard rule has at least proto src_ip dst_ip, we use
 * a helper function to produce these entries if not provided explicitly.
 * The first argument is the list of fields we have, the second is
 * the list of fields we want to be printed.
 *
 * Special cases if we have provided a MAC header:
 *   + if the rule does not contain IP addresses/ports, do not print them;
 *   + if the rule does not contain an IP proto, print "all" instead of "ip";
 *
 * Once we have 'have_options', IP header fields are printed as options.
 */
#define	HAVE_PROTO	0x0001
#define	HAVE_SRCIP	0x0002
#define	HAVE_DSTIP	0x0004
#define	HAVE_PROTO4	0x0008
#define	HAVE_PROTO6	0x0010
#define	HAVE_IP		0x0100
#define	HAVE_OPTIONS	0x8000

static void
show_prerequisites(int *flags, int want, int cmd)
{
	(void)cmd;	/* UNUSED */
	if (co.comment_only)
		return;
	if ( (*flags & HAVE_IP) == HAVE_IP)
		*flags |= HAVE_OPTIONS;

	if ( !(*flags & HAVE_OPTIONS)) {
		if ( !(*flags & HAVE_PROTO) && (want & HAVE_PROTO)) {
			if ( (*flags & HAVE_PROTO4))
				printf(" ip4");
			else if ( (*flags & HAVE_PROTO6))
				printf(" ip6");
			else
				printf(" ip");
		}
		if ( !(*flags & HAVE_SRCIP) && (want & HAVE_SRCIP))
			printf(" from any");
		if ( !(*flags & HAVE_DSTIP) && (want & HAVE_DSTIP))
			printf(" to any");
	}
	*flags |= want;
}

static void
show_ipfw(struct ip_fw *rule, int pcwidth, int bcwidth)
{
	static int twidth = 0;
	int l;
	ipfw_insn *cmd, *tagptr = NULL;
	const char *comment = NULL;	/* ptr to comment if we have one */
	int proto = 0;		/* default */
	int flags = 0;	/* prerequisites */
	ipfw_insn_log *logptr = NULL; /* set if we find an O_LOG */
	ipfw_insn_altq *altqptr = NULL; /* set if we find an O_ALTQ */
	int or_block = 0;	/* we are in an or block */
	uint32_t set_disable;

	bcopy(&rule->next_rule, &set_disable, sizeof(set_disable));

	if (set_disable & (1 << rule->set)) { /* disabled */
		if (!co.show_sets)
			return;
		else
			printf("# DISABLED ");
	}
	printf("%05u ", rule->rulenum);

	if (pcwidth > 0 || bcwidth > 0) {
		pr_u64(&rule->pcnt, pcwidth);
		pr_u64(&rule->bcnt, bcwidth);
	}

	if (co.do_time == 2)
		printf("%10u ", rule->timestamp);
	else if (co.do_time == 1) {
		char timestr[30];
		time_t t = (time_t)0;

		if (twidth == 0) {
			strcpy(timestr, ctime(&t));
			*strchr(timestr, '\n') = '\0';
			twidth = strlen(timestr);
		}
		if (rule->timestamp) {
			t = _long_to_time(rule->timestamp);

			strcpy(timestr, ctime(&t));
			*strchr(timestr, '\n') = '\0';
			printf("%s ", timestr);
		} else {
			printf("%*s", twidth, " ");
		}
	}

	if (co.show_sets)
		printf("set %d ", rule->set);

	/*
	 * print the optional "match probability"
	 */
	if (rule->cmd_len > 0) {
		cmd = rule->cmd ;
		if (cmd->opcode == O_PROB) {
			ipfw_insn_u32 *p = (ipfw_insn_u32 *)cmd;
			double d = 1.0 * p->d[0];

			d = (d / 0x7fffffff);
			printf("prob %f ", d);
		}
	}

	/*
	 * first print actions
	 */
	for (l = rule->cmd_len - rule->act_ofs, cmd = ACTION_PTR(rule);
			l > 0 ; l -= F_LEN(cmd), cmd += F_LEN(cmd)) {
		switch(cmd->opcode) {
		case O_CHECK_STATE:
			printf("check-state");
			/* avoid printing anything else */
			flags = HAVE_PROTO | HAVE_SRCIP |
				HAVE_DSTIP | HAVE_IP;
			break;

		case O_ACCEPT:
			printf("allow");
			break;

		case O_COUNT:
			printf("count");
			break;

		case O_DENY:
			printf("deny");
			break;

		case O_REJECT:
			if (cmd->arg1 == ICMP_REJECT_RST)
				printf("reset");
			else if (cmd->arg1 == ICMP_UNREACH_HOST)
				printf("reject");
			else
				print_reject_code(cmd->arg1);
			break;

		case O_UNREACH6:
			if (cmd->arg1 == ICMP6_UNREACH_RST)
				printf("reset6");
			else
				print_unreach6_code(cmd->arg1);
			break;

		case O_SKIPTO:
			PRINT_UINT_ARG("skipto ", cmd->arg1);
			break;

		case O_PIPE:
			PRINT_UINT_ARG("pipe ", cmd->arg1);
			break;

		case O_QUEUE:
			PRINT_UINT_ARG("queue ", cmd->arg1);
			break;

		case O_DIVERT:
			PRINT_UINT_ARG("divert ", cmd->arg1);
			break;

		case O_TEE:
			PRINT_UINT_ARG("tee ", cmd->arg1);
			break;

		case O_NETGRAPH:
			PRINT_UINT_ARG("netgraph ", cmd->arg1);
			break;

		case O_NGTEE:
			PRINT_UINT_ARG("ngtee ", cmd->arg1);
			break;

		case O_FORWARD_IP:
		    {
			ipfw_insn_sa *s = (ipfw_insn_sa *)cmd;

			if (s->sa.sin_addr.s_addr == INADDR_ANY) {
				printf("fwd tablearg");
			} else {
				printf("fwd %s", inet_ntoa(s->sa.sin_addr));
			}
			if (s->sa.sin_port)
				printf(",%d", s->sa.sin_port);
		    }
			break;

#if 0 // XXX unused yet
		case O_FORWARD_IP6:
		    {
			char buf[4 + INET6_ADDRSTRLEN + 1];
			ipfw_insn_sa6 *s = (ipfw_insn_sa6 *)cmd;

			printf("fwd %s", inet_ntop(AF_INET6, &s->sa.sin6_addr,
			    buf, sizeof(buf)));
			if (s->sa.sin6_port)
				printf(",%d", s->sa.sin6_port);
		    }
			break;
#endif // XXX unused yet


		case O_LOG: /* O_LOG is printed last */
			logptr = (ipfw_insn_log *)cmd;
			break;

		case O_ALTQ: /* O_ALTQ is printed after O_LOG */
			altqptr = (ipfw_insn_altq *)cmd;
			break;

		case O_TAG:
			tagptr = cmd;
			break;

		case O_NAT:
			if (cmd->arg1 != 0)
				PRINT_UINT_ARG("nat ", cmd->arg1);
			else
				printf("nat global");
			break;

		case O_SETFIB:
			PRINT_UINT_ARG("setfib ", cmd->arg1);
			break;

		case O_REASS:
			printf("reass");
			break;

		case O_CALLRETURN:
			if (cmd->len & F_NOT)
				printf("return");
			else
				PRINT_UINT_ARG("call ", cmd->arg1);
			break;

		default:
			printf("** unrecognized action %d len %d ",
				cmd->opcode, cmd->len);
		}
	}
	if (logptr) {
		if (logptr->max_log > 0)
			printf(" log logamount %d", logptr->max_log);
		else
			printf(" log");
	}
#ifndef NO_ALTQ
	if (altqptr) {
		print_altq_cmd(altqptr);
	}
#endif
	if (tagptr) {
		if (tagptr->len & F_NOT)
			PRINT_UINT_ARG(" untag ", tagptr->arg1);
		else
			PRINT_UINT_ARG(" tag ", tagptr->arg1);
	}

	/*
	 * then print the body.
	 */
	for (l = rule->act_ofs, cmd = rule->cmd ;
			l > 0 ; l -= F_LEN(cmd) , cmd += F_LEN(cmd)) {
		if ((cmd->len & F_OR) || (cmd->len & F_NOT))
			continue;
		if (cmd->opcode == O_IP4) {
			flags |= HAVE_PROTO4;
			break;
		} else if (cmd->opcode == O_IP6) {
			flags |= HAVE_PROTO6;
			break;
		}
	}
	if (rule->_pad & 1) {	/* empty rules before options */
		if (!co.do_compact) {
			show_prerequisites(&flags, HAVE_PROTO, 0);
			printf(" from any to any");
		}
		flags |= HAVE_IP | HAVE_OPTIONS | HAVE_PROTO |
			 HAVE_SRCIP | HAVE_DSTIP;
	}

	if (co.comment_only)
		comment = "...";

	for (l = rule->act_ofs, cmd = rule->cmd ;
			l > 0 ; l -= F_LEN(cmd) , cmd += F_LEN(cmd)) {
		/* useful alias */
		ipfw_insn_u32 *cmd32 = (ipfw_insn_u32 *)cmd;

		if (co.comment_only) {
			if (cmd->opcode != O_NOP)
				continue;
			printf(" // %s\n", (char *)(cmd + 1));
			return;
		}

		show_prerequisites(&flags, 0, cmd->opcode);

		switch(cmd->opcode) {
		case O_PROB:
			break;	/* done already */

		case O_PROBE_STATE:
			break; /* no need to print anything here */

		case O_IP_SRC:
		case O_IP_SRC_LOOKUP:
		case O_IP_SRC_MASK:
		case O_IP_SRC_ME:
		case O_IP_SRC_SET:
			show_prerequisites(&flags, HAVE_PROTO, 0);
			if (!(flags & HAVE_SRCIP))
				printf(" from");
			if ((cmd->len & F_OR) && !or_block)
				printf(" {");
			print_ip((ipfw_insn_ip *)cmd,
				(flags & HAVE_OPTIONS) ? " src-ip" : "");
			flags |= HAVE_SRCIP;
			break;

		case O_IP_DST:
		case O_IP_DST_LOOKUP:
		case O_IP_DST_MASK:
		case O_IP_DST_ME:
		case O_IP_DST_SET:
			show_prerequisites(&flags, HAVE_PROTO|HAVE_SRCIP, 0);
			if (!(flags & HAVE_DSTIP))
				printf(" to");
			if ((cmd->len & F_OR) && !or_block)
				printf(" {");
			print_ip((ipfw_insn_ip *)cmd,
				(flags & HAVE_OPTIONS) ? " dst-ip" : "");
			flags |= HAVE_DSTIP;
			break;

		case O_IP6_SRC:
		case O_IP6_SRC_MASK:
		case O_IP6_SRC_ME:
			show_prerequisites(&flags, HAVE_PROTO, 0);
			if (!(flags & HAVE_SRCIP))
				printf(" from");
			if ((cmd->len & F_OR) && !or_block)
				printf(" {");
			print_ip6((ipfw_insn_ip6 *)cmd,
			    (flags & HAVE_OPTIONS) ? " src-ip6" : "");
			flags |= HAVE_SRCIP | HAVE_PROTO;
			break;

		case O_IP6_DST:
		case O_IP6_DST_MASK:
		case O_IP6_DST_ME:
			show_prerequisites(&flags, HAVE_PROTO|HAVE_SRCIP, 0);
			if (!(flags & HAVE_DSTIP))
				printf(" to");
			if ((cmd->len & F_OR) && !or_block)
				printf(" {");
			print_ip6((ipfw_insn_ip6 *)cmd,
			    (flags & HAVE_OPTIONS) ? " dst-ip6" : "");
			flags |= HAVE_DSTIP;
			break;

		case O_FLOW6ID:
		print_flow6id( (ipfw_insn_u32 *) cmd );
		flags |= HAVE_OPTIONS;
		break;

		case O_IP_DSTPORT:
			show_prerequisites(&flags,
				HAVE_PROTO | HAVE_SRCIP |
				HAVE_DSTIP | HAVE_IP, 0);
		case O_IP_SRCPORT:
			if (flags & HAVE_DSTIP)
				flags |= HAVE_IP;
			show_prerequisites(&flags,
				HAVE_PROTO | HAVE_SRCIP, 0);
			if ((cmd->len & F_OR) && !or_block)
				printf(" {");
			if (cmd->len & F_NOT)
				printf(" not");
			print_newports((ipfw_insn_u16 *)cmd, proto,
				(flags & HAVE_OPTIONS) ? cmd->opcode : 0);
			break;

		case O_PROTO: {
			struct protoent *pe = NULL;

			if ((cmd->len & F_OR) && !or_block)
				printf(" {");
			if (cmd->len & F_NOT)
				printf(" not");
			proto = cmd->arg1;
			pe = getprotobynumber(cmd->arg1);
			if ((flags & (HAVE_PROTO4 | HAVE_PROTO6)) &&
			    !(flags & HAVE_PROTO))
				show_prerequisites(&flags,
				    HAVE_PROTO | HAVE_IP | HAVE_SRCIP |
				    HAVE_DSTIP | HAVE_OPTIONS, 0);
			if (flags & HAVE_OPTIONS)
				printf(" proto");
			if (pe)
				printf(" %s", pe->p_name);
			else
				printf(" %u", cmd->arg1);
			}
			flags |= HAVE_PROTO;
			break;

		default: /*options ... */
			if (!(cmd->len & (F_OR|F_NOT)))
				if (((cmd->opcode == O_IP6) &&
				    (flags & HAVE_PROTO6)) ||
				    ((cmd->opcode == O_IP4) &&
				    (flags & HAVE_PROTO4)))
					break;
			show_prerequisites(&flags, HAVE_PROTO | HAVE_SRCIP |
				    HAVE_DSTIP | HAVE_IP | HAVE_OPTIONS, 0);
			if ((cmd->len & F_OR) && !or_block)
				printf(" {");
			if (cmd->len & F_NOT && cmd->opcode != O_IN)
				printf(" not");
			switch(cmd->opcode) {
			case O_MACADDR2: {
				ipfw_insn_mac *m = (ipfw_insn_mac *)cmd;

				printf(" MAC");
				print_mac(m->addr, m->mask);
				print_mac(m->addr + 6, m->mask + 6);
				}
				break;

			case O_MAC_TYPE:
				print_newports((ipfw_insn_u16 *)cmd,
						IPPROTO_ETHERTYPE, cmd->opcode);
				break;


			case O_FRAG:
				printf(" frag");
				break;

			case O_FIB:
				printf(" fib %u", cmd->arg1 );
				break;
			case O_SOCKARG:
				printf(" sockarg");
				break;

			case O_IN:
				printf(cmd->len & F_NOT ? " out" : " in");
				break;

			case O_DIVERTED:
				switch (cmd->arg1) {
				case 3:
					printf(" diverted");
					break;
				case 1:
					printf(" diverted-loopback");
					break;
				case 2:
					printf(" diverted-output");
					break;
				default:
					printf(" diverted-?<%u>", cmd->arg1);
					break;
				}
				break;

			case O_LAYER2:
				printf(" layer2");
				break;
			case O_XMIT:
			case O_RECV:
			case O_VIA:
			    {
				char const *s;
				ipfw_insn_if *cmdif = (ipfw_insn_if *)cmd;

				if (cmd->opcode == O_XMIT)
					s = "xmit";
				else if (cmd->opcode == O_RECV)
					s = "recv";
				else /* if (cmd->opcode == O_VIA) */
					s = "via";
				if (cmdif->name[0] == '\0')
					printf(" %s %s", s,
					    inet_ntoa(cmdif->p.ip));
				else
					printf(" %s %s", s, cmdif->name);

				break;
			    }
			case O_IPID:
				if (F_LEN(cmd) == 1)
				    printf(" ipid %u", cmd->arg1 );
				else
				    print_newports((ipfw_insn_u16 *)cmd, 0,
					O_IPID);
				break;

			case O_IPTTL:
				if (F_LEN(cmd) == 1)
				    printf(" ipttl %u", cmd->arg1 );
				else
				    print_newports((ipfw_insn_u16 *)cmd, 0,
					O_IPTTL);
				break;

			case O_IPVER:
				printf(" ipver %u", cmd->arg1 );
				break;

			case O_IPPRECEDENCE:
				printf(" ipprecedence %u", (cmd->arg1) >> 5 );
				break;

			case O_IPLEN:
				if (F_LEN(cmd) == 1)
				    printf(" iplen %u", cmd->arg1 );
				else
				    print_newports((ipfw_insn_u16 *)cmd, 0,
					O_IPLEN);
				break;

			case O_IPOPT:
				print_flags("ipoptions", cmd, f_ipopts);
				break;

			case O_IPTOS:
				print_flags("iptos", cmd, f_iptos);
				break;

			case O_ICMPTYPE:
				print_icmptypes((ipfw_insn_u32 *)cmd);
				break;

			case O_ESTAB:
				printf(" established");
				break;

			case O_TCPDATALEN:
				if (F_LEN(cmd) == 1)
				    printf(" tcpdatalen %u", cmd->arg1 );
				else
				    print_newports((ipfw_insn_u16 *)cmd, 0,
					O_TCPDATALEN);
				break;

			case O_TCPFLAGS:
				print_flags("tcpflags", cmd, f_tcpflags);
				break;

			case O_TCPOPTS:
				print_flags("tcpoptions", cmd, f_tcpopts);
				break;

			case O_TCPWIN:
				printf(" tcpwin %d", ntohs(cmd->arg1));
				break;

			case O_TCPACK:
				printf(" tcpack %d", ntohl(cmd32->d[0]));
				break;

			case O_TCPSEQ:
				printf(" tcpseq %d", ntohl(cmd32->d[0]));
				break;

			case O_UID:
			    {
				struct passwd *pwd = getpwuid(cmd32->d[0]);

				if (pwd)
					printf(" uid %s", pwd->pw_name);
				else
					printf(" uid %u", cmd32->d[0]);
			    }
				break;

			case O_GID:
			    {
				struct group *grp = getgrgid(cmd32->d[0]);

				if (grp)
					printf(" gid %s", grp->gr_name);
				else
					printf(" gid %u", cmd32->d[0]);
			    }
				break;

			case O_JAIL:
				printf(" jail %d", cmd32->d[0]);
				break;

			case O_VERREVPATH:
				printf(" verrevpath");
				break;

			case O_VERSRCREACH:
				printf(" versrcreach");
				break;

			case O_ANTISPOOF:
				printf(" antispoof");
				break;

			case O_IPSEC:
				printf(" ipsec");
				break;

			case O_NOP:
				comment = (char *)(cmd + 1);
				break;

			case O_KEEP_STATE:
				printf(" keep-state");
				break;

			case O_LIMIT: {
				struct _s_x *p = limit_masks;
				ipfw_insn_limit *c = (ipfw_insn_limit *)cmd;
				uint8_t x = c->limit_mask;
				char const *comma = " ";

				printf(" limit");
				for (; p->x != 0 ; p++)
					if ((x & p->x) == p->x) {
						x &= ~p->x;
						printf("%s%s", comma, p->s);
						comma = ",";
					}
				PRINT_UINT_ARG(" ", c->conn_limit);
				break;
			}

			case O_IP6:
				printf(" ip6");
				break;

			case O_IP4:
				printf(" ip4");
				break;

			case O_ICMP6TYPE:
				print_icmp6types((ipfw_insn_u32 *)cmd);
				break;

			case O_EXT_HDR:
				print_ext6hdr( (ipfw_insn *) cmd );
				break;

			case O_TAGGED:
				if (F_LEN(cmd) == 1)
					PRINT_UINT_ARG(" tagged ", cmd->arg1);
				else
					print_newports((ipfw_insn_u16 *)cmd, 0,
					    O_TAGGED);
				break;

			default:
				printf(" [opcode %d len %d]",
				    cmd->opcode, cmd->len);
			}
		}
		if (cmd->len & F_OR) {
			printf(" or");
			or_block = 1;
		} else if (or_block) {
			printf(" }");
			or_block = 0;
		}
	}
	show_prerequisites(&flags, HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP
					      | HAVE_IP, 0);
	if (comment)
		printf(" // %s", comment);
	printf("\n");
}

static void
show_dyn_ipfw(ipfw_dyn_rule *d, int pcwidth, int bcwidth)
{
	struct protoent *pe;
	struct in_addr a;
	uint16_t rulenum;
	char buf[INET6_ADDRSTRLEN];

	if (!co.do_expired) {
		if (!d->expire && !(d->dyn_type == O_LIMIT_PARENT))
			return;
	}
	bcopy(&d->rule, &rulenum, sizeof(rulenum));
	printf("%05d", rulenum);
	if (pcwidth > 0 || bcwidth > 0) {
		printf(" ");
		pr_u64(&d->pcnt, pcwidth);
		pr_u64(&d->bcnt, bcwidth);
		printf("(%ds)", d->expire);
	}
	switch (d->dyn_type) {
	case O_LIMIT_PARENT:
		printf(" PARENT %d", d->count);
		break;
	case O_LIMIT:
		printf(" LIMIT");
		break;
	case O_KEEP_STATE: /* bidir, no mask */
		printf(" STATE");
		break;
	}

	if ((pe = getprotobynumber(d->id.proto)) != NULL)
		printf(" %s", pe->p_name);
	else
		printf(" proto %u", d->id.proto);

	if (d->id.addr_type == 4) {
		a.s_addr = htonl(d->id.src_ip);
		printf(" %s %d", inet_ntoa(a), d->id.src_port);

		a.s_addr = htonl(d->id.dst_ip);
		printf(" <-> %s %d", inet_ntoa(a), d->id.dst_port);
	} else if (d->id.addr_type == 6) {
		printf(" %s %d", inet_ntop(AF_INET6, &d->id.src_ip6, buf,
		    sizeof(buf)), d->id.src_port);
		printf(" <-> %s %d", inet_ntop(AF_INET6, &d->id.dst_ip6, buf,
		    sizeof(buf)), d->id.dst_port);
	} else
		printf(" UNKNOWN <-> UNKNOWN\n");

	printf("\n");
}

/*
 * This one handles all set-related commands
 * 	ipfw set { show | enable | disable }
 * 	ipfw set swap X Y
 * 	ipfw set move X to Y
 * 	ipfw set move rule X to Y
 */
void
ipfw_sets_handler(char *av[])
{
	uint32_t set_disable, masks[2];
	int i, nbytes;
	uint16_t rulenum;
	uint8_t cmd, new_set;

	av++;

	if (av[0] == NULL)
		errx(EX_USAGE, "set needs command");
	if (_substrcmp(*av, "show") == 0) {
		void *data = NULL;
		char const *msg;
		int nalloc;

		nalloc = nbytes = sizeof(struct ip_fw);
		while (nbytes >= nalloc) {
			if (data)
				free(data);
			nalloc = nalloc * 2 + 200;
			nbytes = nalloc;
			data = safe_calloc(1, nbytes);
			if (do_cmd(IP_FW_GET, data, (uintptr_t)&nbytes) < 0)
				err(EX_OSERR, "getsockopt(IP_FW_GET)");
		}

		bcopy(&((struct ip_fw *)data)->next_rule,
			&set_disable, sizeof(set_disable));

		for (i = 0, msg = "disable" ; i < RESVD_SET; i++)
			if ((set_disable & (1<<i))) {
				printf("%s %d", msg, i);
				msg = "";
			}
		msg = (set_disable) ? " enable" : "enable";
		for (i = 0; i < RESVD_SET; i++)
			if (!(set_disable & (1<<i))) {
				printf("%s %d", msg, i);
				msg = "";
			}
		printf("\n");
	} else if (_substrcmp(*av, "swap") == 0) {
		av++;
		if ( av[0] == NULL || av[1] == NULL )
			errx(EX_USAGE, "set swap needs 2 set numbers\n");
		rulenum = atoi(av[0]);
		new_set = atoi(av[1]);
		if (!isdigit(*(av[0])) || rulenum > RESVD_SET)
			errx(EX_DATAERR, "invalid set number %s\n", av[0]);
		if (!isdigit(*(av[1])) || new_set > RESVD_SET)
			errx(EX_DATAERR, "invalid set number %s\n", av[1]);
		masks[0] = (4 << 24) | (new_set << 16) | (rulenum);
		i = do_cmd(IP_FW_DEL, masks, sizeof(uint32_t));
	} else if (_substrcmp(*av, "move") == 0) {
		av++;
		if (av[0] && _substrcmp(*av, "rule") == 0) {
			cmd = 2;
			av++;
		} else
			cmd = 3;
		if (av[0] == NULL || av[1] == NULL || av[2] == NULL ||
				av[3] != NULL ||  _substrcmp(av[1], "to") != 0)
			errx(EX_USAGE, "syntax: set move [rule] X to Y\n");
		rulenum = atoi(av[0]);
		new_set = atoi(av[2]);
		if (!isdigit(*(av[0])) || (cmd == 3 && rulenum > RESVD_SET) ||
			(cmd == 2 && rulenum == IPFW_DEFAULT_RULE) )
			errx(EX_DATAERR, "invalid source number %s\n", av[0]);
		if (!isdigit(*(av[2])) || new_set > RESVD_SET)
			errx(EX_DATAERR, "invalid dest. set %s\n", av[1]);
		masks[0] = (cmd << 24) | (new_set << 16) | (rulenum);
		i = do_cmd(IP_FW_DEL, masks, sizeof(uint32_t));
	} else if (_substrcmp(*av, "disable") == 0 ||
		   _substrcmp(*av, "enable") == 0 ) {
		int which = _substrcmp(*av, "enable") == 0 ? 1 : 0;

		av++;
		masks[0] = masks[1] = 0;

		while (av[0]) {
			if (isdigit(**av)) {
				i = atoi(*av);
				if (i < 0 || i > RESVD_SET)
					errx(EX_DATAERR,
					    "invalid set number %d\n", i);
				masks[which] |= (1<<i);
			} else if (_substrcmp(*av, "disable") == 0)
				which = 0;
			else if (_substrcmp(*av, "enable") == 0)
				which = 1;
			else
				errx(EX_DATAERR,
					"invalid set command %s\n", *av);
			av++;
		}
		if ( (masks[0] & masks[1]) != 0 )
			errx(EX_DATAERR,
			    "cannot enable and disable the same set\n");

		i = do_cmd(IP_FW_DEL, masks, sizeof(masks));
		if (i)
			warn("set enable/disable: setsockopt(IP_FW_DEL)");
	} else
		errx(EX_USAGE, "invalid set command %s\n", *av);
}

void
ipfw_sysctl_handler(char *av[], int which)
{
	av++;

	if (av[0] == NULL) {
		warnx("missing keyword to enable/disable\n");
	} else if (_substrcmp(*av, "firewall") == 0) {
		sysctlbyname("net.inet.ip.fw.enable", NULL, 0,
		    &which, sizeof(which));
		sysctlbyname("net.inet6.ip6.fw.enable", NULL, 0,
		    &which, sizeof(which));
	} else if (_substrcmp(*av, "one_pass") == 0) {
		sysctlbyname("net.inet.ip.fw.one_pass", NULL, 0,
		    &which, sizeof(which));
	} else if (_substrcmp(*av, "debug") == 0) {
		sysctlbyname("net.inet.ip.fw.debug", NULL, 0,
		    &which, sizeof(which));
	} else if (_substrcmp(*av, "verbose") == 0) {
		sysctlbyname("net.inet.ip.fw.verbose", NULL, 0,
		    &which, sizeof(which));
	} else if (_substrcmp(*av, "dyn_keepalive") == 0) {
		sysctlbyname("net.inet.ip.fw.dyn_keepalive", NULL, 0,
		    &which, sizeof(which));
#ifndef NO_ALTQ
	} else if (_substrcmp(*av, "altq") == 0) {
		altq_set_enabled(which);
#endif
	} else {
		warnx("unrecognize enable/disable keyword: %s\n", *av);
	}
}

void
ipfw_list(int ac, char *av[], int show_counters)
{
	struct ip_fw *r;
	ipfw_dyn_rule *dynrules, *d;

#define NEXT(r)	((struct ip_fw *)((char *)r + RULESIZE(r)))
	char *lim;
	void *data = NULL;
	int bcwidth, n, nbytes, nstat, ndyn, pcwidth, width;
	int exitval = EX_OK;
	int lac;
	char **lav;
	u_long rnum, last;
	char *endptr;
	int seen = 0;
	uint8_t set;

	const int ocmd = co.do_pipe ? IP_DUMMYNET_GET : IP_FW_GET;
	int nalloc = 1024;	/* start somewhere... */

	last = 0;

	if (co.test_only) {
		fprintf(stderr, "Testing only, list disabled\n");
		return;
	}
	if (co.do_pipe) {
		dummynet_list(ac, av, show_counters);
		return;
	}

	ac--;
	av++;

	/* get rules or pipes from kernel, resizing array as necessary */
	nbytes = nalloc;

	while (nbytes >= nalloc) {
		nalloc = nalloc * 2 + 200;
		nbytes = nalloc;
		data = safe_realloc(data, nbytes);
		if (do_cmd(ocmd, data, (uintptr_t)&nbytes) < 0)
			err(EX_OSERR, "getsockopt(IP_%s_GET)",
				co.do_pipe ? "DUMMYNET" : "FW");
	}

	/*
	 * Count static rules. They have variable size so we
	 * need to scan the list to count them.
	 */
	for (nstat = 1, r = data, lim = (char *)data + nbytes;
		    r->rulenum < IPFW_DEFAULT_RULE && (char *)r < lim;
		    ++nstat, r = NEXT(r) )
		; /* nothing */

	/*
	 * Count dynamic rules. This is easier as they have
	 * fixed size.
	 */
	r = NEXT(r);
	dynrules = (ipfw_dyn_rule *)r ;
	n = (char *)r - (char *)data;
	ndyn = (nbytes - n) / sizeof *dynrules;

	/* if showing stats, figure out column widths ahead of time */
	bcwidth = pcwidth = 0;
	if (show_counters) {
		for (n = 0, r = data; n < nstat; n++, r = NEXT(r)) {
			/* skip rules from another set */
			if (co.use_set && r->set != co.use_set - 1)
				continue;

			/* packet counter */
			width = pr_u64(&r->pcnt, 0);
			if (width > pcwidth)
				pcwidth = width;

			/* byte counter */
			width = pr_u64(&r->bcnt, 0);
			if (width > bcwidth)
				bcwidth = width;
		}
	}
	if (co.do_dynamic && ndyn) {
		for (n = 0, d = dynrules; n < ndyn; n++, d++) {
			if (co.use_set) {
				/* skip rules from another set */
				bcopy((char *)&d->rule + sizeof(uint16_t),
				      &set, sizeof(uint8_t));
				if (set != co.use_set - 1)
					continue;
			}
			width = pr_u64(&d->pcnt, 0);
			if (width > pcwidth)
				pcwidth = width;

			width = pr_u64(&d->bcnt, 0);
			if (width > bcwidth)
				bcwidth = width;
		}
	}
	/* if no rule numbers were specified, list all rules */
	if (ac == 0) {
		for (n = 0, r = data; n < nstat; n++, r = NEXT(r)) {
			if (co.use_set && r->set != co.use_set - 1)
				continue;
			show_ipfw(r, pcwidth, bcwidth);
		}

		if (co.do_dynamic && ndyn) {
			printf("## Dynamic rules (%d):\n", ndyn);
			for (n = 0, d = dynrules; n < ndyn; n++, d++) {
				if (co.use_set) {
					bcopy((char *)&d->rule + sizeof(uint16_t),
					      &set, sizeof(uint8_t));
					if (set != co.use_set - 1)
						continue;
				}
				show_dyn_ipfw(d, pcwidth, bcwidth);
		}
		}
		goto done;
	}

	/* display specific rules requested on command line */

	for (lac = ac, lav = av; lac != 0; lac--) {
		/* convert command line rule # */
		last = rnum = strtoul(*lav++, &endptr, 10);
		if (*endptr == '-')
			last = strtoul(endptr+1, &endptr, 10);
		if (*endptr) {
			exitval = EX_USAGE;
			warnx("invalid rule number: %s", *(lav - 1));
			continue;
		}
		for (n = seen = 0, r = data; n < nstat; n++, r = NEXT(r) ) {
			if (r->rulenum > last)
				break;
			if (co.use_set && r->set != co.use_set - 1)
				continue;
			if (r->rulenum >= rnum && r->rulenum <= last) {
				show_ipfw(r, pcwidth, bcwidth);
				seen = 1;
			}
		}
		if (!seen) {
			/* give precedence to other error(s) */
			if (exitval == EX_OK)
				exitval = EX_UNAVAILABLE;
			warnx("rule %lu does not exist", rnum);
		}
	}

	if (co.do_dynamic && ndyn) {
		printf("## Dynamic rules:\n");
		for (lac = ac, lav = av; lac != 0; lac--) {
			last = rnum = strtoul(*lav++, &endptr, 10);
			if (*endptr == '-')
				last = strtoul(endptr+1, &endptr, 10);
			if (*endptr)
				/* already warned */
				continue;
			for (n = 0, d = dynrules; n < ndyn; n++, d++) {
				uint16_t rulenum;

				bcopy(&d->rule, &rulenum, sizeof(rulenum));
				if (rulenum > rnum)
					break;
				if (co.use_set) {
					bcopy((char *)&d->rule + sizeof(uint16_t),
					      &set, sizeof(uint8_t));
					if (set != co.use_set - 1)
						continue;
				}
				if (r->rulenum >= rnum && r->rulenum <= last)
					show_dyn_ipfw(d, pcwidth, bcwidth);
			}
		}
	}

	ac = 0;

done:
	free(data);

	if (exitval != EX_OK)
		exit(exitval);
#undef NEXT
}

static int
lookup_host (char *host, struct in_addr *ipaddr)
{
	struct hostent *he;

	if (!inet_aton(host, ipaddr)) {
		if ((he = gethostbyname(host)) == NULL)
			return(-1);
		*ipaddr = *(struct in_addr *)he->h_addr_list[0];
	}
	return(0);
}

/*
 * fills the addr and mask fields in the instruction as appropriate from av.
 * Update length as appropriate.
 * The following formats are allowed:
 *	me	returns O_IP_*_ME
 *	1.2.3.4		single IP address
 *	1.2.3.4:5.6.7.8	address:mask
 *	1.2.3.4/24	address/mask
 *	1.2.3.4/26{1,6,5,4,23}	set of addresses in a subnet
 * We can have multiple comma-separated address/mask entries.
 */
static void
fill_ip(ipfw_insn_ip *cmd, char *av)
{
	int len = 0;
	uint32_t *d = ((ipfw_insn_u32 *)cmd)->d;

	cmd->o.len &= ~F_LEN_MASK;	/* zero len */

	if (_substrcmp(av, "any") == 0)
		return;

	if (_substrcmp(av, "me") == 0) {
		cmd->o.len |= F_INSN_SIZE(ipfw_insn);
		return;
	}

	if (strncmp(av, "table(", 6) == 0) {
		char *p = strchr(av + 6, ',');

		if (p)
			*p++ = '\0';
		cmd->o.opcode = O_IP_DST_LOOKUP;
		cmd->o.arg1 = strtoul(av + 6, NULL, 0);
		if (p) {
			cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32);
			d[0] = strtoul(p, NULL, 0);
		} else
			cmd->o.len |= F_INSN_SIZE(ipfw_insn);
		return;
	}

    while (av) {
	/*
	 * After the address we can have '/' or ':' indicating a mask,
	 * ',' indicating another address follows, '{' indicating a
	 * set of addresses of unspecified size.
	 */
	char *t = NULL, *p = strpbrk(av, "/:,{");
	int masklen;
	char md, nd = '\0';

	if (p) {
		md = *p;
		*p++ = '\0';
		if ((t = strpbrk(p, ",{")) != NULL) {
			nd = *t;
			*t = '\0';
		}
	} else
		md = '\0';

	if (lookup_host(av, (struct in_addr *)&d[0]) != 0)
		errx(EX_NOHOST, "hostname ``%s'' unknown", av);
	switch (md) {
	case ':':
		if (!inet_aton(p, (struct in_addr *)&d[1]))
			errx(EX_DATAERR, "bad netmask ``%s''", p);
		break;
	case '/':
		masklen = atoi(p);
		if (masklen == 0)
			d[1] = htonl(0);	/* mask */
		else if (masklen > 32)
			errx(EX_DATAERR, "bad width ``%s''", p);
		else
			d[1] = htonl(~0 << (32 - masklen));
		break;
	case '{':	/* no mask, assume /24 and put back the '{' */
		d[1] = htonl(~0 << (32 - 24));
		*(--p) = md;
		break;

	case ',':	/* single address plus continuation */
		*(--p) = md;
		/* FALLTHROUGH */
	case 0:		/* initialization value */
	default:
		d[1] = htonl(~0);	/* force /32 */
		break;
	}
	d[0] &= d[1];		/* mask base address with mask */
	if (t)
		*t = nd;
	/* find next separator */
	if (p)
		p = strpbrk(p, ",{");
	if (p && *p == '{') {
		/*
		 * We have a set of addresses. They are stored as follows:
		 *   arg1	is the set size (powers of 2, 2..256)
		 *   addr	is the base address IN HOST FORMAT
		 *   mask..	is an array of arg1 bits (rounded up to
		 *		the next multiple of 32) with bits set
		 *		for each host in the map.
		 */
		uint32_t *map = (uint32_t *)&cmd->mask;
		int low, high;
		int i = contigmask((uint8_t *)&(d[1]), 32);

		if (len > 0)
			errx(EX_DATAERR, "address set cannot be in a list");
		if (i < 24 || i > 31)
			errx(EX_DATAERR, "invalid set with mask %d\n", i);
		cmd->o.arg1 = 1<<(32-i);	/* map length		*/
		d[0] = ntohl(d[0]);		/* base addr in host format */
		cmd->o.opcode = O_IP_DST_SET;	/* default */
		cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32) + (cmd->o.arg1+31)/32;
		for (i = 0; i < (cmd->o.arg1+31)/32 ; i++)
			map[i] = 0;	/* clear map */

		av = p + 1;
		low = d[0] & 0xff;
		high = low + cmd->o.arg1 - 1;
		/*
		 * Here, i stores the previous value when we specify a range
		 * of addresses within a mask, e.g. 45-63. i = -1 means we
		 * have no previous value.
		 */
		i = -1;	/* previous value in a range */
		while (isdigit(*av)) {
			char *s;
			int a = strtol(av, &s, 0);

			if (s == av) { /* no parameter */
			    if (*av != '}')
				errx(EX_DATAERR, "set not closed\n");
			    if (i != -1)
				errx(EX_DATAERR, "incomplete range %d-", i);
			    break;
			}
			if (a < low || a > high)
			    errx(EX_DATAERR, "addr %d out of range [%d-%d]\n",
				a, low, high);
			a -= low;
			if (i == -1)	/* no previous in range */
			    i = a;
			else {		/* check that range is valid */
			    if (i > a)
				errx(EX_DATAERR, "invalid range %d-%d",
					i+low, a+low);
			    if (*s == '-')
				errx(EX_DATAERR, "double '-' in range");
			}
			for (; i <= a; i++)
			    map[i/32] |= 1<<(i & 31);
			i = -1;
			if (*s == '-')
			    i = a;
			else if (*s == '}')
			    break;
			av = s+1;
		}
		return;
	}
	av = p;
	if (av)			/* then *av must be a ',' */
		av++;

	/* Check this entry */
	if (d[1] == 0) { /* "any", specified as x.x.x.x/0 */
		/*
		 * 'any' turns the entire list into a NOP.
		 * 'not any' never matches, so it is removed from the
		 * list unless it is the only item, in which case we
		 * report an error.
		 */
		if (cmd->o.len & F_NOT) {	/* "not any" never matches */
			if (av == NULL && len == 0) /* only this entry */
				errx(EX_DATAERR, "not any never matches");
		}
		/* else do nothing and skip this entry */
		return;
	}
	/* A single IP can be stored in an optimized format */
	if (d[1] == (uint32_t)~0 && av == NULL && len == 0) {
		cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32);
		return;
	}
	len += 2;	/* two words... */
	d += 2;
    } /* end while */
    if (len + 1 > F_LEN_MASK)
	errx(EX_DATAERR, "address list too long");
    cmd->o.len |= len+1;
}


/* n2mask sets n bits of the mask */
void
n2mask(struct in6_addr *mask, int n)
{
	static int	minimask[9] =
	    { 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff };
	u_char		*p;

	memset(mask, 0, sizeof(struct in6_addr));
	p = (u_char *) mask;
	for (; n > 0; p++, n -= 8) {
		if (n >= 8)
			*p = 0xff;
		else
			*p = minimask[n];
	}
	return;
}

/*
 * helper function to process a set of flags and set bits in the
 * appropriate masks.
 */
static void
fill_flags(ipfw_insn *cmd, enum ipfw_opcodes opcode,
	struct _s_x *flags, char *p)
{
	uint8_t set=0, clear=0;

	while (p && *p) {
		char *q;	/* points to the separator */
		int val;
		uint8_t *which;	/* mask we are working on */

		if (*p == '!') {
			p++;
			which = &clear;
		} else
			which = &set;
		q = strchr(p, ',');
		if (q)
			*q++ = '\0';
		val = match_token(flags, p);
		if (val <= 0)
			errx(EX_DATAERR, "invalid flag %s", p);
		*which |= (uint8_t)val;
		p = q;
	}
	cmd->opcode = opcode;
	cmd->len =  (cmd->len & (F_NOT | F_OR)) | 1;
	cmd->arg1 = (set & 0xff) | ( (clear & 0xff) << 8);
}


void
ipfw_delete(char *av[])
{
	uint32_t rulenum;
	int i;
	int exitval = EX_OK;
	int do_set = 0;

	av++;
	NEED1("missing rule specification");
	if ( *av && _substrcmp(*av, "set") == 0) {
		/* Do not allow using the following syntax:
		 *	ipfw set N delete set M
		 */
		if (co.use_set)
			errx(EX_DATAERR, "invalid syntax");
		do_set = 1;	/* delete set */
		av++;
	}

	/* Rule number */
	while (*av && isdigit(**av)) {
		i = atoi(*av); av++;
		if (co.do_nat) {
			exitval = do_cmd(IP_FW_NAT_DEL, &i, sizeof i);
			if (exitval) {
				exitval = EX_UNAVAILABLE;
				warn("rule %u not available", i);
			}
		} else if (co.do_pipe) {
			exitval = ipfw_delete_pipe(co.do_pipe, i);
		} else {
			if (co.use_set)
				rulenum = (i & 0xffff) | (5 << 24) |
				    ((co.use_set - 1) << 16);
			else
			rulenum =  (i & 0xffff) | (do_set << 24);
			i = do_cmd(IP_FW_DEL, &rulenum, sizeof rulenum);
			if (i) {
				exitval = EX_UNAVAILABLE;
				warn("rule %u: setsockopt(IP_FW_DEL)",
				    rulenum);
			}
		}
	}
	if (exitval != EX_OK)
		exit(exitval);
}


/*
 * fill the interface structure. We do not check the name as we can
 * create interfaces dynamically, so checking them at insert time
 * makes relatively little sense.
 * Interface names containing '*', '?', or '[' are assumed to be shell
 * patterns which match interfaces.
 */
static void
fill_iface(ipfw_insn_if *cmd, char *arg)
{
	cmd->name[0] = '\0';
	cmd->o.len |= F_INSN_SIZE(ipfw_insn_if);

	/* Parse the interface or address */
	if (strcmp(arg, "any") == 0)
		cmd->o.len = 0;		/* effectively ignore this command */
	else if (!isdigit(*arg)) {
		strlcpy(cmd->name, arg, sizeof(cmd->name));
		cmd->p.glob = strpbrk(arg, "*?[") != NULL ? 1 : 0;
	} else if (!inet_aton(arg, &cmd->p.ip))
		errx(EX_DATAERR, "bad ip address ``%s''", arg);
}

static void
get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask)
{
	int i;
	size_t l;
	char *ap, *ptr, *optr;
	struct ether_addr *mac;
	const char *macset = "0123456789abcdefABCDEF:";

	if (strcmp(p, "any") == 0) {
		for (i = 0; i < ETHER_ADDR_LEN; i++)
			addr[i] = mask[i] = 0;
		return;
	}

	optr = ptr = strdup(p);
	if ((ap = strsep(&ptr, "&/")) != NULL && *ap != 0) {
		l = strlen(ap);
		if (strspn(ap, macset) != l || (mac = ether_aton(ap)) == NULL)
			errx(EX_DATAERR, "Incorrect MAC address");
		bcopy(mac, addr, ETHER_ADDR_LEN);
	} else
		errx(EX_DATAERR, "Incorrect MAC address");

	if (ptr != NULL) { /* we have mask? */
		if (p[ptr - optr - 1] == '/') { /* mask len */
			long ml = strtol(ptr, &ap, 10);
			if (*ap != 0 || ml > ETHER_ADDR_LEN * 8 || ml < 0)
				errx(EX_DATAERR, "Incorrect mask length");
			for (i = 0; ml > 0 && i < ETHER_ADDR_LEN; ml -= 8, i++)
				mask[i] = (ml >= 8) ? 0xff: (~0) << (8 - ml);
		} else { /* mask */
			l = strlen(ptr);
			if (strspn(ptr, macset) != l ||
			    (mac = ether_aton(ptr)) == NULL)
				errx(EX_DATAERR, "Incorrect mask");
			bcopy(mac, mask, ETHER_ADDR_LEN);
		}
	} else { /* default mask: ff:ff:ff:ff:ff:ff */
		for (i = 0; i < ETHER_ADDR_LEN; i++)
			mask[i] = 0xff;
	}
	for (i = 0; i < ETHER_ADDR_LEN; i++)
		addr[i] &= mask[i];

	free(optr);
}

/*
 * helper function, updates the pointer to cmd with the length
 * of the current command, and also cleans up the first word of
 * the new command in case it has been clobbered before.
 */
static ipfw_insn *
next_cmd(ipfw_insn *cmd)
{
	cmd += F_LEN(cmd);
	bzero(cmd, sizeof(*cmd));
	return cmd;
}

/*
 * Takes arguments and copies them into a comment
 */
static void
fill_comment(ipfw_insn *cmd, char **av)
{
	int i, l;
	char *p = (char *)(cmd + 1);

	cmd->opcode = O_NOP;
	cmd->len =  (cmd->len & (F_NOT | F_OR));

	/* Compute length of comment string. */
	for (i = 0, l = 0; av[i] != NULL; i++)
		l += strlen(av[i]) + 1;
	if (l == 0)
		return;
	if (l > 84)
		errx(EX_DATAERR,
		    "comment too long (max 80 chars)");
	l = 1 + (l+3)/4;
	cmd->len =  (cmd->len & (F_NOT | F_OR)) | l;
	for (i = 0; av[i] != NULL; i++) {
		strcpy(p, av[i]);
		p += strlen(av[i]);
		*p++ = ' ';
	}
	*(--p) = '\0';
}

/*
 * A function to fill simple commands of size 1.
 * Existing flags are preserved.
 */
static void
fill_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, int flags, uint16_t arg)
{
	cmd->opcode = opcode;
	cmd->len =  ((cmd->len | flags) & (F_NOT | F_OR)) | 1;
	cmd->arg1 = arg;
}

/*
 * Fetch and add the MAC address and type, with masks. This generates one or
 * two microinstructions, and returns the pointer to the last one.
 */
static ipfw_insn *
add_mac(ipfw_insn *cmd, char *av[])
{
	ipfw_insn_mac *mac;

	if ( ( av[0] == NULL ) || ( av[1] == NULL ) )
		errx(EX_DATAERR, "MAC dst src");

	cmd->opcode = O_MACADDR2;
	cmd->len = (cmd->len & (F_NOT | F_OR)) | F_INSN_SIZE(ipfw_insn_mac);

	mac = (ipfw_insn_mac *)cmd;
	get_mac_addr_mask(av[0], mac->addr, mac->mask);	/* dst */
	get_mac_addr_mask(av[1], &(mac->addr[ETHER_ADDR_LEN]),
	    &(mac->mask[ETHER_ADDR_LEN])); /* src */
	return cmd;
}

static ipfw_insn *
add_mactype(ipfw_insn *cmd, char *av)
{
	if (!av)
		errx(EX_DATAERR, "missing MAC type");
	if (strcmp(av, "any") != 0) { /* we have a non-null type */
		fill_newports((ipfw_insn_u16 *)cmd, av, IPPROTO_ETHERTYPE);
		cmd->opcode = O_MAC_TYPE;
		return cmd;
	} else
		return NULL;
}

static ipfw_insn *
add_proto0(ipfw_insn *cmd, char *av, u_char *protop)
{
	struct protoent *pe;
	char *ep;
	int proto;

	proto = strtol(av, &ep, 10);
	if (*ep != '\0' || proto <= 0) {
		if ((pe = getprotobyname(av)) == NULL)
			return NULL;
		proto = pe->p_proto;
	}

	fill_cmd(cmd, O_PROTO, 0, proto);
	*protop = proto;
	return cmd;
}

static ipfw_insn *
add_proto(ipfw_insn *cmd, char *av, u_char *protop)
{
	u_char proto = IPPROTO_IP;

	if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0)
		; /* do not set O_IP4 nor O_IP6 */
	else if (strcmp(av, "ip4") == 0)
		/* explicit "just IPv4" rule */
		fill_cmd(cmd, O_IP4, 0, 0);
	else if (strcmp(av, "ip6") == 0) {
		/* explicit "just IPv6" rule */
		proto = IPPROTO_IPV6;
		fill_cmd(cmd, O_IP6, 0, 0);
	} else
		return add_proto0(cmd, av, protop);

	*protop = proto;
	return cmd;
}

static ipfw_insn *
add_proto_compat(ipfw_insn *cmd, char *av, u_char *protop)
{
	u_char proto = IPPROTO_IP;

	if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0)
		; /* do not set O_IP4 nor O_IP6 */
	else if (strcmp(av, "ipv4") == 0 || strcmp(av, "ip4") == 0)
		/* explicit "just IPv4" rule */
		fill_cmd(cmd, O_IP4, 0, 0);
	else if (strcmp(av, "ipv6") == 0 || strcmp(av, "ip6") == 0) {
		/* explicit "just IPv6" rule */
		proto = IPPROTO_IPV6;
		fill_cmd(cmd, O_IP6, 0, 0);
	} else
		return add_proto0(cmd, av, protop);

	*protop = proto;
	return cmd;
}

static ipfw_insn *
add_srcip(ipfw_insn *cmd, char *av)
{
	fill_ip((ipfw_insn_ip *)cmd, av);
	if (cmd->opcode == O_IP_DST_SET)			/* set */
		cmd->opcode = O_IP_SRC_SET;
	else if (cmd->opcode == O_IP_DST_LOOKUP)		/* table */
		cmd->opcode = O_IP_SRC_LOOKUP;
	else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn))		/* me */
		cmd->opcode = O_IP_SRC_ME;
	else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32))	/* one IP */
		cmd->opcode = O_IP_SRC;
	else							/* addr/mask */
		cmd->opcode = O_IP_SRC_MASK;
	return cmd;
}

static ipfw_insn *
add_dstip(ipfw_insn *cmd, char *av)
{
	fill_ip((ipfw_insn_ip *)cmd, av);
	if (cmd->opcode == O_IP_DST_SET)			/* set */
		;
	else if (cmd->opcode == O_IP_DST_LOOKUP)		/* table */
		;
	else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn))		/* me */
		cmd->opcode = O_IP_DST_ME;
	else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32))	/* one IP */
		cmd->opcode = O_IP_DST;
	else							/* addr/mask */
		cmd->opcode = O_IP_DST_MASK;
	return cmd;
}

static ipfw_insn *
add_ports(ipfw_insn *cmd, char *av, u_char proto, int opcode)
{
	/* XXX "any" is trapped before. Perhaps "to" */
	if (_substrcmp(av, "any") == 0) {
		return NULL;
	} else if (fill_newports((ipfw_insn_u16 *)cmd, av, proto)) {
		/* XXX todo: check that we have a protocol with ports */
		cmd->opcode = opcode;
		return cmd;
	}
	return NULL;
}

static ipfw_insn *
add_src(ipfw_insn *cmd, char *av, u_char proto)
{
	struct in6_addr a;
	char *host, *ch;
	ipfw_insn *ret = NULL;

	if ((host = strdup(av)) == NULL)
		return NULL;
	if ((ch = strrchr(host, '/')) != NULL)
		*ch = '\0';

	if (proto == IPPROTO_IPV6  || strcmp(av, "me6") == 0 ||
	    inet_pton(AF_INET6, host, &a) == 1)
		ret = add_srcip6(cmd, av);
	/* XXX: should check for IPv4, not !IPv6 */
	if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 ||
	    inet_pton(AF_INET6, host, &a) != 1))
		ret = add_srcip(cmd, av);
	if (ret == NULL && strcmp(av, "any") != 0)
		ret = cmd;

	free(host);
	return ret;
}

static ipfw_insn *
add_dst(ipfw_insn *cmd, char *av, u_char proto)
{
	struct in6_addr a;
	char *host, *ch;
	ipfw_insn *ret = NULL;

	if ((host = strdup(av)) == NULL)
		return NULL;
	if ((ch = strrchr(host, '/')) != NULL)
		*ch = '\0';

	if (proto == IPPROTO_IPV6  || strcmp(av, "me6") == 0 ||
	    inet_pton(AF_INET6, host, &a) == 1)
		ret = add_dstip6(cmd, av);
	/* XXX: should check for IPv4, not !IPv6 */
	if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 ||
	    inet_pton(AF_INET6, host, &a) != 1))
		ret = add_dstip(cmd, av);
	if (ret == NULL && strcmp(av, "any") != 0)
		ret = cmd;

	free(host);
	return ret;
}

/*
 * Parse arguments and assemble the microinstructions which make up a rule.
 * Rules are added into the 'rulebuf' and then copied in the correct order
 * into the actual rule.
 *
 * The syntax for a rule starts with the action, followed by
 * optional action parameters, and the various match patterns.
 * In the assembled microcode, the first opcode must be an O_PROBE_STATE
 * (generated if the rule includes a keep-state option), then the
 * various match patterns, log/altq actions, and the actual action.
 *
 */
void
ipfw_add(char *av[])
{
	/*
	 * rules are added into the 'rulebuf' and then copied in
	 * the correct order into the actual rule.
	 * Some things that need to go out of order (prob, action etc.)
	 * go into actbuf[].
	 */
	static uint32_t rulebuf[255], actbuf[255], cmdbuf[255];

	ipfw_insn *src, *dst, *cmd, *action, *prev=NULL;
	ipfw_insn *first_cmd;	/* first match pattern */

	struct ip_fw *rule;

	/*
	 * various flags used to record that we entered some fields.
	 */
	ipfw_insn *have_state = NULL;	/* check-state or keep-state */
	ipfw_insn *have_log = NULL, *have_altq = NULL, *have_tag = NULL;
	size_t len;

	int i;

	int open_par = 0;	/* open parenthesis ( */

	/* proto is here because it is used to fetch ports */
	u_char proto = IPPROTO_IP;	/* default protocol */

	double match_prob = 1; /* match probability, default is always match */

	bzero(actbuf, sizeof(actbuf));		/* actions go here */
	bzero(cmdbuf, sizeof(cmdbuf));
	bzero(rulebuf, sizeof(rulebuf));

	rule = (struct ip_fw *)rulebuf;
	cmd = (ipfw_insn *)cmdbuf;
	action = (ipfw_insn *)actbuf;

	av++;

	/* [rule N]	-- Rule number optional */
	if (av[0] && isdigit(**av)) {
		rule->rulenum = atoi(*av);
		av++;
	}

	/* [set N]	-- set number (0..RESVD_SET), optional */
	if (av[0] && av[1] && _substrcmp(*av, "set") == 0) {
		int set = strtoul(av[1], NULL, 10);
		if (set < 0 || set > RESVD_SET)
			errx(EX_DATAERR, "illegal set %s", av[1]);
		rule->set = set;
		av += 2;
	}

	/* [prob D]	-- match probability, optional */
	if (av[0] && av[1] && _substrcmp(*av, "prob") == 0) {
		match_prob = strtod(av[1], NULL);

		if (match_prob <= 0 || match_prob > 1)
			errx(EX_DATAERR, "illegal match prob. %s", av[1]);
		av += 2;
	}

	/* action	-- mandatory */
	NEED1("missing action");
	i = match_token(rule_actions, *av);
	av++;
	action->len = 1;	/* default */
	switch(i) {
	case TOK_CHECKSTATE:
		have_state = action;
		action->opcode = O_CHECK_STATE;
		break;

	case TOK_ACCEPT:
		action->opcode = O_ACCEPT;
		break;

	case TOK_DENY:
		action->opcode = O_DENY;
		action->arg1 = 0;
		break;

	case TOK_REJECT:
		action->opcode = O_REJECT;
		action->arg1 = ICMP_UNREACH_HOST;
		break;

	case TOK_RESET:
		action->opcode = O_REJECT;
		action->arg1 = ICMP_REJECT_RST;
		break;

	case TOK_RESET6:
		action->opcode = O_UNREACH6;
		action->arg1 = ICMP6_UNREACH_RST;
		break;

	case TOK_UNREACH:
		action->opcode = O_REJECT;
		NEED1("missing reject code");
		fill_reject_code(&action->arg1, *av);
		av++;
		break;

	case TOK_UNREACH6:
		action->opcode = O_UNREACH6;
		NEED1("missing unreach code");
		fill_unreach6_code(&action->arg1, *av);
		av++;
		break;

	case TOK_COUNT:
		action->opcode = O_COUNT;
		break;

	case TOK_NAT:
		action->opcode = O_NAT;
		action->len = F_INSN_SIZE(ipfw_insn_nat);
		goto chkarg;

	case TOK_QUEUE:
		action->opcode = O_QUEUE;
		goto chkarg;
	case TOK_PIPE:
		action->opcode = O_PIPE;
		goto chkarg;
	case TOK_SKIPTO:
		action->opcode = O_SKIPTO;
		goto chkarg;
	case TOK_NETGRAPH:
		action->opcode = O_NETGRAPH;
		goto chkarg;
	case TOK_NGTEE:
		action->opcode = O_NGTEE;
		goto chkarg;
	case TOK_DIVERT:
		action->opcode = O_DIVERT;
		goto chkarg;
	case TOK_TEE:
		action->opcode = O_TEE;
		goto chkarg;
	case TOK_CALL:
		action->opcode = O_CALLRETURN;
chkarg:
		if (!av[0])
			errx(EX_USAGE, "missing argument for %s", *(av - 1));
		if (isdigit(**av)) {
			action->arg1 = strtoul(*av, NULL, 10);
			if (action->arg1 <= 0 || action->arg1 >= IP_FW_TABLEARG)
				errx(EX_DATAERR, "illegal argument for %s",
				    *(av - 1));
		} else if (_substrcmp(*av, "tablearg") == 0) {
			action->arg1 = IP_FW_TABLEARG;
		} else if (i == TOK_DIVERT || i == TOK_TEE) {
			struct servent *s;
			setservent(1);
			s = getservbyname(av[0], "divert");
			if (s != NULL)
				action->arg1 = ntohs(s->s_port);
			else
				errx(EX_DATAERR, "illegal divert/tee port");
		} else
			errx(EX_DATAERR, "illegal argument for %s", *(av - 1));
		av++;
		break;

	case TOK_FORWARD: {
		ipfw_insn_sa *p = (ipfw_insn_sa *)action;
		char *s, *end;

		NEED1("missing forward address[:port]");

		action->opcode = O_FORWARD_IP;
		action->len = F_INSN_SIZE(ipfw_insn_sa);

		/*
		 * In the kernel we assume AF_INET and use only
		 * sin_port and sin_addr. Remember to set sin_len as
		 * the routing code seems to use it too.
		 */
		p->sa.sin_family = AF_INET;
		p->sa.sin_len = sizeof(struct sockaddr_in);
		p->sa.sin_port = 0;
		/*
		 * locate the address-port separator (':' or ',')
		 */
		s = strchr(*av, ':');
		if (s == NULL)
			s = strchr(*av, ',');
		if (s != NULL) {
			*(s++) = '\0';
			i = strtoport(s, &end, 0 /* base */, 0 /* proto */);
			if (s == end)
				errx(EX_DATAERR,
				    "illegal forwarding port ``%s''", s);
			p->sa.sin_port = (u_short)i;
		}
		if (_substrcmp(*av, "tablearg") == 0)
			p->sa.sin_addr.s_addr = INADDR_ANY;
		else
			lookup_host(*av, &(p->sa.sin_addr));
		av++;
		break;
	    }
	case TOK_COMMENT:
		/* pretend it is a 'count' rule followed by the comment */
		action->opcode = O_COUNT;
		av--;		/* go back... */
		break;

	case TOK_SETFIB:
	    {
		int numfibs;
		size_t intsize = sizeof(int);

		action->opcode = O_SETFIB;
		NEED1("missing fib number");
	        action->arg1 = strtoul(*av, NULL, 10);
		if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1)
			errx(EX_DATAERR, "fibs not suported.\n");
		if (action->arg1 >= numfibs)  /* Temporary */
			errx(EX_DATAERR, "fib too large.\n");
		av++;
		break;
	    }

	case TOK_REASS:
		action->opcode = O_REASS;
		break;

	case TOK_RETURN:
		fill_cmd(action, O_CALLRETURN, F_NOT, 0);
		break;

	default:
		errx(EX_DATAERR, "invalid action %s\n", av[-1]);
	}
	action = next_cmd(action);

	/*
	 * [altq queuename] -- altq tag, optional
	 * [log [logamount N]]	-- log, optional
	 *
	 * If they exist, it go first in the cmdbuf, but then it is
	 * skipped in the copy section to the end of the buffer.
	 */
	while (av[0] != NULL && (i = match_token(rule_action_params, *av)) != -1) {
		av++;
		switch (i) {
		case TOK_LOG:
		    {
			ipfw_insn_log *c = (ipfw_insn_log *)cmd;
			int l;

			if (have_log)
				errx(EX_DATAERR,
				    "log cannot be specified more than once");
			have_log = (ipfw_insn *)c;
			cmd->len = F_INSN_SIZE(ipfw_insn_log);
			cmd->opcode = O_LOG;
			if (av[0] && _substrcmp(*av, "logamount") == 0) {
				av++;
				NEED1("logamount requires argument");
				l = atoi(*av);
				if (l < 0)
					errx(EX_DATAERR,
					    "logamount must be positive");
				c->max_log = l;
				av++;
			} else {
				len = sizeof(c->max_log);
				if (sysctlbyname("net.inet.ip.fw.verbose_limit",
				    &c->max_log, &len, NULL, 0) == -1)
					errx(1, "sysctlbyname(\"%s\")",
					    "net.inet.ip.fw.verbose_limit");
			}
		    }
			break;

#ifndef NO_ALTQ
		case TOK_ALTQ:
		    {
			ipfw_insn_altq *a = (ipfw_insn_altq *)cmd;

			NEED1("missing altq queue name");
			if (have_altq)
				errx(EX_DATAERR,
				    "altq cannot be specified more than once");
			have_altq = (ipfw_insn *)a;
			cmd->len = F_INSN_SIZE(ipfw_insn_altq);
			cmd->opcode = O_ALTQ;
			a->qid = altq_name_to_qid(*av);
			av++;
		    }
			break;
#endif

		case TOK_TAG:
		case TOK_UNTAG: {
			uint16_t tag;

			if (have_tag)
				errx(EX_USAGE, "tag and untag cannot be "
				    "specified more than once");
			GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX, i,
			   rule_action_params);
			have_tag = cmd;
			fill_cmd(cmd, O_TAG, (i == TOK_TAG) ? 0: F_NOT, tag);
			av++;
			break;
		}

		default:
			abort();
		}
		cmd = next_cmd(cmd);
	}

	if (have_state)	/* must be a check-state, we are done */
		goto done;

#define OR_START(target)					\
	if (av[0] && (*av[0] == '(' || *av[0] == '{')) { 	\
		if (open_par)					\
			errx(EX_USAGE, "nested \"(\" not allowed\n"); \
		prev = NULL;					\
		open_par = 1;					\
		if ( (av[0])[1] == '\0') {			\
			av++;					\
		} else						\
			(*av)++;				\
	}							\
	target:							\


#define	CLOSE_PAR						\
	if (open_par) {						\
		if (av[0] && (					\
		    strcmp(*av, ")") == 0 ||			\
		    strcmp(*av, "}") == 0)) {			\
			prev = NULL;				\
			open_par = 0;				\
			av++;					\
		} else						\
			errx(EX_USAGE, "missing \")\"\n");	\
	}

#define NOT_BLOCK						\
	if (av[0] && _substrcmp(*av, "not") == 0) {		\
		if (cmd->len & F_NOT)				\
			errx(EX_USAGE, "double \"not\" not allowed\n"); \
		cmd->len |= F_NOT;				\
		av++;						\
	}

#define OR_BLOCK(target)					\
	if (av[0] && _substrcmp(*av, "or") == 0) {		\
		if (prev == NULL || open_par == 0)		\
			errx(EX_DATAERR, "invalid OR block");	\
		prev->len |= F_OR;				\
		av++;					\
		goto target;					\
	}							\
	CLOSE_PAR;

	first_cmd = cmd;

#if 0
	/*
	 * MAC addresses, optional.
	 * If we have this, we skip the part "proto from src to dst"
	 * and jump straight to the option parsing.
	 */
	NOT_BLOCK;
	NEED1("missing protocol");
	if (_substrcmp(*av, "MAC") == 0 ||
	    _substrcmp(*av, "mac") == 0) {
		av++;			/* the "MAC" keyword */
		add_mac(cmd, av);	/* exits in case of errors */
		cmd = next_cmd(cmd);
		av += 2;		/* dst-mac and src-mac */
		NOT_BLOCK;
		NEED1("missing mac type");
		if (add_mactype(cmd, av[0]))
			cmd = next_cmd(cmd);
		av++;			/* any or mac-type */
		goto read_options;
	}
#endif

	/*
	 * protocol, mandatory
	 */
    OR_START(get_proto);
	NOT_BLOCK;
	NEED1("missing protocol");
	if (add_proto_compat(cmd, *av, &proto)) {
		av++;
		if (F_LEN(cmd) != 0) {
			prev = cmd;
			cmd = next_cmd(cmd);
		}
	} else if (first_cmd != cmd) {
		errx(EX_DATAERR, "invalid protocol ``%s''", *av);
	} else
		goto read_options;
    OR_BLOCK(get_proto);

	/*
	 * "from", mandatory
	 */
	if ((av[0] == NULL) || _substrcmp(*av, "from") != 0)
		errx(EX_USAGE, "missing ``from''");
	av++;

	/*
	 * source IP, mandatory
	 */
    OR_START(source_ip);
	NOT_BLOCK;	/* optional "not" */
	NEED1("missing source address");
	if (add_src(cmd, *av, proto)) {
		av++;
		if (F_LEN(cmd) != 0) {	/* ! any */
			prev = cmd;
			cmd = next_cmd(cmd);
		}
	} else
		errx(EX_USAGE, "bad source address %s", *av);
    OR_BLOCK(source_ip);

	/*
	 * source ports, optional
	 */
	NOT_BLOCK;	/* optional "not" */
	if ( av[0] != NULL ) {
		if (_substrcmp(*av, "any") == 0 ||
		    add_ports(cmd, *av, proto, O_IP_SRCPORT)) {
			av++;
			if (F_LEN(cmd) != 0)
				cmd = next_cmd(cmd);
		}
	}

	/*
	 * "to", mandatory
	 */
	if ( (av[0] == NULL) || _substrcmp(*av, "to") != 0 )
		errx(EX_USAGE, "missing ``to''");
	av++;

	/*
	 * destination, mandatory
	 */
    OR_START(dest_ip);
	NOT_BLOCK;	/* optional "not" */
	NEED1("missing dst address");
	if (add_dst(cmd, *av, proto)) {
		av++;
		if (F_LEN(cmd) != 0) {	/* ! any */
			prev = cmd;
			cmd = next_cmd(cmd);
		}
	} else
		errx( EX_USAGE, "bad destination address %s", *av);
    OR_BLOCK(dest_ip);

	/*
	 * dest. ports, optional
	 */
	NOT_BLOCK;	/* optional "not" */
	if (av[0]) {
		if (_substrcmp(*av, "any") == 0 ||
		    add_ports(cmd, *av, proto, O_IP_DSTPORT)) {
			av++;
			if (F_LEN(cmd) != 0)
				cmd = next_cmd(cmd);
		}
	}

read_options:
	if (av[0] && first_cmd == cmd) {
		/*
		 * nothing specified so far, store in the rule to ease
		 * printout later.
		 */
		 rule->_pad = 1;
	}
	prev = NULL;
	while ( av[0] != NULL ) {
		char *s;
		ipfw_insn_u32 *cmd32;	/* alias for cmd */

		s = *av;
		cmd32 = (ipfw_insn_u32 *)cmd;

		if (*s == '!') {	/* alternate syntax for NOT */
			if (cmd->len & F_NOT)
				errx(EX_USAGE, "double \"not\" not allowed\n");
			cmd->len = F_NOT;
			s++;
		}
		i = match_token(rule_options, s);
		av++;
		switch(i) {
		case TOK_NOT:
			if (cmd->len & F_NOT)
				errx(EX_USAGE, "double \"not\" not allowed\n");
			cmd->len = F_NOT;
			break;

		case TOK_OR:
			if (open_par == 0 || prev == NULL)
				errx(EX_USAGE, "invalid \"or\" block\n");
			prev->len |= F_OR;
			break;

		case TOK_STARTBRACE:
			if (open_par)
				errx(EX_USAGE, "+nested \"(\" not allowed\n");
			open_par = 1;
			break;

		case TOK_ENDBRACE:
			if (!open_par)
				errx(EX_USAGE, "+missing \")\"\n");
			open_par = 0;
			prev = NULL;
			break;

		case TOK_IN:
			fill_cmd(cmd, O_IN, 0, 0);
			break;

		case TOK_OUT:
			cmd->len ^= F_NOT; /* toggle F_NOT */
			fill_cmd(cmd, O_IN, 0, 0);
			break;

		case TOK_DIVERTED:
			fill_cmd(cmd, O_DIVERTED, 0, 3);
			break;

		case TOK_DIVERTEDLOOPBACK:
			fill_cmd(cmd, O_DIVERTED, 0, 1);
			break;

		case TOK_DIVERTEDOUTPUT:
			fill_cmd(cmd, O_DIVERTED, 0, 2);
			break;

		case TOK_FRAG:
			fill_cmd(cmd, O_FRAG, 0, 0);
			break;

		case TOK_LAYER2:
			fill_cmd(cmd, O_LAYER2, 0, 0);
			break;

		case TOK_XMIT:
		case TOK_RECV:
		case TOK_VIA:
			NEED1("recv, xmit, via require interface name"
				" or address");
			fill_iface((ipfw_insn_if *)cmd, av[0]);
			av++;
			if (F_LEN(cmd) == 0)	/* not a valid address */
				break;
			if (i == TOK_XMIT)
				cmd->opcode = O_XMIT;
			else if (i == TOK_RECV)
				cmd->opcode = O_RECV;
			else if (i == TOK_VIA)
				cmd->opcode = O_VIA;
			break;

		case TOK_ICMPTYPES:
			NEED1("icmptypes requires list of types");
			fill_icmptypes((ipfw_insn_u32 *)cmd, *av);
			av++;
			break;

		case TOK_ICMP6TYPES:
			NEED1("icmptypes requires list of types");
			fill_icmp6types((ipfw_insn_icmp6 *)cmd, *av);
			av++;
			break;

		case TOK_IPTTL:
			NEED1("ipttl requires TTL");
			if (strpbrk(*av, "-,")) {
			    if (!add_ports(cmd, *av, 0, O_IPTTL))
				errx(EX_DATAERR, "invalid ipttl %s", *av);
			} else
			    fill_cmd(cmd, O_IPTTL, 0, strtoul(*av, NULL, 0));
			av++;
			break;

		case TOK_IPID:
			NEED1("ipid requires id");
			if (strpbrk(*av, "-,")) {
			    if (!add_ports(cmd, *av, 0, O_IPID))
				errx(EX_DATAERR, "invalid ipid %s", *av);
			} else
			    fill_cmd(cmd, O_IPID, 0, strtoul(*av, NULL, 0));
			av++;
			break;

		case TOK_IPLEN:
			NEED1("iplen requires length");
			if (strpbrk(*av, "-,")) {
			    if (!add_ports(cmd, *av, 0, O_IPLEN))
				errx(EX_DATAERR, "invalid ip len %s", *av);
			} else
			    fill_cmd(cmd, O_IPLEN, 0, strtoul(*av, NULL, 0));
			av++;
			break;

		case TOK_IPVER:
			NEED1("ipver requires version");
			fill_cmd(cmd, O_IPVER, 0, strtoul(*av, NULL, 0));
			av++;
			break;

		case TOK_IPPRECEDENCE:
			NEED1("ipprecedence requires value");
			fill_cmd(cmd, O_IPPRECEDENCE, 0,
			    (strtoul(*av, NULL, 0) & 7) << 5);
			av++;
			break;

		case TOK_IPOPTS:
			NEED1("missing argument for ipoptions");
			fill_flags(cmd, O_IPOPT, f_ipopts, *av);
			av++;
			break;

		case TOK_IPTOS:
			NEED1("missing argument for iptos");
			fill_flags(cmd, O_IPTOS, f_iptos, *av);
			av++;
			break;

		case TOK_UID:
			NEED1("uid requires argument");
		    {
			char *end;
			uid_t uid;
			struct passwd *pwd;

			cmd->opcode = O_UID;
			uid = strtoul(*av, &end, 0);
			pwd = (*end == '\0') ? getpwuid(uid) : getpwnam(*av);
			if (pwd == NULL)
				errx(EX_DATAERR, "uid \"%s\" nonexistent", *av);
			cmd32->d[0] = pwd->pw_uid;
			cmd->len |= F_INSN_SIZE(ipfw_insn_u32);
			av++;
		    }
			break;

		case TOK_GID:
			NEED1("gid requires argument");
		    {
			char *end;
			gid_t gid;
			struct group *grp;

			cmd->opcode = O_GID;
			gid = strtoul(*av, &end, 0);
			grp = (*end == '\0') ? getgrgid(gid) : getgrnam(*av);
			if (grp == NULL)
				errx(EX_DATAERR, "gid \"%s\" nonexistent", *av);
			cmd32->d[0] = grp->gr_gid;
			cmd->len |= F_INSN_SIZE(ipfw_insn_u32);
			av++;
		    }
			break;

		case TOK_JAIL:
			NEED1("jail requires argument");
		    {
			char *end;
			int jid;

			cmd->opcode = O_JAIL;
			jid = (int)strtol(*av, &end, 0);
			if (jid < 0 || *end != '\0')
				errx(EX_DATAERR, "jail requires prison ID");
			cmd32->d[0] = (uint32_t)jid;
			cmd->len |= F_INSN_SIZE(ipfw_insn_u32);
			av++;
		    }
			break;

		case TOK_ESTAB:
			fill_cmd(cmd, O_ESTAB, 0, 0);
			break;

		case TOK_SETUP:
			fill_cmd(cmd, O_TCPFLAGS, 0,
				(TH_SYN) | ( (TH_ACK) & 0xff) <<8 );
			break;

		case TOK_TCPDATALEN:
			NEED1("tcpdatalen requires length");
			if (strpbrk(*av, "-,")) {
			    if (!add_ports(cmd, *av, 0, O_TCPDATALEN))
				errx(EX_DATAERR, "invalid tcpdata len %s", *av);
			} else
			    fill_cmd(cmd, O_TCPDATALEN, 0,
				    strtoul(*av, NULL, 0));
			av++;
			break;

		case TOK_TCPOPTS:
			NEED1("missing argument for tcpoptions");
			fill_flags(cmd, O_TCPOPTS, f_tcpopts, *av);
			av++;
			break;

		case TOK_TCPSEQ:
		case TOK_TCPACK:
			NEED1("tcpseq/tcpack requires argument");
			cmd->len = F_INSN_SIZE(ipfw_insn_u32);
			cmd->opcode = (i == TOK_TCPSEQ) ? O_TCPSEQ : O_TCPACK;
			cmd32->d[0] = htonl(strtoul(*av, NULL, 0));
			av++;
			break;

		case TOK_TCPWIN:
			NEED1("tcpwin requires length");
			fill_cmd(cmd, O_TCPWIN, 0,
			    htons(strtoul(*av, NULL, 0)));
			av++;
			break;

		case TOK_TCPFLAGS:
			NEED1("missing argument for tcpflags");
			cmd->opcode = O_TCPFLAGS;
			fill_flags(cmd, O_TCPFLAGS, f_tcpflags, *av);
			av++;
			break;

		case TOK_KEEPSTATE:
			if (open_par)
				errx(EX_USAGE, "keep-state cannot be part "
				    "of an or block");
			if (have_state)
				errx(EX_USAGE, "only one of keep-state "
					"and limit is allowed");
			have_state = cmd;
			fill_cmd(cmd, O_KEEP_STATE, 0, 0);
			break;

		case TOK_LIMIT: {
			ipfw_insn_limit *c = (ipfw_insn_limit *)cmd;
			int val;

			if (open_par)
				errx(EX_USAGE,
				    "limit cannot be part of an or block");
			if (have_state)
				errx(EX_USAGE, "only one of keep-state and "
				    "limit is allowed");
			have_state = cmd;

			cmd->len = F_INSN_SIZE(ipfw_insn_limit);
			cmd->opcode = O_LIMIT;
			c->limit_mask = c->conn_limit = 0;

			while ( av[0] != NULL ) {
				if ((val = match_token(limit_masks, *av)) <= 0)
					break;
				c->limit_mask |= val;
				av++;
			}

			if (c->limit_mask == 0)
				errx(EX_USAGE, "limit: missing limit mask");

			GET_UINT_ARG(c->conn_limit, IPFW_ARG_MIN, IPFW_ARG_MAX,
			    TOK_LIMIT, rule_options);

			av++;
			break;
		}

		case TOK_PROTO:
			NEED1("missing protocol");
			if (add_proto(cmd, *av, &proto)) {
				av++;
			} else
				errx(EX_DATAERR, "invalid protocol ``%s''",
				    *av);
			break;

		case TOK_SRCIP:
			NEED1("missing source IP");
			if (add_srcip(cmd, *av)) {
				av++;
			}
			break;

		case TOK_DSTIP:
			NEED1("missing destination IP");
			if (add_dstip(cmd, *av)) {
				av++;
			}
			break;

		case TOK_SRCIP6:
			NEED1("missing source IP6");
			if (add_srcip6(cmd, *av)) {
				av++;
			}
			break;

		case TOK_DSTIP6:
			NEED1("missing destination IP6");
			if (add_dstip6(cmd, *av)) {
				av++;
			}
			break;

		case TOK_SRCPORT:
			NEED1("missing source port");
			if (_substrcmp(*av, "any") == 0 ||
			    add_ports(cmd, *av, proto, O_IP_SRCPORT)) {
				av++;
			} else
				errx(EX_DATAERR, "invalid source port %s", *av);
			break;

		case TOK_DSTPORT:
			NEED1("missing destination port");
			if (_substrcmp(*av, "any") == 0 ||
			    add_ports(cmd, *av, proto, O_IP_DSTPORT)) {
				av++;
			} else
				errx(EX_DATAERR, "invalid destination port %s",
				    *av);
			break;

		case TOK_MAC:
			if (add_mac(cmd, av))
				av += 2;
			break;

		case TOK_MACTYPE:
			NEED1("missing mac type");
			if (!add_mactype(cmd, *av))
				errx(EX_DATAERR, "invalid mac type %s", *av);
			av++;
			break;

		case TOK_VERREVPATH:
			fill_cmd(cmd, O_VERREVPATH, 0, 0);
			break;

		case TOK_VERSRCREACH:
			fill_cmd(cmd, O_VERSRCREACH, 0, 0);
			break;

		case TOK_ANTISPOOF:
			fill_cmd(cmd, O_ANTISPOOF, 0, 0);
			break;

		case TOK_IPSEC:
			fill_cmd(cmd, O_IPSEC, 0, 0);
			break;

		case TOK_IPV6:
			fill_cmd(cmd, O_IP6, 0, 0);
			break;

		case TOK_IPV4:
			fill_cmd(cmd, O_IP4, 0, 0);
			break;

		case TOK_EXT6HDR:
			fill_ext6hdr( cmd, *av );
			av++;
			break;

		case TOK_FLOWID:
			if (proto != IPPROTO_IPV6 )
				errx( EX_USAGE, "flow-id filter is active "
				    "only for ipv6 protocol\n");
			fill_flow6( (ipfw_insn_u32 *) cmd, *av );
			av++;
			break;

		case TOK_COMMENT:
			fill_comment(cmd, av);
			av[0]=NULL;
			break;

		case TOK_TAGGED:
			if (av[0] && strpbrk(*av, "-,")) {
				if (!add_ports(cmd, *av, 0, O_TAGGED))
					errx(EX_DATAERR, "tagged: invalid tag"
					    " list: %s", *av);
			}
			else {
				uint16_t tag;

				GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX,
				    TOK_TAGGED, rule_options);
				fill_cmd(cmd, O_TAGGED, 0, tag);
			}
			av++;
			break;

		case TOK_FIB:
			NEED1("fib requires fib number");
			fill_cmd(cmd, O_FIB, 0, strtoul(*av, NULL, 0));
			av++;
			break;
		case TOK_SOCKARG:
			fill_cmd(cmd, O_SOCKARG, 0, 0);
			break;

		case TOK_LOOKUP: {
			ipfw_insn_u32 *c = (ipfw_insn_u32 *)cmd;
			char *p;
			int j;

			if (!av[0] || !av[1])
				errx(EX_USAGE, "format: lookup argument tablenum");
			cmd->opcode = O_IP_DST_LOOKUP;
			cmd->len |= F_INSN_SIZE(ipfw_insn) + 2;
			i = match_token(rule_options, *av);
			for (j = 0; lookup_key[j] >= 0 ; j++) {
				if (i == lookup_key[j])
					break;
			}
			if (lookup_key[j] <= 0)
				errx(EX_USAGE, "format: cannot lookup on %s", *av);
			__PAST_END(c->d, 1) = j; // i converted to option
			av++;
			cmd->arg1 = strtoul(*av, &p, 0);
			if (p && *p)
				errx(EX_USAGE, "format: lookup argument tablenum");
			av++;
		    }
			break;

		default:
			errx(EX_USAGE, "unrecognised option [%d] %s\n", i, s);
		}
		if (F_LEN(cmd) > 0) {	/* prepare to advance */
			prev = cmd;
			cmd = next_cmd(cmd);
		}
	}

done:
	/*
	 * Now copy stuff into the rule.
	 * If we have a keep-state option, the first instruction
	 * must be a PROBE_STATE (which is generated here).
	 * If we have a LOG option, it was stored as the first command,
	 * and now must be moved to the top of the action part.
	 */
	dst = (ipfw_insn *)rule->cmd;

	/*
	 * First thing to write into the command stream is the match probability.
	 */
	if (match_prob != 1) { /* 1 means always match */
		dst->opcode = O_PROB;
		dst->len = 2;
		*((int32_t *)(dst+1)) = (int32_t)(match_prob * 0x7fffffff);
		dst += dst->len;
	}

	/*
	 * generate O_PROBE_STATE if necessary
	 */
	if (have_state && have_state->opcode != O_CHECK_STATE) {
		fill_cmd(dst, O_PROBE_STATE, 0, 0);
		dst = next_cmd(dst);
	}

	/* copy all commands but O_LOG, O_KEEP_STATE, O_LIMIT, O_ALTQ, O_TAG */
	for (src = (ipfw_insn *)cmdbuf; src != cmd; src += i) {
		i = F_LEN(src);

		switch (src->opcode) {
		case O_LOG:
		case O_KEEP_STATE:
		case O_LIMIT:
		case O_ALTQ:
		case O_TAG:
			break;
		default:
			bcopy(src, dst, i * sizeof(uint32_t));
			dst += i;
		}
	}

	/*
	 * put back the have_state command as last opcode
	 */
	if (have_state && have_state->opcode != O_CHECK_STATE) {
		i = F_LEN(have_state);
		bcopy(have_state, dst, i * sizeof(uint32_t));
		dst += i;
	}
	/*
	 * start action section
	 */
	rule->act_ofs = dst - rule->cmd;

	/* put back O_LOG, O_ALTQ, O_TAG if necessary */
	if (have_log) {
		i = F_LEN(have_log);
		bcopy(have_log, dst, i * sizeof(uint32_t));
		dst += i;
	}
	if (have_altq) {
		i = F_LEN(have_altq);
		bcopy(have_altq, dst, i * sizeof(uint32_t));
		dst += i;
	}
	if (have_tag) {
		i = F_LEN(have_tag);
		bcopy(have_tag, dst, i * sizeof(uint32_t));
		dst += i;
	}
	/*
	 * copy all other actions
	 */
	for (src = (ipfw_insn *)actbuf; src != action; src += i) {
		i = F_LEN(src);
		bcopy(src, dst, i * sizeof(uint32_t));
		dst += i;
	}

	rule->cmd_len = (uint32_t *)dst - (uint32_t *)(rule->cmd);
	i = (char *)dst - (char *)rule;
	if (do_cmd(IP_FW_ADD, rule, (uintptr_t)&i) == -1)
		err(EX_UNAVAILABLE, "getsockopt(%s)", "IP_FW_ADD");
	if (!co.do_quiet)
		show_ipfw(rule, 0, 0);
}

/*
 * clear the counters or the log counters.
 */
void
ipfw_zero(int ac, char *av[], int optname /* 0 = IP_FW_ZERO, 1 = IP_FW_RESETLOG */)
{
	uint32_t arg, saved_arg;
	int failed = EX_OK;
	char const *errstr;
	char const *name = optname ? "RESETLOG" : "ZERO";

	optname = optname ? IP_FW_RESETLOG : IP_FW_ZERO;

	av++; ac--;

	if (!ac) {
		/* clear all entries */
		if (do_cmd(optname, NULL, 0) < 0)
			err(EX_UNAVAILABLE, "setsockopt(IP_FW_%s)", name);
		if (!co.do_quiet)
			printf("%s.\n", optname == IP_FW_ZERO ?
			    "Accounting cleared":"Logging counts reset");

		return;
	}

	while (ac) {
		/* Rule number */
		if (isdigit(**av)) {
			arg = strtonum(*av, 0, 0xffff, &errstr);
			if (errstr)
				errx(EX_DATAERR,
				    "invalid rule number %s\n", *av);
			saved_arg = arg;
			if (co.use_set)
				arg |= (1 << 24) | ((co.use_set - 1) << 16);
			av++;
			ac--;
			if (do_cmd(optname, &arg, sizeof(arg))) {
				warn("rule %u: setsockopt(IP_FW_%s)",
				    saved_arg, name);
				failed = EX_UNAVAILABLE;
			} else if (!co.do_quiet)
				printf("Entry %d %s.\n", saved_arg,
				    optname == IP_FW_ZERO ?
					"cleared" : "logging count reset");
		} else {
			errx(EX_USAGE, "invalid rule number ``%s''", *av);
		}
	}
	if (failed != EX_OK)
		exit(failed);
}

void
ipfw_flush(int force)
{
	int cmd = co.do_pipe ? IP_DUMMYNET_FLUSH : IP_FW_FLUSH;

	if (!force && !co.do_quiet) { /* need to ask user */
		int c;

		printf("Are you sure? [yn] ");
		fflush(stdout);
		do {
			c = toupper(getc(stdin));
			while (c != '\n' && getc(stdin) != '\n')
				if (feof(stdin))
					return; /* and do not flush */
		} while (c != 'Y' && c != 'N');
		printf("\n");
		if (c == 'N')	/* user said no */
			return;
	}
	if (co.do_pipe) {
		dummynet_flush();
		return;
	}
	/* `ipfw set N flush` - is the same that `ipfw delete set N` */
	if (co.use_set) {
		uint32_t arg = ((co.use_set - 1) & 0xffff) | (1 << 24);
		if (do_cmd(IP_FW_DEL, &arg, sizeof(arg)) < 0)
			err(EX_UNAVAILABLE, "setsockopt(IP_FW_DEL)");
	} else if (do_cmd(cmd, NULL, 0) < 0)
		err(EX_UNAVAILABLE, "setsockopt(IP_%s_FLUSH)",
		    co.do_pipe ? "DUMMYNET" : "FW");
	if (!co.do_quiet)
		printf("Flushed all %s.\n", co.do_pipe ? "pipes" : "rules");
}


static void table_list(ipfw_table_entry ent, int need_header);

/*
 * This one handles all table-related commands
 * 	ipfw table N add addr[/masklen] [value]
 * 	ipfw table N delete addr[/masklen]
 * 	ipfw table {N | all} flush
 * 	ipfw table {N | all} list
 */
void
ipfw_table_handler(int ac, char *av[])
{
	ipfw_table_entry ent;
	int do_add;
	int is_all;
	size_t len;
	char *p;
	uint32_t a;
	uint32_t tables_max;

	len = sizeof(tables_max);
	if (sysctlbyname("net.inet.ip.fw.tables_max", &tables_max, &len,
		NULL, 0) == -1) {
#ifdef IPFW_TABLES_MAX
		warn("Warn: Failed to get the max tables number via sysctl. "
		     "Using the compiled in defaults. \nThe reason was");
		tables_max = IPFW_TABLES_MAX;
#else
		errx(1, "Failed sysctlbyname(\"net.inet.ip.fw.tables_max\")");
#endif
	}

	ac--; av++;
	if (ac && isdigit(**av)) {
		ent.tbl = atoi(*av);
		is_all = 0;
		ac--; av++;
	} else if (ac && _substrcmp(*av, "all") == 0) {
		ent.tbl = 0;
		is_all = 1;
		ac--; av++;
	} else
		errx(EX_USAGE, "table number or 'all' keyword required");
	if (ent.tbl >= tables_max)
		errx(EX_USAGE, "The table number exceeds the maximum allowed "
			"value (%d)", tables_max - 1);
	NEED1("table needs command");
	if (is_all && _substrcmp(*av, "list") != 0
		   && _substrcmp(*av, "flush") != 0)
		errx(EX_USAGE, "table number required");

	if (_substrcmp(*av, "add") == 0 ||
	    _substrcmp(*av, "delete") == 0) {
		do_add = **av == 'a';
		ac--; av++;
		if (!ac)
			errx(EX_USAGE, "IP address required");
		p = strchr(*av, '/');
		if (p) {
			*p++ = '\0';
			ent.masklen = atoi(p);
			if (ent.masklen > 32)
				errx(EX_DATAERR, "bad width ``%s''", p);
		} else
			ent.masklen = 32;
		if (lookup_host(*av, (struct in_addr *)&ent.addr) != 0)
			errx(EX_NOHOST, "hostname ``%s'' unknown", *av);
		ac--; av++;
		if (do_add && ac) {
			unsigned int tval;
			/* isdigit is a bit of a hack here.. */
			if (strchr(*av, (int)'.') == NULL && isdigit(**av))  {
				ent.value = strtoul(*av, NULL, 0);
			} else {
		        	if (lookup_host(*av, (struct in_addr *)&tval) == 0) {
					/* The value must be stored in host order	 *
					 * so that the values < 65k can be distinguished */
		       			ent.value = ntohl(tval);
				} else {
					errx(EX_NOHOST, "hostname ``%s'' unknown", *av);
				}
			}
		} else
			ent.value = 0;
		if (do_cmd(do_add ? IP_FW_TABLE_ADD : IP_FW_TABLE_DEL,
		    &ent, sizeof(ent)) < 0) {
			/* If running silent, don't bomb out on these errors. */
			if (!(co.do_quiet && (errno == (do_add ? EEXIST : ESRCH))))
				err(EX_OSERR, "setsockopt(IP_FW_TABLE_%s)",
				    do_add ? "ADD" : "DEL");
			/* In silent mode, react to a failed add by deleting */
			if (do_add) {
				do_cmd(IP_FW_TABLE_DEL, &ent, sizeof(ent));
				if (do_cmd(IP_FW_TABLE_ADD,
				    &ent, sizeof(ent)) < 0)
					err(EX_OSERR,
				            "setsockopt(IP_FW_TABLE_ADD)");
			}
		}
	} else if (_substrcmp(*av, "flush") == 0) {
		a = is_all ? tables_max : (uint32_t)(ent.tbl + 1);
		do {
			if (do_cmd(IP_FW_TABLE_FLUSH, &ent.tbl,
			    sizeof(ent.tbl)) < 0)
				err(EX_OSERR, "setsockopt(IP_FW_TABLE_FLUSH)");
		} while (++ent.tbl < a);
	} else if (_substrcmp(*av, "list") == 0) {
		a = is_all ? tables_max : (uint32_t)(ent.tbl + 1);
		do {
			table_list(ent, is_all);
		} while (++ent.tbl < a);
	} else
		errx(EX_USAGE, "invalid table command %s", *av);
}

static void
table_list(ipfw_table_entry ent, int need_header)
{
	ipfw_table *tbl;
	socklen_t l;
	uint32_t a;

	a = ent.tbl;
	l = sizeof(a);
	if (do_cmd(IP_FW_TABLE_GETSIZE, &a, (uintptr_t)&l) < 0)
		err(EX_OSERR, "getsockopt(IP_FW_TABLE_GETSIZE)");

	/* If a is zero we have nothing to do, the table is empty. */
	if (a == 0)
		return;

	l = sizeof(*tbl) + a * sizeof(ipfw_table_entry);
	tbl = safe_calloc(1, l);
	tbl->tbl = ent.tbl;
	if (do_cmd(IP_FW_TABLE_LIST, tbl, (uintptr_t)&l) < 0)
		err(EX_OSERR, "getsockopt(IP_FW_TABLE_LIST)");
	if (tbl->cnt && need_header)
		printf("---table(%d)---\n", tbl->tbl);
	for (a = 0; a < tbl->cnt; a++) {
		unsigned int tval;
		tval = tbl->ent[a].value;
		if (co.do_value_as_ip) {
			char tbuf[128];
			strncpy(tbuf, inet_ntoa(*(struct in_addr *)
				&tbl->ent[a].addr), 127);
			/* inet_ntoa expects network order */
			tval = htonl(tval);
			printf("%s/%u %s\n", tbuf, tbl->ent[a].masklen,
				inet_ntoa(*(struct in_addr *)&tval));
		} else {
			printf("%s/%u %u\n",
				inet_ntoa(*(struct in_addr *)&tbl->ent[a].addr),
				tbl->ent[a].masklen, tval);
		}
	}
	free(tbl);
}


================================================
FILE: ipfw/ipfw2.h
================================================
/*
 * Copyright (c) 2002-2003 Luigi Rizzo
 * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp
 * Copyright (c) 1994 Ugen J.S.Antsilevich
 *
 * Idea and grammar partially left from:
 * Copyright (c) 1993 Daniel Boulet
 *
 * Redistribution and use in source forms, with and without modification,
 * are permitted provided that this entire comment appears intact.
 *
 * Redistribution in binary form may occur without any restrictions.
 * Obviously, it would be nice if you gave credit where credit is due
 * but requiring it would be too onerous.
 *
 * This software is provided ``AS IS'' without any warranties of any kind.
 *
 * NEW command line interface for IP firewall facility
 *
 * $FreeBSD: head/sbin/ipfw/ipfw2.h 206843 2010-04-19 15:11:45Z luigi $
 */

/*
 * Options that can be set on the command line.
 * When reading commands from a file, a subset of the options can also
 * be applied globally by specifying them before the file name.
 * After that, each line can contain its own option that changes
 * the global value.
 * XXX The context is not restored after each line.
 */

struct cmdline_opts {
	/* boolean options: */
	int	do_value_as_ip;	/* show table value as IP */
	int	do_resolv;	/* try to resolve all ip to names */
	int	do_time;	/* Show time stamps */
	int	do_quiet;	/* Be quiet in add and flush */
	int	do_pipe;	/* this cmd refers to a pipe/queue/sched */
	int	do_nat; 	/* this cmd refers to a nat config */
	int	do_dynamic;	/* display dynamic rules */
	int	do_expired;	/* display expired dynamic rules */
	int	do_compact;	/* show rules in compact mode */
	int	do_force;	/* do not ask for confirmation */
	int	show_sets;	/* display the set each rule belongs to */
	int	test_only;	/* only check syntax */
	int	comment_only;	/* only print action and comment */
	int	verbose;	/* be verbose on some commands */

	/* The options below can have multiple values. */

	int	do_sort;	/* field to sort results (0 = no) */
		/* valid fields are 1 and above */

	int	use_set;	/* work with specified set number */
		/* 0 means all sets, otherwise apply to set use_set - 1 */

};

extern struct cmdline_opts co;

/*
 * _s_x is a structure that stores a string <-> token pairs, used in
 * various places in the parser. Entries are stored in arrays,
 * with an entry with s=NULL as terminator.
 * The search routines are match_token() and match_value().
 * Often, an element with x=0 contains an error string.
 *
 */
struct _s_x {
	char const *s;
	int x;
};

enum tokens {
	TOK_NULL=0,

	TOK_OR,
	TOK_NOT,
	TOK_STARTBRACE,
	TOK_ENDBRACE,

	TOK_ACCEPT,
	TOK_COUNT,
	TOK_PIPE,
	TOK_LINK,
	TOK_QUEUE,
	TOK_FLOWSET,
	TOK_SCHED,
	TOK_DIVERT,
	TOK_TEE,
	TOK_NETGRAPH,
	TOK_NGTEE,
	TOK_FORWARD,
	TOK_SKIPTO,
	TOK_DENY,
	TOK_REJECT,
	TOK_RESET,
	TOK_UNREACH,
	TOK_CHECKSTATE,
	TOK_NAT,
	TOK_REASS,
	TOK_CALL,
	TOK_RETURN,

	TOK_ALTQ,
	TOK_LOG,
	TOK_TAG,
	TOK_UNTAG,

	TOK_TAGGED,
	TOK_UID,
	TOK_GID,
	TOK_JAIL,
	TOK_IN,
	TOK_LIMIT,
	TOK_KEEPSTATE,
	TOK_LAYER2,
	TOK_OUT,
	TOK_DIVERTED,
	TOK_DIVERTEDLOOPBACK,
	TOK_DIVERTEDOUTPUT,
	TOK_XMIT,
	TOK_RECV,
	TOK_VIA,
	TOK_FRAG,
	TOK_IPOPTS,
	TOK_IPLEN,
	TOK_IPID,
	TOK_IPPRECEDENCE,
	TOK_DSCP,
	TOK_IPTOS,
	TOK_IPTTL,
	TOK_IPVER,
	TOK_ESTAB,
	TOK_SETUP,
	TOK_TCPDATALEN,
	TOK_TCPFLAGS,
	TOK_TCPOPTS,
	TOK_TCPSEQ,
	TOK_TCPACK,
	TOK_TCPWIN,
	TOK_ICMPTYPES,
	TOK_MAC,
	TOK_MACTYPE,
	TOK_VERREVPATH,
	TOK_VERSRCREACH,
	TOK_ANTISPOOF,
	TOK_IPSEC,
	TOK_COMMENT,

	TOK_PLR,
	TOK_NOERROR,
	TOK_BUCKETS,
	TOK_DSTIP,
	TOK_SRCIP,
	TOK_DSTPORT,
	TOK_SRCPORT,
	TOK_ALL,
	TOK_MASK,
	TOK_FLOW_MASK,
	TOK_SCHED_MASK,
	TOK_BW,
	TOK_DELAY,
	TOK_PROFILE,
	TOK_BURST,
	TOK_RED,
	TOK_GRED,
	TOK_DROPTAIL,
	TOK_PROTO,
	/* dummynet tokens */
	TOK_WEIGHT,
	TOK_LMAX,
	TOK_PRI,
	TOK_TYPE,
	TOK_SLOTSIZE,

	TOK_IP,
	TOK_IF,
 	TOK_ALOG,
 	TOK_DENY_INC,
 	TOK_SAME_PORTS,
 	TOK_UNREG_ONLY,
	TOK_SKIP_GLOBAL,
 	TOK_RESET_ADDR,
 	TOK_ALIAS_REV,
 	TOK_PROXY_ONLY,
	TOK_REDIR_ADDR,
	TOK_REDIR_PORT,
	TOK_REDIR_PROTO,

	TOK_IPV6,
	TOK_FLOWID,
	TOK_ICMP6TYPES,
	TOK_EXT6HDR,
	TOK_DSTIP6,
	TOK_SRCIP6,

	TOK_IPV4,
	TOK_UNREACH6,
	TOK_RESET6,

	TOK_FIB,
	TOK_SETFIB,
	TOK_LOOKUP,
	TOK_SOCKARG,
};
/*
 * the following macro returns an error message if we run out of
 * arguments.
 */
#define NEED(_p, msg)      {if (!_p) errx(EX_USAGE, msg);}
#define NEED1(msg)      {if (!(*av)) errx(EX_USAGE, msg);}

int pr_u64(uint64_t *pd, int width);

/* memory allocation support */
void *safe_calloc(size_t number, size_t size);
void *safe_realloc(void *ptr, size_t size);

/* string comparison functions used for historical compatibility */
int _substrcmp(const char *str1, const char* str2);
int _substrcmp2(const char *str1, const char* str2, const char* str3);

/* utility functions */
int match_token(struct _s_x *table, char *string);
char const *match_value(struct _s_x *p, int value);

int do_cmd(int optname, void *optval, uintptr_t optlen);

struct in6_addr;
void n2mask(struct in6_addr *mask, int n);
int contigmask(uint8_t *p, int len);

/*
 * Forward declarations to avoid include way too many headers.
 * C does not allow duplicated typedefs, so we use the base struct
 * that the typedef points to.
 * Should the typedefs use a different type, the compiler will
 * still detect the change when compiling the body of the
 * functions involved, so we do not lose error checking.
 */
struct _ipfw_insn;
struct _ipfw_insn_altq;
struct _ipfw_insn_u32;
struct _ipfw_insn_ip6;
struct _ipfw_insn_icmp6;

/*
 * The reserved set numer. This is a constant in ip_fw.h
 * but we store it in a variable so other files do not depend
 * in that header just for one constant.
 */
extern int resvd_set_number;

/* first-level command handlers */
void ipfw_add(char *av[]);
void ipfw_show_nat(int ac, char **av);
void ipfw_config_pipe(int ac, char **av);
void ipfw_config_nat(int ac, char **av);
void ipfw_sets_handler(char *av[]);
void ipfw_table_handler(int ac, char *av[]);
void ipfw_sysctl_handler(char *av[], int which);
void ipfw_delete(char *av[]);
void ipfw_flush(int force);
void ipfw_zero(int ac, char *av[], int optname);
void ipfw_list(int ac, char *av[], int show_counters);

/* altq.c */
void altq_set_enabled(int enabled);
u_int32_t altq_name_to_qid(const char *name);

void print_altq_cmd(struct _ipfw_insn_altq *altqptr);

/* dummynet.c */
void dummynet_list(int ac, char *av[], int show_counters);
void dummynet_flush(void);
int ipfw_delete_pipe(int pipe_or_queue, int n);

/* ipv6.c */
void print_unreach6_code(uint16_t code);
void print_ip6(struct _ipfw_insn_ip6 *cmd, char const *s);
void print_flow6id(struct _ipfw_insn_u32 *cmd);
void print_icmp6types(struct _ipfw_insn_u32 *cmd);
void print_ext6hdr(struct _ipfw_insn *cmd );

struct _ipfw_insn *add_srcip6(struct _ipfw_insn *cmd, char *av);
struct _ipfw_insn *add_dstip6(struct _ipfw_insn *cmd, char *av);

void fill_flow6(struct _ipfw_insn_u32 *cmd, char *av );
void fill_unreach6_code(u_short *codep, char *str);
void fill_icmp6types(struct _ipfw_insn_icmp6 *cmd, char *av);
int fill_ext6hdr(struct _ipfw_insn *cmd, char *av);


================================================
FILE: ipfw/ipv6.c
================================================
/*
 * Copyright (c) 2002-2003 Luigi Rizzo
 * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp
 * Copyright (c) 1994 Ugen J.S.Antsilevich
 *
 * Idea and grammar partially left from:
 * Copyright (c) 1993 Daniel Boulet
 *
 * Redistribution and use in source forms, with and without modification,
 * are permitted provided that this entire comment appears intact.
 *
 * Redistribution in binary form may occur without any restrictions.
 * Obviously, it would be nice if you gave credit where credit is due
 * but requiring it would be too onerous.
 *
 * This software is provided ``AS IS'' without any warranties of any kind.
 *
 * NEW command line interface for IP firewall facility
 *
 * $FreeBSD: user/luigi/ipfw3-head/sbin/ipfw/ipv6.c 187770 2009-01-27 12:01:30Z luigi $
 *
 * ipv6 support
 */

#include <sys/types.h>
#include <sys/socket.h>

#include "ipfw2.h"

#include <err.h>
#include <netdb.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sysexits.h>

#include <net/if.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/icmp6.h>
#include <netinet/ip_fw.h>
#include <arpa/inet.h>

static struct _s_x icmp6codes[] = {
      { "no-route",		ICMP6_DST_UNREACH_NOROUTE },
      { "admin-prohib",		ICMP6_DST_UNREACH_ADMIN },
      { "address",		ICMP6_DST_UNREACH_ADDR },
      { "port",			ICMP6_DST_UNREACH_NOPORT },
      { NULL, 0 }
};

void
fill_unreach6_code(u_short *codep, char *str)
{
	int val;
	char *s;

	val = strtoul(str, &s, 0);
	if (s == str || *s != '\0' || val >= 0x100)
		val = match_token(icmp6codes, str);
	if (val < 0)
		errx(EX_DATAERR, "unknown ICMPv6 unreachable code ``%s''", str);
	*codep = val;
	return;
}

void
print_unreach6_code(uint16_t code)
{
	char const *s = match_value(icmp6codes, code);

	if (s != NULL)
		printf("unreach6 %s", s);
	else
		printf("unreach6 %u", code);
}

/*
 * Print the ip address contained in a command.
 */
void
print_ip6(ipfw_insn_ip6 *cmd, char const *s)
{
       struct hostent *he = NULL;
       int len = F_LEN((ipfw_insn *) cmd) - 1;
       struct in6_addr *a = &(cmd->addr6);
       char trad[255];

       printf("%s%s ", cmd->o.len & F_NOT ? " not": "", s);

       if (cmd->o.opcode == O_IP6_SRC_ME || cmd->o.opcode == O_IP6_DST_ME) {
		printf("me6");
		return;
       }
       if (cmd->o.opcode == O_IP6) {
		printf(" ip6");
		return;
       }

	/*
	 * len == 4 indicates a single IP, whereas lists of 1 or more
	 * addr/mask pairs have len = (2n+1). We convert len to n so we
	 * use that to count the number of entries.
	 */

	for (len = len / 4; len > 0; len -= 2, a += 2) {
           int mb =        /* mask length */
               (cmd->o.opcode == O_IP6_SRC || cmd->o.opcode == O_IP6_DST) ?
               128 : contigmask((uint8_t *)&(a[1]), 128);

           if (mb == 128 && co.do_resolv)
               he = gethostbyaddr((char *)a, sizeof(*a), AF_INET6);
           if (he != NULL)             /* resolved to name */
               printf("%s", he->h_name);
           else if (mb == 0)           /* any */
               printf("any");
           else {          /* numeric IP followed by some kind of mask */
               if (inet_ntop(AF_INET6,  a, trad, sizeof( trad ) ) == NULL)
                   printf("Error ntop in print_ip6\n");
               printf("%s",  trad );
               if (mb < 0)     /* XXX not really legal... */
                   printf(":%s",
                       inet_ntop(AF_INET6, &a[1], trad, sizeof(trad)));
               else if (mb < 128)
                   printf("/%d", mb);
           }
           if (len > 2)
               printf(",");
       }
}

void
fill_icmp6types(ipfw_insn_icmp6 *cmd, char *av)
{
       uint8_t type;

       bzero(cmd, sizeof(*cmd));
	while (*av) {
	if (*av == ',')
	    av++;
           type = strtoul(av, &av, 0);
           if (*av != ',' && *av != '\0')
               errx(EX_DATAERR, "invalid ICMP6 type");
	   /*
	    * XXX: shouldn't this be 0xFF?  I can't see any reason why
	    * we shouldn't be able to filter all possiable values
	    * regardless of the ability of the rest of the kernel to do
	    * anything useful with them.
	    */
           if (type > ICMP6_MAXTYPE)
               errx(EX_DATAERR, "ICMP6 type out of range");
           cmd->d[type / 32] |= ( 1 << (type % 32));
       }
       cmd->o.opcode = O_ICMP6TYPE;
       cmd->o.len |= F_INSN_SIZE(ipfw_insn_icmp6);
}


void
print_icmp6types(ipfw_insn_u32 *cmd)
{
       int i, j;
       char sep= ' ';

       printf(" ip6 icmp6types");
       for (i = 0; i < 7; i++)
               for (j=0; j < 32; ++j) {
                       if ( (cmd->d[i] & (1 << (j))) == 0)
                               continue;
                       printf("%c%d", sep, (i*32 + j));
                       sep = ',';
               }
}

void
print_flow6id( ipfw_insn_u32 *cmd)
{
       uint16_t i, limit = cmd->o.arg1;
       char sep = ',';

       printf(" flow-id ");
       for( i=0; i < limit; ++i) {
               if (i == limit - 1)
                       sep = ' ';
               printf("%d%c", cmd->d[i], sep);
       }
}

/* structure and define for the extension header in ipv6 */
static struct _s_x ext6hdrcodes[] = {
       { "frag",       EXT_FRAGMENT },
       { "hopopt",     EXT_HOPOPTS },
       { "route",      EXT_ROUTING },
       { "dstopt",     EXT_DSTOPTS },
       { "ah",		EXT_AH },
       { "esp",        EXT_ESP },
       { "rthdr0",     EXT_RTHDR0 },
       { "rthdr2",     EXT_RTHDR2 },
       { NULL,         0 }
};

/* fills command for the extension header filtering */
int
fill_ext6hdr( ipfw_insn *cmd, char *av)
{
       int tok;
       char *s = av;

       cmd->arg1 = 0;

       while(s) {
	   av = strsep( &s, ",") ;
           tok = match_token(ext6hdrcodes, av);
           switch (tok) {
           case EXT_FRAGMENT:
               cmd->arg1 |= EXT_FRAGMENT;
               break;

           case EXT_HOPOPTS:
               cmd->arg1 |= EXT_HOPOPTS;
               break;

           case EXT_ROUTING:
               cmd->arg1 |= EXT_ROUTING;
               break;

           case EXT_DSTOPTS:
               cmd->arg1 |= EXT_DSTOPTS;
               break;

           case EXT_AH:
               cmd->arg1 |= EXT_AH;
               break;

           case EXT_ESP:
               cmd->arg1 |= EXT_ESP;
               break;

           case EXT_RTHDR0:
               cmd->arg1 |= EXT_RTHDR0;
               break;

           case EXT_RTHDR2:
               cmd->arg1 |= EXT_RTHDR2;
               break;

           default:
               errx( EX_DATAERR, "invalid option for ipv6 exten header" );
               break;
           }
       }
       if (cmd->arg1 == 0 )
	    return 0;
       cmd->opcode = O_EXT_HDR;
       cmd->len |= F_INSN_SIZE( ipfw_insn );
       return 1;
}

void
print_ext6hdr( ipfw_insn *cmd )
{
       char sep = ' ';

       printf(" extension header:");
       if (cmd->arg1 & EXT_FRAGMENT ) {
	    printf("%cfragmentation", sep);
           sep = ',';
       }
       if (cmd->arg1 & EXT_HOPOPTS ) {
           printf("%chop options", sep);
           sep = ',';
       }
       if (cmd->arg1 & EXT_ROUTING ) {
           printf("%crouting options", sep);
           sep = ',';
       }
       if (cmd->arg1 & EXT_RTHDR0 ) {
           printf("%crthdr0", sep);
           sep = ',';
       }
       if (cmd->arg1 & EXT_RTHDR2 ) {
           printf("%crthdr2", sep);
           sep = ',';
       }
       if (cmd->arg1 & EXT_DSTOPTS ) {
           printf("%cdestination options", sep);
           sep = ',';
       }
       if (cmd->arg1 & EXT_AH ) {
           printf("%cauthentication header", sep);
           sep = ',';
       }
       if (cmd->arg1 & EXT_ESP ) {
           printf("%cencapsulated security payload", sep);
       }
}

/* Try to find ipv6 address by hostname */
static int
lookup_host6 (char *host, struct in6_addr *ip6addr)
{
	struct hostent *he;

	if (!inet_pton(AF_INET6, host, ip6addr)) {
		if ((he = gethostbyname2(host, AF_INET6)) == NULL)
			return(-1);
		memcpy(ip6addr, he->h_addr_list[0], sizeof( struct in6_addr));
	}
	return(0);
}


/*
 * fill the addr and mask fields in the instruction as appropriate from av.
 * Update length as appropriate.
 * The following formats are allowed:
 *     any     matches any IP6. Actually returns an empty instruction.
 *     me      returns O_IP6_*_ME
 *
 *     03f1::234:123:0342                single IP6 addres
 *     03f1::234:123:0342/24            address/mask
 *     03f1::234:123:0342/24,03f1::234:123:0343/               List of address
 *
 * Set of address (as in ipv6) not supported because ipv6 address
 * are typically random past the initial prefix.
 * Return 1 on success, 0 on failure.
 */
static int
fill_ip6(ipfw_insn_ip6 *cmd, char *av)
{
	int len = 0;
	struct in6_addr *d = &(cmd->addr6);
	/*
	 * Needed for multiple address.
	 * Note d[1] points to struct in6_add r mask6 of cmd
	 */

       cmd->o.len &= ~F_LEN_MASK;	/* zero len */

       if (strcmp(av, "any") == 0)
	       return (1);


       if (strcmp(av, "me") == 0) {	/* Set the data for "me" opt*/
	       cmd->o.len |= F_INSN_SIZE(ipfw_insn);
	       return (1);
       }

       if (strcmp(av, "me6") == 0) {	/* Set the data for "me" opt*/
	       cmd->o.len |= F_INSN_SIZE(ipfw_insn);
	       return (1);
       }

       av = strdup(av);
       while (av) {
		/*
		 * After the address we can have '/' indicating a mask,
		 * or ',' indicating another address follows.
		 */

		char *p;
		int masklen;
		char md = '\0';

		if ((p = strpbrk(av, "/,")) ) {
			md = *p;	/* save the separator */
			*p = '\0';	/* terminate address string */
			p++;		/* and skip past it */
		}
		/* now p points to NULL, mask or next entry */

		/* lookup stores address in *d as a side effect */
		if (lookup_host6(av, d) != 0) {
			/* XXX: failed. Free memory and go */
			errx(EX_DATAERR, "bad address \"%s\"", av);
		}
		/* next, look at the mask, if any */
		masklen = (md == '/') ? atoi(p) : 128;
		if (masklen > 128 || masklen < 0)
			errx(EX_DATAERR, "bad width \"%s\''", p);
		else
			n2mask(&d[1], masklen);

		APPLY_MASK(d, &d[1])   /* mask base address with mask */

		/* find next separator */

		if (md == '/') {	/* find separator past the mask */
			p = strpbrk(p, ",");
			if (p != NULL)
				p++;
		}
		av = p;

		/* Check this entry */
		if (masklen == 0) {
			/*
			 * 'any' turns the entire list into a NOP.
			 * 'not any' never matches, so it is removed from the
			 * list unless it is the only item, in which case we
			 * report an error.
			 */
			if (cmd->o.len & F_NOT && av == NULL && len == 0)
				errx(EX_DATAERR, "not any never matches");
			continue;
		}

		/*
		 * A single IP can be stored alone
		 */
		if (masklen == 128 && av == NULL && len == 0) {
			len = F_INSN_SIZE(struct in6_addr);
			break;
		}

		/* Update length and pointer to arguments */
		len += F_INSN_SIZE(struct in6_addr)*2;
		d += 2;
	} /* end while */

	/*
	 * Total length of the command, remember that 1 is the size of
	 * the base command.
	 */
	if (len + 1 > F_LEN_MASK)
		errx(EX_DATAERR, "address list too long");
	cmd->o.len |= len+1;
	free(av);
	return (1);
}

/*
 * fills command for ipv6 flow-id filtering
 * note that the 20 bit flow number is stored in a array of u_int32_t
 * it's supported lists of flow-id, so in the o.arg1 we store how many
 * additional flow-id we want to filter, the basic is 1
 */
void
fill_flow6( ipfw_insn_u32 *cmd, char *av )
{
	u_int32_t type;	 /* Current flow number */
	u_int16_t nflow = 0;    /* Current flow index */
	char *s = av;
	cmd->d[0] = 0;	  /* Initializing the base number*/

	while (s) {
		av = strsep( &s, ",") ;
		type = strtoul(av, &av, 0);
		if (*av != ',' && *av != '\0')
			errx(EX_DATAERR, "invalid ipv6 flow number %s", av);
		if (type > 0xfffff)
			errx(EX_DATAERR, "flow number out of range %s", av);
		cmd->d[nflow] |= type;
		nflow++;
	}
	if( nflow > 0 ) {
		cmd->o.opcode = O_FLOW6ID;
		cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32) + nflow;
		cmd->o.arg1 = nflow;
	}
	else {
		errx(EX_DATAERR, "invalid ipv6 flow number %s", av);
	}
}

ipfw_insn *
add_srcip6(ipfw_insn *cmd, char *av)
{

	fill_ip6((ipfw_insn_ip6 *)cmd, av);
	if (F_LEN(cmd) == 0) {				/* any */
	} else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) {	/* "me" */
		cmd->opcode = O_IP6_SRC_ME;
	} else if (F_LEN(cmd) ==
	    (F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn))) {
		/* single IP, no mask*/
		cmd->opcode = O_IP6_SRC;
	} else {					/* addr/mask opt */
		cmd->opcode = O_IP6_SRC_MASK;
	}
	return cmd;
}

ipfw_insn *
add_dstip6(ipfw_insn *cmd, char *av)
{

	fill_ip6((ipfw_insn_ip6 *)cmd, av);
	if (F_LEN(cmd) == 0) {				/* any */
	} else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) {	/* "me" */
		cmd->opcode = O_IP6_DST_ME;
	} else if (F_LEN(cmd) ==
	    (F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn))) {
		/* single IP, no mask*/
		cmd->opcode = O_IP6_DST;
	} else {					/* addr/mask opt */
		cmd->opcode = O_IP6_DST_MASK;
	}
	return cmd;
}


================================================
FILE: ipfw/main.c
================================================
/*
 * Copyright (c) 2002-2003,2010 Luigi Rizzo
 * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp
 * Copyright (c) 1994 Ugen J.S.Antsilevich
 *
 * Idea and grammar partially left from:
 * Copyright (c) 1993 Daniel Boulet
 *
 * Redistribution and use in source forms, with and without modification,
 * are permitted provided that this entire comment appears intact.
 *
 * Redistribution in binary form may occur without any restrictions.
 * Obviously, it would be nice if you gave credit where credit is due
 * but requiring it would be too onerous.
 *
 * This software is provided ``AS IS'' without any warranties of any kind.
 *
 * Command line interface for IP firewall facility
 *
 * $FreeBSD: head/sbin/ipfw/main.c 206494 2010-04-12 08:27:53Z luigi $
 */

#include <sys/wait.h>
#include <ctype.h>
#include <err.h>
#include <errno.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sysexits.h>
#include <unistd.h>

#include "ipfw2.h"

static void
help(void)
{
	fprintf(stderr,
"ipfw syntax summary (but please do read the ipfw(8) manpage):\n\n"
"\tipfw [-abcdefhnNqStTv] <command>\n\n"
"where <command> is one of the following:\n\n"
"add [num] [set N] [prob x] RULE-BODY\n"
"{pipe|queue} N config PIPE-BODY\n"
"[pipe|queue] {zero|delete|show} [N{,N}]\n"
"nat N config {ip IPADDR|if IFNAME|log|deny_in|same_ports|unreg_only|reset|\n"
"		reverse|proxy_only|redirect_addr linkspec|\n"
"		redirect_port linkspec|redirect_proto linkspec}\n"
"set [disable N... enable N...] | move [rule] X to Y | swap X Y | show\n"
"set N {show|list|zero|resetlog|delete} [N{,N}] | flush\n"
"table N {add ip[/bits] [value] | delete ip[/bits] | flush | list}\n"
"table all {flush | list}\n"
"\n"
"RULE-BODY:	check-state [PARAMS] | ACTION [PARAMS] ADDR [OPTION_LIST]\n"
"ACTION:	check-state | allow | count | deny | unreach{,6} CODE |\n"
"               skipto N | {divert|tee} PORT | forward ADDR |\n"
"               pipe N | queue N | nat N | setfib FIB | reass\n"
"PARAMS: 	[log [logamount LOGLIMIT]] [altq QUEUE_NAME]\n"
"ADDR:		[ MAC dst src ether_type ] \n"
"		[ ip from IPADDR [ PORT ] to IPADDR [ PORTLIST ] ]\n"
"		[ ipv6|ip6 from IP6ADDR [ PORT ] to IP6ADDR [ PORTLIST ] ]\n"
"IPADDR:	[not] { any | me | ip/bits{x,y,z} | table(t[,v]) | IPLIST }\n"
"IP6ADDR:	[not] { any | me | me6 | ip6/bits | IP6LIST }\n"
"IP6LIST:	{ ip6 | ip6/bits }[,IP6LIST]\n"
"IPLIST:	{ ip | ip/bits | ip:mask }[,IPLIST]\n"
"OPTION_LIST:	OPTION [OPTION_LIST]\n"
"OPTION:	bridged | diverted | diverted-loopback | diverted-output |\n"
"	{dst-ip|src-ip} IPADDR | {dst-ip6|src-ip6|dst-ipv6|src-ipv6} IP6ADDR |\n"
"	{dst-port|src-port} LIST |\n"
"	estab | frag | {gid|uid} N | icmptypes LIST | in | out | ipid LIST |\n"
"	iplen LIST | ipoptions SPEC | ipprecedence | ipsec | iptos SPEC |\n"
"	ipttl LIST | ipversion VER | keep-state | layer2 | limit ... |\n"
"	icmp6types LIST | ext6hdr LIST | flow-id N[,N] | fib FIB |\n"
"	mac ... | mac-type LIST | proto LIST | {recv|xmit|via} {IF|IPADDR} |\n"
"	setup | {tcpack|tcpseq|tcpwin} NN | tcpflags SPEC | tcpoptions SPEC |\n"
"	tcpdatalen LIST | verrevpath | versrcreach | antispoof\n"
);

	exit(0);
}

/*
 * Called with the arguments, including program name because getopt
 * wants it to be present.
 * Returns 0 if successful, 1 if empty command, errx() in case of errors.
 * First thing we do is process parameters creating an argv[] array
 * which includes the program name and a NULL entry at the end.
 * If we are called with a single string, we split it on whitespace.
 * Also, arguments with a trailing ',' are joined to the next one.
 * The pointers (av[]) and data are in a single chunk of memory.
 * av[0] points to the original program name, all other entries
 * point into the allocated chunk.
 */
static int
ipfw_main(int oldac, char **oldav)
{
	int ch, ac;
	const char *errstr;
	char **av, **save_av;
	int do_acct = 0;		/* Show packet/byte count */
	int try_next = 0;		/* set if pipe cmd not found */
	int av_size;			/* compute the av size */
	char *av_p;			/* used to build the av list */

#define WHITESP		" \t\f\v\n\r"
	if (oldac < 2)
		return 1;	/* need at least one argument */

	if (oldac == 2) {
		/*
		 * If we are called with one argument, try to split it into
		 * words for subsequent parsing. Spaces after a ',' are
		 * removed by copying the string in-place.
		 */
		char *arg = oldav[1];	/* The string is the first arg. */
		int l = strlen(arg);
		int copy = 0;		/* 1 if we need to copy, 0 otherwise */
		int i, j;

		for (i = j = 0; i < l; i++) {
			if (arg[i] == '#')	/* comment marker */
				break;
			if (copy) {
				arg[j++] = arg[i];
				copy = !strchr("," WHITESP, arg[i]);
			} else {
				copy = !strchr(WHITESP, arg[i]);
				if (copy)
					arg[j++] = arg[i];
			}
		}
		if (!copy && j > 0)	/* last char was a 'blank', remove it */
			j--;
		l = j;			/* the new argument length */
		arg[j++] = '\0';
		if (l == 0)		/* empty string! */
			return 1;

		/*
		 * First, count number of arguments. Because of the previous
		 * processing, this is just the number of blanks plus 1.
		 */
		for (i = 0, ac = 1; i < l; i++)
			if (strchr(WHITESP, arg[i]) != NULL)
				ac++;

		/*
		 * Allocate the argument list structure as a single block
		 * of memory, containing pointers and the argument
		 * strings. We include one entry for the program name
		 * because getopt expects it, and a NULL at the end
		 * to simplify further parsing.
		 */
		ac++;		/* add 1 for the program name */
		av_size = (ac+1) * sizeof(char *) + l + 1;
		av = safe_calloc(av_size, 1);

		/*
		 * Init the argument pointer to the end of the array
		 * and copy arguments from arg[] to av[]. For each one,
		 * j is the initial character, i is the one past the end.
		 */
		av_p = (char *)&av[ac+1];
		for (ac = 1, i = j = 0; i < l; i++) {
			if (strchr(WHITESP, arg[i]) != NULL || i == l-1) {
				if (i == l-1)
					i++;
				bcopy(arg+j, av_p, i-j);
				av[ac] = av_p;
				av_p += i-j;	/* the length of the string */
				*av_p++ = '\0';
				ac++;
				j = i + 1;
			}
		}
	} else {
		/*
		 * If an argument ends with ',' join with the next one.
		 */
		int first, i, l=0;

		/*
		 * Allocate the argument list structure as a single block
		 * of memory, containing both pointers and the argument
		 * strings. We include some space for the program name
		 * because getopt expects it.
		 * We add an extra pointer to the end of the array,
		 * to make simpler further parsing.
		 */
		for (i=0; i<oldac; i++)
			l += strlen(oldav[i]);

		av_size = (oldac+1) * sizeof(char *) + l + oldac;
		av = safe_calloc(av_size, 1);

		/*
		 * Init the argument pointer to the end of the array
		 * and copy arguments from arg[] to av[]
		 */
		av_p = (char *)&av[oldac+1];
		for (first = i = ac = 1, l = 0; i < oldac; i++) {
			char *arg = oldav[i];
			int k = strlen(arg);

			l += k;
			if (arg[k-1] != ',' || i == oldac-1) {
				/* Time to copy. */
				av[ac] = av_p;
				for (l=0; first <= i; first++) {
					strcat(av_p, oldav[first]);
					av_p += strlen(oldav[first]);
				}
				*av_p++ = '\0';
				ac++;
				l = 0;
				first = i+1;
			}
		}
	}

	/*
	 * set the progname pointer to the original string
	 * and terminate the array with null
	 */
	av[0] = oldav[0];
	av[ac] = NULL;

	/* Set the force flag for non-interactive processes */
	if (!co.do_force)
		co.do_force = !isatty(STDIN_FILENO);

#ifdef EMULATE_SYSCTL /* sysctl emulation */
	if ( ac >= 2 && !strcmp(av[1], "sysctl")) {
		char *s;
		int i;

		if (ac != 3) {
			printf(	"sysctl emulation usage:\n"
				"	ipfw sysctl name[=value]\n"
				"	ipfw sysctl -a\n");
			return 0;
		}
		s = strchr(av[2], '=');
		if (s == NULL) {
			s = !strcmp(av[2], "-a") ? NULL : av[2];
			sysctlbyname(s, NULL, NULL, NULL, 0);
		} else {	/* ipfw sysctl x.y.z=value */
			/* assume an INT value, will extend later */
			if (s[1] == '\0') {
				printf("ipfw sysctl: missing value\n\n");
				return 0;
			}
			*s = '\0';
			i = strtol(s+1, NULL, 0);
			sysctlbyname(av[2], NULL, NULL, &i, sizeof(int));
		}
		return 0;
	}
#endif

	/* Save arguments for final freeing of memory. */
	save_av = av;

	optind = optreset = 1;	/* restart getopt() */
	while ((ch = getopt(ac, av, "abcdefhinNp:qs:STtv")) != -1)
		switch (ch) {
		case 'a':
			do_acct = 1;
			break;

		case 'b':
			co.comment_only = 1;
			co.do_compact = 1;
			break;

		case 'c':
			co.do_compact = 1;
			break;

		case 'd':
			co.do_dynamic = 1;
			break;

		case 'e':
			co.do_expired = 1;
			break;

		case 'f':
			co.do_force = 1;
			break;

		case 'h': /* help */
			free(save_av);
			help();
			break;	/* NOTREACHED */

		case 'i':
			co.do_value_as_ip = 1;
			break;

		case 'n':
			co.test_only = 1;
			break;

		case 'N':
			co.do_resolv = 1;
			break;

		case 'q':
			co.do_quiet = 1;
			break;

		case 'p':
			errx(EX_USAGE, "An absolute pathname must be used "
			    "with -p option.");
			/* NOTREACHED */

		case 's': /* sort */
			co.do_sort = atoi(optarg);
			break;

		case 'S':
			co.show_sets = 1;
			break;

		case 't':
			co.do_time = 1;
			break;

		case 'T':
			co.do_time = 2;	/* numeric timestamp */
			break;

		case 'v': /* verbose */
			co.verbose = 1;
			break;

		default:
			free(save_av);
			return 1;
		}

	ac -= optind;
	av += optind;
	NEED1("bad arguments, for usage summary ``ipfw''");

	/*
	 * An undocumented behaviour of ipfw1 was to allow rule numbers first,
	 * e.g. "100 add allow ..." instead of "add 100 allow ...".
	 * In case, swap first and second argument to get the normal form.
	 */
	if (ac > 1 && isdigit(*av[0])) {
		char *p = av[0];

		av[0] = av[1];
		av[1] = p;
	}

	/*
	 * Optional: pipe, queue or nat.
	 */
	co.do_nat = 0;
	co.do_pipe = 0;
	co.use_set = 0;
	if (!strncmp(*av, "nat", strlen(*av)))
	        co.do_nat = 1;
	else if (!strncmp(*av, "pipe", strlen(*av)))
		co.do_pipe = 1;
	else if (_substrcmp(*av, "queue") == 0)
		co.do_pipe = 2;
	else if (_substrcmp(*av, "flowset") == 0)
		co.do_pipe = 2;
	else if (_substrcmp(*av, "sched") == 0)
		co.do_pipe = 3;
	else if (!strncmp(*av, "set", strlen(*av))) {
		if (ac > 1 && isdigit(av[1][0])) {
			co.use_set = strtonum(av[1], 0, resvd_set_number,
					&errstr);
			if (errstr)
				errx(EX_DATAERR,
				    "invalid set number %s\n", av[1]);
			ac -= 2; av += 2; co.use_set++;
		}
	}

	if (co.do_pipe || co.do_nat) {
		ac--;
		av++;
	}
	NEED1("missing command");

	/*
	 * For pipes, queues and nats we normally say 'nat|pipe NN config'
	 * but the code is easier to parse as 'nat|pipe config NN'
	 * so we swap the two arguments.
	 */
	if ((co.do_pipe || co.do_nat) && ac > 1 && isdigit(*av[0])) {
		char *p = av[0];

		av[0] = av[1];
		av[1] = p;
	}

	if (co.use_set == 0) {
		if (_substrcmp(*av, "add") == 0)
			ipfw_add(av);
		else if (co.do_nat && _substrcmp(*av, "show") == 0)
			ipfw_show_nat(ac, av);
		else if (co.do_pipe && _substrcmp(*av, "config") == 0)
			ipfw_config_pipe(ac, av);
		else if (co.do_nat && _substrcmp(*av, "config") == 0)
			ipfw_config_nat(ac, av);
		else if (_substrcmp(*av, "set") == 0)
			ipfw_sets_handler(av);
		else if (_substrcmp(*av, "table") == 0)
			ipfw_table_handler(ac, av);
		else if (_substrcmp(*av, "enable") == 0)
			ipfw_sysctl_handler(av, 1);
		else if (_substrcmp(*av, "disable") == 0)
			ipfw_sysctl_handler(av, 0);
		else
			try_next = 1;
	}

	if (co.use_set || try_next) {
		if (_substrcmp(*av, "delete") == 0)
			ipfw_delete(av);
		else if (_substrcmp(*av, "flush") == 0)
			ipfw_flush(co.do_force);
		else if (_substrcmp(*av, "zero") == 0)
			ipfw_zero(ac, av, 0 /* IP_FW_ZERO */);
		else if (_substrcmp(*av, "resetlog") == 0)
			ipfw_zero(ac, av, 1 /* IP_FW_RESETLOG */);
		else if (_substrcmp(*av, "print") == 0 ||
		         _substrcmp(*av, "list") == 0)
			ipfw_list(ac, av, do_acct);
		else if (_substrcmp(*av, "show") == 0)
			ipfw_list(ac, av, 1 /* show counters */);
		else
			errx(EX_USAGE, "bad command `%s'", *av);
	}

	/* Free memory allocated in the argument parsing. */
	free(save_av);
	return 0;
}


static void
ipfw_readfile(int ac, char *av[])
{
#define MAX_ARGS	32
	char buf[4096];
	char *progname = av[0];		/* original program name */
	const char *cmd = NULL;		/* preprocessor name, if any */
	const char *filename = av[ac-1]; /* file to read */
	int	c, lineno=0;
	FILE	*f = NULL;
	pid_t	preproc = 0;

	while ((c = getopt(ac, av, "cfNnp:qS")) != -1) {
		switch(c) {
		case 'c':
			co.do_compact = 1;
			break;

		case 'f':
			co.do_force = 1;
			break;

		case 'N':
			co.do_resolv = 1;
			break;

		case 'n':
			co.test_only = 1;
			break;

		case 'p':
			/*
			 * ipfw -p cmd [args] filename
			 *
			 * We are done with getopt(). All arguments
			 * except the filename go to the preprocessor,
			 * so we need to do the following:
			 * - check that a filename is actually present;
			 * - advance av by optind-1 to skip arguments
			 *   already processed;
			 * - decrease ac by optind, to remove the args
			 *   already processed and the final filename;
			 * - set the last entry in av[] to NULL so
			 *   popen() can detect the end of the array;
			 * - set optind=ac to let getopt() terminate.
			 */
			if (optind == ac)
				errx(EX_USAGE, "no filename argument");
			cmd = optarg;
			av[ac-1] = NULL;
			av += optind - 1;
			ac -= optind;
			optind = ac;
			break;

		case 'q':
			co.do_quiet = 1;
			break;

		case 'S':
			co.show_sets = 1;
			break;

		default:
			errx(EX_USAGE, "bad arguments, for usage"
			     " summary ``ipfw''");
		}

	}

	if (cmd == NULL && ac != optind + 1)
		errx(EX_USAGE, "extraneous filename arguments %s", av[ac-1]);

	if ((f = fopen(filename, "r")) == NULL)
		err(EX_UNAVAILABLE, "fopen: %s", filename);

	if (cmd != NULL) {			/* pipe through preprocessor */
		int pipedes[2];

		if (pipe(pipedes) == -1)
			err(EX_OSERR, "cannot create pipe");

		preproc = fork();
		if (preproc == -1)
			err(EX_OSERR, "cannot fork");

		if (preproc == 0) {
			/*
			 * Child, will run the preprocessor with the
			 * file on stdin and the pipe on stdout.
			 */
			if (dup2(fileno(f), 0) == -1
			    || dup2(pipedes[1], 1) == -1)
				err(EX_OSERR, "dup2()");
			fclose(f);
			close(pipedes[1]);
			close(pipedes[0]);
			execvp(cmd, av);
			err(EX_OSERR, "execvp(%s) failed", cmd);
		} else { /* parent, will reopen f as the pipe */
			fclose(f);
			close(pipedes[1]);
			if ((f = fdopen(pipedes[0], "r")) == NULL) {
				int savederrno = errno;

				(void)kill(preproc, SIGTERM);
				errno = savederrno;
				err(EX_OSERR, "fdopen()");
			}
		}
	}

	while (fgets(buf, sizeof(buf), f)) {		/* read commands */
		char linename[20];
		char *args[2];

		lineno++;
		snprintf(linename, sizeof(linename), "Line %d", lineno);
		setprogname(linename); /* XXX */
		args[0] = progname;
		args[1] = buf;
		ipfw_main(2, args);
	}
	fclose(f);
	if (cmd != NULL) {
		int status;

		if (waitpid(preproc, &status, 0) == -1)
			errx(EX_OSERR, "waitpid()");
		if (WIFEXITED(status) && WEXITSTATUS(status) != EX_OK)
			errx(EX_UNAVAILABLE,
			    "preprocessor exited with status %d",
			    WEXITSTATUS(status));
		else if (WIFSIGNALED(status))
			errx(EX_UNAVAILABLE,
			    "preprocessor exited with signal %d",
			    WTERMSIG(status));
	}
}

int
main(int ac, char *av[])
{
#if defined(_WIN32) && defined(TCC)
	{
		WSADATA wsaData;
		int ret=0;
		unsigned short wVersionRequested = MAKEWORD(2, 2);
		ret = WSAStartup(wVersionRequested, &wsaData);
		if (ret != 0) {
			/* Tell the user that we could not find a usable */
			/* Winsock DLL.                                  */
			printf("WSAStartup failed with error: %d\n", ret);
			return 1;
		}
	}
#endif
	/*
	 * If the last argument is an absolute pathname, interpret it
	 * as a file to be preprocessed.
	 */

	if (ac > 1 && av[ac - 1][0] == '/') {
		if (access(av[ac - 1], R_OK) == 0)
			ipfw_readfile(ac, av);
		else
			err(EX_USAGE, "pathname: %s", av[ac - 1]);
	} else {
		if (ipfw_main(ac, av)) {
			errx(EX_USAGE,
			    "usage: ipfw [options]\n"
			    "do \"ipfw -h\" or \"man ipfw\" for details");
		}
	}
	return EX_OK;
}


================================================
FILE: ipfw/qsort.c
================================================
/*-
 * Copyright (c) 1992, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#if defined(LIBC_SCCS) && !defined(lint)
static char sccsid[] = "@(#)qsort.c	8.1 (Berkeley) 6/4/93";
#endif /* LIBC_SCCS and not lint */
#include <sys/cdefs.h>
__FBSDID("$FreeBSD: src/lib/libc/stdlib/qsort.c,v 1.15 2008/01/14 09:21:34 das Exp $");

#include <stdlib.h>

#ifdef I_AM_QSORT_R
typedef int		 cmp_t(void *, const void *, const void *);
#else
typedef int		 cmp_t(const void *, const void *);
#endif
static inline char	*med3(char *, char *, char *, cmp_t *, void *);
static inline void	 swapfunc(char *, char *, int, int);

#define min(a, b)	(a) < (b) ? a : b

/*
 * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function".
 */
#define swapcode(TYPE, parmi, parmj, n) { 		\
	long i = (n) / sizeof (TYPE); 			\
	TYPE *pi = (TYPE *) (parmi); 		\
	TYPE *pj = (TYPE *) (parmj); 		\
	do { 						\
		TYPE	t = *pi;		\
		*pi++ = *pj;				\
		*pj++ = t;				\
        } while (--i > 0);				\
}

#define SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \
	es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1;

static inline void
swapfunc(a, b, n, swaptype)
	char *a, *b;
	int n, swaptype;
{
	if(swaptype <= 1)
		swapcode(long, a, b, n)
	else
		swapcode(char, a, b, n)
}

#define swap(a, b)					\
	if (swaptype == 0) {				\
		long t = *(long *)(a);			\
		*(long *)(a) = *(long *)(b);		\
		*(long *)(b) = t;			\
	} else						\
		swapfunc(a, b, es, swaptype)

#define vecswap(a, b, n) 	if ((n) > 0) swapfunc(a, b, n, swaptype)

#ifdef I_AM_QSORT_R
#define	CMP(t, x, y) (cmp((t), (x), (y)))
#else
#define	CMP(t, x, y) (cmp((x), (y)))
#endif

static inline char *
med3(char *a, char *b, char *c, cmp_t *cmp, void *thunk
#ifndef I_AM_QSORT_R
__unused // XXX what ?
#endif
)
{
	return CMP(thunk, a, b) < 0 ?
	       (CMP(thunk, b, c) < 0 ? b : (CMP(thunk, a, c) < 0 ? c : a ))
              :(CMP(thunk, b, c) > 0 ? b : (CMP(thunk, a, c) < 0 ? a : c ));
}

#ifdef I_AM_QSORT_R
void
qsort_r(void *a, size_t n, size_t es, void *thunk, cmp_t *cmp)
#else
#define thunk NULL
void
qsort(void *a, size_t n, size_t es, cmp_t *cmp)
#endif
{
	char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
	size_t d, r;
	int cmp_result;
	int swaptype, swap_cnt;

loop:	SWAPINIT(a, es);
	swap_cnt = 0;
	if (n < 7) {
		for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es)
			for (pl = pm;
			     pl > (char *)a && CMP(thunk, pl - es, pl) > 0;
			     pl -= es)
				swap(pl, pl - es);
		return;
	}
	pm = (char *)a + (n / 2) * es;
	if (n > 7) {
		pl = a;
		pn = (char *)a + (n - 1) * es;
		if (n > 40) {
			d = (n / 8) * es;
			pl = med3(pl, pl + d, pl + 2 * d, cmp, thunk);
			pm = med3(pm - d, pm, pm + d, cmp, thunk);
			pn = med3(pn - 2 * d, pn - d, pn, cmp, thunk);
		}
		pm = med3(pl, pm, pn, cmp, thunk);
	}
	swap(a, pm);
	pa = pb = (char *)a + es;

	pc = pd = (char *)a + (n - 1) * es;
	for (;;) {
		while (pb <= pc && (cmp_result = CMP(thunk, pb, a)) <= 0) {
			if (cmp_result == 0) {
				swap_cnt = 1;
				swap(pa, pb);
				pa += es;
			}
			pb += es;
		}
		while (pb <= pc && (cmp_result = CMP(thunk, pc, a)) >= 0) {
			if (cmp_result == 0) {
				swap_cnt = 1;
				swap(pc, pd);
				pd -= es;
			}
			pc -= es;
		}
		if (pb > pc)
			break;
		swap(pb, pc);
		swap_cnt = 1;
		pb += es;
		pc -= es;
	}
	if (swap_cnt == 0) {  /* Switch to insertion sort */
		for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es)
			for (pl = pm;
			     pl > (char *)a && CMP(thunk, pl - es, pl) > 0;
			     pl -= es)
				swap(pl, pl - es);
		return;
	}

	pn = (char *)a + n * es;
	r = min(pa - (char *)a, pb - pa);
	vecswap(a, pb - r, r);
	r = min(pd - pc, pn - pd - es);
	vecswap(pb, pn - r, r);
	if ((r = pb - pa) > es)
#ifdef I_AM_QSORT_R
		qsort_r(a, r / es, es, thunk, cmp);
#else
		qsort(a, r / es, es, cmp);
#endif
	if ((r = pd - pc) > es) {
		/* Iterate rather than recurse to save stack space */
		a = pn - r;
		n = r / es;
		goto loop;
	}
/*		qsort(pn - r, r / es, es, cmp);*/
}


================================================
FILE: ipfw/qsort_r.c
================================================
/*
 * This file is in the public domain.  Originally written by Garrett
 * A. Wollman.
 *
 * $FreeBSD: src/lib/libc/stdlib/qsort_r.c,v 1.1 2002/09/10 02:04:49 wollman Exp $
 */
#define I_AM_QSORT_R
#include "qsort.c"


================================================
FILE: ipfw/rule_test.sh
================================================
#/bin/bash

COMMAND=ipfw


echo .########## Set $COMMAND mode .##########
$COMMAND add allow ip from any to any
$COMMAND -q flush

echo .########## empty rules .##########
$COMMAND list
$COMMAND add allow ip from any to any
$COMMAND add allow ip from any to { 1.2.3.4 or 2.3.4.5 }
$COMMAND add allow { dst-ip 1.2.3.4 or dst-ip 2.3.4.5 }

echo .########## listing 3 rules .##########
$COMMAND list

$COMMAND delete 200
echo .########## listing 2 rules .##########
$COMMAND list

$COMMAND table 10 add 1.2.3.4
$COMMAND table 10 add 1.2.3.5
$COMMAND table 10 add 1.2.3.6
$COMMAND table 10 add 1.2.3.7/13
$COMMAND table 10 add 1.2.3.7/20
$COMMAND table 10 add 1.2.3.7/28

echo .########## listing table 10 with 6 elements .##########
$COMMAND table 10 list
$COMMAND table 10 delete 1.2.3.6

echo .########## listing table 10 with 5 elements .##########
$COMMAND table 10 list
$COMMAND table 10 flush

echo .########## table 10 empty .##########
$COMMAND table 10 list

echo .########## move rule 100 to set 1 300 to 3 .##########
$COMMAND set move rule 100 to 1
$COMMAND set move rule 300 to 3
$COMMAND -S show

echo .########## move rule 200 to 2 but 200 do not exist .######
$COMMAND set move rule 200 to 2

echo .########## add some rules .##########
$COMMAND add 200 queue 2 proto ip
$COMMAND add 300 queue 5 proto ip
$COMMAND add 400 queue 40 proto ip
$COMMAND add 400 queue 50 proto ip

echo .########## move rule 200 to 2 .######
$COMMAND set move rule 200 to 2

echo .########## move rule 400 to 5 .######
$COMMAND set move rule 400 to 5

echo .########## set 5 show 2 rules .######
$COMMAND set 5 show

echo .########## flush set 5 .######
$COMMAND -q set 5 flush

echo .########## set 5 show 0 rule .######
$COMMAND set 5 show

echo .########## disable set 1 .######
$COMMAND set disable 1

echo .########## show all rules except set 1 .######
$COMMAND -S show

echo .########## enable set 1 .######
$COMMAND set enable 1

echo .########## show all rules .######
$COMMAND -S show


================================================
FILE: ipfw/ws2_32.def
================================================
LIBRARY ws2_32.dll

EXPORTS
FreeAddrInfoW
GetAddrInfoW
GetNameInfoW
WEP
WPUCompleteOverlappedRequest
WSAAccept
WSAAddressToStringA
WSAAddressToStringW
WSAAsyncGetHostByAddr
WSAAsyncGetHostByName
WSAAsyncGetProtoByName
WSAAsyncGetProtoByNumber
WSAAsyncGetServByName
WSAAsyncGetServByPort
WSAAsyncSelect
WSACancelAsyncRequest
WSACancelBlockingCall
WSACleanup
WSACloseEvent
WSAConnect
WSACreateEvent
WSADuplicateSocketA
WSADuplicateSocketW
WSAEnumNameSpaceProvidersA
WSAEnumNameSpaceProvidersW
WSAEnumNetworkEvents
WSAEnumProtocolsA
WSAEnumProtocolsW
WSAEventSelect
WSAGetLastError
WSAGetOverlappedResult
WSAGetQOSByName
WSAGetServiceClassInfoA
WSAGetServiceClassInfoW
WSAGetServiceClassNameByClassIdA
WSAGetServiceClassNameByClassIdW
WSAHtonl
WSAHtons
WSAInstallServiceClassA
WSAInstallServiceClassW
WSAIoctl
WSAIsBlocking
WSAJoinLeaf
WSALookupServiceBeginA
WSALookupServiceBeginW
WSALookupServiceEnd
WSALookupServiceNextA
WSALookupServiceNextW
WSANSPIoctl
WSANtohl
WSANtohs
WSAProviderConfigChange
WSARecv
WSARecvDisconnect
WSARecvFrom
WSARemoveServiceClass
WSAResetEvent
WSASend
WSASendDisconnect
WSASendTo
WSASetBlockingHook
WSASetEvent
WSASetLastError
WSASetServiceA
WSASetServiceW
WSASocketA
WSASocketW
WSAStartup
WSAStringToAddressA
WSAStringToAddressW
WSAUnhookBlockingHook
WSAWaitForMultipleEvents
WSApSetPostRoutine
WSCDeinstallProvider
WSCEnableNSProvider
WSCEnumProtocols
WSCGetProviderPath
WSCInstallNameSpace
WSCInstallProvider
WSCUnInstallNameSpace
WSCUpdateProvider
WSCWriteNameSpaceOrder
WSCWriteProviderOrder
__WSAFDIsSet
accept
bind
closesocket
connect
freeaddrinfo
getaddrinfo
gethostbyaddr
gethostbyname
gethostname
getnameinfo
getpeername
getprotobyname
getprotobynumber
getservbyname
getservbyport
getsockname
getsockopt
htonl
htons
inet_addr
inet_ntoa
ioctlsocket
listen
ntohl
ntohs
recv
recvfrom
select
send
sendto
setsockopt
shutdown
socket


================================================
FILE: kipfw/Makefile
================================================
# $Id: Makefile 12257 2013-04-26 21:13:24Z luigi $
# gnu Makefile to build linux/Windows module for ipfw+dummynet.
#
# The defaults are set to build without modifications on PlanetLab
# and possibly 2.6 versions.
# On Windows, we use gnu-make and MSC

# Some variables need to have specific names, because they are used
# by the build infrastructure on Linux and OpenWrt. They are:
# 
#   ccflags-y	additional $(CC) flags
#   M		used by Kbuild, we must set it to `pwd`
#   obj-m	list of .o modules to build
#   $(MOD)-y	for each $MOD in obj-m, the list of objects
#   obj-y	same as above, for openwrt
#   O_TARGET	the link target, for openwrt
#   EXTRA_CFLAGS as the name says... in openwrt
#   EXTRA_CFLAGS is used in 2.6.22 module kernel compilation too
#   KERNELPATH	the path to the kernel sources or headers
#	(on planetlab it is set already by the build system,
#	for other systems we take KSRC which is either guessed
#	or taken from the command line.
#
# Not sure about this (the name might be reserved)
#   ipfw-cflags		our flags for building the module
#
# Other variables are only private and can be renamed. They include:
#
#   VER		linux version we are building for (2.4 2.6 or openwrt)
#
#---
#
# The windows files (passthru etc.) are modified version of the
# examples found in the $(DDK)/src/network/ndis/passthru/driver/
# They can be re-created using the 'ndis-glue' target in the 

IPFW3_ROOT ?= $(PWD)/..
include $(IPFW3_ROOT)/Makefile.inc

TARGET = kipfw

# lets default for 2.6 for planetlab builds
VER ?= 2.6

# $(warning ########## linux dir is $(LINUX_DIR) ###########)
# $(warning ########## KERNELPATH is $(KERNELPATH) ###########)
#--- General values for all types of build ---
# obj-m is the target module
obj-m := ipfw_mod.o

#-- the list of source files. IPFW_SRCS is our own name.
# Original ipfw and dummynet sources + FreeBSD stuff,
IPFW_SRCS := ip_fw2.c ip_fw_pfil.c ip_fw_sockopt.c
IPFW_SRCS += ip_fw_dynamic.c ip_fw_table.c ip_fw_log.c
IPFW_SRCS += radix.c in_cksum.c
IPFW_SRCS += ip_dummynet.c ip_dn_io.c ip_dn_glue.c
IPFW_SRCS += dn_heap.c
IPFW_SRCS += dn_sched_fifo.c dn_sched_wf2q.c
IPFW_SRCS += dn_sched_rr.c dn_sched_qfq.c
IPFW_SRCS += dn_sched_prio.c
# Module glue and functions missing in linux
IPFW_SRCS += ipfw2_mod.c bsd_compat.c

# generic cflags used on all systems
#ipfw-cflags += -DIPFW_HASHTABLES
ipfw-cflags += -DIPFIREWALL_DEFAULT_TO_ACCEPT
# _BSD_SOURCE enables __FAVOR_BSD (udp/tcp bsd structs instead of posix)
ipfw-cflags += -D_BSD_SOURCE
ipfw-cflags += -DKERNEL_MODULE	# build linux kernel module
# the two header trees for empty and override files
ipfw-cflags += -I $(M)/include_e
ipfw-cflags += -I $(M)/../sys
ipfw-cflags += -include $(M)/../glue.h	# headers
ipfw-cflags += -include $(M)/missing.h	# headers

$(warning ------ arch $(OSARCH) goals $(MAKECMDGOALS) -----------)

ifeq ($(OSARCH),Windows)	#--- {  Windows block
  ifeq ($(VER),win64)
    $(warning ---- building for 64-bit windows ---)
    win_arch= -DAMD64=1
  else
    win_arch= -Di386=1
  endif
    M ?= $(shell pwd)
    WIN_SRCS += md_win.c
    WIN_SRCS += miniport.c protocol.c passthru.c debug.c
    #compiler, linker, target, sources and objects
    #DDK is exported from the root makefile
    #DDK = C:/WinDDK/7600.16385.1

    CSOURCES = $(IPFW_SRCS) $(WIN_SRCS)

    COBJS := $(CSOURCES:.c=.obj)
    COBJS := $(addprefix $(OBJDIR)/,$(COBJS))

    #include paths
    INCLUDE_PATHS = -Ii386 -I../sys -Iinclude_e -I.
    # INCLUDE_PATHS += -I$(OBJDIR)
    INCLUDE_PATHS += -I$(DDK)/inc/api
    INCLUDE_PATHS += -I$(DDK)/inc/ddk
    INCLUDE_PATHS += -I$(DDK)/inc/crt

    # #preprocessor MS defines
    PREPROC  = -D_X86_=1 -Di386=1 -DSTD_CALL -DCONDITION_HANDLING=1
    PREPROC += -DNT_UP=0 -DNT_INST=0 -DWIN32=100 -D_NT1X_=100 -DWINNT=1
    PREPROC += -D_WIN32_WINNT=0x0501 -DWINVER=0x0501 -D_WIN32_IE=0x0603
    PREPROC += -DWIN32_LEAN_AND_MEAN=1 
    PREPROC += -D__BUILDMACHINE__=WinDDK -DFPO=0 -D_DLL=1
    PREPROC += -DNDIS_MINIPORT_DRIVER -DNDIS_WDM=1
    PREPROC += -DNDIS51_MINIPORT=1 -DNDIS51=1
    PREPROC += -DMSC_NOOPT -DNTDDI_VERSION=0x05010200
    PREPROC += -DKMDF_MAJOR_VERSION_STRING=01 -DKMDF_MINOR_VERSION_STRING=009
    #PREPROC += -DDBG=1 #debug
    PREPROC += -DNDEBUG #always up, seems no effect, possibly no debug?
    PREPROC += -DDEVL=1 #always up, seems no effect
    #macroing module name, WARNING: must match the one in .inf files
    PREPROC += -DMODULENAME=Ipfw 

    #our defines
    OUR_PREPROC  = -D_KERNEL -DKERNEL_MODULE -DKLD_MODULE
    OUR_PREPROC += -D__BSD_VISIBLE -DIPFIREWALL_DEFAULT_TO_ACCEPT
    OUR_PREPROC += -D__LITTLE_ENDIAN -DSYSCTL_NODE -DEMULATE_SYSCTL

  ifeq ($(TCC),)	# Microsoft C compiler
    CC = $(DDK)/bin/x86/x86/cl.exe
    LD = $(DDK)/bin/x86/x86/link.exe
    # #complier options
    CFLAGS  = -Fo$(OBJDIR)/  -c -FC -Zc:wchar_t-
    CFLAGS += -Zl -Zp8 -Gy -Gm- -GF -cbstring -Gz -hotpatch -EHs-c-
    CFLAGS += -W2 # -W3 gives too many conversion errors
    CFLAGS += -GR- -GF -GS -Zi	# XXX do we need this ?
    CFLAGS += -Fd$(OBJDIR)/
    CFLAGS += -wd4603 -wd4627 -typedil-
    CFLAGS += -FI $(DDK)/inc/api/warning.h
    CFLAGS += -FI winmissing.h
    CFLAGS += -FI missing.h	# headers
    CFLAGS += -FI ../glue.h	# headers

    #optimization options
    OPTIMIZE = -Od -Oi -Oy-

    #linker options
    LDFLAGS  = /MERGE:_PAGE=PAGE /MERGE:_TEXT=.text
    LDFLAGS += /SECTION:INIT,d /OPT:REF /OPT:ICF
    LDFLAGS += /IGNORE:4198,4010,4037,4039,4065,4070,4078,4087,4089,4221
    LDFLAGS += /INCREMENTAL:NO /release /NODEFAULTLIB /WX
    LDFLAGS += /debug /debugtype:cv,fixup,pdata
    LDFLAGS += /version:6.1 /osversion:6.1 /functionpadmin:5
    LDFLAGS += /safeseh /pdbcompress
    LDFLAGS += /STACK:0x40000,0x1000 /driver /base:0x10000 /align:0x80
    LDFLAGS += /stub:$(DDK)\\lib\\wxp\\stub512.com
    LDFLAGS += /subsystem:native,5.01 /entry:GsDriverEntry@8
    LDFLAGS += /out:$(OBJDIR)/ipfw.sys

    #libraries to build against
    LIBS  = $(DDK)/lib/wxp/i386/BufferOverflowK.lib
    LIBS += $(DDK)/lib/wxp/i386/ntoskrnl.lib
    LIBS += $(DDK)/lib/wxp/i386/hal.lib
    LIBS += $(DDK)/lib/wxp/i386/wmilib.lib
    LIBS += $(DDK)/lib/wxp/i386/ndis.lib
    LIBS += $(DDK)/lib/wxp/i386/sehupd.lib
  else	# use tcc. not working yet for the kernel module.
    # TCC points to the root of tcc tree
    CC=$(TCC)/bin/wintcc
    EXTRA_CFLAGS += -DTCC -I..
    EXTRA_CFLAGS += -I$(TCC)/include/winapi -I$(TCC)/include
    EXTRA_CFLAGS += -nostdinc

    CFLAGS += -include winmissing.h -include missing.h -include ../glue.h
    CFLAGS += -I../../inc/api -I../../inc/ddk -I../../inc/crt
    CFLAGS += -DRC_INVOKED
  endif # use tcc

    #empty include directory to be built
    M ?= $(shell pwd)
    EFILES_asm += div64.h
    EFILES_linux += if.h random.h errno.h
    EFILES_net += if_types.h inet_hashtables.h route.h

    #targets
all: $(TARGET)

$(TARGET): include_e
	# XXX dangerous rm -rf $(OBJDIR)
	mkdir -p $(OBJDIR)
	$(MSG) "  CC [$(CC)] $(CSOURCES)"
	$(HIDE) $(CC) $(INCLUDE_PATHS) $(PREPROC) $(OUR_PREPROC) $(CFLAGS) $(OPTIMIZE) $(CSOURCES)
	$(MSG) "  LD [$(LD)] $(COBJS)"
	$(HIDE) $(LD) $(LDFLAGS) $(COBJS) $(LIBS)

else # } { linux variables and targets

  # extract version number (hex, aXXYY). Newer linuxes have a different dir
  # if not set, use the version from the installed system
  KERNELPATH ?= $(KSRC)
  LIN_VER := $(shell V=linux/version.h; G=. ; \
        [ -f $(KERNELPATH)/include/$${V} ] || G=generated/uapi ;\
        grep LINUX_VERSION_CODE $(KERNELPATH)/include/$${G}/linux/version.h | \
        awk '{printf "%03x%02x", $$3/256, $$3%256} ')

  $(warning ------------- linux version $(LIN_VER) (hex) ------------)
# We have three sections: OpenWrt, Linux 2.4 and Linux 2.6

ifeq ($(LIN_VER),openwrt)	#--- { The Makefile section for openwrt ---
  # this was used on openwrt, but not anymore
  $(error ------ build on openwrt ---------- )
  # We do not include a dependency on include_e as it is called
  # by Makefile.openwrt in Build/Prepare
  M=.
  obj-y := $(IPFW_SRCS:%.c=%.o)
  O_TARGET := $(obj-m)

  # xcflags-y is a temporary variable where we store build options
  xcflags-y += -O1 -DLINUX_24
  xcflags-y += -g

  EXTRA_CFLAGS := $(xcflags-y) $(ipfw-cflags) -DSYSCTL_NODE -DEMULATE_SYSCTL

  # we should not export anything
  #export-objs := ipfw2_mod.o
-include $(TOPDIR)/Rules.make
endif # ---- } end openwrt version


ifneq ($(shell echo $(LIN_VER)|grep '2.4'),)	#--- {
  # Makefile section for the linux 2.4 version
  # tested on linux-2.4.35.4, does not work with 2.4.37
  #
  # guess the kernel path -- or is it under /lib/modules ?
  KERNELPATH ?= $(KSRC)

  # We need to figure out the gcc include directory, if not
  # set by the user through MYGCC_INCLUDE
  # Find compiler version (3rd field in last line returned by gcc -v)
  # e.g.	gcc version 4.3.2 (Debian 4.3.2-1.1)
  MYGCC_VER ?= $(shell $(CC) -v 2>&1 |tail -n 1 | cut -d " " -f 3)
  # We don't know the exact directory under /usr/lib/gcc so we guess
  MYGCC_INCLUDE ?= $(shell echo /usr/lib/gcc/*/$(MYGCC_VER) | cut -d " " -f 1)/include
  $(warning "---- gcc includes guessed to $(MYGCC_INCLUDE)")

  # additional warning
  WARN += -Wall -Wundef
  WARN += -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing
  WARN += -fno-common -Werror-implicit-function-declaration
  # WARN += -O2  -fno-stack-protector -m32 -msoft-float -mregparm=3
  # -mregparm=3 gives a printk error
  WARN += -m32 -msoft-float # -mregparm=3
  #WARN += -freg-struct-return -mpreferred-stack-boundary=2
  WARN += -Wno-sign-compare
  WARN += -Wdeclaration-after-statement
  ifneq ($(MYGCC_VER),3.4.6)
        WARN += -Wno-pointer-sign
  endif

  ccflags-y += -O1 -DLINUX_24
  CFLAGS = -DMODULE -D__KERNEL__ -nostdinc \
	-isystem ${KERNELPATH}/include -isystem $(MYGCC_INCLUDE) \
	${ccflags-y}
  # The Main target
all: mod24

else # --- } {  linux 2.6 and newer
  $(warning --- build 2.6 and newer target $(TARGET) ----)

  # This is the Makefile section for Linux 2.6.x including planetlab

  ifeq ($(IPFW_PLANETLAB),1)
    $(warning "---- Building for PlanetLab")
    ipfw-cflags += -DIPFW_PLANETLAB        # PlanetLab compilation
  endif

  WARN := -O1 -Wall -Werror -DDEBUG_SPINLOCK -DDEBUG_MUTEXES
  # The main target

  # Required by GCC 4.6
  ccflags-y += -Wno-unused-but-set-variable


  ifeq ($(shell if [ -z $(LIN_VER) ] ; then echo "true"; fi),true)
    $(warning "---- Perhaps you miss a (cd $(KERNELPATH); make oldconfig; make prepare; make scripts)");
  endif

  # Required by kernel < 2.6.23, ccflags-y is used on newer version
  ifeq ($(shell [ "$(LIN_VER)" \< "20617" ] && echo "true"),true)
    EXTRA_CFLAGS += $(ccflags-y)
  endif

  $(warning $(shell [ "$(LIN_VER)" \< "2061c" ] && \
	[ `$(MAKE) -version | head -1 | cut -d " " -f 3` != '3.81' ] && \
	echo "****   need make 3.81 *****") )
  # $(warning make is $(MAKE) version is $(shell $(MAKE) -version | head -1) )

  #--- openwrt ?
  ifeq ($(_VER),xx-openwrt)
    $(warning ----------------------- compiling for openwrt -----)
    M=.
    obj-y := $(IPFW_SRCS:%.c=%.o)
    O_TARGET := $(obj-m)

    # xcflags-y is a temporary variable where we store build options
    xcflags-y += -O1
    xcflags-y += -g

    EXTRA_CFLAGS := $(xcflags-y) $(ipfw-cflags) -DSYSCTL_NODE -DEMULATE_SYSCTL
  endif #---- end openwrt

all: $(TARGET)
$(TARGET):	include_e
	echo "xxxxxxxxxxxxx $(MAKE) -C $(KERNELPATH) V=$(V) M=`pwd` modules"
	$(MAKE) -C $(KERNELPATH) V=$(V) M=`pwd` modules


endif # } --- linux 2.6 and newer

#-- back to the common section for linux

# the list of objects used to build the module
ipfw_mod-y = $(IPFW_SRCS:%.c=%.o)

# additional $(CC) flags
ccflags-y += $(WARN)
ccflags-y += $(ipfw-cflags)
# if we really want debug symbols...
ccflags-y += -g

mod24: include_e $(obj-m)

$(obj-m): $(ipfw_mod-y)
	$(LD) $(LDFLAGS) -m elf_i386 -r -o $@ $^

# M is the current directory, used in recursive builds
# so we allow it to be overridden
M ?= $(shell pwd)

endif # } ----- end of the non-Windows block

ifneq ($(OBJDIR),mia)
    $(error objdir set to $(OBJDIR))
endif

#--- various common targets
clean:
	-@rm -f *.o *.ko Module.symvers *.mod.c
	-@# rm -rf $(OBJDIR)
	-@rm -rf include_e

distclean: clean
	-@rm -f .*cmd modules.order opt_*
	-@rm -rf .tmp_versions .*.o.d _CL_*

# support to create empty dirs and files in include_e/
# EFILES_foo/bar is the list of files to be created in foo/bar
# (/ and . are allowed in gmake variable names)

EFILES_. += opt_inet.h opt_inet6.h opt_ipfw.h opt_ipsec.h opt_mpath.h
EFILES_. += opt_mbuf_stress_test.h opt_param.h opt_ipdivert.h

EFILES_altq += if_altq.h
EFILES_arpa += inet.h
EFILES_machine += in_cksum.h
EFILES_net += ethernet.h netisr.h pf_mtag.h bpf.h if_types.h vnet.h

EFILES_netinet += ether.h icmp6.h if_ether.h in.h in_pcb.h in_var.h
EFILES_netinet += in_systm.h ip_carp.h ip_var.h pim.h
EFILES_netinet += sctp.h tcp_timer.h tcpip.h udp_var.h
EFILES_netinet6 += ip6_var.h

EFILES_sys += _lock.h _rwlock.h rmlock.h _mutex.h jail.h
EFILES_sys += condvar.h eventhandler.h domain.h
EFILES_sys += limits.h lock.h mutex.h priv.h
EFILES_sys += proc.h rwlock.h socket.h socketvar.h
EFILES_sys += sysctl.h time.h ucred.h

# first make a list of directories from variable names
EDIRS= $(subst EFILES_,,$(filter EFILES_%,$(.VARIABLES)))
# then prepend the directory name to individual files.
#       $(empty) serves to interpret the following space literally,
#       and the ":  = " substitution packs spaces into one.
EFILES = $(foreach i,$(EDIRS),$(subst $(empty) , $(i)/, $(EFILES_$(i):  = )))

include_e:
	-@rm -rf $(M)/include_e opt_*
	-@mkdir -p $(M)/include_e
	-@(cd $(M)/include_e; mkdir -p $(EDIRS); touch $(EFILES) )

#--- some other targets for testing purposes
test_radix: test_radix.o radix.o
test_lookup: ip_fw_lookup.o
test_radix test_lookup: CFLAGS=-Wall -Werror -O1


================================================
FILE: kipfw/bsd_compat.c
================================================
/*
 * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $Id: bsd_compat.c 11530 2012-08-01 10:29:32Z luigi $
 *
 * kernel variables and functions that are not available in linux.
 */

#include <sys/cdefs.h>
#include <asm/div64.h>	/* do_div on 2.4 */
#include <linux/random.h>	/* get_random_bytes on 2.4 */
#include <netinet/ip_fw.h>
#include <netinet/ip_dummynet.h>
#include <sys/malloc.h>

/*
 * gettimeofday would be in sys/time.h but it is not
 * visible if _KERNEL is defined
 */
int gettimeofday(struct timeval *, struct timezone *);

int ticks;		/* kernel ticks counter */
int hz = 1000;		/* default clock time */
long tick = 1000;	/* XXX is this 100000/hz ? */
int bootverbose = 0;
struct timeval boottime;

int     ip_defttl = 64;	/* XXX set default value */
int	max_linkhdr = 16;
int fw_one_pass = 1;
u_long  in_ifaddrhmask;                         /* mask for hash table */
struct  in_ifaddrhashhead *in_ifaddrhashtbl;    /* inet addr hash table  */

u_int rt_numfibs = RT_NUMFIBS;

/*
 * pfil hook support.
 * We make pfil_head_get return a non-null pointer, which is then ignored
 * in our 'add-hook' routines.
 */
struct pfil_head;
typedef int (pfil_hook_t)
	(void *, struct mbuf **, struct ifnet *, int, struct inpcb *);

struct pfil_head *
pfil_head_get(int proto, u_long flags)
{
	static int dummy;
	return (struct pfil_head *)&dummy;
}
 
int
pfil_add_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h)
{
	return 0;
}

int
pfil_remove_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h)
{
	return 0;
}

/* define empty body for kernel function */
int
priv_check(struct thread *td, int priv)
{
	return 0;
}

int
securelevel_ge(struct ucred *cr, int level)
{
	return 0;
}

int
sysctl_handle_int(SYSCTL_HANDLER_ARGS)
{
	return 0;
}

int
sysctl_handle_long(SYSCTL_HANDLER_ARGS)
{
	return 0;
}

void
ether_demux(struct ifnet *ifp, struct mbuf *m)
{
	return;
}

int
ether_output_frame(struct ifnet *ifp, struct mbuf *m)
{
	return 0;
}

void
in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum)
{
	return;
}

void
icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu)
{
	return;
}

u_short
in_cksum_skip(struct mbuf *m, int len, int skip)
{
	return 0;
}

u_short
in_cksum_hdr(struct ip *ip)
{
	return 0;
}

/*
 * we don't really reassemble, just return whatever we had.
 */
struct mbuf *
ip_reass(struct mbuf *clone)
{
	return clone;
}
#ifdef INP_LOCK_ASSERT
#undef INP_LOCK_ASSERT
#define INP_LOCK_ASSERT(a)
#endif

/* credentials check */
#include <netinet/ip_fw.h>
#ifdef __linux__
int
cred_check(void *_insn,  int proto, struct ifnet *oif,
    struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
    u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp,
    struct sk_buff *skb)
{
	int match = 0;
	ipfw_insn_u32 *insn = (ipfw_insn_u32 *)_insn;

	if (*ugid_lookupp == 0) {        /* actively lookup and copy in cache */
		/* returns null if any element of the chain up to file is null.
		 * if sk != NULL then we also have a reference
		 */
		*ugid_lookupp = linux_lookup(proto,
			src_ip.s_addr, htons(src_port),
			dst_ip.s_addr, htons(dst_port),
			skb, oif ? 1 : 0, u);
	}
	if (*ugid_lookupp < 0)
		return 0;

	if (insn->o.opcode == O_UID)
		match = (u->uid == (uid_t)insn->d[0]);
	else if (insn->o.opcode == O_JAIL)
		match = (u->xid == (uid_t)insn->d[0]);
	else if (insn->o.opcode == O_GID)
		match = (u->gid == (uid_t)insn->d[0]);
	return match;
}
#endif	/* __linux__ */

int
jailed(struct ucred *cred)
{
	return 0;
}

/*
* Return 1 if an internet address is for a ``local'' host
* (one to which we have a connection).  If subnetsarelocal
* is true, this includes other subnets of the local net.
* Otherwise, it includes only the directly-connected (sub)nets.
*/
int
in_localaddr(struct in_addr in)
{
	return 1;
}

int
sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
{
	size_t valsize = sopt->sopt_valsize;

	if (len < valsize)
		sopt->sopt_valsize = valsize = len;
	//printf("copyout buf = %p, sopt = %p, soptval = %p, len = %d \n", buf, sopt, sopt->sopt_val, len);
	bcopy(buf, sopt->sopt_val, valsize);
	return 0;
}

/*
 * copy data from userland to kernel
 */
int
sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
{
	size_t valsize = sopt->sopt_valsize;

	if (valsize < minlen)
		return EINVAL;
	if (valsize > len)
		sopt->sopt_valsize = valsize = len;
	//printf("copyin buf = %p, sopt = %p, soptval = %p, len = %d \n", buf, sopt, sopt->sopt_val, len);
	bcopy(sopt->sopt_val, buf, valsize);
	return 0;
}

void
getmicrouptime(struct timeval *tv)
{
	do_gettimeofday(tv);
}


#include <arpa/inet.h>

char *
inet_ntoa_r(struct in_addr ina, char *buf)
{
#ifdef _WIN32
#else
	unsigned char *ucp = (unsigned char *)&ina;

	sprintf(buf, "%d.%d.%d.%d",
	ucp[0] & 0xff,
	ucp[1] & 0xff,
	ucp[2] & 0xff,
	ucp[3] & 0xff);
#endif
	return buf;
}

char *
inet_ntoa(struct in_addr ina)
{
	static char buf[16];
	return inet_ntoa_r(ina, buf);
}

int
random(void)
{
#ifdef _WIN32
	static unsigned long seed;
	if (seed == 0) {
		LARGE_INTEGER tm;
		KeQuerySystemTime(&tm);
		seed = tm.LowPart;
	}
	return RtlRandomEx(&seed) & 0x7fffffff;
#else
	int r;
	get_random_bytes(&r, sizeof(r));
	return r & 0x7fffffff; 
#endif
}


/*
 * do_div really does a u64 / u32 bit division.
 * we save the sign and convert to uint befor calling.
 * We are safe just because we always call it with small operands.
 */
int64_t
div64(int64_t a, int64_t b)
{
#ifdef _WIN32
        int a1 = a, b1 = b;
	return a1/b1;
#else
	uint64_t ua, ub;
	int sign = ((a>0)?1:-1) * ((b>0)?1:-1);

	ua = ((a>0)?a:-a);
	ub = ((b>0)?b:-b);
        do_div(ua, ub);
	return sign*ua;
#endif
}

#ifdef __MIPSEL__
size_t
strlcpy(char *dst, const char *src, size_t siz)
{
        char *d = dst;
        const char *s = src;
        size_t n = siz;
 
        /* Copy as many bytes as will fit */
        if (n != 0 && --n != 0) {
                do {
                        if ((*d++ = *s++) == 0)
                                break;
                } while (--n != 0);
        }

        /* Not enough room in dst, add NUL and traverse rest of src */
        if (n == 0) {
                if (siz != 0)
                        *d = '\0';              /* NUL-terminate dst */
                while (*s++)
                        ;
        }

        return(s - src - 1);    /* count does not include NUL */
}
#endif // __MIPSEL__

/*
 * compact version of fnmatch.
 */
int
fnmatch(const char *pattern, const char *string, int flags)
{
	char s;

	if (!string || !pattern)
		return 1;	/* no match */
	while ( (s = *string++) ) {
		char p = *pattern++;
		if (p == '\0')		/* pattern is over, no match */
			return 1;
		if (p == '*')		/* wildcard, match */
			return 0;
		if (p == '.' || p == s)	/* char match, continue */
			continue;
		return 1;		/* no match */
	}
	/* end of string, make sure the pattern is over too */
	if (*pattern == '\0' || *pattern == '*')
		return 0;
	return 1;	/* no match */
}


/*
 * linux 2.6.33 defines these functions to access to
 * skbuff internal structures. Define the missing
 * function for the previous versions too.
 */
#ifdef linux
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31)
inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
{
        skb->dst = dst;
}

inline struct dst_entry *skb_dst(const struct sk_buff *skb)
{
        return (struct dst_entry *)skb->dst;
}
#endif /* < 2.6.31 */
#endif /* linux */


/* support for sysctl emulation.
 * XXX this is actually MI code that should be enabled also on openwrt
 */
#ifdef EMULATE_SYSCTL
static struct sysctltable GST;

int
kesysctl_emu_get(struct sockopt* sopt)
{
	struct dn_id* oid = sopt->sopt_val;
	struct sysctlhead* entry;
	int sizeneeded = sizeof(struct dn_id) + GST.totalsize +
		sizeof(struct sysctlhead);
	unsigned char* pstring;
	unsigned char* pdata;
	int i;
	
	if (sopt->sopt_valsize < sizeneeded) {
		// this is a probe to retrieve the space needed for
		// a dump of the sysctl table
		oid->id = sizeneeded;
		sopt->sopt_valsize = sizeof(struct dn_id);
		return 0;
	}
	
	entry = (struct sysctlhead*)(oid+1);
	for( i=0; i<GST.count; i++) {
		entry->blocklen = GST.entry[i].head.blocklen;
		entry->namelen = GST.entry[i].head.namelen;
		entry->flags = GST.entry[i].head.flags;
		entry->datalen = GST.entry[i].head.datalen;
		pdata = (unsigned char*)(entry+1);
		pstring = pdata+GST.entry[i].head.datalen;
		bcopy(GST.entry[i].data, pdata, GST.entry[i].head.datalen);
		bcopy(GST.entry[i].name, pstring, GST.entry[i].head.namelen);
		entry = (struct sysctlhead*)
			((unsigned char*)(entry) + GST.entry[i].head.blocklen);
	}
	sopt->sopt_valsize = sizeneeded;
	return 0;
}

int
kesysctl_emu_set(void* p, int l)
{
	struct sysctlhead* entry;
	unsigned char* pdata;
	unsigned char* pstring;
	int i = 0;
	
	entry = (struct sysctlhead*)(((struct dn_id*)p)+1);
	pdata = (unsigned char*)(entry+1);
	pstring = pdata + entry->datalen;
	
	for (i=0; i<GST.count; i++) {
		if (strcmp(GST.entry[i].name, pstring) != 0)
			continue;
		printf("%s: match found! %s\n",__FUNCTION__,pstring);
		//sanity check on len, not really useful now since
		//we only accept int32
		if (entry->datalen != GST.entry[i].head.datalen) {
			printf("%s: len mismatch, user %d vs kernel %d\n",
				__FUNCTION__, entry->datalen,
				GST.entry[i].head.datalen);
			return -1;
		}
		// check access (at the moment flags handles only the R/W rights
		//later on will be type + access
		if( (GST.entry[i].head.flags & 3) == CTLFLAG_RD) {
			printf("%s: the entry %s is read only\n",
				__FUNCTION__,GST.entry[i].name);
			return -1;
		}
		bcopy(pdata, GST.entry[i].data, GST.entry[i].head.datalen);
		return 0;
	}
	printf("%s: match not found\n",__FUNCTION__);
	return 0;
}

/* convert all _ to . until the first . */
static void
underscoretopoint(char* s)
{
	for (; *s && *s != '.'; s++)
		if (*s == '_')
			*s = '.';
}

static int
formatnames()
{
	int i;
	int size=0;
	char* name;

	for (i=0; i<GST.count; i++)
		size += GST.entry[i].head.namelen;
	GST.namebuffer = malloc(size, 0, 0);
	if (GST.namebuffer == NULL)
		return -1;
	name = GST.namebuffer;
	for (i=0; i<GST.count; i++) {
		bcopy(GST.entry[i].name, name, GST.entry[i].head.namelen);
		underscoretopoint(name);
		GST.entry[i].name = name;
		name += GST.entry[i].head.namelen;
	}
	return 0;
}

static void
dumpGST()
{
	int i;

	for (i=0; i<GST.count; i++) {
		printf("SYSCTL: entry %i\n", i);
		printf("name %s\n", GST.entry[i].name);
		printf("namelen %i\n", GST.entry[i].head.namelen);
		printf("type %i access %i\n",
			GST.entry[i].head.flags >> 2,
			GST.entry[i].head.flags & 0x00000003);
		printf("data %i\n", *(int*)(GST.entry[i].data));
		printf("datalen %i\n", GST.entry[i].head.datalen);
		printf("blocklen %i\n", GST.entry[i].head.blocklen);
	}
}

void sysctl_addgroup_f1();
void sysctl_addgroup_f2();
void sysctl_addgroup_f3();
void sysctl_addgroup_f4();

void
keinit_GST()
{
	int ret;

	sysctl_addgroup_f1();
	sysctl_addgroup_f2();
	sysctl_addgroup_f3();
	sysctl_addgroup_f4();
	ret = formatnames();
	if (ret != 0)
		printf("conversion of names failed for some reason\n");
	//dumpGST();
	printf("*** Global Sysctl Table entries = %i, total size = %i ***\n",
		GST.count, GST.totalsize);
}

void
keexit_GST()
{
	if (GST.namebuffer != NULL)
		free(GST.namebuffer,0);
	bzero(&GST, sizeof(GST));
}

void
sysctl_pushback(char* name, int flags, int datalen, void* data)
{
	if (GST.count >= GST_HARD_LIMIT) {
		printf("WARNING: global sysctl table full, this entry will not be added,"
				"please recompile the module increasing the table size\n");
		return;
	}
	GST.entry[GST.count].head.namelen = strlen(name)+1; //add space for '\0'
	GST.entry[GST.count].name = name;
	GST.entry[GST.count].head.flags = flags;
	GST.entry[GST.count].data = data;
	GST.entry[GST.count].head.datalen = datalen;
	GST.entry[GST.count].head.blocklen =
		((sizeof(struct sysctlhead) + GST.entry[GST.count].head.namelen +
			GST.entry[GST.count].head.datalen)+3) & ~3;
	GST.totalsize += GST.entry[GST.count].head.blocklen;
	GST.count++;
}
#endif /* EMULATE_SYSCTL */


================================================
FILE: kipfw/debug.c
================================================
#include <ntddk.h>

const char* texify_cmd(int i)
{
	if (i==110)
		return("IP_FW_ADD");
	if (i==111)
		return("IP_FW_DEL");
	if (i==112)
		return("IP_FW_FLUSH");
	if (i==113)
		return("IP_FW_ZERO");
	if (i==114)
		return("IP_FW_GET");
	if (i==115)
		return("IP_FW_RESETLOG");
	if (i==116)
		return("IP_FW_NAT_CFG");
	if (i==117)
		return("IP_FW_NAT_DEL");
	if (i==118)
		return("IP_FW_NAT_GET_CONFIG");
	if (i==119)
		return("IP_FW_NAT_GET_LOG");
	if (i==120)
		return("IP_DUMMYNET_CONFIGURE");
	if (i==121)
		return("IP_DUMMYNET_DEL");
	if (i==122)
		return("IP_DUMMYNET_FLUSH");
	if (i==124)
		return("IP_DUMMYNET_GET");
	if (i==108)
		return("IP_FW3");
	if (i==109)
		return("IP_DUMMYNET3");
	return ("BOH");
}

const char* texify_proto(unsigned int p)
{
	if (p==1)
		return("ICMP");
	if (p==6)
		return("TCP");
	if (p==17)
		return("UDP");
	return("OTHER");
}

void hexdump(unsigned char* addr, int len, const char *msg)
{
	int i;
	const  int cicli = len/8;
	const int resto = len%8;
	unsigned char d[8];

	DbgPrint("%s at %p len %d\n", msg, addr, len);
	for (i=0; i<=cicli; i++) {
		bzero(d, 8);
		bcopy(addr+i*8, d, i < cicli ? 8 : resto);
		DbgPrint("%04X %02X %02X %02X %02X %02X %02X %02X %02X\n",
			i*8, d[0], d[1], d[2], d[3], d[4],
			d[5], d[6], d[7]);
	}
	DbgPrint("\n");
}


================================================
FILE: kipfw/ipfw2_mod.c
================================================
/*
 * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $Id: ipfw2_mod.c 12501 2014-01-10 01:09:14Z luigi $
 *
 * The main interface to build ipfw+dummynet as a linux module.
 * (and possibly as a windows module as well, though that part
 * is not complete yet).
 *
 * The control interface uses the sockopt mechanism
 * on a socket(AF_INET, SOCK_RAW, IPPROTO_RAW).
 *
 * The data interface uses the netfilter interface, at the moment
 * hooked to the PRE_ROUTING and POST_ROUTING hooks.
 * Unfortunately the netfilter interface is a moving target,
 * so we need a set of macros to adapt to the various cases.
 *
 * In the netfilter hook we just mark packet as 'QUEUE' and then
 * let the queue handler to do the whole work (filtering and
 * possibly emulation).
 * As we receive packets, we wrap them with an mbuf descriptor
 * so the existing ipfw+dummynet code runs unmodified.
 */

#include <sys/cdefs.h>
#include <sys/mbuf.h>			/* sizeof struct mbuf */
#include <sys/param.h>			/* NGROUPS */

#ifndef D
#define ND(fmt, ...) do {} while (0)
#define D1(fmt, ...) do {} while (0)
#define D(fmt, ...) printf("%-10s " fmt "\n",      \
        __FUNCTION__, ## __VA_ARGS__)
#endif

#ifdef __linux__
#include <linux/module.h>
#include <linux/kernel.h>

#ifndef CONFIG_NETFILTER
#error should configure netfilter (broken on 2.6.26 and below ?)
#endif

#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>	/* NF_IP_PRI_FILTER */

#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25)
#include <net/netfilter/nf_queue.h>	/* nf_queue */
#endif

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14)
#define __read_mostly
#endif

#endif /* !__linux__ */

#include <netinet/in.h>			/* in_addr */
#include <netinet/ip_fw.h>		/* ip_fw_ctl_t, ip_fw_chk_t */
#include <netinet/ipfw/ip_fw_private.h>		/* ip_fw_ctl_t, ip_fw_chk_t */
#include <netinet/ip_dummynet.h>	/* ip_dn_ctl_t, ip_dn_io_t */
#include <net/pfil.h>			/* PFIL_IN, PFIL_OUT */

#ifdef __linux__

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,13)
/* XXX was < 2.6.0:  inet_hashtables.h is introduced in 2.6.14 */
// #warning --- inet_hashtables not present on 2.4
#include <linux/tcp.h>
#include <net/route.h>
#include <net/sock.h>
static inline int inet_iif(const struct sk_buff *skb)
{
        return ((struct rtable *)skb->dst)->rt_iif;
}

#else
#include <net/inet_hashtables.h>	/* inet_lookup */
#endif
#endif /* __linux__ */

#include <net/route.h>			/* inet_iif */

/*
 * Here we allocate some global variables used in the firewall.
 */
//ip_dn_ctl_t    *ip_dn_ctl_ptr;
int (*ip_dn_ctl_ptr)(struct sockopt *);

ip_fw_ctl_t    *ip_fw_ctl_ptr;

int	(*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa);
ip_fw_chk_t    *ip_fw_chk_ptr;

void		(*bridge_dn_p)(struct mbuf *, struct ifnet *);

/* Divert hooks. */
void (*ip_divert_ptr)(struct mbuf *m, int incoming);

/* ng_ipfw hooks. */
ng_ipfw_input_t *ng_ipfw_input_p = NULL;

/*---
 * Glue code to implement the registration of children with the parent.
 * Each child should call my_mod_register() when linking, so that
 * module_init() and module_exit() can call init_children() and
 * fini_children() to provide the necessary initialization.
 * We use the same mechanism for MODULE_ and SYSINIT_.
 * The former only get a pointer to the moduledata,
 * the latter have two function pointers (init/uninit)
 */
#include <sys/module.h>
struct mod_args {
        const char *name;
        int order;
        struct moduledata *mod;
	void (*init)(void), (*uninit)(void);
};

static unsigned int mod_idx;
static struct mod_args mods[10];	/* hard limit to 10 modules */

int
my_mod_register(const char *name, int order,
	struct moduledata *mod, void *init, void *uninit);
/*
 * my_mod_register should be called automatically as the init
 * functions in the submodules. Unfortunately this compiler/linker
 * trick is not supported yet so we call it manually.
 */
int
my_mod_register(const char *name, int order,
	struct moduledata *mod, void *init, void *uninit)
{
	struct mod_args m;

	m.name = name;
	m.order = order;
	m.mod = mod;
	m.init = init;
	m.uninit = uninit;

	printf("%s %s called\n", __FUNCTION__, name);
	if (mod_idx < sizeof(mods) / sizeof(mods[0]))
		mods[mod_idx++] = m;
	return 0;
}

static void
init_children(void)
{
	unsigned int i;

        /* Call the functions registered at init time. */
	printf("%s mod_idx value %d\n", __FUNCTION__, mod_idx);
        for (i = 0; i < mod_idx; i++) {
		struct mod_args *m = &mods[i];
                printf("+++ start module %d %s %s at %p order 0x%x\n",
                        i, m->name, m->mod ? m->mod->name : "SYSINIT",
                        m->mod, m->order);
		if (m->mod && m->mod->evhand)
			m->mod->evhand(NULL, MOD_LOAD, m->mod->priv);
		else if (m->init)
			m->init();
        }
}

static void
fini_children(void)
{
	int i;

        /* Call the functions registered at init time. */
        for (i = mod_idx - 1; i >= 0; i--) {
		struct mod_args *m = &mods[i];
                printf("+++ end module %d %s %s at %p order 0x%x\n",
                        i, m->name, m->mod ? m->mod->name : "SYSINIT",
                        m->mod, m->order);
		if (m->mod && m->mod->evhand)
			m->mod->evhand(NULL, MOD_UNLOAD, m->mod->priv);
		else if (m->uninit)
			m->uninit();
        }
}
/*--- end of module binding helper functions ---*/

/*---
 * Control hooks:
 * ipfw_ctl_h() is a wrapper for linux to FreeBSD sockopt call convention.
 * then call the ipfw handler in order to manage requests.
 * In turn this is called by the linux set/get handlers.
 */
static int
ipfw_ctl_h(struct sockopt *s, int cmd, int dir, int len, void __user *user)
{
	struct thread t;
	int ret = EINVAL;

	memset(s, 0, sizeof(*s));
	s->sopt_name = cmd;
	s->sopt_dir = dir;
	s->sopt_valsize = len;
	s->sopt_val = user;

	/* sopt_td is not used but it is referenced */
	memset(&t, 0, sizeof(t));
	s->sopt_td = &t;
	
	//printf("%s called with cmd %d len %d sopt %p user %p\n", __FUNCTION__, cmd, len, s, user);

	if (ip_fw_ctl_ptr && cmd != IP_DUMMYNET3 && (cmd == IP_FW3 ||
	    cmd < IP_DUMMYNET_CONFIGURE))
		ret = ip_fw_ctl_ptr(s);
	else if (ip_dn_ctl_ptr && (cmd == IP_DUMMYNET3 ||
	    cmd >= IP_DUMMYNET_CONFIGURE))
		ret = ip_dn_ctl_ptr(s);
	
	return -ret;	/* errors are < 0 on linux */
}

#ifdef linux
/*
 * Convert an mbuf into an skbuff
 * At the moment this only works for ip packets fully contained
 * in a single mbuf. We assume that on entry ip_len and ip_off are
 * in host format, and the ip checksum is not computed.
 */
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) /* check boundary */
int dst_output(struct skbuff *s)
{
	return 0;
}

struct sk_buff *
mbuf2skbuff(struct mbuf* m)
{
	return NULL;
}
#else
struct sk_buff *
mbuf2skbuff(struct mbuf* m)
{
	struct sk_buff *skb;
	size_t len = m->m_pkthdr.len;

	/* used to lookup the routing table */
	struct rtable *r;
	struct flowi fl;
	int ret = 0;	/* success for ip_route_output_key() */

	struct ip *ip = mtod(m, struct ip *);

	/* XXX ip_output has ip_len and ip_off in network format,
	 * linux expects host format */
	ip->ip_len = ntohs(ip->ip_len);
	ip->ip_off = ntohs(ip->ip_off);

	ip->ip_sum = 0;
	ip->ip_sum = in_cksum(m, ip->ip_hl<<2);

	/* fill flowi struct, we need just the dst addr, see XXX */
	bzero(&fl, sizeof(fl));
	flow_daddr.daddr = ip->ip_dst.s_addr;

	/*
	 * ip_route_output_key() should increment
	 * r->u.dst.__use and call a dst_hold(dst)
	 * XXX verify how we release the resources.
	 */
#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,38) /* check boundary */
	r = ip_route_output_key(&init_net, &fl.u.ip4);
#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26) /* check boundary */
	ret = ip_route_output_key(&init_net, &r, &fl);
#else
	ret = ip_route_output_key(&r, &fl);
#endif
	if (ret != 0 || r == NULL ) {
		printf("NO ROUTE FOUND\n");
		return NULL;
	}

	/* allocate the skbuff and the data */
	skb = alloc_skb(len + sizeof(struct ethhdr), GFP_ATOMIC);
	if (skb == NULL) {
		printf("%s: can not allocate SKB buffers.\n", __FUNCTION__);
		return NULL;
	}

	skb->protocol = htons(ETH_P_IP); // XXX 8 or 16 bit ?
	/* sk_dst_set XXX take the lock (?) */
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
	skb_dst_set(skb, &r->u.dst);
#else
	skb_dst_set(skb, &r->dst);
#endif
	skb->dev = skb_dst(skb)->dev;

	/* reserve space for ethernet header */
	skb_reserve(skb, sizeof(struct ethhdr));

#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
	skb_reset_network_header(skb); // skb->network_header = skb->data - skb->head
#else
	skb->nh.raw = skb->data;
#endif
	/* set skbuff tail pointers and copy content */
	skb_put(skb, len);
	memcpy(skb->data, m->m_data, len);

	return skb;
}
#endif /* linux 2.6+ */
#endif /* linux */


/*
 * This function is called to reinject packets to the
 * kernel stack within the linux netfilter system
 * or to send a new created mbuf.
 * In the first case we have a valid sk_buff pointer
 * encapsulated within the fake mbuf, so we can call
 * the reinject function trough netisr_dispatch.
 * In the last case we need to build a sk_buff from scratch,
 * before sending out the packet.
 */
int
ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
    struct ip_moptions *imo, struct inpcb *inp)
{
	(void)opt; (void)ro; (void)flags; (void)imo; (void)inp;	/* UNUSED */
	if ( m->m_skb != NULL ) { /* reinjected packet, just call dispatch */
		ND("sending... ");
		netisr_dispatch(0, m);
	} else {
		/* self-generated packet, wrap as appropriate and send */
#ifdef __linux__
		struct sk_buff *skb = mbuf2skbuff(m);

		if (skb != NULL)
			dst_output(skb);
#else /* Windows */
		D("unimplemented.");
#endif
		FREE_PKT(m);
	}
	return 0;
}

/*
 * setsockopt hook has no return value other than the error code.
 */
int
do_ipfw_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
	struct sockopt s;	/* pass arguments */
	(void)sk;		/* UNUSED */
	return ipfw_ctl_h(&s, cmd, SOPT_SET, len, user);
}

/*
 * getsockopt can can return a block of data in response.
 */
int
do_ipfw_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
	struct sockopt s;	/* pass arguments */
	int ret = ipfw_ctl_h(&s, cmd, SOPT_GET, *len, user);

	(void)sk;		/* UNUSED */
	*len = s.sopt_valsize;	/* return length back to the caller */
	return ret;
}

#ifdef __linux__

/*
 * declare our [get|set]sockopt hooks
 */
static struct nf_sockopt_ops ipfw_sockopts = {
	.pf		= PF_INET,
	.set_optmin	= _IPFW_SOCKOPT_BASE,
	.set_optmax	= _IPFW_SOCKOPT_END,
	.set		= do_ipfw_set_ctl,
	.get_optmin	= _IPFW_SOCKOPT_BASE,
	.get_optmax	= _IPFW_SOCKOPT_END,
	.get		= do_ipfw_get_ctl,
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24)
	.owner		= THIS_MODULE,
#endif
};

/*----
 * We need a number of macros to adapt to the various APIs in
 * different linux versions. Among them:
 *
 * - the hook names change between macros (NF_IP*) and enum NF_INET_*
 *
 * - the second argument to the netfilter hook is
 *	struct sk_buff **	in kernels <= 2.6.22
 *	struct sk_buff *	in kernels > 2.6.22
 *
 * - NF_STOP is not defined before 2.6 so we remap it to NF_ACCEPT
 *
 * - the packet descriptor passed to the queue handler is
 *	struct nf_info		in kernels <= 2.6.24
 *	struct nf_queue_entry	in kernels <= 2.6.24
 *
 * - the arguments to the queue handler also change;
 */

/*
 * declare hook to grab packets from the netfilter interface.
 * The NF_* names change in different versions of linux, in some
 * cases they are #defines, in others they are enum, so we
 * need to adapt.
 */
#ifndef NF_IP_PRE_ROUTING
#define NF_IP_PRE_ROUTING	NF_INET_PRE_ROUTING
#endif
#ifndef NF_IP_POST_ROUTING
#define NF_IP_POST_ROUTING	NF_INET_POST_ROUTING
#endif

/*
 * ipfw hooks into the POST_ROUTING and the PRE_ROUTING chains.
 * PlanetLab sets skb_tag to the slice id in the LOCAL_INPUT and
 * POST_ROUTING chains, so if we want to use that information we
 * need to hook the LOCAL_INPUT chain instead of the PRE_ROUTING.
 * However at the moment the skb_tag info is not reliable so
 * we stay with the standard hooks.
 */
#if 0 // defined(IPFW_PLANETLAB)
#define IPFW_HOOK_IN NF_IP_LOCAL_IN
#else
#define IPFW_HOOK_IN NF_IP_PRE_ROUTING
#endif

/*
 * The main netfilter hook.
 * To make life simple, we queue everything and then do all the
 * decision in the queue handler.
 *
 * XXX note that in 2.4 and up to 2.6.22 the skbuf is passed as sk_buff**
 * so we have an #ifdef to set the proper argument type.
 */
static unsigned int
call_ipfw(
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0)
	unsigned int hooknum,
#else
	const struct nf_hook_ops *hooknum,
#endif

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) // in 2.6.22 we have **
	struct sk_buff  **skb,
#else
	struct sk_buff  *skb,
#endif
	const struct net_device *in, const struct net_device *out,
	int (*okfn)(struct sk_buff *))
{
	(void)hooknum; (void)skb; (void)in; (void)out; (void)okfn; /* UNUSED */
	return NF_QUEUE;
}

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12)	/* XXX was 2.6.0 */
#define	NF_STOP		NF_ACCEPT
#endif

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)

/*
 * nf_queue_entry is a recent addition, in previous versions
 * of the code the struct is called nf_info.
 */
#define nf_queue_entry	nf_info	/* for simplicity */

/* also, 2.4 and perhaps something else have different arguments */
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14)	/* XXX unsure */
/* on 2.4 we use nf_info */
#define QH_ARGS		struct sk_buff *skb, struct nf_info *info, void *data
#else	/* 2.6.14. 2.6.24 */
#define QH_ARGS		struct sk_buff *skb, struct nf_info *info, unsigned int qnum, void *data
#endif

#define DEFINE_SKB	/* nothing, already an argument */
#define	REINJECT(_inf, _verd)	nf_reinject(skb, _inf, _verd)

#else	/* 2.6.25 and above */

#define QH_ARGS		struct nf_queue_entry *info, unsigned int queuenum
#define DEFINE_SKB	struct sk_buff *skb = info->skb;
#define	REINJECT(_inf, _verd)	nf_reinject(_inf, _verd)
#endif

/*
 * used by dummynet when dropping packets
 * XXX use dummynet_send()
 */
void
reinject_drop(struct mbuf* m)
{
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)	/* unsure on the exact boundary */
	struct sk_buff *skb = (struct sk_buff *)m;
#endif
	REINJECT(m->queue_entry, NF_DROP);
}

/*
 * The real call to the firewall. nf_queue_entry points to the skbuf,
 * and eventually we need to return both through nf_reinject().
 */
static int
ipfw2_queue_handler(QH_ARGS)
{
	DEFINE_SKB	/* no semicolon here, goes in the macro */
	int ret = 0;	/* return value */
	struct mbuf *m;

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
	if (skb->nh.iph == NULL) {
		printf("null dp, len %d reinject now\n", skb->len);
		REINJECT(info, NF_ACCEPT);
		return 0;
	}
#endif
	m = malloc(sizeof(*m), 0, 0);
	if (m == NULL) {
		printf("malloc fail, len %d reinject now\n", skb->len);
		REINJECT(info, NF_ACCEPT);
		return 0;
	}

	m->m_skb = skb;
	m->m_len = skb->len;		/* len from ip header to end */
	m->m_pkthdr.len = skb->len;	/* total packet len */
	m->m_pkthdr.rcvif = info->indev;
	m->queue_entry = info;
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)	/* XXX was 2.6.0 */
	m->m_data = (char *)skb->nh.iph;
#else
	m->m_data = (char *)skb_network_header(skb);	// XXX unsigned ? */
#endif

	/* XXX add the interface */
	if (info->hook == IPFW_HOOK_IN) {
		ret = ipfw_check_hook(NULL, &m, info->indev, PFIL_IN, NULL);
	} else {
		ret = ipfw_check_hook(NULL, &m, info->outdev, PFIL_OUT, NULL);
	}

	if (m != NULL) {	/* Accept. reinject and free the mbuf */
		REINJECT(info, NF_ACCEPT);
		m_freem(m);
	} else if (ret == 0) {
		/* dummynet has kept the packet, will reinject later. */
	} else {
		/*
		 * Packet dropped by ipfw or dummynet. Nothing to do as
		 * FREE_PKT already did a reinject as NF_DROP
		 */
	}
	return 0;
}

struct route;
struct ip_moptions;
struct inpcb;

/* XXX should include prototypes for netisr_dispatch and ip_output */
/*
 * The reinjection routine after a packet comes out from dummynet.
 * We must update the skb timestamp so ping reports the right time.
 * This routine is also used (with num == -1) as FREE_PKT. XXX
 */
void
netisr_dispatch(int num, struct mbuf *m)
{
	struct nf_queue_entry *info = m->queue_entry;
	struct sk_buff *skb = m->m_skb;	/* always used */

	/*
	 * This function can be called by the FREE_PKT()
	 * used when ipfw generate their own mbuf packets
	 * or by the mbuf2skbuff() function.
	 */
	m_freem(m);

	/* XXX check
	 * info is null in the case of a real mbuf
	 * (one created by the ipfw code without a
	 * valid sk_buff pointer
	 */
	if (info == NULL)
		return;

#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)	// XXX above 2.6.x ?
	__net_timestamp(skb);	/* update timestamp */
#endif

	/* XXX to obey one-pass, possibly call the queue handler here */
	REINJECT(info, ((num == -1)?NF_DROP:NF_STOP));	/* accept but no more firewall */
}

/*
 * socket lookup function for linux.
 * This code is used to associate uid, gid, jail/xid to packets,
 * and store the info in a cache *ugp where they can be accessed quickly.
 * The function returns 1 if the info is found, -1 otherwise.
 *
 * We do this only on selected protocols: TCP, ...
 *
 * The chain is the following
 *   sk_buff*  sock*  socket*    file*
 *	skb  ->  sk ->sk_socket->file ->f_owner    ->pid
 *	skb  ->  sk ->sk_socket->file ->f_uid (direct)
 *	skb  ->  sk ->sk_socket->file ->f_cred->fsuid (2.6.29+)
 *
 * Related headers:
 * linux/skbuff.h	struct skbuff
 * net/sock.h		struct sock
 * linux/net.h		struct socket
 * linux/fs.h		struct file
 *
 * With vserver we may have sk->sk_xid and sk->sk_nid that
 * which we store in fw_groups[1] (matches O_JAIL) and fw_groups[2]
 * (no matches yet)
 *
 * Note- for locally generated, outgoing packets we should not need
 * need a lookup because the sk_buff already points to the socket where
 * the info is.
 */
extern struct inet_hashinfo tcp_hashinfo;
int
linux_lookup(const int proto, const __be32 saddr, const __be16 sport,
		const __be32 daddr, const __be16 dport,
		struct sk_buff *skb, int dir, struct bsd_ucred *u)
{
#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,13) 	/* XXX was 2.6.0 */
	return -1;
#else
	struct sock *sk;
	int ret = -1;	/* default return value */
	int st = -1;	/* state */


	if (proto != IPPROTO_TCP)	/* XXX extend for UDP */
		return -1;

	if ((dir ? (void *)skb_dst(skb) : (void *)skb->dev) == NULL) {
		panic(" -- this should not happen\n");
		return -1;
	}

	if (skb->sk) {
		sk = skb->sk;
	} else {
		/*
		 * Try a lookup. On a match, sk has a refcount that we must
		 * release on exit (we know it because skb->sk = NULL).
		 *
		 * inet_lookup above 2.6.24 has an additional 'net' parameter
		 * so we use a macro to conditionally supply it.
		 * swap dst and src depending on the direction.
		 */
#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,24)
#define _OPT_NET_ARG
#else
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
/* there is no dev_net() on 2.6.25 */
#define _OPT_NET_ARG (skb->dev->nd_net),
#else	/* 2.6.26 and above */
#define _OPT_NET_ARG dev_net(skb->dev),
#endif
#endif
		sk =  (dir) ? /* dir != 0 on output */
		    inet_lookup(_OPT_NET_ARG &tcp_hashinfo,
			daddr, dport, saddr, sport,	// match outgoing
			inet_iif(skb)) :
		    inet_lookup(_OPT_NET_ARG &tcp_hashinfo,
			saddr, sport, daddr, dport,	// match incoming
			skb->dev->ifindex);
#undef _OPT_NET_ARG

		if (sk == NULL) /* no match, nothing to be done */
			return -1;
	}
	ret = 1;	/* retrying won't make things better */
	st = sk->sk_state;
#ifdef CONFIG_VSERVER
	u->xid = sk->sk_xid;
	u->nid = sk->sk_nid;
#else
	u->xid = u->nid = 0;
#endif
	/*
	 * Exclude tcp states where sk points to a inet_timewait_sock which
	 * has no sk_socket field (surely TCP_TIME_WAIT, perhaps more).
	 * To be safe, use a whitelist and not a blacklist.
	 * Before dereferencing sk_socket grab a lock on sk_callback_lock.
	 *
	 * Once again we need conditional code because the UID and GID
	 * location changes between kernels.
	 */
#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,28)
/* use the current's real uid/gid */
#define _CURR_UID f_uid
#define _CURR_GID f_gid
#else /* 2.6.29 and above */
/* use the current's file access real uid/gid */
#define _CURR_UID f_cred->fsuid
#define _CURR_GID f_cred->fsgid
#endif

#define GOOD_STATES (	\
	(1<<TCP_LISTEN) | (1<<TCP_SYN_RECV)   | (1<<TCP_SYN_SENT)   | \
	(1<<TCP_ESTABLISHED)  | (1<<TCP_FIN_WAIT1) | (1<<TCP_FIN_WAIT2) )
	// surely exclude TCP_CLOSE, TCP_TIME_WAIT, TCP_LAST_ACK
	// uncertain TCP_CLOSE_WAIT and TCP_CLOSING

	if ((1<<st) & GOOD_STATES) {
		read_lock_bh(&sk->sk_callback_lock);
		if (sk->sk_socket && sk->sk_socket->file) {
			//u->uid = sk->sk_socket->file->_CURR_UID;
			//u->gid = sk->sk_socket->file->_CURR_GID;
		}
		read_unlock_bh(&sk->sk_callback_lock);
	} else {
		u->uid = u->gid = 0;
	}
	if (!skb->sk) /* return the reference that came from the lookup */
		sock_put(sk);
#undef GOOD_STATES
#undef _CURR_UID
#undef _CURR_GID
	return ret;

#endif /* LINUX > 2.4 */
}

/*
 * Now prepare to hook the various functions.
 * Linux 2.4 has a different API so we need some adaptation
 * for register and unregister hooks
 *
 * the unregister function changed arguments between 2.6.22 and 2.6.24
 */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14)
struct nf_queue_handler ipfw2_queue_handler_desc = {
        .outfn = ipfw2_queue_handler,
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,2)
        .name = "ipfw2 dummynet queue",
#endif
};
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,2)
#define REG_QH_ARG(pf, fn)	pf, &(fn ## _desc)
#else
#define REG_QH_ARG(pf, fn)	&(fn ## _desc)
#endif
#endif

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) /* XXX was 2.6.0 */
static int
nf_register_hooks(struct nf_hook_ops *ops, int n)
{
	int i, ret = 0;
	for (i = 0; i < n; i++) {
		ret = nf_register_hook(ops + i);
		if (ret < 0)
			break;
	}
	return ret;
}

static void
nf_unregister_hooks(struct nf_hook_ops *ops, int n)
{
	int i;
	for (i = 0; i < n; i++) {
		nf_unregister_hook(ops + i);
	}
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) /* XXX was 2.6.0 */
#define REG_QH_ARG(pf, fn)	pf, fn, NULL
#endif
#define UNREG_QH_ARG(pf, fn) //fn	/* argument for nf_[un]register_queue_handler */
#define SET_MOD_OWNER

#else /* linux > 2.6.17 */

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
#define UNREG_QH_ARG(pf, fn) //fn
#elif LINUX_VERSION_CODE < KERNEL_VERSION(3,8,2)
#define UNREG_QH_ARG(pf, fn)	pf, &(fn ## _desc)
#else
#define UNREG_QH_ARG(pf, fn)
#endif /* 2.6.0 < LINUX > 2.6.24 */

#define SET_MOD_OWNER	.owner = THIS_MODULE,

#endif	/* !LINUX < 2.6.0 */

static struct nf_hook_ops ipfw_ops[] __read_mostly = {
        {
                .hook           = call_ipfw,
                .pf             = PF_INET,
                .hooknum        = IPFW_HOOK_IN,
                .priority       = NF_IP_PRI_FILTER,
                SET_MOD_OWNER
        },
        {
                .hook           = call_ipfw,
                .pf             = PF_INET,
                .hooknum        = NF_IP_POST_ROUTING,
                .priority       = NF_IP_PRI_FILTER,
		SET_MOD_OWNER
        },
};
#endif /* __linux__ */

/* descriptors for the children, until i find a way for the
 * linker to produce them
 */
extern moduledata_t *moddesc_ipfw;
extern moduledata_t *moddesc_dummynet;
extern moduledata_t *moddesc_dn_fifo;
extern moduledata_t *moddesc_dn_wf2qp;
extern moduledata_t *moddesc_dn_rr;
extern moduledata_t *moddesc_dn_qfq;
extern moduledata_t *moddesc_dn_prio;
extern void *sysinit_ipfw_init;
extern void *sysuninit_ipfw_destroy;
extern void *sysinit_vnet_ipfw_init;
extern void *sysuninit_vnet_ipfw_uninit;

/*
 * Module glue - init and exit function.
 */
int __init
ipfw_module_init(void)
{
	int ret = 0;
#ifdef _WIN32
	unsigned long resolution;
#endif

	rn_init(64);
	my_mod_register("ipfw",  1, moddesc_ipfw, NULL, NULL);
	my_mod_register("sy_ipfw",  2, NULL,
		sysinit_ipfw_init, sysuninit_ipfw_destroy);
	my_mod_register("sy_Vnet_ipfw",  3, NULL,
		sysinit_vnet_ipfw_init, sysuninit_vnet_ipfw_uninit);
	my_mod_register("dummynet",  4, moddesc_dummynet, NULL, NULL);
	my_mod_register("dn_fifo",  5, moddesc_dn_fifo, NULL, NULL);
	my_mod_register("dn_wf2qp",  6, moddesc_dn_wf2qp, NULL, NULL);
	my_mod_register("dn_rr",  7, moddesc_dn_rr, NULL, NULL);
	my_mod_register("dn_qfq",  8, moddesc_dn_qfq, NULL, NULL);
	my_mod_register("dn_prio",  9, moddesc_dn_prio, NULL, NULL);
	init_children();

#ifdef _WIN32
	resolution = ExSetTimerResolution(1, TRUE);
	printf("*** ExSetTimerResolution: resolution set to %d n-sec ***\n",resolution);
#endif
#ifdef EMULATE_SYSCTL
	keinit_GST();
#endif 

#ifdef __linux__
	/* sockopt register, in order to talk with user space */
	ret = nf_register_sockopt(&ipfw_sockopts);
        if (ret < 0) {
		printf("error %d in nf_register_sockopt\n", ret);
		goto clean_modules;
	}

	/* queue handler registration, in order to get network
	 * packet under a private queue */
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,2)
	ret =
#endif
	    nf_register_queue_handler(REG_QH_ARG(PF_INET, ipfw2_queue_handler) );
        if (ret < 0)	/* queue busy */
		goto unregister_sockopt;

        ret = nf_register_hooks(ipfw_ops, ARRAY_SIZE(ipfw_ops));
        if (ret < 0)
		goto unregister_sockopt;

	printf("%s loaded\n", __FUNCTION__);
	return 0;


/* handle errors on load */
unregister_sockopt:
	nf_unregister_queue_handler(UNREG_QH_ARG(PF_INET, ipfw2_queue_handler) );
	nf_unregister_sockopt(&ipfw_sockopts);

clean_modules:
	fini_children();
	printf("%s error\n", __FUNCTION__);

#endif	/* __linux__ */
	return ret;
}

/* module shutdown */
void __exit
ipfw_module_exit(void)
{
#ifdef EMULATE_SYSCTL
	keexit_GST();
#endif
#ifdef _WIN32
	ExSetTimerResolution(0,FALSE);

#else  /* linux hook */
        nf_unregister_hooks(ipfw_ops, ARRAY_SIZE(ipfw_ops));
	/* maybe drain the queue before unregistering ? */
	nf_unregister_queue_handler(UNREG_QH_ARG(PF_INET, ipfw2_queue_handler) );
	nf_unregister_sockopt(&ipfw_sockopts);
#endif	/* __linux__ */

	fini_children();

	printf("%s unloaded\n", __FUNCTION__);
}

#ifdef __linux__
module_init(ipfw_module_init)
module_exit(ipfw_module_exit)
MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */
#endif


================================================
FILE: kipfw/md_win.c
================================================
/*
 * Copyright (C) 2010 Luigi Rizzo, Francesco Magno, Universita` di Pisa
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * kernel variables and functions that are not available in Windows.
 */

#include <net/pfil.h> /* provides PFIL_IN and PFIL_OUT */
#include <arpa/inet.h>
#include <netinet/in.h>			/* in_addr */
#include <ndis.h>
#include <sys/mbuf.h>
#include <passthru.h>

/* credentials check */
int
cred_check(void *_insn,  int proto, struct ifnet *oif,
    struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
    u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp,
    struct sk_buff *skb)
{
	return 0;
}

/*
 * as good as anywhere, place here the missing calls
 */

void *
my_alloc(int size)
{
	void *_ret = ExAllocatePoolWithTag(NonPagedPool, size, 'wfpi');
	if (_ret)
		memset(_ret, 0, size);
	return _ret;
}

void
panic(const char *fmt, ...)
{
	printf("%s", fmt);
	for (;;);
}

int securelevel = 0;

int ffs(int bits)
{
	int i;
	if (bits == 0)
		return (0);
	for (i = 1; ; i++, bits >>= 1) {
		if (bits & 1)
			break;
	}
	return (i);
}

void
do_gettimeofday(struct timeval *tv)
{
	static LARGE_INTEGER prevtime; //system time in 100-nsec resolution
	static LARGE_INTEGER prevcount; //RTC counter value
	static LARGE_INTEGER freq; //frequency

	LARGE_INTEGER currtime;
	LARGE_INTEGER currcount;
	if (prevtime.QuadPart == 0) { //first time we ask for system time
		KeQuerySystemTime(&prevtime);
		prevcount = KeQueryPerformanceCounter(&freq);
		currtime.QuadPart = prevtime.QuadPart;
	} else {
		KeQuerySystemTime(&currtime);
		currcount = KeQueryPerformanceCounter(&freq);
		if (currtime.QuadPart == prevtime.QuadPart) {
			//time has NOT changed, calculate time using ticks and DO NOT update
			LONGLONG difftime = 0; //difference in 100-nsec
			LONGLONG diffcount = 0; //clock count difference
			//printf("time has NOT changed\n");
			diffcount = currcount.QuadPart - prevcount.QuadPart;
			diffcount *= 10000000;
			difftime = diffcount / freq.QuadPart;
			currtime.QuadPart += difftime;
		} else {	
			//time has changed, update and return SystemTime
			//printf("time has changed\n");
			prevtime.QuadPart = currtime.QuadPart;
			prevcount.QuadPart = currcount.QuadPart;
		}
	}
	currtime.QuadPart /= 10; //convert in usec
	tv->tv_sec = currtime.QuadPart / (LONGLONG)1000000;
	tv->tv_usec = currtime.QuadPart % (LONGLONG)1000000;
	//printf("sec %d usec %d\n",tv->tv_sec, tv->tv_usec);
}

int time_uptime_w32()
{
	int ret;
	LARGE_INTEGER tm;
	KeQuerySystemTime(&tm);
	ret = (int)(tm.QuadPart / (LONGLONG)1000000);
	return ret;
}


/*
 * Windows version of firewall hook. We receive a partial copy of
 * the packet which points to the original buffers. In output,
 * the refcount has been already incremented.
 * The function reconstructs
 * the whole packet in a contiguous memory area, builds a fake mbuf,
 * calls the firewall, does the eventual cleaning and returns
 * to MiniportSend or ProtocolReceive, which will silently return
 * (dropping packet) or continue its execution (allowing packet).
 * The memory area contains:
 * - the fake mbuf, filled with data needed by ipfw, and information
 *   for reinjection
 * - the packet data
 */
void hexdump(PUCHAR,int, const char *);
static char _if_in[] = "incoming";
static char _if_out[] = "outgoing";

int
ipfw2_qhandler_w32(PNDIS_PACKET pNdisPacket, int direction,
	NDIS_HANDLE Context)
{	
	unsigned int		BufferCount = 0;
	unsigned			TotalPacketLength = 0;
	PNDIS_BUFFER		pCurrentBuffer = NULL;
	PNDIS_BUFFER		pNextBuffer = NULL;
	struct mbuf*		m;
	unsigned char*		payload = NULL;
	unsigned int		ofs, l;
	unsigned short		EtherType = 0;
	unsigned int		i = 0;
	int					ret = 0;
	PNDIS_BUFFER		pNdisBuffer, old_head, old_tail;
	NDIS_HANDLE			PacketPool;
	PADAPT				pAdapt;
	NDIS_STATUS			Status;

	/* In NDIS, packets are a chain of NDIS_BUFFER. We query
	 * the packet to get a pointer of chain's head, the length
	 * of the chain, and the length of the packet itself.
	 * Then allocate a buffer for the mbuf and the payload.
	 */
	NdisQueryPacket(pNdisPacket, NULL, &BufferCount,
		&pCurrentBuffer, &TotalPacketLength);
	m = malloc(sizeof(struct mbuf) + TotalPacketLength, 0, 0 );
	if (m == NULL) //resource shortage, drop the packet
		goto drop_pkt;

	/* set mbuf fields to point past the MAC header.
	 * Also set additional W32 info
	 */
	payload = (unsigned char*)(m + 1);
	m->m_len = m->m_pkthdr.len = TotalPacketLength-14;
	m->m_pkthdr.rcvif = (void *)((direction==INCOMING) ? _if_in : NULL);
	m->m_data = payload + 14; /* past the MAC header */
	m->direction = direction;
	m->context = Context;
	m->pkt = pNdisPacket;

	/* m_skb != NULL is used in the ip_output routine to check
	 * for packets that come from the stack and differentiate
	 * from those internally generated by ipfw.
	 * The pointer is not used, just needs to be non-null.
	 */
	m->m_skb = (void *)pNdisPacket;
	/*
	 * Now copy the data from the Windows buffers to the mbuf.
	 */
	for (i=0, ofs = 0; i < BufferCount; i++) {
		unsigned char* src;
		NdisQueryBufferSafe(pCurrentBuffer, &src, &l,
			NormalPagePriority);
		bcopy(src, payload + ofs, l);
		ofs += l;
		NdisGetNextBuffer(pCurrentBuffer, &pNextBuffer);
		pCurrentBuffer = pNextBuffer;
	}
	/*
	 * Identify EtherType. If the packet is not IP, simply allow
	 * and don't bother the firewall. XXX should be done before.
	 */
	EtherType = *(unsigned short*)(payload + 12);
	EtherType = RtlUshortByteSwap(EtherType);
	if (EtherType != 0x0800) {
		//DbgPrint("ethertype = %X, skipping ipfw\n",EtherType);
		free(m, 0);
		return PASS;
	}

	/*
	 * Now build a buffer descriptor to replace the original chain.
	 */
	pAdapt = Context;
	PacketPool = direction == OUTGOING ?
		pAdapt->SendPacketPoolHandle : pAdapt->RecvPacketPoolHandle;
        NdisAllocateBuffer(&Status, &pNdisBuffer,
                PacketPool, payload, m->m_pkthdr.len+14);
        if (Status != NDIS_STATUS_SUCCESS)
                goto drop_pkt;
        /*
	 * Save the old buffer pointers, and put the new one
	 * into the chain.
         */
        pNdisBuffer->Next = NULL;
	old_head = NDIS_PACKET_FIRST_NDIS_BUFFER(pNdisPacket);
	old_tail = NDIS_PACKET_LAST_NDIS_BUFFER(pNdisPacket);
	NdisReinitializePacket(pNdisPacket);
	NdisChainBufferAtFront(pNdisPacket, pNdisBuffer);
#if 0
	if (direction == INCOMING) {
		DBGPRINT(("incoming: proto %u (%s), src %08X, dst %08X, sport %u, dport %u, len %u\n", *(payload+14+9), texify_proto(*(payload+14+9)), *(unsigned int*)(payload+14+12), *(unsigned int*)(payload+14+16), ntohs((*((unsigned short int*)(payload+14+20)))), ntohs((*((unsigned short int*)(payload+14+22)))), TotalPacketLength));
	} else {
		DBGPRINT(("outgoing: proto %u (%s), src %08X, dst %08X, sport %u, dport %u, len %u\n", *(payload+14+9), texify_proto(*(payload+14+9)), *(unsigned int*)(payload+14+12), *(unsigned int*)(payload+14+16), ntohs((*((unsigned short int*)(payload+14+20)))), ntohs((*((unsigned short int*)(payload+14+22)))), TotalPacketLength));
	}
#endif
	if (direction == INCOMING)
		ret = ipfw_check_hook(NULL, &m, NULL, PFIL_IN, NULL);
	else
		ret = ipfw_check_hook(NULL, &m, (struct ifnet*)_if_out, PFIL_OUT, NULL);

	if (m != NULL) {
		/* Accept. Restore the old buffer chain, free
		 * the mbuf and return PASS.
		 */
		//DBGPRINT(("accepted\n"));
		NdisReinitializePacket(pNdisPacket);
		NDIS_PACKET_FIRST_NDIS_BUFFER(pNdisPacket) = old_head;
		NDIS_PACKET_LAST_NDIS_BUFFER(pNdisPacket) = old_tail;
		NdisFreeBuffer(pNdisBuffer);
		m_freem(m);
		return PASS;
	} else if (ret == 0) {
		/* dummynet has kept the packet, will reinject later. */
		//DBGPRINT(("kept by dummynet\n"));
		return DUMMYNET;
	} else {
		/*
		 * Packet dropped by ipfw or dummynet. Nothing to do as
		 * FREE_PKT already freed the fake mbuf
		 */
		//DBGPRINT(("dropped by dummynet, ret = %i\n", ret));
		return DROP;
	}
drop_pkt:
	/* for some reason we cannot proceed. Free any resources
	 * including those received from above, and return
	 * faking success. XXX this must be fixed later.
	 */
	NdisFreePacket(pNdisPacket);
	return DROP;
}

/*
 * Windows reinjection function.
 * The packet is already available as m->pkt, so we only
 * need to send it to the right place.
 * Normally a ndis intermediate driver allocates
 * a fresh descriptor, while the actual data's ownership is
 * retained by the protocol, or the miniport below.
 * Since an intermediate driver behaves as a miniport driver
 * at the upper edge (towards the protocol), and as a protocol
 * driver at the lower edge (towards the NIC), when we handle a
 * packet we have a reserved area in both directions (we can use
 * only one for each direction at our own discretion).
 * Normally this area is used to save a pointer to the original
 * packet, so when the driver is done with it, the original descriptor
 * can be retrieved, and the resources freed (packet descriptor,
 * buffer descriptor(s) and the actual data). In our driver this
 * area is used to mark the reinjected packets as 'orphan', because
 * the original descriptor is gone long ago. This way we can handle
 * correctly the resource freeing when the callback function
 * is called by NDIS.
 */

void 
netisr_dispatch(int num, struct mbuf *m)
{
	unsigned char*		payload = (unsigned char*)(m+1);
	PADAPT				pAdapt = m->context;
	NDIS_STATUS			Status;
	PNDIS_PACKET		pPacket = m->pkt;
	PNDIS_BUFFER		pNdisBuffer;
	NDIS_HANDLE			PacketPool;

	if (num < 0)
		goto drop_pkt;

	//debug print
#if 0
	DbgPrint("reinject %s\n", m->direction == OUTGOING ?
		"outgoing" : "incoming");
#endif
	NdisAcquireSpinLock(&pAdapt->Lock);
	if (m->direction == OUTGOING) {
		//we must first check if the adapter is going down,
		// in this case abort the reinjection
		if (pAdapt->PTDeviceState > NdisDeviceStateD0) {
			pAdapt->OutstandingSends--;
			// XXX should we notify up ?
			NdisReleaseSpinLock(&pAdapt->Lock);
			goto drop_pkt;
		}
	} else {
		/* if the upper miniport edge is not initialized or
		 * the miniport edge is in low power state, abort
		 * XXX we should notify the error.
		 */
		if (!pAdapt->MiniportHandle ||
		    pAdapt->MPDeviceState > NdisDeviceStateD0) {
			NdisReleaseSpinLock(&pAdapt->Lock);
			goto drop_pkt;
		}
	}
	NdisReleaseSpinLock(&pAdapt->Lock);

	if (m->direction == OUTGOING) {
		PSEND_RSVD	SendRsvd;
		/* use the 8-bytes protocol reserved area, the first
		 * field is used to mark/the packet as 'orphan', the
		 * second stores the pointer to the mbuf, so in the
		 * the SendComplete handler we know that this is a
		 * reinjected packet and can free correctly.
		 */
		SendRsvd = (PSEND_RSVD)(pPacket->ProtocolReserved);
		SendRsvd->OriginalPkt = NULL;
		SendRsvd->pMbuf = m;
		//do the actual send
		NdisSend(&Status, pAdapt->BindingHandle, pPacket);
		if (Status != NDIS_STATUS_PENDING) {
			/* done, call the callback now */
			PtSendComplete(m->context, m->pkt, Status);
		}
		return; /* unconditional return here. */
	} else {
		/* There's no need to check the 8-bytes miniport 
		 * reserved area since the path going up will be always
		 * syncronous, and all the cleanup will be done inline.
		 * If the reinjected packed comes from a PtReceivePacket, 
		 * there will be no callback.
		 * Otherwise PtReceiveComplete will be called but will just
		 * return since all the cleaning is alreqady done */
		// do the actual receive. 
		ULONG Proc = KeGetCurrentProcessorNumber();
		pAdapt->ReceivedIndicationFlags[Proc] = TRUE;
		NdisMEthIndicateReceive(pAdapt->MiniportHandle, NULL, payload, 14, payload+14, m->m_len, m->m_len);
		NdisMEthIndicateReceiveComplete(pAdapt->MiniportHandle);
		pAdapt->ReceivedIndicationFlags[Proc] = FALSE;
	}
drop_pkt:
	/* NDIS_PACKET exists and must be freed only if
	 * the packet come from a PtReceivePacket, oherwise
	 * m->pkt will ne null.
	 */
	if (m->pkt != NULL)
	{
		NdisUnchainBufferAtFront(m->pkt, &pNdisBuffer);
		NdisFreeBuffer(pNdisBuffer);
		NdisFreePacket(m->pkt);
	}
	m_freem(m);
}

void win_freem(void *);	/* wrapper for m_freem() for protocol.c */
void
win_freem(void *_m)
{
	struct mbuf *m = _m;
	m_freem(m);
}

/*
 * not implemented in linux.
 * taken from /usr/src/lib/libc/string/strlcpy.c
 */
size_t
strlcpy(char *dst, const char *src, size_t siz)
{
        char *d = dst;
        const char *s = src;
        size_t n = siz;
 
        /* Copy as many bytes as will fit */
        if (n != 0 && --n != 0) {
                do {
                        if ((*d++ = *s++) == 0)
                                break;
                } while (--n != 0);
        }

        /* Not enough room in dst, add NUL and traverse rest of src */
        if (n == 0) {
                if (siz != 0)
                        *d = '\0';              /* NUL-terminate dst */
                while (*s++)
                        ;
        }

        return(s - src - 1);    /* count does not include NUL */
}

void CleanupReinjected(PNDIS_PACKET Packet, struct mbuf* m, PADAPT pAdapt)
{
	PNDIS_BUFFER pNdisBuffer;

	NdisQueryPacket(Packet, NULL, NULL, &pNdisBuffer, NULL);
	NdisUnchainBufferAtFront(Packet, &pNdisBuffer);
	NdisFreeBuffer(pNdisBuffer);
	win_freem(m);
	NdisFreePacket(Packet);
	ADAPT_DECR_PENDING_SENDS(pAdapt);
}

int
ipfw2_qhandler_w32_oldstyle(int direction,
	NDIS_HANDLE         ProtocolBindingContext,
    unsigned char*      HeaderBuffer,
    unsigned int        HeaderBufferSize,
    unsigned char*      LookAheadBuffer,
    unsigned int        LookAheadBufferSize,
    unsigned int        PacketSize)
{
	struct mbuf* m;
	unsigned char*		payload = NULL;
	unsigned short		EtherType = 0;
	int					ret = 0;
	
	/* We are in a special case when NIC signals an incoming
	 * packet using old style calls. This is done passing
	 * a pointer to the MAC header and a pointer to the
	 * rest of the packet.
	 * We simply allocate space for the mbuf and the
	 * subsequent payload section.
	 */
	m = malloc(sizeof(struct mbuf) + HeaderBufferSize + LookAheadBufferSize, 0, 0 );
	if (m == NULL) //resource shortage, drop the packet
		return DROP;
	
	/* set mbuf fields to point past the MAC header.
	 * Also set additional W32 info.
	 * m->pkt here is set to null because the notification
	 * from the NIC has come with a header+loolahead buffer,
	 * no NDIS_PACKET has been provided.
	 */
	payload = (unsigned char*)(m + 1);
	m->m_len = m->m_pkthdr.len = HeaderBufferSize+LookAheadBufferSize-14;
	m->m_data = payload + 14; /* past the MAC header */
	m->direction = direction;
	m->context = ProtocolBindingContext;
	m->pkt = NULL;
	
	/*
	 * Now copy the data from the Windows buffers to the mbuf.
	 */
	bcopy(HeaderBuffer, payload, HeaderBufferSize);
	bcopy(LookAheadBuffer, payload+HeaderBufferSize, LookAheadBufferSize);
	//hexdump(payload,HeaderBufferSize+LookAheadBufferSize,"qhandler");
	/*
	 * Identify EtherType. If the packet is not IP, simply allow
	 * and don't bother the firewall. XXX should be done before.
	 */
	EtherType = *(unsigned short*)(payload + 12);
	EtherType = RtlUshortByteSwap(EtherType);
	if (EtherType != 0x0800) {
		//DbgPrint("ethertype = %X, skipping ipfw\n",EtherType);
		free(m, 0);
		return PASS;
	}

	//DbgPrint("incoming_raw: proto %u (%s), src %08X, dst %08X, sport %u, dport %u, len %u\n", *(payload+14+9), texify_proto(*(payload+14+9)), *(unsigned int*)(payload+14+12), *(unsigned int*)(payload+14+16), ntohs((*((unsigned short int*)(payload+14+20)))), ntohs((*((unsigned short int*)(payload+14+22)))), HeaderBufferSize+LookAheadBufferSize);
	
	/* Query the firewall */
	ret = ipfw_check_hook(NULL, &m, NULL, PFIL_IN, NULL);

	if (m != NULL) {
		/* Accept. Free the mbuf and return PASS. */
		//DbgPrint("accepted\n");
		m_freem(m);
		return PASS;
	} else if (ret == 0) {
		/* dummynet has kept the packet, will reinject later. */
		//DbgPrint("kept by dummynet\n");
		return DUMMYNET;
	} else {
		/*
		 * Packet dropped by ipfw or dummynet. Nothing to do as
		 * FREE_PKT already freed the fake mbuf
		 */
		//DbgPrint("dropped by dummynet, ret = %i\n", ret);
		return DROP;
	}
}

/* forward declaration because those functions are used only here,
 * no point to make them visible in passthru/protocol/miniport */
int do_ipfw_set_ctl(struct sock *sk, int cmd,
	void __user *user, unsigned int len);
int do_ipfw_get_ctl(struct sock *sk, int cmd,
	void __user *user, int *len);

NTSTATUS
DevIoControl(
    IN PDEVICE_OBJECT    pDeviceObject,
    IN PIRP              pIrp
    )
/*++

Routine Description:

    This is the dispatch routine for handling device ioctl requests.

Arguments:

    pDeviceObject - Pointer to the device object.

    pIrp - Pointer to the request packet.

Return Value:

    Status is returned.

--*/
{
    PIO_STACK_LOCATION  pIrpSp;
    NTSTATUS            NtStatus = STATUS_SUCCESS;
    unsigned long       BytesReturned = 0;
    unsigned long       FunctionCode;
    unsigned long       len;
    struct sockopt		*sopt;
    int					ret = 0;
    
    UNREFERENCED_PARAMETER(pDeviceObject);
    
    pIrpSp = IoGetCurrentIrpStackLocation(pIrp);
    
    /*
     * Using METHOD_BUFFERED as communication method, the userland
     * side calls DeviceIoControl passing an input buffer and an output
     * and their respective length (ipfw uses the same length for both).
     * The system creates a single I/O buffer, with len=max(inlen,outlen).
     * In the kernel we can read information from this buffer (which is
     * directly accessible), overwrite it with our results, and set
     * IoStatus.Information with the number of bytes that the system must
     * copy back to userland.
     * In our sockopt emulation, the initial part of the buffer contains
     * a struct sockopt, followed by the data area.
     */

    len = pIrpSp->Parameters.DeviceIoControl.InputBufferLength;
    if (len < sizeof(struct sockopt))
    {
	return STATUS_NOT_SUPPORTED; // XXX find better value
    }
    sopt = pIrp->AssociatedIrp.SystemBuffer;

    FunctionCode = pIrpSp->Parameters.DeviceIoControl.IoControlCode;

    len = sopt->sopt_valsize;

    switch (FunctionCode)
    {
		case IP_FW_SETSOCKOPT:
			ret = do_ipfw_set_ctl(NULL, sopt->sopt_name, sopt+1, len);
			break;
			
		case IP_FW_GETSOCKOPT:
			ret = do_ipfw_get_ctl(NULL, sopt->sopt_name, sopt+1, &len);
			sopt->sopt_valsize = len;
			//sanity check on len
			if (len + sizeof(struct sockopt) <= pIrpSp->Parameters.DeviceIoControl.InputBufferLength)
				BytesReturned = len + sizeof(struct sockopt);
			else
				BytesReturned = pIrpSp->Parameters.DeviceIoControl.InputBufferLength;
			break;

		default:
				NtStatus = STATUS_NOT_SUPPORTED;
				break;
    }
    
    pIrp->IoStatus.Information = BytesReturned;
    pIrp->IoStatus.Status = NtStatus;
    IoCompleteRequest(pIrp, IO_NO_INCREMENT);

    return NtStatus;
} 

void dummynet(void * unused);
void ipfw_tick(void * vnetx);

VOID dummynet_dpc(
    __in struct _KDPC  *Dpc,
    __in_opt PVOID  DeferredContext,
    __in_opt PVOID  SystemArgument1,
    __in_opt PVOID  SystemArgument2
    )
{
	dummynet(NULL);
}

VOID ipfw_dpc(
    __in struct _KDPC  *Dpc,
    __in_opt PVOID  DeferredContext,
    __in_opt PVOID  SystemArgument1,
    __in_opt PVOID  SystemArgument2
    )
{
	ipfw_tick(DeferredContext);
}


================================================
FILE: kipfw/missing.h
================================================
/*
 * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $Id: missing.h 12256 2013-04-26 21:12:44Z luigi $
 *
 * Header for kernel variables and functions that are not available in
 * userland.
 */

#ifndef _MISSING_H_
#define _MISSING_H_

#include <sys/cdefs.h>
#ifdef linux
#include <linux/sysctl.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#endif /* linux */

/* portability features, to be set before the rest: */
#define HAVE_NET_IPLEN		/* iplen/ipoff in net format */
#define WITHOUT_BPF		/* do not use bpf logging */

#ifdef _WIN32

#ifndef DEFINE_SPINLOCK
#define DEFINE_SPINLOCK(x)	FAST_MUTEX x
#endif
/* spinlock --> Guarded Mutex KGUARDED_MUTEX */
/* http://www.reactos.org/wiki/index.php/Guarded_Mutex */
#define spin_lock_init(_l)
#define spin_lock_bh(_l)
#define spin_unlock_bh(_l)

#include <sys/socket.h>		/* bsd-compat.c */
#include <netinet/in.h>		/* bsd-compat.c */
#include <netinet/ip.h>		/* local version */
#define INADDR_TO_IFP(a, b) b = NULL

#else	/* __linux__ */

#define MALLOC_DECLARE(x)	/* nothing */
#include <linux/time.h>		/* do_gettimeofday */
#include <netinet/ip.h>		/* local version */
struct inpcb;

/*
 * Kernel locking support.
 * FreeBSD uses mtx in dummynet.c and struct rwlock ip_fw2.c
 *
 * In linux we use spinlock_bh to implement both.
 * For 'struct rwlock' we need an #ifdef to change it to spinlock_t
 */

#ifndef DEFINE_SPINLOCK	/* this is for linux 2.4 */
#define DEFINE_SPINLOCK(x)   spinlock_t x = SPIN_LOCK_UNLOCKED
#endif


#define rw_assert(a, b)
#define rw_destroy(_l)
#define rw_init(_l, msg)	spin_lock_init(_l)
#define rw_rlock(_l)		spin_lock_bh(_l)
#define rw_runlock(_l)		spin_unlock_bh(_l)
#define rw_wlock(_l)		spin_lock_bh(_l)
#define rw_wunlock(_l)		spin_unlock_bh(_l)
#define rw_init_flags(_l, s, v)

#define mtx_assert(a, b)
#define	mtx_destroy(m)
#define mtx_init(m, a,b,c) 	spin_lock_init(m)
#define mtx_lock(_l)		spin_lock_bh(_l)
#define mtx_unlock(_l)		spin_unlock_bh(_l)

#endif	/* __linux__ */
/* end of locking support */

/*
 * Reference to an ipfw rule that can be carried outside critical sections.
 * A rule is identified by rulenum:rule_id which is ordered.
 * In version chain_id the rule can be found in slot 'slot', so
 * we don't need a lookup if chain_id == chain->id.
 *
 * On exit from the firewall this structure refers to the rule after
 * the matching one (slot points to the new rule; rulenum:rule_id-1
 * is the matching rule), and additional info (e.g. info often contains
 * the insn argument or tablearg in the low 16 bits, in host format).
 * On entry, the structure is valid if slot>0, and refers to the starting
 * rules. 'info' contains the reason for reinject, e.g. divert port,
 * divert direction, and so on.
 */
struct ipfw_rule_ref {
	uint32_t	slot;		/* slot for matching rule	*/
	uint32_t	rulenum;	/* matching rule number		*/
	uint32_t	rule_id;	/* matching rule id		*/
	uint32_t	chain_id;	/* ruleset id			*/
	uint32_t	info;		/* see below			*/
};

enum {
	IPFW_INFO_MASK	= 0x0000ffff,
	IPFW_INFO_OUT	= 0x00000000,	/* outgoing, just for convenience */
	IPFW_INFO_IN	= 0x80000000,	/* incoming, overloads dir */
	IPFW_ONEPASS	= 0x40000000,	/* One-pass, do not reinject */
	IPFW_IS_MASK	= 0x30000000,	/* which source ? */
	IPFW_IS_DIVERT	= 0x20000000,
	IPFW_IS_DUMMYNET =0x10000000,
	IPFW_IS_PIPE	= 0x08000000,	/* pipe=1, queue = 0 */
};

/* in netinet/in.h */
#define        in_nullhost(x)  ((x).s_addr == INADDR_ANY)

/* bzero not present on linux, but this should go in glue.h */
#define bzero(s, n) memset(s, 0, n)
#define bcmp(p1, p2, n) memcmp(p1, p2, n)

/* ethernet stuff */
#define	ETHERTYPE_IP		0x0800	/* IP protocol */
//#define	ETHER_ADDR_LEN		6	/* length of an Ethernet address */
struct ether_header {
        u_char  ether_dhost[ETHER_ADDR_LEN];
        u_char  ether_shost[ETHER_ADDR_LEN];
        u_short ether_type;
};

#define ETHER_TYPE_LEN          2       /* length of the Ethernet type field */
#define ETHER_HDR_LEN           (ETHER_ADDR_LEN*2+ETHER_TYPE_LEN)

/*
 * Historically, BSD keeps ip_len and ip_off in host format
 * when doing layer 3 processing, and this often requires
 * to translate the format back and forth.
 * To make the process explicit, we define a couple of macros
 * that also take into account the fact that at some point
 * we may want to keep those fields always in net format.
 */

#if (BYTE_ORDER == BIG_ENDIAN) || defined(HAVE_NET_IPLEN)
#define SET_NET_IPLEN(p)        do {} while (0)
#define SET_HOST_IPLEN(p)       do {} while (0)
#else /* never on linux */
#define SET_NET_IPLEN(p)        do {            \
        struct ip *h_ip = (p);                  \
        h_ip->ip_len = htons(h_ip->ip_len);     \
        h_ip->ip_off = htons(h_ip->ip_off);     \
        } while (0)

#define SET_HOST_IPLEN(p)       do {            \
        struct ip *h_ip = (p);                  \
        h_ip->ip_len = ntohs(h_ip->ip_len);     \
        h_ip->ip_off = ntohs(h_ip->ip_off);     \
        } while (0)
#endif /* !HAVE_NET_IPLEN */

/* ip_dummynet.c */
#define __FreeBSD_version 500035

#ifdef __linux__
struct moduledata;
int my_mod_register(const char *name,
	int order, struct moduledata *mod, void *init, void *uninit);

/* define some macro for ip_dummynet */

struct malloc_type {
};

#define MALLOC_DEFINE(type, shortdesc, longdesc) 	\
	struct malloc_type type[1]; void *md_dummy_ ## type = type

#define CTASSERT(x)

/* log... does not use the first argument */
#define	LOG_ERR		0x100
#define	LOG_INFO	0x200
#define log(_level, fmt, arg...)  do {			\
	int _qwerty=_level;(void)_qwerty; printk(KERN_ERR fmt, ##arg); } while (0)

/*
 * gettimeofday would be in sys/time.h but it is not
 * visible if _KERNEL is defined
 */
int gettimeofday(struct timeval *, struct timezone *);

#else  /* _WIN32 */
#define MALLOC_DEFINE(a,b,c)
#endif /* _WIN32 */

extern int	hz;
extern long	tick;		/* exists in 2.4 but not in 2.6 */
extern int	bootverbose;
extern struct timeval boottime;

/* The time_uptime a FreeBSD variable increased each second */
#ifdef __linux__
#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,37) /* revise boundaries */
#define time_uptime get_seconds()
#else /* OpenWRT */
#define time_uptime CURRENT_TIME
#endif
#else /* WIN32 */
#define time_uptime time_uptime_w32()
#endif

extern int	max_linkhdr;
extern int	ip_defttl;
extern u_long	in_ifaddrhmask;                         /* mask for hash table */
extern struct in_ifaddrhashhead *in_ifaddrhashtbl;    /* inet addr hash table  */

/*-------------------------------------------------*/

/* define, includes and functions missing in linux */
/* include and define */
#include <arpa/inet.h>		/* inet_ntoa */

struct mbuf;

/* used by ip_dummynet.c */
void reinject_drop(struct mbuf* m);

#include <linux/errno.h>	/* error define */
#include <linux/if.h>		/* IFNAMESIZ */

void rn_init(int);
/*
 * some network structure can be defined in the bsd way
 * by using the _FAVOR_BSD definition. This is not true
 * for icmp structure.
 * XXX struct icmp contains bsd names in 
 * /usr/include/netinet/ip_icmp.h
 */
#ifdef __linux__
#define icmp_code code
#define icmp_type type

/* linux in6_addr has no member __u6_addr
 * replace the whole structure ?
 */
#define __u6_addr       in6_u
#define __u6_addr32     u6_addr32
#endif /* __linux__ */

/* defined in linux/sctp.h with no bsd definition */
struct sctphdr {
        uint16_t src_port;      /* source port */
        uint16_t dest_port;     /* destination port */
        uint32_t v_tag;         /* verification tag of packet */
        uint32_t checksum;      /* Adler32 C-Sum */
        /* chunks follow... */
};

/* missing definition */
#define TH_FIN  0x01
#define TH_SYN  0x02
#define TH_RST  0x04
#define TH_ACK  0x10

#define RTF_CLONING	0x100		/* generate new routes on use */

#define IPPROTO_OSPFIGP         89              /* OSPFIGP */
#define IPPROTO_CARP            112             /* CARP */
#ifndef _WIN32
#define IPPROTO_IPV4            IPPROTO_IPIP    /* for compatibility */
#endif

#define	CARP_VERSION		2
#define	CARP_ADVERTISEMENT	0x01

#define PRIV_NETINET_IPFW       491     /* Administer IPFW firewall. */

#define IP_FORWARDING           0x1             /* most of ip header exists */

#define NETISR_IP       2               /* same as AF_INET */

#define PRIV_NETINET_DUMMYNET   494     /* Administer DUMMYNET. */

extern int securelevel;

struct carp_header {
#if BYTE_ORDER == LITTLE_ENDIAN
        u_int8_t        carp_type:4,
                        carp_version:4;
#endif
#if BYTE_ORDER == BIG_ENDIAN
        u_int8_t        carp_version:4,
                        carp_type:4;
#endif
};

struct pim {
	int dummy;      /* windows compiler does not like empty definition */
};

#ifndef _WIN32
struct route {
	struct  rtentry *ro_rt;
	struct  sockaddr ro_dst;
};
#endif

struct ifaltq {
	void *ifq_head;
};

/*
 * ifnet->if_snd is used in ip_dummynet.c to take the transmission
 * clock.
 */
#if defined( __linux__)
#define	if_xname	name
#define	if_snd		XXX
/* search local the ip addresses, used for the "me" keyword */
#include <linux/inetdevice.h>

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
#define INADDR_TO_IFP(ip, b)	\
	b = ip_dev_find(ip.s_addr)
#else
#define INADDR_TO_IFP(ip, b)	\
	b = ip_dev_find((struct net *)&init_net, ip.s_addr)
#endif

#elif defined( _WIN32 )
/* used in ip_dummynet.c */
struct ifnet {
	char    if_xname[IFNAMSIZ];     /* external name (name + unit) */
//        struct ifaltq if_snd;          /* output queue (includes altq) */
};

struct net_device {
	char    if_xname[IFNAMSIZ];     /* external name (name + unit) */
};
#endif

/* involves mbufs */
int in_cksum(struct mbuf *m, int len);
#define divert_cookie(mtag) 0
#define divert_info(mtag) 0
#define pf_find_mtag(a) NULL
#define pf_get_mtag(a) NULL
#ifndef _WIN32
#define AF_LINK AF_ASH	/* ? our sys/socket.h */
#endif

/* we don't pullup, either success or free and fail */
#define m_pullup(m, x)					\
	((m)->m_len >= x ? (m) : (FREE_PKT(m), NULL))

struct pf_mtag {
	void            *hdr;           /* saved hdr pos in mbuf, for ECN */
	sa_family_t      af;            /* for ECN */
        u_int32_t        qid;           /* queue id */
};

#if 0 // ndef radix
/* radix stuff in radix.h and radix.c */
struct radix_node {
	caddr_t rn_key;         /* object of search */
	caddr_t rn_mask;        /* netmask, if present */
};
#endif /* !radix */

/* missing kernel functions */
char *inet_ntoa(struct in_addr ina);
int random(void);

/*
 * Return the risult of a/b
 *
 * this is used in linux kernel space,
 * since the 64bit division needs to
 * be done using a macro
 */
int64_t
div64(int64_t a, int64_t b);

char *
inet_ntoa_r(struct in_addr ina, char *buf);

/* from bsd sys/queue.h */
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)                      \
        for ((var) = TAILQ_FIRST((head));                               \
            (var) && ((tvar) = TAILQ_NEXT((var), field), 1);            \
            (var) = (tvar))

#define SLIST_FOREACH_SAFE(var, head, field, tvar)                      \
        for ((var) = SLIST_FIRST((head));                               \
            (var) && ((tvar) = SLIST_NEXT((var), field), 1);            \
            (var) = (tvar))

/* depending of linux version */
#ifndef ETHERTYPE_IPV6
#define ETHERTYPE_IPV6          0x86dd          /* IP protocol version 6 */
#endif

/*-------------------------------------------------*/
#define RT_NUMFIBS 1
extern u_int rt_numfibs;

/* involves kernel locking function */
#ifdef RTFREE
#undef RTFREE
#define RTFREE(a) fprintf(stderr, "RTFREE: commented out locks\n");
#endif

void getmicrouptime(struct timeval *tv);

/* from sys/netinet/ip_output.c */
struct ip_moptions;
struct route;
struct ip;

struct mbuf *ip_reass(struct mbuf *);
u_short in_cksum_hdr(struct ip *);
int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
    struct ip_moptions *imo, struct inpcb *inp);

/* from net/netisr.c */
void netisr_dispatch(int num, struct mbuf *m);

/* definition moved in missing.c */
int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len);

int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen);

/* defined in session.c */
int priv_check(struct thread *td, int priv);

/* struct ucred is in linux/socket.h and has pid, uid, gid.
 * We need a 'bsd_ucred' to store also the extra info
 */

struct bsd_ucred {
	uid_t		uid;
	gid_t		gid;
	uint32_t	xid;
	uint32_t	nid;
};

int
cred_check(void *insn, int proto, struct ifnet *oif,
    struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
    u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp,
    struct sk_buff *skb);

int securelevel_ge(struct ucred *cr, int level);

struct sysctl_oid;
struct sysctl_req;

#ifdef _WIN32
#define module_param_named(_name, _var, _ty, _perm)
#else /* !_WIN32 */

/* Linux 2.4 is mostly for openwrt */
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
#include <linux/bitops.h>	 /* generic_ffs() used in ip_fw2.c */
typedef uint32_t __be32;
typedef uint16_t __be16;
struct sock;
struct net;
struct inet_hashinfo;
struct sock *inet_lookup(
	struct inet_hashinfo *hashinfo,
        const __be32 saddr, const __be16 sport,
        const __be32 daddr, const __be16 dport,
        const int dif);
struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif);
#endif /* Linux < 2.6 */

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) &&	\
	LINUX_VERSION_CODE > KERNEL_VERSION(2,6,16)	/* XXX NOT sure, in 2.6.9 give an error */
#define module_param_named(_name, _var, _ty, _perm)	\
	//module_param(_name, _ty, 0644)
#endif

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
typedef unsigned long uintptr_t;

#ifdef __i386__
static inline unsigned long __fls(unsigned long word)
{
        asm("bsr %1,%0"
            : "=r" (word)
            : "rm" (word));
        return word;
}
#endif

#endif /* LINUX < 2.6.25 */

#endif /* !_WIN32 so maybe __linux__ */

#if defined (__linux__) && !defined (EMULATE_SYSCTL)
#define SYSCTL_DECL(_1)
#define SYSCTL_OID(_1, _2, _3, _4, _5, _6, _7, _8)
#define SYSCTL_NODE(_1, _2, _3, _4, _5, _6)
#define _SYSCTL_BASE(_name, _var, _ty, _perm)		\
	module_param_named(_name, *(_var), _ty, 	\
		( (_perm) == CTLFLAG_RD) ? 0444: 0644 )
#define SYSCTL_PROC(_base, _oid, _name, _mode, _var, _val, _desc, _a, _b)

#define SYSCTL_INT(_base, _oid, _name, _mode, _var, _val, _desc)	\
	_SYSCTL_BASE(_name, _var, int, _mode)

#define SYSCTL_LONG(_base, _oid, _name, _mode, _var, _val, _desc)	\
	_SYSCTL_BASE(_name, _var, long, _mode)

#define SYSCTL_ULONG(_base, _oid, _name, _mode, _var, _val, _desc)	\
	_SYSCTL_BASE(_name, _var, ulong, _mode)

#define SYSCTL_UINT(_base, _oid, _name, _mode, _var, _val, _desc)	\
	 _SYSCTL_BASE(_name, _var, uint, _mode)

#define TUNABLE_INT(_name, _ptr)

#define SYSCTL_VNET_PROC		SYSCTL_PROC
#define SYSCTL_VNET_INT			SYSCTL_INT
#define SYSCTL_VNET_UINT		SYSCTL_UINT

#endif

#define SYSCTL_HANDLER_ARGS 		\
	struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req
int sysctl_handle_int(SYSCTL_HANDLER_ARGS);
int sysctl_handle_long(SYSCTL_HANDLER_ARGS); 


void ether_demux(struct ifnet *ifp, struct mbuf *m);

int ether_output_frame(struct ifnet *ifp, struct mbuf *m);

void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum);

void icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu);

void rtfree(struct rtentry *rt);

u_short in_cksum_skip(struct mbuf *m, int len, int skip);

#ifdef INP_LOCK_ASSERT
#undef INP_LOCK_ASSERT
#define INP_LOCK_ASSERT(a)
#endif

int jailed(struct ucred *cred);

/*
* Return 1 if an internet address is for a ``local'' host
* (one to which we have a connection).  If subnetsarelocal
* is true, this includes other subnets of the local net.
* Otherwise, it includes only the directly-connected (sub)nets.
*/
int in_localaddr(struct in_addr in);

/* the prototype is already in the headers */
//int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); 

int fnmatch(const char *pattern, const char *string, int flags);

int
linux_lookup(const int proto, const __be32 saddr, const __be16 sport,
	const __be32 daddr, const __be16 dport,
	struct sk_buff *skb, int dir, struct bsd_ucred *u);

/* vnet wrappers, in vnet.h and ip_var.h */
//int ipfw_init(void);
//void ipfw_destroy(void);

#define	MTAG_IPFW	1148380143	/* IPFW-tagged cookie */
#define	MTAG_IPFW_RULE	1262273568	/* rule reference */

struct ip_fw_args;
extern int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa);

#define curvnet                 NULL
#define	CURVNET_SET(_v)
#define	CURVNET_RESTORE()
#define VNET_ASSERT(condition)

#define VNET_NAME(n)            n
#define VNET_DECLARE(t, n)      extern t n
#define VNET_DEFINE(t, n)       t n
#define _VNET_PTR(b, n)         &VNET_NAME(n)
/*
 * Virtualized global variable accessor macros.
 */
#define VNET_VNET_PTR(vnet, n)          (&(n))
#define VNET_VNET(vnet, n)              (n)

#define VNET_PTR(n)             (&(n))
#define VNET(n)                 (n)

VNET_DECLARE(int, ip_defttl);
#define V_ip_defttl    VNET(ip_defttl);

int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp,
	int dir, struct inpcb *inp);

/* hooks for divert */
extern void (*ip_divert_ptr)(struct mbuf *m, int incoming);

extern int (*ip_dn_ctl_ptr)(struct sockopt *);
typedef int ip_fw_ctl_t(struct sockopt *);
extern ip_fw_ctl_t *ip_fw_ctl_ptr;

/* netgraph prototypes */
typedef int ng_ipfw_input_t(struct mbuf **, int, struct ip_fw_args *, int);
extern  ng_ipfw_input_t *ng_ipfw_input_p;

/* For kernel ipfw_ether and ipfw_bridge. */
struct ip_fw_args;
typedef int ip_fw_chk_t(struct ip_fw_args *args);
extern  ip_fw_chk_t     *ip_fw_chk_ptr;

#define V_ip_fw_chk_ptr         VNET(ip_fw_chk_ptr)
#define V_ip_fw_ctl_ptr         VNET(ip_fw_ctl_ptr)
#define	V_tcbinfo		VNET(tcbinfo)
#define	V_udbinfo		VNET(udbinfo)

#endif /* !_MISSING_H_ */


================================================
FILE: kipfw/mysetenv.sh
================================================
#!/bin/bash

# bash script to set a suitable environment to call MSVC's build
# to build a 64-bit version of the kernel.
#
# inspired by C:/winddk/7600.16385.1/bin/setenv.bat
# see http://www.osronline.com/ddkx/ddtools/build_ref_0kqb.htm

#############################################################
#  edit theese variables to meet your configuration         #
#  - DRIVE is the hard drive letter where DDK is installed  #
#  - DDK is the path to the DDK's root directory            #
#  - CYGDDK is the complete cygwin path to DDK              #
#############################################################
if [ $# -ne 3 ]; then
echo "invalid params" && exit 1
fi
DRIVE=$1
DDK=$2
CYGDDK=/cygdrive/c/${DDK}
TARGETOS=$3
MYDIR=`pwd`	# XXX luigi

if [ "$TARGETOS" = "wnet" ]; then
export DDK_TARGET_OS=WinNET
export _NT_TARGET_VERSION=0x502
fi

if [ "$TARGETOS" = "wlh" ]; then
export DDK_TARGET_OS=WinLH
export _NT_TARGET_VERSION=0x600
fi

if [ "$TARGETOS" = "win7" ]; then
export DDK_TARGET_OS=Win7
export _NT_TARGET_VERSION=0x601
fi


#############################################################
#  don't edit anything else below this point                #
#############################################################

D=${DRIVE}${DDK}
DB=${D}/bin
DI=${D}/inc
DL=${D}/lib


export AMD64=1
export ATL_INC_PATH=$DI				# defaults to DDKROOT/inc
export ATL_INC_ROOT=$DI				# XXX redundant ?
export ATL_LIB_PATH=${DL}/atl/*
export BASEDIR=$D				# default
export BUFFER_OVERFLOW_CHECKS=1
export BUILD_ALLOW_COMPILER_WARNINGS=1
export BUILD_ALT_DIR=chk_${TARGETOS}_AMD64
export BUILD_DEFAULT="-ei -nmake -i -nosqm"	# can go on the command line
export BUILD_DEFAULT_TARGETS="-amd64"		# can also go on the command line
export BUILD_MAKE_PROGRAM=nmake.exe		# default to nmake
export BUILD_MULTIPROCESSOR=1			# parallel make, same as -M
export BUILD_OPTIONS=" ~imca ~toastpkg"
export COFFBASE_TXT_FILE=${DB}/coffbase.txt
export CPU=AMD64
export CRT_INC_PATH=${DI}/crt			# default
export CRT_LIB_PATH=${DL}/crt/*			# not default, it seems uses lib/{wnet,win7}/*
export DDKBUILDENV=chk				# checked or free
export DDK_INC_PATH=${DI}/ddk
export DDK_LIB_DEST=${DL}/${TARGETOS}
export DDK_LIB_PATH=${DL}/${TARGETOS}/*
export DEPRECATE_DDK_FUNCTIONS=1
export DRIVER_INC_PATH=${DI}/ddk
export HALKIT_INC_PATH=${DI}/ddk
export HALKIT_LIB_PATH=${DL}/${TARGETOS}/*
export IFSKIT_INC_PATH=${DI}/ddk
export IFSKIT_LIB_DEST=${DL}/${TARGETOS}
export IFSKIT_LIB_PATH=${DL}/${TARGETOS}/*
export Include=${DI}/api
export KMDF_INC_PATH=${DI}/wdf/kmdf
export KMDF_LIB_PATH=${DL}/wdf/kmdf/*
export LANGUAGE_NEUTRAL=0
export Lib=${DL}
export LINK_LIB_IGNORE=4198
export MFC_INC_PATH=${DI}/mfc42
export MFC_LIB_PATH=${DL}/mfc/*
export MSC_OPTIMIZATION="/Od /Oi" 
export NEW_CRTS=1
export NO_BINPLACE=TRUE
export NO_BROWSER_FILE=TRUE
export NTDBGFILES=1
export NTDEBUG=ntsd
export NTDEBUGTYPE=both
# need NTMAKEENV to point to the binary dir
export NTMAKEENV=${DB}
export OAK_INC_PATH=${DI}/api

export PATH="${CYGDDK}/bin/amd64:${CYGDDK}/tools/sdv/bin:${CYGDDK}/tools/pfd/bin/bin/x86_AMD64\
:${CYGDDK}/bin/SelfSign:${CYGDDK}/bin/x86/amd64:${CYGDDK}/bin/x86\
:${CYGDDK}/tools/pfd/bin/bin/AMD64:${CYGDDK}/tools/tracing/amd64:$PATH"

export PATHEXT=".COM;.EXE;.BAT;.CMD;.VBS;.VBE;.JS;.JSE;.WSF;.WSH;.MSC"
export PROJECT_ROOT=${D}/src
export PUBLIC_ROOT=${D}
export RAZZLETOOLPATH=${DB}
export RCNOFONTMAP=1
export SDK_INC_PATH=${DI}/api
export SDK_LIB_DEST=${DL}/${TARGETOS}
export SDK_LIB_PATH=${DL}/${TARGETOS}/*
export SDV=${D}/tools/sdv
export separate_object_root=FALSE
export TEMP=tmpbuild
export TMP=tmpbuild
export UMDF_INC_PATH=${DI}/wdf/umdf
export USE_OBJECT_ROOT=1
export WDM_INC_PATH=${DI}/ddk
export WPP_CONFIG_PATH=${DB}/wppconfig
export _AMD64bit=true
export _BUILDARCH=AMD64
export _BuildType=chk
export _NTDRIVE=${DRIVE}
export _NTROOT=${DDK}
#
# --- XXX note, it spams  C:/winddk/7600.16385.1/build.dat
# -c: delete objs, -e: generare build.* logfiles, -f rescan sources, -g color errors
unset MAKEFLAGS
echo "emv ${MAKE} flags ${MAKEFLAGS}"
cd kipfw-mod && build -cefg 
echo "done"
#cp objchk_${TARGETOS}_amd64/amd64/ipfw.sys ../binary/ipfw.sys


================================================
FILE: kipfw/netipfw.inf
================================================
; version section
[Version]
Signature  = "$Windows NT$"
Class      = NetService
ClassGUID  = {4D36E974-E325-11CE-BFC1-08002BE10318}
Provider   = %Unipi%
DriverVer  = 08/12/2012,3.0.1.1

; manufacturer section
[Manufacturer]
%Unipi% = UNIPI,NTx86,NTamd64

; control flags section
; optional, unused in netipfw.inf inf, used in netipfw_m.inf
[ControlFlags]

; models section
[UNIPI] ; Win2k
%Desc% = Ipfw.ndi, unipi_ipfw
[UNIPI.NTx86] ;For WinXP and later
%Desc% = Ipfw.ndi, unipi_ipfw
[UNIPI.NTamd64] ;For x64
%Desc% = Ipfw.ndi, unipi_ipfw

; ddinstall section
[Ipfw.ndi]
AddReg          = Ipfw.ndi.AddReg, Ipfw.AddReg
Characteristics = 0x4410 ;  NCF_FILTER | NCF_NDIS_PROTOCOL !--Filter Specific--!!
CopyFiles       = Ipfw.Files.Sys
CopyInf         = netipfw_m.inf

; remove section
[Ipfw.ndi.Remove]
DelFiles = Ipfw.Files.Sys

;ddinstall.services section
[Ipfw.ndi.Services]
AddService = Ipfw,,Ipfw.AddService

[Ipfw.AddService]
DisplayName    = %ServiceDesc%
ServiceType    = 1 ;SERVICE_KERNEL_DRIVER
StartType      = 3 ;SERVICE_DEMAND_START
ErrorControl   = 1 ;SERVICE_ERROR_NORMAL
ServiceBinary  = %12%\ipfw.sys
AddReg         = Ipfw.AddService.AddReg

[Ipfw.AddService.AddReg]

;file copy related sections
[SourceDisksNames]
1=%DiskDescription%,"",,

[SourceDisksFiles]
ipfw.sys=1

[DestinationDirs]
DefaultDestDir = 12
Ipfw.Files.Sys   = 12   ; %windir%\System32\drivers

; ddinstall->copyfiles points here
[Ipfw.Files.Sys]
ipfw.sys,,,2

; ddinstall->addreg points here
[Ipfw.ndi.AddReg]
HKR, Ndi,            HelpText,            , %HELP% ; this is displayed at the bottom of the General page of the Connection Properties dialog box
HKR, Ndi,            FilterClass,         , failover
HKR, Ndi,            FilterDeviceInfId,   , unipi_ipfwmp
HKR, Ndi,            Service,             , Ipfw
HKR, Ndi\Interfaces, UpperRange,          , noupper
HKR, Ndi\Interfaces, LowerRange,          , nolower
HKR, Ndi\Interfaces, FilterMediaTypes,    , "ethernet, tokenring, fddi, wan"

;strings section
[Strings]
Unipi = "Unipi"
DiskDescription = "Ipfw Driver Disk"
Desc = "ipfw+dummynet"
HELP = "This is ipfw and dummynet network emulator, developed by unipi.it"
ServiceDesc = "ipfw service"


================================================
FILE: kipfw/netipfw_m.inf
================================================
; version section
[Version]
Signature  = "$Windows NT$"
Class      = Net
ClassGUID  = {4D36E972-E325-11CE-BFC1-08002BE10318}
Provider   = %Unipi%
DriverVer  = 08/12/2012,3.0.1.1

; control flags section
; optional, unused in netipfw.inf inf, used in netipfw_m.inf
[ControlFlags]
ExcludeFromSelect = unipi_ipfwmp

; destinationdirs section, optional
[DestinationDirs]
DefaultDestDir=12
; No files to copy 

; manufacturer section
[Manufacturer]
%Unipi% = UNIPI,NTx86,NTamd64

; models section
[UNIPI] ; Win2k
%Desc% = IpfwMP.ndi, unipi_ipfwmp
[UNIPI.NTx86] ;For WinXP and later
%Desc% = IpfwMP.ndi, unipi_ipfwmp
[UNIPI.NTamd64] ;For x64
%Desc% = IpfwMP.ndi, unipi_ipfwmp

; ddinstall section
[IpfwMP.ndi]
AddReg  = IpfwMP.ndi.AddReg
Characteristics = 0x29 ;NCF_NOT_USER_REMOVABLE | NCF_VIRTUAL | NCF_HIDDEN

; ddinstall->addreg points here
[IpfwMP.ndi.AddReg]
HKR, Ndi, Service,  0,  IpfwMP

;ddinstall.services section
[IpfwMP.ndi.Services]
AddService = IpfwMP,0x2, IpfwMP.AddService

[IpfwMP.AddService]
ServiceType    = 1 ;SERVICE_KERNEL_DRIVER
StartType      = 3 ;SERVICE_DEMAND_START
ErrorControl   = 1 ;SERVICE_ERROR_NORMAL
ServiceBinary  = %12%\ipfw.sys
AddReg         = IpfwMP.AddService.AddReg

[IpfwMP.AddService.AddReg]
; None

[Strings]
Unipi = "Unipi"
Desc = "Ipfw Miniport"


================================================
FILE: kipfw/sources
================================================
TARGETNAME=ipfw
TARGETTYPE=DRIVER

C_DEFINES=$(C_DEFINES) -DNDIS_MINIPORT_DRIVER -DNDIS_WDM=1

MSC_WARNING_LEVEL=/W2

# The driver is built in the XP or .NET build environment
# So let us build NDIS 5.1 version.
C_DEFINES=$(C_DEFINES) -DNDIS51_MINIPORT=1
C_DEFINES=$(C_DEFINES) -DNDIS51=1

# Enable dummynet preprocessing macros
C_DEFINES=$(C_DEFINES) /D_WIN32 /DMODULENAME=Ipfw /D_BSD_SOURCE /DKERNEL_MODULE /D_KERNEL /DKLD_MODULE /D__BSD_VISIBLE /DIPFIREWALL_DEFAULT_TO_ACCEPT /D__LITTLE_ENDIAN /DSYSCTL_NODE /DEMULATE_SYSCTL -FIwinmissing.h -FImissing.h -FI../glue.h /DWIN32_LEAN_AND_MEAN=1

TARGETLIBS=$(DDK_LIB_PATH)\ndis.lib

INCLUDES= include_e ; ../sys

SOURCES= ip_fw2.c ip_fw_pfil.c ip_fw_sockopt.c ip_fw_dynamic.c ip_fw_table.c ip_fw_log.c radix.c in_cksum.c ip_dummynet.c ip_dn_io.c ip_dn_glue.c dn_heap.c dn_sched_fifo.c dn_sched_wf2q.c dn_sched_rr.c dn_sched_qfq.c dn_sched_prio.c ipfw2_mod.c bsd_compat.c md_win.c miniport.c protocol.c passthru.c debug.c


================================================
FILE: kipfw/win-passthru.diff
================================================
diff -ubwrp original_passthru/miniport.c kipfw/miniport.c
--- original_passthru/miniport.c	2012-08-01 14:34:15.096679600 +0200
+++ kipfw/miniport.c	2012-08-01 14:34:11.377929600 +0200
@@ -223,6 +223,7 @@ Return Value:
     //
     // Use NDIS 5.1 packet stacking:
     //
+    if (0)	// XXX IPFW - make sure we don't go in here
     {
         PNDIS_PACKET_STACK        pStack;
         BOOLEAN                   Remaining;
@@ -347,6 +348,25 @@ Return Value:
                                                 MediaSpecificInfo,
                                                 MediaSpecificInfoSize);
         }
+#if 1	/* IPFW: query the firewall */
+	/* if dummynet keeps the packet, we mimic success.
+	 * otherwise continue as usual.
+	 */
+		{
+			int ret = ipfw2_qhandler_w32(MyPacket, OUTGOING,
+					MiniportAdapterContext);
+			if (ret != PASS) {
+				if (ret == DROP)
+					return NDIS_STATUS_FAILURE;
+				else {  //dummynet kept the packet
+#ifndef WIN9X
+					NdisIMCopySendCompletePerPacketInfo (Packet, MyPacket);
+#endif
+					return NDIS_STATUS_SUCCESS; //otherwise simply continue
+				}
+			}
+		}
+#endif	/* end of IPFW code */
 
         NdisSend(&Status,
                  pAdapt->BindingHandle,
diff -ubwrp original_passthru/passthru.c kipfw/passthru.c
--- original_passthru/passthru.c	2012-08-01 14:34:15.268554600 +0200
+++ kipfw/passthru.c	2012-08-01 14:34:11.534179600 +0200
@@ -47,8 +47,15 @@ NDIS_HANDLE        NdisWrapperHandle;
 // To support ioctls from user-mode:
 //
 
-#define LINKNAME_STRING     L"\\DosDevices\\Passthru"
-#define NTDEVICE_STRING     L"\\Device\\Passthru"
+#define STR2(x) #x
+#define STR(x) STR2(x)
+#define DOSPREFIX "\\DosDevices\\"
+#define NTPREFIX "\\Device\\"
+#define WIDEN2(x) L ## x
+#define WIDEN(x) WIDEN2(x)
+#define LINKNAME_STRING			WIDEN(DOSPREFIX) WIDEN(STR(MODULENAME))
+#define NTDEVICE_STRING			WIDEN(NTPREFIX) WIDEN(STR(MODULENAME))
+#define PROTOCOLNAME_STRING		WIDEN(STR(MODULENAME))
 
 NDIS_HANDLE     NdisDeviceHandle = NULL;
 PDEVICE_OBJECT  ControlDeviceObject = NULL;
@@ -136,8 +143,8 @@ Return Value:
         // Either the Send or the SendPackets handler should be specified.
         // If SendPackets handler is specified, SendHandler is ignored
         //
-        MChars.SendHandler = NULL;    // MPSend;
-        MChars.SendPacketsHandler = MPSendPackets;
+        MChars.SendHandler = MPSend;    // IPFW: use MPSend, not SendPackets
+        MChars.SendPacketsHandler = NULL;
 
         Status = NdisIMRegisterLayeredMiniport(NdisWrapperHandle,
                                                   &MChars,
@@ -165,7 +172,7 @@ Return Value:
         // This is needed to ensure that NDIS can correctly determine
         // the binding and call us to bind to miniports below.
         //
-        NdisInitUnicodeString(&Name, L"Passthru");    // Protocol name
+        NdisInitUnicodeString(&Name, PROTOCOLNAME_STRING);    // Protocol name
         PChars.Name = Name;
         PChars.OpenAdapterCompleteHandler = PtOpenAdapterComplete;
         PChars.CloseAdapterCompleteHandler = PtCloseAdapterComplete;
@@ -205,6 +212,8 @@ Return Value:
         NdisTerminateWrapper(NdisWrapperHandle, NULL);
     }
 
+    ipfw_module_init();	// IPFW - start the system
+
     return(Status);
 }
 
@@ -276,7 +285,8 @@ Return Value:
         DispatchTable[IRP_MJ_CREATE] = PtDispatch;
         DispatchTable[IRP_MJ_CLEANUP] = PtDispatch;
         DispatchTable[IRP_MJ_CLOSE] = PtDispatch;
-        DispatchTable[IRP_MJ_DEVICE_CONTROL] = PtDispatch;
+	// IPFW we use DevIoControl ?
+        DispatchTable[IRP_MJ_DEVICE_CONTROL] = DevIoControl;
         
 
         NdisInitUnicodeString(&DeviceName, NTDEVICE_STRING);
@@ -453,6 +463,7 @@ PtUnload(
     
     NdisFreeSpinLock(&GlobalLock);
 
+    ipfw_module_exit(); // IPFW unloading dummynet
+
     DBGPRINT(("PtUnload: done!\n"));
 }
-
diff -ubwrp original_passthru/passthru.h kipfw/passthru.h
--- original_passthru/passthru.h	2012-08-01 14:34:15.049804600 +0200
+++ kipfw/passthru.h	2012-08-01 14:34:11.362304600 +0200
@@ -61,6 +61,13 @@ PtDispatch(
     IN PIRP                      Irp
     );
 
+DRIVER_DISPATCH DevIoControl;
+NTSTATUS
+DevIoControl(
+    IN PDEVICE_OBJECT            pDeviceObject,
+    IN PIRP                      pIrp
+    );
+
 NDIS_STATUS
 PtRegisterDevice(
     VOID
@@ -366,6 +373,7 @@ PtDereferenceAdapt(
 typedef struct _SEND_RSVD
 {
     PNDIS_PACKET    OriginalPkt;
+    struct mbuf*    pMbuf; // IPFW extension, reference to the mbuf
 } SEND_RSVD, *PSEND_RSVD;
 
 //
@@ -376,6 +384,7 @@ typedef struct _SEND_RSVD
 typedef struct _RECV_RSVD
 {
     PNDIS_PACKET    OriginalPkt;
+    struct mbuf*    pMbuf; // IPFW extension, reference to the mbuf
 } RECV_RSVD, *PRECV_RSVD;
 
 C_ASSERT(sizeof(RECV_RSVD) <= sizeof(((PNDIS_PACKET)0)->MiniportReserved));
@@ -475,3 +484,17 @@ IsIMDeviceStateOn(
 */
 #define IsIMDeviceStateOn(_pP)        ((_pP)->MPDeviceState == NdisDeviceStateD0 && (_pP)->PTDeviceState == NdisDeviceStateD0 ) 
 
+#include "winmissing.h"
+
+int ipfw_module_init(void);
+void ipfw_module_exit(void);
+int ipfw2_qhandler_w32(PNDIS_PACKET pNdisPacket, int direction,
+	NDIS_HANDLE Context);
+int ipfw2_qhandler_w32_oldstyle(int direction, NDIS_HANDLE ProtocolBindingContext,
+		unsigned char* HeaderBuffer, unsigned int HeaderBufferSize,
+		unsigned char* LookAheadBuffer, unsigned int LookAheadBufferSize,
+	    unsigned int PacketSize);
+void CleanupReinjected(PNDIS_PACKET Packet, struct mbuf* m, PADAPT pAdapt);
+void hexdump(PUCHAR,int, const char *);
+void my_init();
+void my_exit();
\ Manca newline alla fine del file
Solo in original_passthru: passthru.htm
Solo in original_passthru: passthru.rc
diff -ubwrp original_passthru/protocol.c kipfw/protocol.c
--- original_passthru/protocol.c	2012-08-01 14:34:15.112304600 +0200
+++ kipfw/protocol.c	2012-08-01 14:34:11.409179600 +0200
@@ -841,6 +841,14 @@ Return Value:
         SendRsvd = (PSEND_RSVD)(Packet->ProtocolReserved);
         Pkt = SendRsvd->OriginalPkt;
     
+#if 1	// IPFW - new code
+	//DbgPrint("SendComplete: packet %p pkt %p\n", Packet, Pkt);
+	if (Pkt == NULL) { //this is a reinjected packet, with no 'father'
+		CleanupReinjected(Packet, SendRsvd->pMbuf, pAdapt);
+		return;
+	}
+#endif /* IPFW */
+    
 #ifndef WIN9X
         NdisIMCopySendCompletePerPacketInfo (Pkt, Packet);
 #endif
@@ -1021,6 +1029,13 @@ Return Value:
 
                 if (pAdapt->MiniportHandle != NULL)
                 {
+#if 1	/* IPFW: query the firewall */
+					int	ret;
+					ret = ipfw2_qhandler_w32(MyPacket, INCOMING,
+						ProtocolBindingContext);
+					if (ret != PASS)
+					return 0; //otherwise simply continue
+#endif /* end of IPFW code */
                     NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &MyPacket, 1);
                 }
 
@@ -1055,6 +1070,13 @@ Return Value:
         {
             case NdisMedium802_3:
             case NdisMediumWan:
+				//DbgPrint("EthIndicateReceive context %p, header at %p len %u, lookahead at %p len %u, packetsize %u\n",ProtocolBindingContext,HeaderBuffer,HeaderBufferSize,LookAheadBuffer,LookAheadBufferSize,PacketSize);
+				//hexdump(HeaderBuffer,HeaderBufferSize+LookAheadBufferSize,"EthIndicateReceive");
+			{
+				int ret = ipfw2_qhandler_w32_oldstyle(INCOMING, ProtocolBindingContext, HeaderBuffer, HeaderBufferSize, LookAheadBuffer, LookAheadBufferSize, PacketSize);
+				if (ret != PASS)
+					return NDIS_STATUS_SUCCESS;
+			}
                 NdisMEthIndicateReceive(pAdapt->MiniportHandle,
                                              MacReceiveContext,
                                              HeaderBuffer,
@@ -1120,6 +1142,21 @@ Return Value:
     PADAPT        pAdapt =(PADAPT)ProtocolBindingContext;
     ULONG         Proc = KeGetCurrentProcessorNumber();      
 
+	/* Warning: this is a poor implementation of the PtReceiveComplete
+	 * made by MS, and it's a well known (but never fixed) issue.
+	 * Since the ProcessorNumber here can be different from the one
+	 * that processed the PtReceive, sometimes NdisMEthIndicateReceiveComplete
+	 * will not be called, causing poor performance in the incoming traffic.
+	 * In our driver, PtReceive is called for IP packets ONLY by particulary 
+	 * old NIC drivers, and the poor performance can be seen even 
+	 * in traffic not handled by ipfw or dummynet.
+	 * Fortunately, this is quite rare, all the incoming IP packets
+	 * will arrive through PtReceivePacket, and this callback will never
+	 * be called. For reinjected traffic, a workaround is done
+	 * commuting the ReceivedIndicationFlag and calling
+	 * NdisMEthIndicateReceiveComplete manually for each packet.
+	 */
+
     if (((pAdapt->MiniportHandle != NULL)
                 && (pAdapt->MPDeviceState == NdisDeviceStateD0))
                 && (pAdapt->ReceivedIndicationFlags[Proc]))
@@ -1199,7 +1236,7 @@ Return Value:
     // See also: PtReceive(). 
     //
     (VOID)NdisIMGetCurrentPacketStack(Packet, &Remaining);
-    if (Remaining)
+    if (0 && Remaining)
     {
         //
         // We can reuse "Packet". Indicate it up and be done with it.
@@ -1247,6 +1284,13 @@ Return Value:
 
         if (pAdapt->MiniportHandle != NULL)
         {
+#if 1	/* IPFW: query the firewall */
+	    int	ret;
+	    ret = ipfw2_qhandler_w32(MyPacket, INCOMING,
+			ProtocolBindingContext);
+	    if (ret != PASS)
+			return 0; //otherwise simply continue
+#endif /* end of IPFW code */
             NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &MyPacket, 1);
         }
 

================================================
FILE: kipfw/winmissing.h
================================================
/*
 * Copyright (c) 2010 Francesco Magno, Universita` di Pisa
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $Id: winmissing.h 11647 2012-08-06 23:20:21Z luigi $
 * definitions and other things needed to build freebsd kernel
 * modules in Windows (with the MSVC compiler)
 */

#ifndef _WINMISSING_H_
#define _WINMISSING_H_

#include <ntifs.h>
#include <ntddk.h>
#include <basetsd.h>
#include <windef.h>
#include <stdio.h>
#include <ndis.h>

typedef UCHAR	u_char;
typedef UCHAR	u_int8_t;
typedef UCHAR	uint8_t;
typedef USHORT	u_short;
typedef USHORT	u_int16_t;
typedef USHORT	uint16_t;
typedef USHORT	n_short;
typedef UINT	u_int;
typedef INT32	int32_t;
typedef UINT32	u_int32_t;
typedef UINT32	uint32_t;
typedef ULONG	u_long;
typedef ULONG	n_long;
typedef UINT64	uint64_t;
typedef UINT64	u_int64_t;
typedef INT64	int64_t;

typedef UINT32	in_addr_t;
typedef UCHAR	sa_family_t;
typedef	USHORT	in_port_t;
typedef UINT32	__gid_t;
typedef UINT32	gid_t;
typedef UINT32	__uid_t;
typedef UINT32	uid_t;
typedef ULONG	n_time;
typedef char*	caddr_t;

/* linux_lookup uses __be32 and __be16 in the prototype */
typedef uint32_t __be32; /* XXX __u32 __bitwise __be32 */
typedef uint16_t __be16; /* XXX */

//*** DEBUG STUFF ***
/*
 * To see the debugging messages you need DbgView
http://technet.microsoft.com/en-us/sysinternals/bb896647.aspx
 */
#define printf		DbgPrint
#define log(lev, ...)	DbgPrint(__VA_ARGS__)
const char* texify_cmd(int i);
const char* texify_proto(unsigned int p);
//*** end DEBUG STUFF ***

#define snprintf _snprintf
#define timespec timeval
struct timeval {
	long tv_sec;
	long tv_usec;
};

struct in_addr {
	in_addr_t s_addr;
};

struct sockaddr_in {
	uint8_t	sin_len;
	sa_family_t	sin_family;
	in_port_t	sin_port;
	struct	in_addr sin_addr;
	char	sin_zero[8];
};

/* XXX watch out, windows names are actually longer */
#define IFNAMSIZ	16
#define IF_NAMESIZE	16

#define ETHER_ADDR_LEN 6

/* we do not include the windows headers for in6_addr so
 * we need to provide our own definition for the kernel.
 */
struct in6_addr {
        union {
                uint8_t         __u6_addr8[16];
                uint16_t        __u6_addr16[8]; 
                uint32_t        __u6_addr32[4];
        } __u6_addr;                    /* 128-bit IP6 address */
};

#define	htons(x) RtlUshortByteSwap(x)
#define	ntohs(x) RtlUshortByteSwap(x)
#define	htonl(x) RtlUlongByteSwap(x)
#define	ntohl(x) RtlUlongByteSwap(x)

#define ENOSPC          28      /* No space left on device */
#define	EOPNOTSUPP	45	/* Operation not supported */
#define	EACCES		13	/* Permission denied */
#define	ENOENT		2	/* No such file or directory */
#define EINVAL          22      /* Invalid argument */
#define	EPROTONOSUPPORT	43	/* Protocol not supported */
#define	ENOMEM		12	/* Cannot allocate memory */
#define	EEXIST		17	/* File exists */
#define ESRCH		3
#define	ENOBUFS		55	/* No buffer space available */
#define	EBUSY		16	/* Module busy */


#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
#define __packed 
#define __aligned(x);
#define __user
#define __init
#define __exit
#define __func__ __FUNCTION__
#define inline __inline

struct sockaddr_in6 {
	int dummy;
};

//SPINLOCKS
#define DEFINE_SPINLOCK(x)		NDIS_SPIN_LOCK x
#define mtx_init(m,a,b,c)		NdisAllocateSpinLock(m)
#define mtx_lock(_l)			NdisAcquireSpinLock(_l)
#define mtx_unlock(_l)			NdisReleaseSpinLock(_l)
#define	mtx_destroy(m)			NdisFreeSpinLock(m)
#define mtx_assert(a, b)

#define rw_rlock(_l)			NdisAcquireSpinLock(_l)
#define rw_runlock(_l)			NdisReleaseSpinLock(_l)
#define rw_assert(a, b)
#define rw_wlock(_l)			NdisAcquireSpinLock(_l)
#define rw_wunlock(_l)			NdisReleaseSpinLock(_l)
#define rw_destroy(_l)			NdisFreeSpinLock(_l)
#define rw_init(_l, msg)		NdisAllocateSpinLock(_l)
#define rw_init_flags(_l, s, v)		NdisAllocateSpinLock(_l)

#define rwlock_t NDIS_SPIN_LOCK
#define spinlock_t NDIS_SPIN_LOCK

#define s6_addr   __u6_addr.__u6_addr8


struct icmphdr {
	u_char	icmp_type;		/* type of message, see below */
	u_char	icmp_code;		/* type sub code */
	u_short	icmp_cksum;		/* ones complement cksum of struct */
};

#define	ICMP_ECHO		8		/* echo service */

#define IPOPT_OPTVAL            0               /* option ID */
#define IPOPT_OLEN              1               /* option length */
#define IPOPT_EOL               0               /* end of option list */
#define IPOPT_NOP               1               /* no operation */
#define IPOPT_LSRR              131             /* loose source route */
#define IPOPT_SSRR              137             /* strict source route */
#define IPOPT_RR                7               /* record packet route */
#define IPOPT_TS                68              /* timestamp */

#define	IPPROTO_ICMP	1		/* control message protocol */
#define	IPPROTO_TCP		6		/* tcp */
#define	IPPROTO_UDP		17		/* user datagram protocol */
#define	IPPROTO_ICMPV6		58		/* ICMP6 */
#define	IPPROTO_SCTP		132		/* SCTP */
#define	IPPROTO_HOPOPTS		0		/* IP6 hop-by-hop options */
#define	IPPROTO_ROUTING		43		/* IP6 routing header */
#define	IPPROTO_FRAGMENT	44		/* IP6 fragmentation header */
#define	IPPROTO_DSTOPTS		60		/* IP6 destination option */
#define	IPPROTO_AH		51		/* IP6 Auth Header */
#define	IPPROTO_ESP		50		/* IP6 Encap Sec. Payload */
#define	IPPROTO_NONE		59		/* IP6 no next header */
#define	IPPROTO_PIM		103		/* Protocol Independent Mcast */

#define IPPROTO_IPV6		41
#define	IPPROTO_IPV4		4		/* IPv4 encapsulation */


#define	INADDR_ANY		(uint32_t)0x00000000

#define	AF_INET		2		/* internetwork: UDP, TCP, etc. */
#define	AF_LINK		18		/* Link layer interface */

#define	IN_CLASSD(i)		(((uint32_t)(i) & 0xf0000000) == 0xe0000000)
#define	IN_MULTICAST(i)		IN_CLASSD(i)

#define DROP 0
#define PASS 1
#define DUMMYNET 2
#define INCOMING 0
#define OUTGOING 1

size_t strlcpy(char *dst, const char *src, size_t siz);
void do_gettimeofday(struct timeval *tv);
int ffs(int bits);
int time_uptime_w32();

#endif /* _WINMISSING_H_ */


================================================
FILE: planetlab/Makefile.planetlab
================================================
# $Id: Makefile 11687 2012-08-12 20:51:25Z luigi $
#
# Top level makefile for building ipfw/dummynet (kernel and userspace).
# You can run it manually or also under the Planetlab build.
# Planetlab wants also the 'install' target.
#
# To build on system with non standard Kernel sources or userland files,
# you should run this with
#
#	make KERNELPATH=/path/to/linux-2.x.y.z USRDIR=/path/to/usr
#
# We assume that $(USRDIR) contains include/ and lib/ used to build userland.
#

include Makefile.inc

DATE ?= $(shell date +%Y%m%d)
SNAPSHOT_NAME=$(DATE)-ipfw3.tgz
BINDIST=$(DATE)-dummynet-linux.tgz
WINDIST=$(DATE)-dummynet-windows.zip

.PHONY: ipfw kipfw

###########################################
#  windows x86 and x64 specific variables #
###########################################
#  DRIVE must be the hard drive letter where DDK is installed
#  DDKDIR must be the path to the DDK root directory, without drive letter
#  TARGETOS (x64 only) must be one of the following:
#  wnet   -> windows server 2003
#  wlh    -> windows vista and windows server 2008
#  win7   -> windows 7
#  future version must be added here
export WIN64
export DDK
export DRIVE
export DDKDIR
DRIVE ?= C:
DDKDIR ?= /WinDDK/7600.16385.1
DDK = $(DRIVE)$(DDKDIR)

TARGETOS=win7

_all: all

clean distclean:
	-@(cd ipfw && $(MAKE) $(@) )
	-@rm -rf kipfw-mod binary64/[A-hj-z]*

all: kipfw ipfw
	@# -- windows only
ifeq ($(OSARCH),Windows)	# copy files
ifeq ($(WIN64),)
	-@ cp ipfw/ipfw.exe kipfw-mod/$(OBJDIR)/ipfw.sys binary/
	-@ cp kipfw/*.inf binary/
else
	-@ cp binary/* binary64/
	-@ cp ipfw/ipfw.exe kipfw-mod/objchk_win7_amd64/amd64/ipfw.sys binary64/
endif	# WIN64
endif	# Windows

win64:
	$(MAKE) WIN64=1

# kipfw-src prepares the sources for the kernel part.
# The windows files (passthru etc.) are modified version of the
# examples found in the $(DDK)/src/network/ndis/passthru/driver/
# They can be re-created using the 'ndis-glue' target
# # We need a sed trick to remove newlines from the patchfile.

ndis-glue:
	-@mkdir -p kipfw-mod
	cp $(DDK)/src/network/ndis/passthru/driver/*.[ch] kipfw-mod
	cat kipfw/win-passthru.diff | sed "s/$$(printf '\r')//g" | (cd kipfw-mod; patch )

kipfw-src:
	-@rm -rf kipfw-mod
	-@mkdir -p kipfw-mod
	-@cp -Rp kipfw/* kipfw-mod
	-@cp `find sys -name \*.c` kipfw-mod
	-@(cd kipfw-mod && $(MAKE) include_e)
ifeq ($(OSARCH),Windows)
	make ndis-glue
endif

snapshot:
	$(MAKE) distclean
	(cd ..; tar cvzhf /tmp/$(SNAPSHOT_NAME) --exclude .svn \
		--exclude README.openwrt --exclude tags --exclude NOTES \
		--exclude tcc-0.9.25-bsd \
		--exclude original_passthru \
		--exclude ipfw3.diff --exclude add_rules \
		--exclude test --exclude test_ \
		ipfw3-2012 )

bindist:
	$(MAKE) clean
	$(MAKE) all
	tar cvzf /tmp/$(BINDIST) ipfw/ipfw ipfw/ipfw.8 kipfw-mod/ipfw_mod.ko

windist:
	$(MAKE) clean
	-$(MAKE) all
	-rm /tmp/$(WINDIST)
	zip -r /tmp/$(WINDIST) binary -x \*.svn\*


ipfw:
	@(cd ipfw && $(MAKE) $(@) )

kipfw: kipfw-src
ifeq ($(WIN64),)	# linux or windows 32 bit
	@(cd kipfw-mod && $(MAKE) $(@) )
else	#--- windows 64 bit, we use build.exe and nmake
	rm -f kipfw-mod/Makefile
	mkdir kipfw-mod/tmpbuild		# check mysetenv.sh
	bash kipfw/mysetenv.sh $(DRIVE) $(DDKDIR) $(TARGETOS)
endif

IPF3_REPO ?= svn+ssh://some.host/some/path/ipfw3-2012

planetlab_update:
	# clean and create a local working directory
	rm -rf /tmp/pl-tmp
	mkdir -p /tmp/pl-tmp/pl
	mkdir -p /tmp/pl-tmp/ol2
	# get the trunk version of the PlanetLab repository
	# to specify the sshkey use the .ssh/config file
	(cd /tmp/pl-tmp/pl; \
		svn co svn+ssh://svn.planet-lab.org/svn/ipfw/trunk)
	# get an updated copy of the main ipfw repository
	(cd /tmp/pl-tmp/ol2; svn export $(IPFW3_REPO) )
	# copy the new version over the old one
	(cd /tmp/pl-tmp; cp -rP ol2/ipfw3/* pl/trunk)
	# files cleanup in the old version
	(cd /tmp/pl-tmp; diff -r ol2/ipfw3 pl/trunk | \
		grep -v "svn" | awk '{print $$3 $$4}' | \
		sed 's/:/\//' | xargs rm -rf)
	# local adjustments here
	rm -rf /tmp/pl-tmp/pl/trunk/planetlab/check_planetlab_sync
	# commit to the remote repo
	@echo "Please, revise the update with the commands:"
	@echo "(cd /tmp/pl-tmp/pl/trunk; svn diff)"
	@echo "(cd /tmp/pl-tmp/pl/trunk; svn status)"
	@echo "and commit with:"
	@echo "(cd /tmp/pl-tmp/pl/trunk; svn ci -m 'Update from the mail ipfw repo.')"

openwrt_release:
	# create a temporary directory
	$(eval TMPDIR := $(shell mktemp -d -p /tmp/ ipfw3_openwrt_XXXXX))
	# create the source destination directory
	$(eval IPFWDIR := ipfw3-$(DATE))
	$(eval DSTDIR := $(TMPDIR)/$(IPFWDIR))
	mkdir $(DSTDIR)
	# copy the package, clean objects and svn info
	cp -r ./ipfw ./kipfw-mod glue.h Makefile ./configuration README $(DSTDIR)
	(cd $(DSTDIR); make -s distclean; find . -name .svn | xargs rm -rf)
	(cd $(TMPDIR); tar czf $(IPFWDIR).tar.gz $(IPFWDIR))

	# create the port files in /tmp/ipfw3-port
	$(eval PORTDIR := $(TMPDIR)/ipfw3)
	mkdir -p $(PORTDIR)/patches
	# generate the Makefile, PKG_VERSION and PKG_MD5SUM
	md5sum $(DSTDIR).tar.gz | cut -d ' ' -f 1 > $(TMPDIR)/md5sum
	cat ./OPENWRT/Makefile | \
		sed s/PKG_VERSION:=/PKG_VERSION:=$(DATE)/ | \
		sed s/PKG_MD5SUM:=/PKG_MD5SUM:=`cat $(TMPDIR)/md5sum`/ \
		> $(PORTDIR)/Makefile

	@echo ""
	@echo "The openwrt port is in $(TMPDIR)/ipfw3-port"
	@echo "The source file should be copied to the public server:"
	@echo "scp $(DSTDIR).tar.gz marta@info.iet.unipi.it:~marta/public_html/dummynet"
	@echo "after this the temporary directory $(TMPDIR) can be removed."

install:

diff:
	-@(diff -upr $(BSD_HEAD)/sbin/ipfw ipfw)
	-@(diff -upr $(BSD_HEAD)/sys sys)


================================================
FILE: planetlab/check_planetlab_sync
================================================
#!/bin/sh

#
# This script is used to check the sync of the local repo
# with the remote planetlab repository

tmpfile=/tmp/chech_planetlab_sync.tmp

# check for local copy sync
svn diff > /tmp/chech_planetlab_sync.tmp
if [ -s $tmpfile ] ; then
	echo "Local repo unsynced, can not continue"
	exit -1
	rm $tmpfile
fi

# export remote copy
svn --force export http://svn.planet-lab.org/svn/ipfw/trunk ./ >> /dev/null

# check diffs again, output to the user
svn diff 
svn status | grep -v check_planetlab_sync


================================================
FILE: planetlab/ipfw
================================================
#!/bin/sh
#
# ipfw	init the emulation service
#
# chkconfig: 2345 09 91
# description: ipfw init and shutdown
#

# Source function library.
. /etc/init.d/functions

IPFW=ipfw
IPFW_BACKEND=/vsys/ipfw-be
IPFW_MOD=ipfw_mod

if [ ! -x /sbin/$IPFW ] || [ ! -x ${IPFW_BACKEND} ]; then
    echo -n "/sbin/$IPFW does not exist."; warning; echo
    exit 0
fi

# Load the ipfw module, and initialize netconfig
start() {
	# load the module
	modprobe $IPFW_MOD >& /dev/null
	let ret=$?;
        [ $ret -eq 0 ] && success || failure

	# init netconfig
	echo "super dbcleanup" | ${IPFW_BACKEND} root >& /dev/null
	echo "super init" | ${IPFW_BACKEND} root >& /dev/null

	return $ret
}

stop() {
	# clean netconfig stuff
	echo "super dbcleanup" | ${IPFW_BACKEND} root >& /dev/null
	echo "Unloading $IPFW_MOD module: "

	# unload the ipfw module
	rmmod ${IPFW_MOD}
	let ret=$?;
	[ $ret -eq 0 ] && success || failure

	return $ret
}

# echo the ipfw status
status() {
	# check for module presence
	grep '^ipfw_mod$' /proc/modules >& /dev/null || echo "ipfw not loaded" && return 0

	# Show active users
	USERS=$(grep BLOCK /tmp/ff | wc -l)
	echo "ipfw is loaded and there are currently ${USERS} with active emulation."
	return 0
}

# main
case "$1" in
    start)
	start
	RETVAL=$?
	;;
    stop)
	stop
	RETVAL=$?
	;;
    restart)
	stop
	start
	RETVAL=$?
	;;
    status)
	status
	RETVAL=$?
	;;
    *)
	echo $"Usage: $0 {start|stop|restart|status}"
	exit 1
	;;
esac

exit $RETVAL


================================================
FILE: planetlab/ipfw.cron
================================================
# Runs every 5 minutes and clean ipfw expired rules
# $Id: ipfw.cron 6069 2010-04-15 09:35:33Z marta $
*/5 * * * * root     echo "super killexpired" | /vsys/ipfw-be root > /dev/null 2>&1


================================================
FILE: planetlab/ipfwroot.spec
================================================
#
# Marta Carbone <marta.carbone@iet.unipi.it>
# 2009 - Universita` di Pisa
# License is BSD.

# kernel_release, kernel_version and kernel_arch are expected to be set by the build to e.g.
# kernel_release : 24.onelab  (24 is then the planetlab taglevel)
# kernel_version : 2.6.27.57 | 2.6.32  (57 in the 27 case is the patch level)
# kernel_arch :    i686 | x86_64

# the 2012 release was pulled from http://info.iet.unipi.it/~marta/dummynet/ipfw3-20120610.tar.gz
# seel also          http://sourceforge.net/p/dummynet/code
# in 2013 Marta has moved to sourceforge at
# git clone git://git.code.sf.net/p/dummynet/code your read-only code
%define name ipfwroot
%define version 3
%define taglevel 1

# when no planetlab kernel is being built, kernel_version is defined but empty
%define _with_planetlab_kernel %{?kernel_version:1}%{!?kernel_version:0}
# we need to make sure that this rpm gets upgraded when the kernel release changes
%if %{_with_planetlab_kernel}
# with the planetlab kernel
%define pl_kernel_taglevel %( echo %{kernel_release} | cut -d. -f1 )
%define ipfw_release %{kernel_version}.%{pl_kernel_taglevel}
%else
# with the stock kernel
# this line below
#%define ipfw_release %( rpm -q --qf "%{version}" kernel-headers )
# causes recursive macro definition no matter how much you quote
%define percent %
%define braop \{
%define bracl \}
%define kernel_version %( rpm -q --qf %{percent}%{braop}version%{bracl} kernel-headers )
%define kernel_release %( rpm -q --qf %{percent}%{braop}release%{bracl} kernel-headers )
%define kernel_arch %( rpm -q --qf %{percent}%{braop}arch%{bracl} kernel-headers )
%define ipfw_release %{kernel_version}.%{kernel_release}
%endif

%define release %{ipfw_release}.%{taglevel}%{?pldistro:.%{pldistro}}%{?date:.%{date}}

# guess which convention is used; k27 and before used dash, k32 uses dot
%define kernelpath_dash /usr/src/kernels/%{kernel_version}-%{kernel_release}-%{kernel_arch}
%define kernelpath_dot /usr/src/kernels/%{kernel_version}-%{kernel_release}.%{kernel_arch}
%define kernelpath %( [ -d %{kernelpath_dot} ] && echo %{kernelpath_dot} || echo %{kernelpath_dash} )

# the k32 kernel currently builds e.g. /lib/modules/2.6.32-0.onelab.2010.12.07-i686
# the k27 and before does not have the -i686 part
%define kernel_id_old %{kernel_version}-%{kernel_release}
%define kernel_id_new %{kernel_version}-%{kernel_release}.%{kernel_arch}
%define kernel_id %( [ -d %{kernelpath_dot} ] && echo %{kernel_id_new} || echo %{kernel_id_old} )

Summary: ipfw and dummynet for Linux
Name: %{name}
Version: %{version}
Release: %{release}
License: BSD
Group: System Environment/Kernel
Source0: %{name}-%{version}.tar.bz2
BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-buildroot
Requires: kernel = %{kernel_version}-%{kernel_release}
# in fedora20 cronie does not provide vixie-cronie anymore, just cronie
Requires: cronie
Requires: vsys-scripts
Obsoletes: ipfw

Vendor: unipi
Packager: PlanetLab <marta@onelab2.iet.unipi.it>
# XXX ask 
Distribution: PlanetLab %{plrelease}
URL: %{SCMURL}

%description
ipfw is the Linux port of the FreeBSD ipfw and dummynet packages

%prep
%setup

%build
# clean the rpm build directory
rm -rf $RPM_BUILD_ROOT

%__make KERNELPATH=%kernelpath clean
%__make KERNELPATH=%kernelpath IPFW_PLANETLAB=1

%install
install -D -m 755 kipfw-mod/ipfw_mod.ko $RPM_BUILD_ROOT/lib/modules/%{kernel_id}/net/netfilter/ipfw_mod.ko
install -D -m 755 ipfw/ipfw $RPM_BUILD_ROOT/sbin/ipfw
install -D -m 644 planetlab/ipfw.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/ipfw.cron
install -D -m 755 planetlab/ipfw $RPM_BUILD_ROOT/etc/rc.d/init.d/ipfw

%clean
rm -rf $RPM_BUILD_ROOT

%post
### this script is also triggered while the node image is being created at build-time
# some parts of the script do not make sense in this context
# this is why the build exports PL_BOOTCD=1 in such cases
depmod -a %{kernel_id}
/sbin/chkconfig --add ipfw
# start the service if not building
[ -z "$PL_BOOTCD" ] && service ipfw start

%postun
# stop the service if not building
[ -z "$PL_BOOTCD" ] && service ipfw stop

# here there is a list of the final installation directories
%files
%defattr(-,root,root)
%dir /lib/modules/%{kernel_id}
/lib/modules/%{kernel_id}/net/netfilter/ipfw_mod.ko
/sbin/ipfw
%{_sysconfdir}/cron.d/ipfw.cron
/etc/rc.d/init.d/ipfw

%changelog
* Mon Jul 09 2012 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-20120610-2
- cosmetic changes only in specfile

* Fri Jun 15 2012 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-20120610-1
- integrated ipfw3 as of 20120610 from upstream

* Mon Oct 24 2011 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-23
- for building against k32 on f8

* Sun Oct 02 2011 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-22
- rpm version number has the kernel taglevel embedded

* Fri Jun 10 2011 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-21
- build tweaks for gcc-4.6 on f15

* Sun Jan 23 2011 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-20
- tweaks for compiling on k32/64 bits

* Wed Dec 08 2010 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-19
- fix detection of kernel conventions

* Tue Dec 07 2010 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-18
- guess conventions for either <=k27 or >=k32

* Tue Jun 15 2010 Baris Metin <Talip-Baris.Metin@sophia.inria.fr> - ipfw-0.9-17
- testing git only module-tag

* Tue Jun 15 2010 Baris Metin <Talip-Baris.Metin@sophia.inria.fr> - ipfw-0.9-16
- tagging ipfw to test module-tools on (pure) git

* Wed May 12 2010 Talip Baris Metin <Talip-Baris.Metin@sophia.inria.fr> - ipfw-0.9-15
- tagging for obsoletes

* Tue Apr 27 2010 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-13
- Update to the ipfw3 version of the dummynet code.

* Mon Apr 12 2010 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-11
- add ipfw initialization script to chkconfig

* Wed Mar 03 2010 Talip Baris Metin <Talip-Baris.Metin@sophia.inria.fr> - ipfw-0.9-10
- - Load module at installation - Marta

* Mon Jan 11 2010 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-9
- consistent with vsys-scripts-0.95-13

* Mon Jan 11 2010 Marta Carbone <marta.carbone@iet.unipi.it>
- Integrated the ipfw rules cleanup into the backend

* Sat Jan 09 2010 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-8
- builds on 2.6.22 & 2.6.27 - for 32 and 64 bits

* Wed Jan 06 2010 Marta Carbone <marta.carbone@iet.unipi.it>
- move to dummynet2, added support for table lookup
- added the vsys-script dependencies and the ipfw initialization

* Tue Dec 15 2009 Marta Carbone <marta.carbone@iet.unipi.it>
- more work on the radix code, added sysctl read/write support

* Sun Nov 29 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-7
- added missing qsort.c - tag 0.9-6 was broken

* Thu Nov 26 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-6
- root: removed goto into the main ipfw switch, enabled slice_id matching
- slice: completely move netconfig checks into the backend

* Mon Nov 09 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-5
- additional features on matching packets, including uid match

* Mon Sep 07 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-4
- on behalf of Marta Carbone, more options and features

* Thu Jul 23 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-3
- fixed memory usage issue

* Wed Jul 15 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-2
- patch for building on x86_64

* Thu Jun 25 2009 Marta Carbone <marta.carbone@iet.unipi.it>
- post installation removed for deployment, moved manpages to the slice package

* Fri Apr 17 2009 Marta Carbone <marta.carbone@iet.unipi.it>
- Initial release


================================================
FILE: planetlab/ipfwslice.spec
================================================
#
# TODO:
# restart crond
# modprobe ipfw_mod.ko (depmod ?)
#

# Marta Carbone <marta.carbone@iet.unipi.it>
# 2009 - Universita` di Pisa
# License is BSD.

# the 2012 release was pulled from http://info.iet.unipi.it/~marta/dummynet/ipfw3-20120610.tar.gz
# seel also          http://sourceforge.net/p/dummynet/code
# in 2013 Marta has moved to sourceforge at
# git clone git://git.code.sf.net/p/dummynet/code your read-only code
%define name ipfwslice
%define version 3
%define taglevel 1

%define release %{taglevel}%{?pldistro:.%{pldistro}}%{?date:.%{date}}

Summary: ipfw and dummynet for Linux
Name: %{name}
Version: %{version}
Release: %{release}
License: BSD
Group: System Environment/Kernel
Source0: %{name}-%{version}.tar.bz2
BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-buildroot

Vendor: unipi
Packager: PlanetLab <marta@onelab2.iet.unipi.it>
Distribution: PlanetLab %{plrelease}
URL: %{SCMURL}

%description
the frontend part of the ipfw planetlab package

%prep
%setup

%build
rm -rf $RPM_BUILD_ROOT

%install
install -D -m 755 planetlab/netconfig $RPM_BUILD_ROOT/sbin/netconfig
install -D -m 755 planetlab/ipfw.8.gz $RPM_BUILD_ROOT/%{_mandir}/man8/ipfw.8.gz

%clean
rm -rf $RPM_BUILD_ROOT

# here there is a list of the final installation directories
%files
%defattr(-,root,root)
/sbin/netconfig
%{_mandir}/man8/ipfw.8*

%changelog
* Mon Jul 09 2012 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-20120610-2
- cosmetic changes only in specfile

* Fri Jun 15 2012 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-20120610-1
- integrated ipfw3 as of 20120610 from upstream

* Mon Oct 24 2011 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-23
- for building against k32 on f8

* Sun Oct 02 2011 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-22
- rpm version number has the kernel taglevel embedded

* Fri Jun 10 2011 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-21
- build tweaks for gcc-4.6 on f15

* Sun Jan 23 2011 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-20
- tweaks for compiling on k32/64 bits

* Wed Dec 08 2010 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-19
- fix detection of kernel conventions

* Tue Dec 07 2010 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-18
- guess conventions for either <=k27 or >=k32

* Tue Jun 15 2010 Baris Metin <Talip-Baris.Metin@sophia.inria.fr> - ipfw-0.9-17
- testing git only module-tag

* Tue Jun 15 2010 Baris Metin <Talip-Baris.Metin@sophia.inria.fr> - ipfw-0.9-16
- tagging ipfw to test module-tools on (pure) git

* Wed May 12 2010 Talip Baris Metin <Talip-Baris.Metin@sophia.inria.fr> - ipfw-0.9-15
- tagging for obsoletes

* Tue Apr 27 2010 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-13
- Update to the ipfw3 version of the dummynet code.

* Mon Apr 12 2010 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-11
- add ipfw initialization script to chkconfig

* Wed Mar 03 2010 Talip Baris Metin <Talip-Baris.Metin@sophia.inria.fr> - ipfw-0.9-10
- - Load module at installation - Marta

* Mon Jan 11 2010 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-9
- consistent with vsys-scripts-0.95-13

* Sat Jan 09 2010 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-8
- builds on 2.6.22 & 2.6.27 - for 32 and 64 bits

* Tue Dec 15 2009 Marta Carbone <marta.carbone@iet.unipi.it>
- more work on the radix code, added sysctl read/write support

* Sun Nov 29 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-7
- added missing qsort.c - tag 0.9-6 was broken

* Thu Nov 26 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-6
- root: removed goto into the main ipfw switch, enabled slice_id matching
- slice: completely move netconfig checks into the backend

* Mon Nov 09 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-5
- additional features on matching packets, including uid match

* Mon Sep 07 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-4
- on behalf of Marta Carbone, more options and features

* Thu Jul 23 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-3
- fixed memory usage issue

* Wed Jul 15 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-2
- patch for building on x86_64

* Thu Jun 25 2009 Marta Carbone <marta.carbone@iet.unipi.it>
- Initial release


================================================
FILE: planetlab/netconfig
================================================
#!/bin/sh
#
# Marta Carbone, Luigi Rizzo
# Copyright (C) 2009 Universita` di Pisa
# $Id: netconfig 4533 2009-12-16 14:39:23Z luigi $
#
# This script is the frontend to be used with the vsys system.
# It simply passes information to the backend and gets back the reply

PIPE_IN=/vsys/ipfw-be.in
PIPE_OUT=/vsys/ipfw-be.out

sudo sh -c "echo $* >> ${PIPE_IN}"
sudo sh -c "cat ${PIPE_OUT}"


================================================
FILE: planetlab/planetlab-tags.mk
================================================
# $Id: planetlab-tags.mk 7450 2010-10-18 11:17:43Z marta $
# These are good to build the ipfw modules from svn on kernels 2.6.22
# and are used to fetch files from the onelab2 repository.
linux-2.6-SVNBRANCH	:= 22
linux-2.6-SVNPATH	:= http://svn.planet-lab.org/svn/linux-2.6/tags/linux-2.6-22-39-1
ipfwsrc-SVNPATH		:= svn+ssh://luigi%40onelab2.iet.unipi.it/home/svn/ports-luigi/dummynet-branches/ipfw3


================================================
FILE: planetlab/planetlab.mk
================================================
# $Id: planetlab.mk 4533 2009-12-16 14:39:23Z luigi $
# .mk file to build a module
kernel-MODULES := linux-2.6
kernel-SPEC := kernel-2.6.spec 
kernel-BUILD-FROM-SRPM := yes
ifeq "$(HOSTARCH)" "i386"
kernel-RPMFLAGS:= --target i686
else
kernel-RPMFLAGS:= --target $(HOSTARCH)
endif
ALL += kernel

ipfwroot-MODULES := ipfwsrc
ipfwroot-SPEC := planetlab/ipfwroot.spec
ipfwroot-DEPEND-DEVEL-RPMS := kernel-devel
ipfwroot-SPECVARS = kernel_version=$(kernel.rpm-version) \
        kernel_release=$(kernel.rpm-release) \
        kernel_arch=$(kernel.rpm-arch)
ALL += ipfwroot 

ipfwslice-MODULES := ipfwsrc
ipfwslice-SPEC := planetlab/ipfwslice.spec
ipfwslice-SPECVARS = kernel_version=$(kernel.rpm-version) \
        kernel_release=$(kernel.rpm-release) \
        kernel_arch=$(kernel.rpm-arch)
ALL += ipfwslice


================================================
FILE: planetlab/sample_hook
================================================
#!/bin/sh

#
# Marta Carbone <marta.carbone@iet.unipi.it>
# 2009 - Universita` di Pisa
#
# This is a sample hook file in charge to collect
# statistical information on netconfig usage. It dumps
# on a log file slicename, port and the configuration string
# used to configure a dummynet experiment.
#
# Each time a user configure a dummynet port, this file
# will be executed.
# The following variables will be passed as argument:
# 
# ${SLICE} ${PORT} ${CONFIG_STRING} 
# ${SLICE} The slicename executing the netconfig command
# ${PORT} The port to be configured
# ${CONFIG_STRING} The configuration string
#
# Note that this script can get additional information
# by executing the ipfw command, e.g.
# ipfw list		# list of installed rules
# ipfw show		# list of rules and statistical information
# ipfw pipe show	# list of pipes
#
# a complete list of ipfw commands is available at:
# http://www.freebsd.org/cgi/man.cgi?query=ipfw&sektion=8

# logfile
LOG_FILE=/tmp/ipfw_hook.log

echo -e `date` >> ${LOG_FILE}
echo "$*" >> ${LOG_FILE}


================================================
FILE: sys/net/if.h
================================================
#include <linux/if.h>


================================================
FILE: sys/net/pfil.h
================================================
/*	$FreeBSD: src/sys/net/pfil.h,v 1.16 2007/06/08 12:43:25 gallatin Exp $ */
/*	$NetBSD: pfil.h,v 1.22 2003/06/23 12:57:08 martin Exp $	*/

/*-
 * Copyright (c) 1996 Matthew R. Green
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#ifndef _NET_PFIL_H_
#define _NET_PFIL_H_

#include <sys/systm.h>
#include <sys/queue.h>
#include <sys/_lock.h>
#include <sys/_mutex.h>
#include <sys/lock.h>
#include <sys/rmlock.h>

struct mbuf;
struct ifnet;
struct inpcb;

/*
 * The packet filter hooks are designed for anything to call them to
 * possibly intercept the packet.
 */
struct packet_filter_hook {
        TAILQ_ENTRY(packet_filter_hook) pfil_link;
	int	(*pfil_func)(void *, struct mbuf **, struct ifnet *, int,
		    struct inpcb *);
	void	*pfil_arg;
};

#define PFIL_IN		0x00000001
#define PFIL_OUT	0x00000002
#define PFIL_WAITOK	0x00000004
#define PFIL_ALL	(PFIL_IN|PFIL_OUT)

typedef	TAILQ_HEAD(pfil_list, packet_filter_hook) pfil_list_t;

#define	PFIL_TYPE_AF		1	/* key is AF_* type */
#define	PFIL_TYPE_IFNET		2	/* key is ifnet pointer */

struct pfil_head {
	pfil_list_t	ph_in;
	pfil_list_t	ph_out;
	int		ph_type;
	int		ph_nhooks;
#if defined( __linux__ ) || defined( _WIN32 )
	rwlock_t	ph_mtx;
#else
	struct rmlock	ph_lock;
#endif
	union {
		u_long		phu_val;
		void		*phu_ptr;
	} ph_un;
#define	ph_af		ph_un.phu_val
#define	ph_ifnet	ph_un.phu_ptr
	LIST_ENTRY(pfil_head) ph_list;
};

int	pfil_add_hook(int (*func)(void *, struct mbuf **, struct ifnet *,
	    int, struct inpcb *), void *, int, struct pfil_head *);
int	pfil_remove_hook(int (*func)(void *, struct mbuf **, struct ifnet *,
	    int, struct inpcb *), void *, int, struct pfil_head *);
int	pfil_run_hooks(struct pfil_head *, struct mbuf **, struct ifnet *,
	    int, struct inpcb *inp);

int	pfil_head_register(struct pfil_head *);
int	pfil_head_unregister(struct pfil_head *);

struct pfil_head *pfil_head_get(int, u_long);

#define	PFIL_HOOKED(p) ((p)->ph_nhooks > 0)
#define	PFIL_LOCK_INIT(p) \
    rm_init_flags(&(p)->ph_lock, "PFil hook read/write mutex", RM_RECURSE)
#define	PFIL_LOCK_DESTROY(p) rm_destroy(&(p)->ph_lock)
#define PFIL_RLOCK(p, t) rm_rlock(&(p)->ph_lock, (t))
#define PFIL_WLOCK(p) rm_wlock(&(p)->ph_lock)
#define PFIL_RUNLOCK(p, t) rm_runlock(&(p)->ph_lock, (t))
#define PFIL_WUNLOCK(p) rm_wunlock(&(p)->ph_lock)
#define PFIL_LIST_LOCK() mtx_lock(&pfil_global_lock)
#define PFIL_LIST_UNLOCK() mtx_unlock(&pfil_global_lock)

static __inline struct packet_filter_hook *
pfil_hook_get(int dir, struct pfil_head *ph)
{

	if (dir == PFIL_IN)
		return (TAILQ_FIRST(&ph->ph_in));
	else if (dir == PFIL_OUT)
		return (TAILQ_FIRST(&ph->ph_out));
	else
		return (NULL);
}

#endif /* _NET_PFIL_H_ */


================================================
FILE: sys/net/radix.c
================================================
/*-
 * Copyright (c) 1988, 1989, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)radix.c	8.5 (Berkeley) 5/19/95
 * $FreeBSD: head/sys/net/radix.c 200354 2009-12-10 10:34:30Z luigi $
 */

/*
 * Routines to build and maintain radix trees for routing lookups.
 */
#include <sys/param.h>
#ifdef	_KERNEL
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/rwlock.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/syslog.h>
#include <net/radix.h>
#include "opt_mpath.h"
#ifdef RADIX_MPATH
#include <net/radix_mpath.h>
#endif
#else /* !_KERNEL */
#include <stdio.h>
#include <strings.h>
#include <stdlib.h>
#define log(x, arg...)	fprintf(stderr, ## arg)
#define panic(x)	fprintf(stderr, "PANIC: %s", x), exit(1)
#define min(a, b) ((a) < (b) ? (a) : (b) )
#include <net/radix.h>
#endif /* !_KERNEL */

static int	rn_walktree_from(struct radix_node_head *h, void *a, void *m,
		    walktree_f_t *f, void *w);
static int rn_walktree(struct radix_node_head *, walktree_f_t *, void *);
static struct radix_node
	 *rn_insert(void *, struct radix_node_head *, int *,
	     struct radix_node [2]),
	 *rn_newpair(void *, int, struct radix_node[2]),
	 *rn_search(void *, struct radix_node *),
	 *rn_search_m(void *, struct radix_node *, void *);

static int	max_keylen;
static struct radix_mask *rn_mkfreelist;
static struct radix_node_head *mask_rnhead;
/*
 * Work area -- the following point to 3 buffers of size max_keylen,
 * allocated in this order in a block of memory malloc'ed by rn_init.
 * rn_zeros, rn_ones are set in rn_init and used in readonly afterwards.
 * addmask_key is used in rn_addmask in rw mode and not thread-safe.
 */
static char *rn_zeros, *rn_ones, *addmask_key;

#define MKGet(m) {						\
	if (rn_mkfreelist) {					\
		m = rn_mkfreelist;				\
		rn_mkfreelist = (m)->rm_mklist;			\
	} else							\
		R_Malloc(m, struct radix_mask *, sizeof (struct radix_mask)); }
 
#define MKFree(m) { (m)->rm_mklist = rn_mkfreelist; rn_mkfreelist = (m);}

#define rn_masktop (mask_rnhead->rnh_treetop)

static int	rn_lexobetter(void *m_arg, void *n_arg);
static struct radix_mask *
		rn_new_radix_mask(struct radix_node *tt,
		    struct radix_mask *next);
static int	rn_satisfies_leaf(char *trial, struct radix_node *leaf,
		    int skip);

/*
 * The data structure for the keys is a radix tree with one way
 * branching removed.  The index rn_bit at an internal node n represents a bit
 * position to be tested.  The tree is arranged so that all descendants
 * of a node n have keys whose bits all agree up to position rn_bit - 1.
 * (We say the index of n is rn_bit.)
 *
 * There is at least one descendant which has a one bit at position rn_bit,
 * and at least one with a zero there.
 *
 * A route is determined by a pair of key and mask.  We require that the
 * bit-wise logical and of the key and mask to be the key.
 * We define the index of a route to associated with the mask to be
 * the first bit number in the mask where 0 occurs (with bit number 0
 * representing the highest order bit).
 *
 * We say a mask is normal if every bit is 0, past the index of the mask.
 * If a node n has a descendant (k, m) with index(m) == index(n) == rn_bit,
 * and m is a normal mask, then the route applies to every descendant of n.
 * If the index(m) < rn_bit, this implies the trailing last few bits of k
 * before bit b are all 0, (and hence consequently true of every descendant
 * of n), so the route applies to all descendants of the node as well.
 *
 * Similar logic shows that a non-normal mask m such that
 * index(m) <= index(n) could potentially apply to many children of n.
 * Thus, for each non-host route, we attach its mask to a list at an internal
 * node as high in the tree as we can go.
 *
 * The present version of the code makes use of normal routes in short-
 * circuiting an explict mask and compare operation when testing whether
 * a key satisfies a normal route, and also in remembering the unique leaf
 * that governs a subtree.
 */

/*
 * Most of the functions in this code assume that the key/mask arguments
 * are sockaddr-like structures, where the first byte is an u_char
 * indicating the size of the entire structure.
 *
 * To make the assumption more explicit, we use the LEN() macro to access
 * this field. It is safe to pass an expression with side effects
 * to LEN() as the argument is evaluated only once.
 * We cast the result to int as this is the dominant usage.
 */
#define LEN(x) ( (int) (*(const u_char *)(x)) )

/*
 * XXX THIS NEEDS TO BE FIXED
 * In the code, pointers to keys and masks are passed as either
 * 'void *' (because callers use to pass pointers of various kinds), or
 * 'caddr_t' (which is fine for pointer arithmetics, but not very
 * clean when you dereference it to access data). Furthermore, caddr_t
 * is really 'char *', while the natural type to operate on keys and
 * masks would be 'u_char'. This mismatch require a lot of casts and
 * intermediate variables to adapt types that clutter the code.
 */

/*
 * Search a node in the tree matching the key.
 */
static struct radix_node *
rn_search(v_arg, head)
	void *v_arg;
	struct radix_node *head;
{
	register struct radix_node *x;
	register caddr_t v;

	for (x = head, v = v_arg; x->rn_bit >= 0;) {
		if (x->rn_bmask & v[x->rn_offset])
			x = x->rn_right;
		else
			x = x->rn_left;
	}
	return (x);
}

/*
 * Same as above, but with an additional mask.
 * XXX note this function is used only once.
 */
static struct radix_node *
rn_search_m(v_arg, head, m_arg)
	struct radix_node *head;
	void *v_arg, *m_arg;
{
	register struct radix_node *x;
	register caddr_t v = v_arg, m = m_arg;

	for (x = head; x->rn_bit >= 0;) {
		if ((x->rn_bmask & m[x->rn_offset]) &&
		    (x->rn_bmask & v[x->rn_offset]))
			x = x->rn_right;
		else
			x = x->rn_left;
	}
	return x;
}

int
rn_refines(m_arg, n_arg)
	void *m_arg, *n_arg;
{
	register caddr_t m = m_arg, n = n_arg;
	register caddr_t lim, lim2 = lim = n + LEN(n);
	int longer = LEN(n++) - LEN(m++);
	int masks_are_equal = 1;

	if (longer > 0)
		lim -= longer;
	while (n < lim) {
		if (*n & ~(*m))
			return 0;
		if (*n++ != *m++)
			masks_are_equal = 0;
	}
	while (n < lim2)
		if (*n++)
			return 0;
	if (masks_are_equal && (longer < 0))
		for (lim2 = m - longer; m < lim2; )
			if (*m++)
				return 1;
	return (!masks_are_equal);
}

struct radix_node *
rn_lookup(v_arg, m_arg, head)
	void *v_arg, *m_arg;
	struct radix_node_head *head;
{
	register struct radix_node *x;
	caddr_t netmask = 0;

	if (m_arg) {
		x = rn_addmask(m_arg, 1, head->rnh_treetop->rn_offset);
		if (x == 0)
			return (0);
		netmask = x->rn_key;
	}
	x = rn_match(v_arg, head);
	if (x && netmask) {
		while (x && x->rn_mask != netmask)
			x = x->rn_dupedkey;
	}
	return x;
}

static int
rn_satisfies_leaf(trial, leaf, skip)
	char *trial;
	register struct radix_node *leaf;
	int skip;
{
	register char *cp = trial, *cp2 = leaf->rn_key, *cp3 = leaf->rn_mask;
	char *cplim;
	int length = min(LEN(cp), LEN(cp2));

	if (cp3 == NULL)
		cp3 = rn_ones;
	else
		length = min(length, LEN(cp3));
	cplim = cp + length; cp3 += skip; cp2 += skip;
	for (cp += skip; cp < cplim; cp++, cp2++, cp3++)
		if ((*cp ^ *cp2) & *cp3)
			return 0;
	return 1;
}

struct radix_node *
rn_match(v_arg, head)
	void *v_arg;
	struct radix_node_head *head;
{
	caddr_t v = v_arg;
	register struct radix_node *t = head->rnh_treetop, *x;
	register caddr_t cp = v, cp2;
	caddr_t cplim;
	struct radix_node *saved_t, *top = t;
	int off = t->rn_offset, vlen = LEN(cp), matched_off;
	register int test, b, rn_bit;

	/*
	 * Open code rn_search(v, top) to avoid overhead of extra
	 * subroutine call.
	 */
	for (; t->rn_bit >= 0; ) {
		if (t->rn_bmask & cp[t->rn_offset])
			t = t->rn_right;
		else
			t = t->rn_left;
	}
	/*
	 * See if we match exactly as a host destination
	 * or at least learn how many bits match, for normal mask finesse.
	 *
	 * It doesn't hurt us to limit how many bytes to check
	 * to the length of the mask, since if it matches we had a genuine
	 * match and the leaf we have is the most specific one anyway;
	 * if it didn't match with a shorter length it would fail
	 * with a long one.  This wins big for class B&C netmasks which
	 * are probably the most common case...
	 */
	if (t->rn_mask)
		vlen = *(u_char *)t->rn_mask;
	cp += off; cp2 = t->rn_key + off; cplim = v + vlen;
	for (; cp < cplim; cp++, cp2++)
		if (*cp != *cp2)
			goto on1;
	/*
	 * This extra grot is in case we are explicitly asked
	 * to look up the default.  Ugh!
	 *
	 * Never return the root node itself, it seems to cause a
	 * lot of confusion.
	 */
	if (t->rn_flags & RNF_ROOT)
		t = t->rn_dupedkey;
	return t;
on1:
	test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */
	for (b = 7; (test >>= 1) > 0;)
		b--;
	matched_off = cp - v;
	b += matched_off << 3;
	rn_bit = -1 - b;
	/*
	 * If there is a host route in a duped-key chain, it will be first.
	 */
	if ((saved_t = t)->rn_mask == 0)
		t = t->rn_dupedkey;
	for (; t; t = t->rn_dupedkey)
		/*
		 * Even if we don't match exactly as a host,
		 * we may match if the leaf we wound up at is
		 * a route to a net.
		 */
		if (t->rn_flags & RNF_NORMAL) {
			if (rn_bit <= t->rn_bit)
				return t;
		} else if (rn_satisfies_leaf(v, t, matched_off))
				return t;
	t = saved_t;
	/* start searching up the tree */
	do {
		register struct radix_mask *m;
		t = t->rn_parent;
		m = t->rn_mklist;
		/*
		 * If non-contiguous masks ever become important
		 * we can restore the masking and open coding of
		 * the search and satisfaction test and put the
		 * calculation of "off" back before the "do".
		 */
		while (m) {
			if (m->rm_flags & RNF_NORMAL) {
				if (rn_bit <= m->rm_bit)
					return (m->rm_leaf);
			} else {
				off = min(t->rn_offset, matched_off);
				x = rn_search_m(v, t, m->rm_mask);
				while (x && x->rn_mask != m->rm_mask)
					x = x->rn_dupedkey;
				if (x && rn_satisfies_leaf(v, x, off))
					return x;
			}
			m = m->rm_mklist;
		}
	} while (t != top);
	return 0;
}

#ifdef RN_DEBUG
int	rn_nodenum;
struct	radix_node *rn_clist;
int	rn_saveinfo;
int	rn_debug =  1;
#endif

/*
 * Whenever we add a new leaf to the tree, we also add a parent node,
 * so we allocate them as an array of two elements: the first one must be
 * the leaf (see RNTORT() in route.c), the second one is the parent.
 * This routine initializes the relevant fields of the nodes, so that
 * the leaf is the left child of the parent node, and both nodes have
 * (almost) all all fields filled as appropriate.
 * (XXX some fields are left unset, see the '#if 0' section).
 * The function returns a pointer to the parent node.
 */

static struct radix_node *
rn_newpair(v, b, nodes)
	void *v;
	int b;
	struct radix_node nodes[2];
{
	register struct radix_node *tt = nodes, *t = tt + 1;
	t->rn_bit = b;
	t->rn_bmask = 0x80 >> (b & 7);
	t->rn_left = tt;
	t->rn_offset = b >> 3;

#if 0  /* XXX perhaps we should fill these fields as well. */
	t->rn_parent = t->rn_right = NULL;

	tt->rn_mask = NULL;
	tt->rn_dupedkey = NULL;
	tt->rn_bmask = 0;
#endif
	tt->rn_bit = -1;
	tt->rn_key = (caddr_t)v;
	tt->rn_parent = t;
	tt->rn_flags = t->rn_flags = RNF_ACTIVE;
	tt->rn_mklist = t->rn_mklist = 0;
#ifdef RN_DEBUG
	tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++;
	tt->rn_twin = t;
	tt->rn_ybro = rn_clist;
	rn_clist = tt;
#endif
	return t;
}

static struct radix_node *
rn_insert(v_arg, head, dupentry, nodes)
	void *v_arg;
	struct radix_node_head *head;
	int *dupentry;
	struct radix_node nodes[2];
{
	caddr_t v = v_arg;
	struct radix_node *top = head->rnh_treetop;
	int head_off = top->rn_offset, vlen = LEN(v);
	register struct radix_node *t = rn_search(v_arg, top);
	register caddr_t cp = v + head_off;
	register int b;
	struct radix_node *tt;
    	/*
	 * Find first bit at which v and t->rn_key differ
	 */
    {
	register caddr_t cp2 = t->rn_key + head_off;
	register int cmp_res;
	caddr_t cplim = v + vlen;

	while (cp < cplim)
		if (*cp2++ != *cp++)
			goto on1;
	*dupentry = 1;
	return t;
on1:
	*dupentry = 0;
	cmp_res = (cp[-1] ^ cp2[-1]) & 0xff;
	for (b = (cp - v) << 3; cmp_res; b--)
		cmp_res >>= 1;
    }
    {
	register struct radix_node *p, *x = top;
	cp = v;
	do {
		p = x;
		if (cp[x->rn_offset] & x->rn_bmask)
			x = x->rn_right;
		else
			x = x->rn_left;
	} while (b > (unsigned) x->rn_bit);
				/* x->rn_bit < b && x->rn_bit >= 0 */
#ifdef RN_DEBUG
	if (rn_debug)
		log(LOG_DEBUG, "rn_insert: Going In:\n"), traverse(p);
#endif
	t = rn_newpair(v_arg, b, nodes); 
	tt = t->rn_left;
	if ((cp[p->rn_offset] & p->rn_bmask) == 0)
		p->rn_left = t;
	else
		p->rn_right = t;
	x->rn_parent = t;
	t->rn_parent = p; /* frees x, p as temp vars below */
	if ((cp[t->rn_offset] & t->rn_bmask) == 0) {
		t->rn_right = x;
	} else {
		t->rn_right = tt;
		t->rn_left = x;
	}
#ifdef RN_DEBUG
	if (rn_debug)
		log(LOG_DEBUG, "rn_insert: Coming Out:\n"), traverse(p);
#endif
    }
	return (tt);
}

struct radix_node *
rn_addmask(n_arg, search, skip)
	int search, skip;
	void *n_arg;
{
	caddr_t netmask = (caddr_t)n_arg;
	register struct radix_node *x;
	register caddr_t cp, cplim;
	register int b = 0, mlen, j;
	int maskduplicated, m0, isnormal;
	struct radix_node *saved_x;
	static int last_zeroed = 0;

	if ((mlen = LEN(netmask)) > max_keylen)
		mlen = max_keylen;
	if (skip == 0)
		skip = 1;
	if (mlen <= skip)
		return (mask_rnhead->rnh_nodes);
	if (skip > 1)
		bcopy(rn_ones + 1, addmask_key + 1, skip - 1);
	if ((m0 = mlen) > skip)
		bcopy(netmask + skip, addmask_key + skip, mlen - skip);
	/*
	 * Trim trailing zeroes.
	 */
	for (cp = addmask_key + mlen; (cp > addmask_key) && cp[-1] == 0;)
		cp--;
	mlen = cp - addmask_key;
	if (mlen <= skip) {
		if (m0 >= last_zeroed)
			last_zeroed = mlen;
		return (mask_rnhead->rnh_nodes);
	}
	if (m0 < last_zeroed)
		bzero(addmask_key + m0, last_zeroed - m0);
	*addmask_key = last_zeroed = mlen;
	x = rn_search(addmask_key, rn_masktop);
	if (bcmp(addmask_key, x->rn_key, mlen) != 0)
		x = 0;
	if (x || search)
		return (x);
	R_Zalloc(x, struct radix_node *, max_keylen + 2 * sizeof (*x));
	if ((saved_x = x) == 0)
		return (0);
	netmask = cp = (caddr_t)(x + 2);
	bcopy(addmask_key, cp, mlen);
	x = rn_insert(cp, mask_rnhead, &maskduplicated, x);
	if (maskduplicated) {
		log(LOG_ERR, "rn_addmask: mask impossibly already in tree");
		Free(saved_x);
		return (x);
	}
	/*
	 * Calculate index of mask, and check for normalcy.
	 * First find the first byte with a 0 bit, then if there are
	 * more bits left (remember we already trimmed the trailing 0's),
	 * the pattern must be one of those in normal_chars[], or we have
	 * a non-contiguous mask.
	 */
	cplim = netmask + mlen;
	isnormal = 1;
	for (cp = netmask + skip; (cp < cplim) && *(u_char *)cp == 0xff;)
		cp++;
	if (cp != cplim) {
		static char normal_chars[] = {
			0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};

		for (j = 0x80; (j & *cp) != 0; j >>= 1)
			b++;
		if (*cp != normal_chars[b] || cp != (cplim - 1))
			isnormal = 0;
	}
	b += (cp - netmask) << 3;
	x->rn_bit = -1 - b;
	if (isnormal)
		x->rn_flags |= RNF_NORMAL;
	return (x);
}

static int	/* XXX: arbitrary ordering for non-contiguous masks */
rn_lexobetter(m_arg, n_arg)
	void *m_arg, *n_arg;
{
	register u_char *mp = m_arg, *np = n_arg, *lim;

	if (LEN(mp) > LEN(np))
		return 1;  /* not really, but need to check longer one first */
	if (LEN(mp) == LEN(np))
		for (lim = mp + LEN(mp); mp < lim;)
			if (*mp++ > *np++)
				return 1;
	return 0;
}

static struct radix_mask *
rn_new_radix_mask(tt, next)
	register struct radix_node *tt;
	register struct radix_mask *next;
{
	register struct radix_mask *m;

	MKGet(m);
	if (m == 0) {
		log(LOG_ERR, "Mask for route not entered\n");
		return (0);
	}
	bzero(m, sizeof *m);
	m->rm_bit = tt->rn_bit;
	m->rm_flags = tt->rn_flags;
	if (tt->rn_flags & RNF_NORMAL)
		m->rm_leaf = tt;
	else
		m->rm_mask = tt->rn_mask;
	m->rm_mklist = next;
	tt->rn_mklist = m;
	return m;
}

struct radix_node *
rn_addroute(v_arg, n_arg, head, treenodes)
	void *v_arg, *n_arg;
	struct radix_node_head *head;
	struct radix_node treenodes[2];
{
	caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg;
	register struct radix_node *t, *x = 0, *tt;
	struct radix_node *saved_tt, *top = head->rnh_treetop;
	short b = 0, b_leaf = 0;
	int keyduplicated;
	caddr_t mmask;
	struct radix_mask *m, **mp;

	/*
	 * In dealing with non-contiguous masks, there may be
	 * many different routes which have the same mask.
	 * We will find it useful to have a unique pointer to
	 * the mask to speed avoiding duplicate references at
	 * nodes and possibly save time in calculating indices.
	 */
	if (netmask)  {
		if ((x = rn_addmask(netmask, 0, top->rn_offset)) == 0)
			return (0);
		b_leaf = x->rn_bit;
		b = -1 - x->rn_bit;
		netmask = x->rn_key;
	}
	/*
	 * Deal with duplicated keys: attach node to previous instance
	 */
	saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes);
	if (keyduplicated) {
		for (t = tt; tt; t = tt, tt = tt->rn_dupedkey) {
#ifdef RADIX_MPATH
			/* permit multipath, if enabled for the family */
			if (rn_mpath_capable(head) && netmask == tt->rn_mask) {
				/*
				 * go down to the end of multipaths, so that
				 * new entry goes into the end of rn_dupedkey
				 * chain.
				 */
				do {
					t = tt;
					tt = tt->rn_dupedkey;
				} while (tt && t->rn_mask == tt->rn_mask);
				break;
			}
#endif
			if (tt->rn_mask == netmask)
				return (0);
			if (netmask == 0 ||
			    (tt->rn_mask &&
			     ((b_leaf < tt->rn_bit) /* index(netmask) > node */
			      || rn_refines(netmask, tt->rn_mask)
			      || rn_lexobetter(netmask, tt->rn_mask))))
				break;
		}
		/*
		 * If the mask is not duplicated, we wouldn't
		 * find it among possible duplicate key entries
		 * anyway, so the above test doesn't hurt.
		 *
		 * We sort the masks for a duplicated key the same way as
		 * in a masklist -- most specific to least specific.
		 * This may require the unfortunate nuisance of relocating
		 * the head of the list.
		 *
		 * We also reverse, or doubly link the list through the
		 * parent pointer.
		 */
		if (tt == saved_tt) {
			struct	radix_node *xx = x;
			/* link in at head of list */
			(tt = treenodes)->rn_dupedkey = t;
			tt->rn_flags = t->rn_flags;
			tt->rn_parent = x = t->rn_parent;
			t->rn_parent = tt;	 		/* parent */
			if (x->rn_left == t)
				x->rn_left = tt;
			else
				x->rn_right = tt;
			saved_tt = tt; x = xx;
		} else {
			(tt = treenodes)->rn_dupedkey = t->rn_dupedkey;
			t->rn_dupedkey = tt;
			tt->rn_parent = t;			/* parent */
			if (tt->rn_dupedkey)			/* parent */
				tt->rn_dupedkey->rn_parent = tt; /* parent */
		}
#ifdef RN_DEBUG
		t=tt+1; tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++;
		tt->rn_twin = t; tt->rn_ybro = rn_clist; rn_clist = tt;
#endif
		tt->rn_key = (caddr_t) v;
		tt->rn_bit = -1;
		tt->rn_flags = RNF_ACTIVE;
	}
	/*
	 * Put mask in tree.
	 */
	if (netmask) {
		tt->rn_mask = netmask;
		tt->rn_bit = x->rn_bit;
		tt->rn_flags |= x->rn_flags & RNF_NORMAL;
	}
	t = saved_tt->rn_parent;
	if (keyduplicated)
		goto on2;
	b_leaf = -1 - t->rn_bit;
	if (t->rn_right == saved_tt)
		x = t->rn_left;
	else
		x = t->rn_right;
	/* Promote general routes from below */
	if (x->rn_bit < 0) {
	    for (mp = &t->rn_mklist; x; x = x->rn_dupedkey)
		if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) {
			*mp = m = rn_new_radix_mask(x, 0);
			if (m)
				mp = &m->rm_mklist;
		}
	} else if (x->rn_mklist) {
		/*
		 * Skip over masks whose index is > that of new node
		 */
		for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist)
			if (m->rm_bit >= b_leaf)
				break;
		t->rn_mklist = m; *mp = 0;
	}
on2:
	/* Add new route to highest possible ancestor's list */
	if ((netmask == 0) || (b > t->rn_bit ))
		return tt; /* can't lift at all */
	b_leaf = tt->rn_bit;
	do {
		x = t;
		t = t->rn_parent;
	} while (b <= t->rn_bit && x != top);
	/*
	 * Search through routes associated with node to
	 * insert new route according to index.
	 * Need same criteria as when sorting dupedkeys to avoid
	 * double loop on deletion.
	 */
	for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) {
		if (m->rm_bit < b_leaf)
			continue;
		if (m->rm_bit > b_leaf)
			break;
		if (m->rm_flags & RNF_NORMAL) {
			mmask = m->rm_leaf->rn_mask;
			if (tt->rn_flags & RNF_NORMAL) {
#if !defined(RADIX_MPATH)
			    log(LOG_ERR,
			        "Non-unique normal route, mask not entered\n");
#endif
				return tt;
			}
		} else
			mmask = m->rm_mask;
		if (mmask == netmask) {
			m->rm_refs++;
			tt->rn_mklist = m;
			return tt;
		}
		if (rn_refines(netmask, mmask)
		    || rn_lexobetter(netmask, mmask))
			break;
	}
	*mp = rn_new_radix_mask(tt, *mp);
	return tt;
}

struct radix_node *
rn_delete(v_arg, netmask_arg, head)
	void *v_arg, *netmask_arg;
	struct radix_node_head *head;
{
	register struct radix_node *t, *p, *x, *tt;
	struct radix_mask *m, *saved_m, **mp;
	struct radix_node *dupedkey, *saved_tt, *top;
	caddr_t v, netmask;
	int b, head_off, vlen;

	v = v_arg;
	netmask = netmask_arg;
	x = head->rnh_treetop;
	tt = rn_search(v, x);
	head_off = x->rn_offset;
	vlen =  LEN(v);
	saved_tt = tt;
	top = x;
	if (tt == 0 ||
	    bcmp(v + head_off, tt->rn_key + head_off, vlen - head_off))
		return (0);
	/*
	 * Delete our route from mask lists.
	 */
	if (netmask) {
		if ((x = rn_addmask(netmask, 1, head_off)) == 0)
			return (0);
		netmask = x->rn_key;
		while (tt->rn_mask != netmask)
			if ((tt = tt->rn_dupedkey) == 0)
				return (0);
	}
	if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == 0)
		goto on1;
	if (tt->rn_flags & RNF_NORMAL) {
		if (m->rm_leaf != tt || m->rm_refs > 0) {
			log(LOG_ERR, "rn_delete: inconsistent annotation\n");
			return 0;  /* dangling ref could cause disaster */
		}
	} else {
		if (m->rm_mask != tt->rn_mask) {
			log(LOG_ERR, "rn_delete: inconsistent annotation\n");
			goto on1;
		}
		if (--m->rm_refs >= 0)
			goto on1;
	}
	b = -1 - tt->rn_bit;
	t = saved_tt->rn_parent;
	if (b > t->rn_bit)
		goto on1; /* Wasn't lifted at all */
	do {
		x = t;
		t = t->rn_parent;
	} while (b <= t->rn_bit && x != top);
	for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist)
		if (m == saved_m) {
			*mp = m->rm_mklist;
			MKFree(m);
			break;
		}
	if (m == 0) {
		log(LOG_ERR, "rn_delete: couldn't find our annotation\n");
		if (tt->rn_flags & RNF_NORMAL)
			return (0); /* Dangling ref to us */
	}
on1:
	/*
	 * Eliminate us from tree
	 */
	if (tt->rn_flags & RNF_ROOT)
		return (0);
#ifdef RN_DEBUG
	/* Get us out of the creation list */
	for (t = rn_clist; t && t->rn_ybro != tt; t = t->rn_ybro) {}
	if (t) t->rn_ybro = tt->rn_ybro;
#endif
	t = tt->rn_parent;
	dupedkey = saved_tt->rn_dupedkey;
	if (dupedkey) {
		/*
		 * Here, tt is the deletion target and
		 * saved_tt is the head of the dupekey chain.
		 */
		if (tt == saved_tt) {
			/* remove from head of chain */
			x = dupedkey; x->rn_parent = t;
			if (t->rn_left == tt)
				t->rn_left = x;
			else
				t->rn_right = x;
		} else {
			/* find node in front of tt on the chain */
			for (x = p = saved_tt; p && p->rn_dupedkey != tt;)
				p = p->rn_dupedkey;
			if (p) {
				p->rn_dupedkey = tt->rn_dupedkey;
				if (tt->rn_dupedkey)		/* parent */
					tt->rn_dupedkey->rn_parent = p;
								/* parent */
			} else log(LOG_ERR, "rn_delete: couldn't find us\n");
		}
		t = tt + 1;
		if  (t->rn_flags & RNF_ACTIVE) {
#ifndef RN_DEBUG
			*++x = *t;
			p = t->rn_parent;
#else
			b = t->rn_info;
			*++x = *t;
			t->rn_info = b;
			p = t->rn_parent;
#endif
			if (p->rn_left == t)
				p->rn_left = x;
			else
				p->rn_right = x;
			x->rn_left->rn_parent = x;
			x->rn_right->rn_parent = x;
		}
		goto out;
	}
	if (t->rn_left == tt)
		x = t->rn_right;
	else
		x = t->rn_left;
	p = t->rn_parent;
	if (p->rn_right == t)
		p->rn_right = x;
	else
		p->rn_left = x;
	x->rn_parent = p;
	/*
	 * Demote routes attached to us.
	 */
	if (t->rn_mklist) {
		if (x->rn_bit >= 0) {
			for (mp = &x->rn_mklist; (m = *mp);)
				mp = &m->rm_mklist;
			*mp = t->rn_mklist;
		} else {
			/* If there are any key,mask pairs in a sibling
			   duped-key chain, some subset will appear sorted
			   in the same order attached to our mklist */
			for (m = t->rn_mklist; m && x; x = x->rn_dupedkey)
				if (m == x->rn_mklist) {
					struct radix_mask *mm = m->rm_mklist;
					x->rn_mklist = 0;
					if (--(m->rm_refs) < 0)
						MKFree(m);
					m = mm;
				}
			if (m)
				log(LOG_ERR,
				    "rn_delete: Orphaned Mask %p at %p\n",
				    m, x);
		}
	}
	/*
	 * We may be holding an active internal node in the tree.
	 */
	x = tt + 1;
	if (t != x) {
#ifndef RN_DEBUG
		*t = *x;
#else
		b = t->rn_info;
		*t = *x;
		t->rn_info = b;
#endif
		t->rn_left->rn_parent = t;
		t->rn_right->rn_parent = t;
		p = x->rn_parent;
		if (p->rn_left == x)
			p->rn_left = t;
		else
			p->rn_right = t;
	}
out:
	tt->rn_flags &= ~RNF_ACTIVE;
	tt[1].rn_flags &= ~RNF_ACTIVE;
	return (tt);
}

/*
 * This is the same as rn_walktree() except for the parameters and the
 * exit.
 */
static int
rn_walktree_from(h, a, m, f, w)
	struct radix_node_head *h;
	void *a, *m;
	walktree_f_t *f;
	void *w;
{
	int error;
	struct radix_node *base, *next;
	u_char *xa = (u_char *)a;
	u_char *xm = (u_char *)m;
	register struct radix_node *rn, *last = 0 /* shut up gcc */;
	int stopping = 0;
	int lastb;

	/*
	 * rn_search_m is sort-of-open-coded here. We cannot use the
	 * function because we need to keep track of the last node seen.
	 */
	/* printf("about to search\n"); */
	for (rn = h->rnh_treetop; rn->rn_bit >= 0; ) {
		last = rn;
		/* printf("rn_bit %d, rn_bmask %x, xm[rn_offset] %x\n",
		       rn->rn_bit, rn->rn_bmask, xm[rn->rn_offset]); */
		if (!(rn->rn_bmask & xm[rn->rn_offset])) {
			break;
		}
		if (rn->rn_bmask & xa[rn->rn_offset]) {
			rn = rn->rn_right;
		} else {
			rn = rn->rn_left;
		}
	}
	/* printf("done searching\n"); */

	/*
	 * Two cases: either we stepped off the end of our mask,
	 * in which case last == rn, or we reached a leaf, in which
	 * case we want to start from the last node we looked at.
	 * Either way, last is the node we want to start from.
	 */
	rn = last;
	lastb = rn->rn_bit;

	/* printf("rn %p, lastb %d\n", rn, lastb);*/

	/*
	 * This gets complicated because we may delete the node
	 * while applying the function f to it, so we need to calculate
	 * the successor node in advance.
	 */
	while (rn->rn_bit >= 0)
		rn = rn->rn_left;

	while (!stopping) {
		/* printf("node %p (%d)\n", rn, rn->rn_bit); */
		base = rn;
		/* If at right child go back up, otherwise, go right */
		while (rn->rn_parent->rn_right == rn
		       && !(rn->rn_flags & RNF_ROOT)) {
			rn = rn->rn_parent;

			/* if went up beyond last, stop */
			if (rn->rn_bit <= lastb) {
				stopping = 1;
				/* printf("up too far\n"); */
				/*
				 * XXX we should jump to the 'Process leaves'
				 * part, because the values of 'rn' and 'next'
				 * we compute will not be used. Not a big deal
				 * because this loop will terminate, but it is
				 * inefficient and hard to understand!
				 */
			}
		}
		
		/* 
		 * At the top of the tree, no need to traverse the right
		 * half, prevent the traversal of the entire tree in the
		 * case of default route.
		 */
		if (rn->rn_parent->rn_flags & RNF_ROOT)
			stopping = 1;

		/* Find the next *leaf* since next node might vanish, too */
		for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;)
			rn = rn->rn_left;
		next = rn;
		/* Process leaves */
		while ((rn = base) != 0) {
			base = rn->rn_dupedkey;
			/* printf("leaf %p\n", rn); */
			if (!(rn->rn_flags & RNF_ROOT)
			    && (error = (*f)(rn, w)))
				return (error);
		}
		rn = next;

		if (rn->rn_flags & RNF_ROOT) {
			/* printf("root, stopping"); */
			stopping = 1;
		}

	}
	return 0;
}

static int
rn_walktree(h, f, w)
	struct radix_node_head *h;
	walktree_f_t *f;
	void *w;
{
	int error;
	struct radix_node *base, *next;
	register struct radix_node *rn = h->rnh_treetop;
	/*
	 * This gets complicated because we may delete the node
	 * while applying the function f to it, so we need to calculate
	 * the successor node in advance.
	 */

	/* First time through node, go left */
	while (rn->rn_bit >= 0)
		rn = rn->rn_left;
	for (;;) {
		base = rn;
		/* If at right child go back up, otherwise, go right */
		while (rn->rn_parent->rn_right == rn
		       && (rn->rn_flags & RNF_ROOT) == 0)
			rn = rn->rn_parent;
		/* Find the next *leaf* since next node might vanish, too */
		for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;)
			rn = rn->rn_left;
		next = rn;
		/* Process leaves */
		while ((rn = base)) {
			base = rn->rn_dupedkey;
			if (!(rn->rn_flags & RNF_ROOT)
			    && (error = (*f)(rn, w)))
				return (error);
		}
		rn = next;
		if (rn->rn_flags & RNF_ROOT)
			return (0);
	}
	/* NOTREACHED */
}

/*
 * Allocate and initialize an empty tree. This has 3 nodes, which are
 * part of the radix_node_head (in the order <left,root,right>) and are
 * marked RNF_ROOT so they cannot be freed.
 * The leaves have all-zero and all-one keys, with significant
 * bits starting at 'off'.
 * Return 1 on success, 0 on error.
 */
int
rn_inithead(head, off)
	void **head;
	int off;
{
	register struct radix_node_head *rnh;
	register struct radix_node *t, *tt, *ttt;
	if (*head)
		return (1);
	R_Zalloc(rnh, struct radix_node_head *, sizeof (*rnh));
	if (rnh == 0)
		return (0);
#ifdef _KERNEL
	RADIX_NODE_HEAD_LOCK_INIT(rnh);
#endif
	*head = rnh;
	t = rn_newpair(rn_zeros, off, rnh->rnh_nodes);
	ttt = rnh->rnh_nodes + 2;
	t->rn_right = ttt;
	t->rn_parent = t;
	tt = t->rn_left;	/* ... which in turn is rnh->rnh_nodes */
	tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE;
	tt->rn_bit = -1 - off;
	*ttt = *tt;
	ttt->rn_key = rn_ones;
	rnh->rnh_addaddr = rn_addroute;
	rnh->rnh_deladdr = rn_delete;
	rnh->rnh_matchaddr = rn_match;
	rnh->rnh_lookup = rn_lookup;
	rnh->rnh_walktree = rn_walktree;
	rnh->rnh_walktree_from = rn_walktree_from;
	rnh->rnh_treetop = t;
	return (1);
}

int
rn_detachhead(void **head)
{
	struct radix_node_head *rnh;

	KASSERT((head != NULL && *head != NULL),
	    ("%s: head already freed", __func__));
	rnh = *head;
	
	/* Free <left,root,right> nodes. */
	Free(rnh);

	*head = NULL;
	return (1);
}

void
rn_init(int maxk)
{
	char *cp, *cplim;

	max_keylen = maxk;
	if (max_keylen == 0) {
		log(LOG_ERR,
		    "rn_init: radix functions require max_keylen be set\n");
		return;
	}
	R_Malloc(rn_zeros, char *, 3 * max_keylen);
	if (rn_zeros == NULL)
		panic("rn_init");
	bzero(rn_zeros, 3 * max_keylen);
	rn_ones = cp = rn_zeros + max_keylen;
	addmask_key = cplim = rn_ones + max_keylen;
	while (cp < cplim)
		*cp++ = -1;
	if (rn_inithead((void **)(void *)&mask_rnhead, 0) == 0)
		panic("rn_init 2");
}


================================================
FILE: sys/net/radix.h
================================================
/*-
 * Copyright (c) 1988, 1989, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)radix.h	8.2 (Berkeley) 10/31/94
 * $FreeBSD: head/sys/net/radix.h 185747 2008-12-07 21:15:43Z kmacy $
 */

#ifndef _RADIX_H_
#define	_RADIX_H_

#ifdef _KERNEL
#include <sys/_lock.h>
#include <sys/_mutex.h>
#include <sys/_rwlock.h>
#endif

#ifdef MALLOC_DECLARE
MALLOC_DECLARE(M_RTABLE);
#endif

/*
 * Radix search tree node layout.
 */

struct radix_node {
	struct	radix_mask *rn_mklist;	/* list of masks contained in subtree */
	struct	radix_node *rn_parent;	/* parent */
	short	rn_bit;			/* bit offset; -1-index(netmask) */
	char	rn_bmask;		/* node: mask for bit test*/
	u_char	rn_flags;		/* enumerated next */
#define RNF_NORMAL	1		/* leaf contains normal route */
#define RNF_ROOT	2		/* leaf is root leaf for tree */
#define RNF_ACTIVE	4		/* This node is alive (for rtfree) */
	union {
		struct {			/* leaf only data: */
			caddr_t	rn_Key;		/* object of search */
			caddr_t	rn_Mask;	/* netmask, if present */
			struct	radix_node *rn_Dupedkey;
		} rn_leaf;
		struct {			/* node only data: */
			int	rn_Off;		/* where to start compare */
			struct	radix_node *rn_L;/* progeny */
			struct	radix_node *rn_R;/* progeny */
		} rn_node;
	}		rn_u;
#ifdef RN_DEBUG
	int rn_info;
	struct radix_node *rn_twin;
	struct radix_node *rn_ybro;
#endif
};

#define	rn_dupedkey	rn_u.rn_leaf.rn_Dupedkey
#define	rn_key		rn_u.rn_leaf.rn_Key
#define	rn_mask		rn_u.rn_leaf.rn_Mask
#define	rn_offset	rn_u.rn_node.rn_Off
#define	rn_left		rn_u.rn_node.rn_L
#define	rn_right	rn_u.rn_node.rn_R

/*
 * Annotations to tree concerning potential routes applying to subtrees.
 */

struct radix_mask {
	short	rm_bit;			/* bit offset; -1-index(netmask) */
	char	rm_unused;		/* cf. rn_bmask */
	u_char	rm_flags;		/* cf. rn_flags */
	struct	radix_mask *rm_mklist;	/* more masks to try */
	union	{
		caddr_t	rmu_mask;		/* the mask */
		struct	radix_node *rmu_leaf;	/* for normal routes */
	}	rm_rmu;
	int	rm_refs;		/* # of references to this struct */
};

#define	rm_mask rm_rmu.rmu_mask
#define	rm_leaf rm_rmu.rmu_leaf		/* extra field would make 32 bytes */

typedef int walktree_f_t(struct radix_node *, void *);

struct radix_node_head {
	struct	radix_node *rnh_treetop;
	u_int	rnh_gen;		/* generation counter */
	int	rnh_multipath;		/* multipath capable ? */
	int	rnh_addrsize;		/* permit, but not require fixed keys */
	int	rnh_pktsize;		/* permit, but not require fixed keys */
	struct	radix_node *(*rnh_addaddr)	/* add based on sockaddr */
		(void *v, void *mask,
		     struct radix_node_head *head, struct radix_node nodes[]);
	struct	radix_node *(*rnh_addpkt)	/* add based on packet hdr */
		(void *v, void *mask,
		     struct radix_node_head *head, struct radix_node nodes[]);
	struct	radix_node *(*rnh_deladdr)	/* remove based on sockaddr */
		(void *v, void *mask, struct radix_node_head *head);
	struct	radix_node *(*rnh_delpkt)	/* remove based on packet hdr */
		(void *v, void *mask, struct radix_node_head *head);
	struct	radix_node *(*rnh_matchaddr)	/* locate based on sockaddr */
		(void *v, struct radix_node_head *head);
	struct	radix_node *(*rnh_lookup)	/* locate based on sockaddr */
		(void *v, void *mask, struct radix_node_head *head);
	struct	radix_node *(*rnh_matchpkt)	/* locate based on packet hdr */
		(void *v, struct radix_node_head *head);
	int	(*rnh_walktree)			/* traverse tree */
		(struct radix_node_head *head, walktree_f_t *f, void *w);
	int	(*rnh_walktree_from)		/* traverse tree below a */
		(struct radix_node_head *head, void *a, void *m,
		     walktree_f_t *f, void *w);
	void	(*rnh_close)	/* do something when the last ref drops */
		(struct radix_node *rn, struct radix_node_head *head);
	struct	radix_node rnh_nodes[3];	/* empty tree for common case */
#ifdef _KERNEL
#if defined( __linux__ ) || defined( _WIN32 )
        spinlock_t rnh_lock;
#else
	struct	rwlock rnh_lock;		/* locks entire radix tree */
#endif /* !__linux__ */
#endif
};

#ifndef _KERNEL
#define R_Malloc(p, t, n) (p = (t) malloc((unsigned int)(n)))
#define R_Zalloc(p, t, n) (p = (t) calloc(1,(unsigned int)(n)))
#define Free(p) free((char *)p);
#else
#define R_Malloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT))
#define R_Zalloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT | M_ZERO))
#define Free(p) free((caddr_t)p, M_RTABLE);

#define	RADIX_NODE_HEAD_LOCK_INIT(rnh)	\
    rw_init_flags(&(rnh)->rnh_lock, "radix node head", 0)
#define	RADIX_NODE_HEAD_LOCK(rnh)	rw_wlock(&(rnh)->rnh_lock)
#define	RADIX_NODE_HEAD_UNLOCK(rnh)	rw_wunlock(&(rnh)->rnh_lock)
#define	RADIX_NODE_HEAD_RLOCK(rnh)	rw_rlock(&(rnh)->rnh_lock)
#define	RADIX_NODE_HEAD_RUNLOCK(rnh)	rw_runlock(&(rnh)->rnh_lock)
#define	RADIX_NODE_HEAD_LOCK_TRY_UPGRADE(rnh)	rw_try_upgrade(&(rnh)->rnh_lock)


#define	RADIX_NODE_HEAD_DESTROY(rnh)	rw_destroy(&(rnh)->rnh_lock)
#define	RADIX_NODE_HEAD_LOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_LOCKED)
#define	RADIX_NODE_HEAD_WLOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_WLOCKED)
#endif /* _KERNEL */

void	 rn_init(int);
int	 rn_inithead(void **, int);
int	 rn_detachhead(void **);
int	 rn_refines(void *, void *);
struct radix_node
	 *rn_addmask(void *, int, int),
	 *rn_addroute (void *, void *, struct radix_node_head *,
			struct radix_node [2]),
	 *rn_delete(void *, void *, struct radix_node_head *),
	 *rn_lookup (void *v_arg, void *m_arg,
		        struct radix_node_head *head),
	 *rn_match(void *, struct radix_node_head *);

#endif /* _RADIX_H_ */


================================================
FILE: sys/netgraph/ng_ipfw.h
================================================
/*-
 * Copyright 2005, Gleb Smirnoff <glebius@FreeBSD.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * $FreeBSD: src/sys/netgraph/ng_ipfw.h,v 1.2 2006/02/17 09:42:49 glebius Exp $
 */

#ifndef _NG_IPFW_H
#define _NG_IPFW_H
#define NG_IPFW_NODE_TYPE    "ipfw"
#define NGM_IPFW_COOKIE      1105988990
#endif /* _NG_IPFW_H */


================================================
FILE: sys/netinet/in_cksum.c
================================================
/*-
 * Copyright (c) 1988, 1992, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)in_cksum.c	8.1 (Berkeley) 6/10/93
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD: src/sys/netinet/in_cksum.c,v 1.10 2007/10/07 20:44:22 silby Exp $");

#include <sys/param.h>
#include <sys/mbuf.h>

/*
 * Checksum routine for Internet Protocol family headers (Portable Version).
 *
 * This routine is very heavily used in the network
 * code and should be modified for each CPU to be as fast as possible.
 */

#define ADDCARRY(x)  (x > 65535 ? x -= 65535 : x)
#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);}

int
in_cksum(struct mbuf *m, int len)
{
	register u_short *w;
	register int sum = 0;
	register int mlen = 0;
	int byte_swapped = 0;

	union {
		char	c[2];
		u_short	s;
	} s_util;
	union {
		u_short s[2];
		long	l;
	} l_util;

	for (;m && len; m = m->m_next) {
		if (m->m_len == 0)
			continue;
		w = mtod(m, u_short *);
		if (mlen == -1) {
			/*
			 * The first byte of this mbuf is the continuation
			 * of a word spanning between this mbuf and the
			 * last mbuf.
			 *
			 * s_util.c[0] is already saved when scanning previous
			 * mbuf.
			 */
			s_util.c[1] = *(char *)w;
			sum += s_util.s;
			w = (u_short *)((char *)w + 1);
			mlen = m->m_len - 1;
			len--;
		} else
			mlen = m->m_len;
		if (len < mlen)
			mlen = len;
		len -= mlen;
		/*
		 * Force to even boundary.
		 */
		if ((1 & (uintptr_t) w) && (mlen > 0)) {
			REDUCE;
			sum <<= 8;
			s_util.c[0] = *(u_char *)w;
			w = (u_short *)((char *)w + 1);
			mlen--;
			byte_swapped = 1;
		}
		/*
		 * Unroll the loop to make overhead from
		 * branches &c small.
		 */
		while ((mlen -= 32) >= 0) {
			sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
			sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7];
			sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11];
			sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15];
			w += 16;
		}
		mlen += 32;
		while ((mlen -= 8) >= 0) {
			sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
			w += 4;
		}
		mlen += 8;
		if (mlen == 0 && byte_swapped == 0)
			continue;
		REDUCE;
		while ((mlen -= 2) >= 0) {
			sum += *w++;
		}
		if (byte_swapped) {
			REDUCE;
			sum <<= 8;
			byte_swapped = 0;
			if (mlen == -1) {
				s_util.c[1] = *(char *)w;
				sum += s_util.s;
				mlen = 0;
			} else
				mlen = -1;
		} else if (mlen == -1)
			s_util.c[0] = *(char *)w;
	}
	if (len)
		printf("cksum: out of data\n");
	if (mlen == -1) {
		/* The last mbuf has odd # of bytes. Follow the
		   standard (the odd byte may be shifted left by 8 bits
		   or not as determined by endian-ness of the machine) */
		s_util.c[1] = 0;
		sum += s_util.s;
	}
	REDUCE;
	return (~sum & 0xffff);
}


================================================
FILE: sys/netinet/ip.h
================================================
#ifndef _NETINET_IP_H_
#define _NETINET_IP_H_

#define LITTLE_ENDIAN   1234
#define BIG_ENDIAN      4321
#if defined(__BIG_ENDIAN)
#define BYTE_ORDER      BIG_ENDIAN
//#warning we are in bigendian
#elif defined(__LITTLE_ENDIAN)
//#warning we are in littleendian
#define BYTE_ORDER      LITTLE_ENDIAN
#else
#error no platform
#endif

/* XXX endiannes doesn't belong here */
// #define LITTLE_ENDIAN   1234
// #define BIG_ENDIAN      4321
// #define BYTE_ORDER      LITTLE_ENDIAN

/*
 * Structure of an internet header, naked of options.
 */
struct ip {
#if BYTE_ORDER == LITTLE_ENDIAN
        u_char  ip_hl:4,                /* header length */
                ip_v:4;                 /* version */
#endif
#if BYTE_ORDER == BIG_ENDIAN
        u_char  ip_v:4,                 /* version */
                ip_hl:4;                /* header length */
#endif
        u_char  ip_tos;                 /* type of service */
        u_short ip_len;                 /* total length */
        u_short ip_id;                  /* identification */
        u_short ip_off;                 /* fragment offset field */
#define IP_RF 0x8000                    /* reserved fragment flag */
#define IP_DF 0x4000                    /* dont fragment flag */
#define IP_MF 0x2000                    /* more fragments flag */
#define IP_OFFMASK 0x1fff               /* mask for fragmenting bits */
        u_char  ip_ttl;                 /* time to live */
        u_char  ip_p;                   /* protocol */
        u_short ip_sum;                 /* checksum */
        struct  in_addr ip_src,ip_dst;  /* source and dest address */
} __packed __aligned(4);

#define	IPTOS_LOWDELAY		0x10

#endif /* _NETINET_IP_H_ */


================================================
FILE: sys/netinet/ip6.h
================================================
#ifndef _NETINET_IP6_H_
#define _NETINET_IP6_H_
#define IN6_ARE_ADDR_EQUAL(a, b)                        \
(memcmp(&(a)->s6_addr[0], &(b)->s6_addr[0], sizeof(struct in6_addr)) == 0)

struct ip6_hdr {
        union {
                struct ip6_hdrctl {
                        u_int32_t ip6_un1_flow; /* 20 bits of flow-ID */  
                        u_int16_t ip6_un1_plen; /* payload length */
                        u_int8_t  ip6_un1_nxt;  /* next header */
                        u_int8_t  ip6_un1_hlim; /* hop limit */
                } ip6_un1;
                u_int8_t ip6_un2_vfc;   /* 4 bits version, top 4 bits class */
        } ip6_ctlun;
        struct in6_addr ip6_src;        /* source address */
        struct in6_addr ip6_dst;        /* destination address */
};
#define ip6_nxt         ip6_ctlun.ip6_un1.ip6_un1_nxt
#define ip6_flow        ip6_ctlun.ip6_un1.ip6_un1_flow


struct icmp6_hdr {
        u_int8_t        icmp6_type;     /* type field */
        u_int8_t        icmp6_code;     /* code field */
        u_int16_t       icmp6_cksum;    /* checksum field */
        union {
                u_int32_t       icmp6_un_data32[1]; /* type-specific field */
                u_int16_t       icmp6_un_data16[2]; /* type-specific field */
                u_int8_t        icmp6_un_data8[4];  /* type-specific field */
        } icmp6_dataun;
};

struct ip6_hbh {
        u_int8_t ip6h_nxt;      /* next header */
        u_int8_t ip6h_len;      /* length in units of 8 octets */
        /* followed by options */
}; 
struct ip6_rthdr {
        u_int8_t  ip6r_nxt;     /* next header */
        u_int8_t  ip6r_len;     /* length in units of 8 octets */
        u_int8_t  ip6r_type;    /* routing type */
        u_int8_t  ip6r_segleft; /* segments left */
        /* followed by routing type specific data */
};
struct ip6_frag {
        u_int8_t  ip6f_nxt;             /* next header */
        u_int8_t  ip6f_reserved;        /* reserved field */
        u_int16_t ip6f_offlg;           /* offset, reserved, and flag */
        u_int32_t ip6f_ident;           /* identification */
};
#define IP6F_OFF_MASK           0xfff8  /* mask out offset from _offlg */
#define IP6F_MORE_FRAG          0x0001  /* more-fragments flag */
struct  ip6_ext {
        u_int8_t ip6e_nxt;
        u_int8_t ip6e_len;
};
#endif /* _NETINET_IP6_H_ */


================================================
FILE: sys/netinet/ip_dummynet.h
================================================
/*-
 * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa
 * Portions Copyright (c) 2000 Akamba Corp.
 * All rights reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * $FreeBSD: user/luigi/ipfw3-head/sys/netinet/ip_dummynet.h 203321 2010-01-31 21:39:25Z luigi $
 */

#ifndef _IP_DUMMYNET_H
#define _IP_DUMMYNET_H

/*
 * Definition of the kernel-userland API for dummynet.
 *
 * Setsockopt() and getsockopt() pass a batch of objects, each
 * of them starting with a "struct dn_id" which should fully identify
 * the object and its relation with others in the sequence.
 * The first object in each request should have
 *	 type= DN_CMD_*, id = DN_API_VERSION.
 * For other objects, type and subtype specify the object, len indicates
 * the total length including the header, and 'id' identifies the specific
 * object.
 *
 * Most objects are numbered with an identifier in the range 1..65535.
 * DN_MAX_ID indicates the first value outside the range.
 */

#define	DN_API_VERSION	12500000
#define	DN_MAX_ID	0x10000

struct dn_id {
	uint16_t	len;	/* total obj len including this header */
	uint8_t		type;
	uint8_t		subtype;
	uint32_t	id;	/* generic id */
};

/*
 * These values are in the type field of struct dn_id.
 * To preserve the ABI, never rearrange the list or delete
 * entries with the exception of DN_LAST
 */
enum {
	DN_NONE = 0,
	DN_LINK = 1,
	DN_FS,
	DN_SCH,
	DN_SCH_I,
	DN_QUEUE,
	DN_DELAY_LINE,
	DN_PROFILE,
	DN_FLOW,		/* struct dn_flow */
	DN_TEXT,		/* opaque text is the object */

	DN_CMD_CONFIG = 0x80,	/* objects follow */
	DN_CMD_DELETE,		/* subtype + list of entries */
	DN_CMD_GET,		/* subtype + list of entries */
	DN_CMD_FLUSH,
	/* for compatibility with FreeBSD 7.2/8 */
	DN_COMPAT_PIPE,
	DN_COMPAT_QUEUE,
	DN_GET_COMPAT,

	/* special commands for emulation of sysctl variables */
	DN_SYSCTL_GET,
	DN_SYSCTL_SET,

	DN_LAST,
};

enum { /* subtype for schedulers, flowset and the like */
	DN_SCHED_UNKNOWN = 0,
	DN_SCHED_FIFO = 1,
	DN_SCHED_WF2QP = 2,
	/* others are in individual modules */
};

enum {	/* user flags */
	DN_HAVE_MASK	= 0x0001,	/* fs or sched has a mask */
	DN_NOERROR	= 0x0002,	/* do not report errors */
	DN_QHT_HASH	= 0x0004,	/* qht is a hash table */
	DN_QSIZE_BYTES	= 0x0008,	/* queue size is in bytes */
	DN_HAS_PROFILE	= 0x0010,	/* a link has a profile */
	DN_IS_RED	= 0x0020,
	DN_IS_GENTLE_RED= 0x0040,
	DN_PIPE_CMD	= 0x1000,	/* pipe config... */
};

/*
 * link template.
 */
struct dn_link {
	struct dn_id oid;

	/*
	 * Userland sets bw and delay in bits/s and milliseconds.
	 * The kernel converts this back and forth to bits/tick and ticks.
	 * XXX what about burst ?
	 */
	int32_t		link_nr;
	int		bandwidth;	/* bit/s or bits/tick.   */
	int		delay;		/* ms and ticks */
	uint64_t	burst;		/* scaled. bits*Hz  XXX */
};

/*
 * A flowset, which is a template for flows. Contains parameters
 * from the command line: id, target scheduler, queue sizes, plr,
 * flow masks, buckets for the flow hash, and possibly scheduler-
 * specific parameters (weight, quantum and so on).
 */
struct dn_fs {
	struct dn_id oid;
	uint32_t fs_nr;		/* the flowset number */
	uint32_t flags;		/* userland flags */
	int qsize;		/* queue size in slots or bytes */
	int32_t plr;		/* PLR, pkt loss rate (2^31-1 means 100%) */
	uint32_t buckets;	/* buckets used for the queue hash table */

	struct ipfw_flow_id flow_mask;
	uint32_t sched_nr;	/* the scheduler we attach to */
	/* generic scheduler parameters. Leave them at -1 if unset.
	 * Now we use 0: weight, 1: lmax, 2: priority
	 */
	int par[4];

	/* RED/GRED parameters.
	 * weight and probabilities are in the range 0..1 represented
	 * in fixed point arithmetic with SCALE_RED decimal bits.
	 */
#define SCALE_RED	16
#define SCALE(x)	( (x) << SCALE_RED )
#define SCALE_VAL(x)	( (x) >> SCALE_RED )
#define SCALE_MUL(x,y)	( ( (x) * (y) ) >> SCALE_RED )
	int w_q ;		/* queue weight (scaled) */
	int max_th ;		/* maximum threshold for queue (scaled) */
	int min_th ;		/* minimum threshold for queue (scaled) */
	int max_p ;		/* maximum value for p_b (scaled) */

};

/*
 * dn_flow collects flow_id and stats for queues and scheduler
 * instances, and is used to pass these info to userland.
 * oid.type/oid.subtype describe the object, oid.id is number
 * of the parent object.
 */
struct dn_flow {
	struct dn_id	oid;
	struct ipfw_flow_id fid;
	uint64_t	tot_pkts; /* statistics counters  */
	uint64_t	tot_bytes;
	uint32_t	length; /* Queue length, in packets */
	uint32_t	len_bytes; /* Queue length, in bytes */
	uint32_t	drops;
};


/*
 * Scheduler template, mostly indicating the name, number,
 * sched_mask and buckets.
 */
struct dn_sch {
	struct dn_id	oid;
	uint32_t	sched_nr; /* N, scheduler number */
	uint32_t	buckets; /* number of buckets for the instances */
	uint32_t	flags;	/* have_mask, ... */

	char name[16];	/* null terminated */
	/* mask to select the appropriate scheduler instance */
	struct ipfw_flow_id sched_mask; /* M */
};


/* A delay profile is attached to a link.
 * Note that a profile, as any other object, cannot be longer than 2^16
 */
#define	ED_MAX_SAMPLES_NO	1024
struct dn_profile {
	struct dn_id	oid;
	/* fields to simulate a delay profile */
#define ED_MAX_NAME_LEN		32
	char	name[ED_MAX_NAME_LEN];
	int	link_nr;
	int	loss_level;
	int	bandwidth;			// XXX use link bandwidth?
	int	samples_no;			/* actual len of samples[] */
	int	samples[0];			/* may be shorter */
};


/*
 * Overall structure of dummynet

In dummynet, packets are selected with the firewall rules, and passed
to two different objects: PIPE or QUEUE (bad name).

A QUEUE defines a classifier, which groups packets into flows
according to a 'mask', puts them into independent queues (one
per flow) with configurable size and queue management policy,
and passes flows to a scheduler:

                 (flow_mask|sched_mask)  sched_mask
	 +---------+   weight Wx  +-------------+
         |         |->-[flow]-->--|             |-+
    -->--| QUEUE x |   ...        |             | |
         |         |->-[flow]-->--| SCHEDuler N | |
	 +---------+              |             | |
	     ...                  |             +--[LINK N]-->--
	 +---------+   weight Wy  |             | +--[LINK N]-->--
         |         |->-[flow]-->--|             | |
    -->--| QUEUE y |   ...        |             | |
         |         |->-[flow]-->--|             | |
	 +---------+              +-------------+ |
	                            +-------------+

Many QUEUE objects can connect to the same scheduler, each
QUEUE object can have its own set of parameters.

In turn, the SCHEDuler 'forks' multiple instances according
to a 'sched_mask', each instance manages its own set of queues
and transmits on a private instance of a configurable LINK.

A PIPE is a simplified version of the above, where there
is no flow_mask, and each scheduler instance handles a single queue.

The following data structures (visible from userland) describe
the objects used by dummynet:

 + dn_link, contains the main configuration parameters related
   to delay and bandwidth;
 + dn_profile describes a delay profile;
 + dn_flow describes the flow status (flow id, statistics)
   
 + dn_sch describes a scheduler
 + dn_fs describes a flowset (msk, weight, queue parameters)

 *
 */

#endif /* _IP_DUMMYNET_H */


================================================
FILE: sys/netinet/ip_fw.h
================================================
/*-
 * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * $FreeBSD: user/luigi/ipfw3-head/sys/netinet/ip_fw.h 202072 2010-01-11 10:12:35Z luigi $
 */

#ifndef _IPFW2_H
#define _IPFW2_H

/*
 * The default rule number.  By the design of ip_fw, the default rule
 * is the last one, so its number can also serve as the highest number
 * allowed for a rule.  The ip_fw code relies on both meanings of this
 * constant. 
 */
#define	IPFW_DEFAULT_RULE	65535

/*
 * The number of ipfw tables.  The maximum allowed table number is the
 * (IPFW_TABLES_MAX - 1).
 */
#define	IPFW_TABLES_MAX		128

/*
 * Most commands (queue, pipe, tag, untag, limit...) can have a 16-bit
 * argument between 1 and 65534. The value 0 is unused, the value
 * 65535 (IP_FW_TABLEARG) is used to represent 'tablearg', i.e. the
 * can be 1..65534, or 65535 to indicate the use of a 'tablearg'
 * result of the most recent table() lookup.
 * Note that 16bit is only a historical limit, resulting from
 * the use of a 16-bit fields for that value. In reality, we can have
 * 2^32 pipes, queues, tag values and so on, and use 0 as a tablearg.
 */
#define	IPFW_ARG_MIN		1
#define	IPFW_ARG_MAX		65534
#define IP_FW_TABLEARG		65535	/* XXX should use 0 */

 /*
 * Number of entries in the call stack of the call/return commands.
 * Call stack currently is an uint16_t array with rule numbers.
 */
#define	IPFW_CALLSTACK_SIZE	16

/* IP_FW3 header/opcodes */
typedef struct _ip_fw3_opheader {
	uint16_t opcode;	/* Operation opcode */
	uint16_t reserved[3];	/* Align to 64-bit boundary */
} ip_fw3_opheader;


/* IPFW extented tables support XXX what namespace ? */
#define	IP_FW_TABLE_XADD	86	/* add entry */
#define	IP_FW_TABLE_XDEL	87	/* delete entry */
#define	IP_FW_TABLE_XGETSIZE	88	/* get table size */
#define	IP_FW_TABLE_XLIST	89	/* list table contents */

/*
 * The kernel representation of ipfw rules is made of a list of
 * 'instructions' (for all practical purposes equivalent to BPF
 * instructions), which specify which fields of the packet
 * (or its metadata) should be analysed.
 *
 * Each instruction is stored in a structure which begins with
 * "ipfw_insn", and can contain extra fields depending on the
 * instruction type (listed below).
 * Note that the code is written so that individual instructions
 * have a size which is a multiple of 32 bits. This means that, if
 * such structures contain pointers or other 64-bit entities,
 * (there is just one instance now) they may end up unaligned on
 * 64-bit architectures, so the must be handled with care.
 *
 * "enum ipfw_opcodes" are the opcodes supported. We can have up
 * to 256 different opcodes. When adding new opcodes, they should
 * be appended to the end of the opcode list before O_LAST_OPCODE,
 * this will prevent the ABI from being broken, otherwise users
 * will have to recompile ipfw(8) when they update the kernel.
 */

enum ipfw_opcodes {		/* arguments (4 byte each)	*/
	O_NOP,

	O_IP_SRC,		/* u32 = IP			*/
	O_IP_SRC_MASK,		/* ip = IP/mask			*/
	O_IP_SRC_ME,		/* none				*/
	O_IP_SRC_SET,		/* u32=base, arg1=len, bitmap	*/

	O_IP_DST,		/* u32 = IP			*/
	O_IP_DST_MASK,		/* ip = IP/mask			*/
	O_IP_DST_ME,		/* none				*/
	O_IP_DST_SET,		/* u32=base, arg1=len, bitmap	*/

	O_IP_SRCPORT,		/* (n)port list:mask 4 byte ea	*/
	O_IP_DSTPORT,		/* (n)port list:mask 4 byte ea	*/
	O_PROTO,		/* arg1=protocol		*/

	O_MACADDR2,		/* 2 mac addr:mask		*/
	O_MAC_TYPE,		/* same as srcport		*/

	O_LAYER2,		/* none				*/
	O_IN,			/* none				*/
	O_FRAG,			/* none				*/

	O_RECV,			/* none				*/
	O_XMIT,			/* none				*/
	O_VIA,			/* none				*/

	O_IPOPT,		/* arg1 = 2*u8 bitmap		*/
	O_IPLEN,		/* arg1 = len			*/
	O_IPID,			/* arg1 = id			*/

	O_IPTOS,		/* arg1 = id			*/
	O_IPPRECEDENCE,		/* arg1 = precedence << 5	*/
	O_IPTTL,		/* arg1 = TTL			*/

	O_IPVER,		/* arg1 = version		*/
	O_UID,			/* u32 = id			*/
	O_GID,			/* u32 = id			*/
	O_ESTAB,		/* none (tcp established)	*/
	O_TCPFLAGS,		/* arg1 = 2*u8 bitmap		*/
	O_TCPWIN,		/* arg1 = desired win		*/
	O_TCPSEQ,		/* u32 = desired seq.		*/
	O_TCPACK,		/* u32 = desired seq.		*/
	O_ICMPTYPE,		/* u32 = icmp bitmap		*/
	O_TCPOPTS,		/* arg1 = 2*u8 bitmap		*/

	O_VERREVPATH,		/* none				*/
	O_VERSRCREACH,		/* none				*/

	O_PROBE_STATE,		/* none				*/
	O_KEEP_STATE,		/* none				*/
	O_LIMIT,		/* ipfw_insn_limit		*/
	O_LIMIT_PARENT,		/* dyn_type, not an opcode.	*/

	/*
	 * These are really 'actions'.
	 */

	O_LOG,			/* ipfw_insn_log		*/
	O_PROB,			/* u32 = match probability	*/

	O_CHECK_STATE,		/* none				*/
	O_ACCEPT,		/* none				*/
	O_DENY,			/* none 			*/
	O_REJECT,		/* arg1=icmp arg (same as deny)	*/
	O_COUNT,		/* none				*/
	O_SKIPTO,		/* arg1=next rule number	*/
	O_PIPE,			/* arg1=pipe number		*/
	O_QUEUE,		/* arg1=queue number		*/
	O_DIVERT,		/* arg1=port number		*/
	O_TEE,			/* arg1=port number		*/
	O_FORWARD_IP,		/* fwd sockaddr			*/
	O_FORWARD_MAC,		/* fwd mac			*/
	O_NAT,                  /* nope                         */
	O_REASS,                /* none                         */
	
	/*
	 * More opcodes.
	 */
	O_IPSEC,		/* has ipsec history 		*/
	O_IP_SRC_LOOKUP,	/* arg1=table number, u32=value	*/
	O_IP_DST_LOOKUP,	/* arg1=table number, u32=value	*/
	O_ANTISPOOF,		/* none				*/
	O_JAIL,			/* u32 = id			*/
	O_ALTQ,			/* u32 = altq classif. qid	*/
	O_DIVERTED,		/* arg1=bitmap (1:loop, 2:out)	*/
	O_TCPDATALEN,		/* arg1 = tcp data len		*/
	O_IP6_SRC,		/* address without mask		*/
	O_IP6_SRC_ME,		/* my addresses			*/
	O_IP6_SRC_MASK,		/* address with the mask	*/
	O_IP6_DST,
	O_IP6_DST_ME,
	O_IP6_DST_MASK,
	O_FLOW6ID,		/* for flow id tag in the ipv6 pkt */
	O_ICMP6TYPE,		/* icmp6 packet type filtering	*/
	O_EXT_HDR,		/* filtering for ipv6 extension header */
	O_IP6,

	/*
	 * actions for ng_ipfw
	 */
	O_NETGRAPH,		/* send to ng_ipfw		*/
	O_NGTEE,		/* copy to ng_ipfw		*/

	O_IP4,

	O_UNREACH6,		/* arg1=icmpv6 code arg (deny)  */

	O_TAG,   		/* arg1=tag number */
	O_TAGGED,		/* arg1=tag number */

	O_SETFIB,		/* arg1=FIB number */
	O_FIB,			/* arg1=FIB desired fib number */

	O_SOCKARG,		/* socket argument */

	O_CALLRETURN,		/* arg1=called rule number */

	O_FORWARD_IP6,		/* fwd sockaddr_in6             */

	O_LAST_OPCODE		/* not an opcode!		*/
};


/*
 * The extension header are filtered only for presence using a bit
 * vector with a flag for each header.
 */
#define EXT_FRAGMENT	0x1
#define EXT_HOPOPTS	0x2
#define EXT_ROUTING	0x4
#define EXT_AH		0x8
#define EXT_ESP		0x10
#define EXT_DSTOPTS	0x20
#define EXT_RTHDR0		0x40
#define EXT_RTHDR2		0x80

/*
 * Template for instructions.
 *
 * ipfw_insn is used for all instructions which require no operands,
 * a single 16-bit value (arg1), or a couple of 8-bit values.
 *
 * For other instructions which require different/larger arguments
 * we have derived structures, ipfw_insn_*.
 *
 * The size of the instruction (in 32-bit words) is in the low
 * 6 bits of "len". The 2 remaining bits are used to implement
 * NOT and OR on individual instructions. Given a type, you can
 * compute the length to be put in "len" using F_INSN_SIZE(t)
 *
 * F_NOT	negates the match result of the instruction.
 *
 * F_OR		is used to build or blocks. By default, instructions
 *		are evaluated as part of a logical AND. An "or" block
 *		{ X or Y or Z } contains F_OR set in all but the last
 *		instruction of the block. A match will cause the code
 *		to skip past the last instruction of the block.
 *
 * NOTA BENE: in a couple of places we assume that
 *	sizeof(ipfw_insn) == sizeof(u_int32_t)
 * this needs to be fixed.
 *
 */
typedef struct	_ipfw_insn {	/* template for instructions */
	u_int8_t 	opcode;
	u_int8_t	len;	/* number of 32-bit words */
#define	F_NOT		0x80
#define	F_OR		0x40
#define	F_LEN_MASK	0x3f
#define	F_LEN(cmd)	((cmd)->len & F_LEN_MASK)

	u_int16_t	arg1;
} ipfw_insn;

/*
 * The F_INSN_SIZE(type) computes the size, in 4-byte words, of
 * a given type.
 */
#define	F_INSN_SIZE(t)	((sizeof (t))/sizeof(u_int32_t))

/*
 * This is used to store an array of 16-bit entries (ports etc.)
 */
typedef struct	_ipfw_insn_u16 {
	ipfw_insn o;
	u_int16_t ports[2];	/* there may be more */
} ipfw_insn_u16;

/*
 * This is used to store an array of 32-bit entries
 * (uid, single IPv4 addresses etc.)
 */
typedef struct	_ipfw_insn_u32 {
	ipfw_insn o;
	u_int32_t d[1];	/* one or more */
} ipfw_insn_u32;

/*
 * This is used to store IP addr-mask pairs.
 */
typedef struct	_ipfw_insn_ip {
	ipfw_insn o;
	struct in_addr	addr;
	struct in_addr	mask;
} ipfw_insn_ip;

/*
 * This is used to forward to a given address (ip).
 */
typedef struct  _ipfw_insn_sa {
	ipfw_insn o;
	struct sockaddr_in sa;
} ipfw_insn_sa;

/*
* This is used to forward to a given address (ipv6).
*/
typedef struct _ipfw_insn_sa6 {
	ipfw_insn o;
	struct sockaddr_in6 sa;
} ipfw_insn_sa6;

/*
 * This is used for MAC addr-mask pairs.
 */
typedef struct	_ipfw_insn_mac {
	ipfw_insn o;
	u_char addr[12];	/* dst[6] + src[6] */
	u_char mask[12];	/* dst[6] + src[6] */
} ipfw_insn_mac;

/*
 * This is used for interface match rules (recv xx, xmit xx).
 */
typedef struct	_ipfw_insn_if {
	ipfw_insn o;
	union {
		struct in_addr ip;
		int glob;
	} p;
	char name[IFNAMSIZ];
} ipfw_insn_if;

/*
 * This is used for storing an altq queue id number.
 */
typedef struct _ipfw_insn_altq {
	ipfw_insn	o;
	u_int32_t	qid;
} ipfw_insn_altq;

/*
 * This is used for limit rules.
 */
typedef struct	_ipfw_insn_limit {
	ipfw_insn o;
	u_int8_t _pad;
	u_int8_t limit_mask;	/* combination of DYN_* below	*/
#define	DYN_SRC_ADDR	0x1
#define	DYN_SRC_PORT	0x2
#define	DYN_DST_ADDR	0x4
#define	DYN_DST_PORT	0x8

	u_int16_t conn_limit;
} ipfw_insn_limit;

/*
 * This is used for log instructions.
 */
typedef struct  _ipfw_insn_log {
        ipfw_insn o;
	u_int32_t max_log;	/* how many do we log -- 0 = all */
	u_int32_t log_left;	/* how many left to log 	*/
} ipfw_insn_log;

/*
 * Data structures required by both ipfw(8) and ipfw(4) but not part of the
 * management API are protected by IPFW_INTERNAL.
 */
#ifdef IPFW_INTERNAL
/* Server pool support (LSNAT). */
struct cfg_spool {
	LIST_ENTRY(cfg_spool)   _next;          /* chain of spool instances */
	struct in_addr          addr;
	u_short                 port;
};
#endif

/* Redirect modes id. */
#define REDIR_ADDR      0x01
#define REDIR_PORT      0x02
#define REDIR_PROTO     0x04

#ifdef IPFW_INTERNAL
/* Nat redirect configuration. */
struct cfg_redir {
	LIST_ENTRY(cfg_redir)   _next;          /* chain of redir instances */
	u_int16_t               mode;           /* type of redirect mode */
	struct in_addr	        laddr;          /* local ip address */
	struct in_addr	        paddr;          /* public ip address */
	struct in_addr	        raddr;          /* remote ip address */
	u_short                 lport;          /* local port */
	u_short                 pport;          /* public port */
	u_short                 rport;          /* remote port  */
	u_short                 pport_cnt;      /* number of public ports */
	u_short                 rport_cnt;      /* number of remote ports */
	int                     proto;          /* protocol: tcp/udp */
	struct alias_link       **alink;	
	/* num of entry in spool chain */
	u_int16_t               spool_cnt;      
	/* chain of spool instances */
	LIST_HEAD(spool_chain, cfg_spool) spool_chain;
};
#endif

#define NAT_BUF_LEN     1024

#ifdef IPFW_INTERNAL
/* Nat configuration data struct. */
struct cfg_nat {
	/* chain of nat instances */
	LIST_ENTRY(cfg_nat)     _next;
	int                     id;                     /* nat id */
	struct in_addr          ip;                     /* nat ip address */
	char                    if_name[IF_NAMESIZE];   /* interface name */
	int                     mode;                   /* aliasing mode */
	struct libalias	        *lib;                   /* libalias instance */
	/* number of entry in spool chain */
	int                     redir_cnt;              
	/* chain of redir instances */
	LIST_HEAD(redir_chain, cfg_redir) redir_chain;  
};
#endif

#define SOF_NAT         sizeof(struct cfg_nat)
#define SOF_REDIR       sizeof(struct cfg_redir)
#define SOF_SPOOL       sizeof(struct cfg_spool)

/* Nat command. */
typedef struct	_ipfw_insn_nat {
 	ipfw_insn	o;
 	struct cfg_nat *nat;	
} ipfw_insn_nat;

/* Apply ipv6 mask on ipv6 addr */
#define APPLY_MASK(addr,mask)                          \
    (addr)->__u6_addr.__u6_addr32[0] &= (mask)->__u6_addr.__u6_addr32[0]; \
    (addr)->__u6_addr.__u6_addr32[1] &= (mask)->__u6_addr.__u6_addr32[1]; \
    (addr)->__u6_addr.__u6_addr32[2] &= (mask)->__u6_addr.__u6_addr32[2]; \
    (addr)->__u6_addr.__u6_addr32[3] &= (mask)->__u6_addr.__u6_addr32[3];

/* Structure for ipv6 */
typedef struct _ipfw_insn_ip6 {
       ipfw_insn o;
       struct in6_addr addr6;
       struct in6_addr mask6;
} ipfw_insn_ip6;

/* Used to support icmp6 types */
typedef struct _ipfw_insn_icmp6 {
       ipfw_insn o;
       uint32_t d[7]; /* XXX This number si related to the netinet/icmp6.h
                       *     define ICMP6_MAXTYPE
                       *     as follows: n = ICMP6_MAXTYPE/32 + 1
                        *     Actually is 203 
                       */
} ipfw_insn_icmp6;

/*
 * Here we have the structure representing an ipfw rule.
 *
 * It starts with a general area (with link fields and counters)
 * followed by an array of one or more instructions, which the code
 * accesses as an array of 32-bit values.
 *
 * Given a rule pointer  r:
 *
 *  r->cmd		is the start of the first instruction.
 *  ACTION_PTR(r)	is the start of the first action (things to do
 *			once a rule matched).
 *
 * When assembling instruction, remember the following:
 *
 *  + if a rule has a "keep-state" (or "limit") option, then the
 *	first instruction (at r->cmd) MUST BE an O_PROBE_STATE
 *  + if a rule has a "log" option, then the first action
 *	(at ACTION_PTR(r)) MUST be O_LOG
 *  + if a rule has an "altq" option, it comes after "log"
 *  + if a rule has an O_TAG option, it comes after "log" and "altq"
 *
 * NOTE: we use a simple linked list of rules because we never need
 * 	to delete a rule without scanning the list. We do not use
 *	queue(3) macros for portability and readability.
 */

struct ip_fw {
#ifdef _X64EMU
		int32_t pad1;
#endif
	struct ip_fw	*x_next;	/* linked list of rules		*/
#ifdef _X64EMU
		int32_t pad2;
#endif
	struct ip_fw	*next_rule;	/* ptr to next [skipto] rule	*/
	/* 'next_rule' is used to pass up 'set_disable' status		*/

	uint16_t	act_ofs;	/* offset of action in 32-bit units */
	uint16_t	cmd_len;	/* # of 32-bit words in cmd	*/
	uint16_t	rulenum;	/* rule number			*/
	uint8_t	set;		/* rule set (0..31)		*/
#define	RESVD_SET	31	/* set for default and persistent rules */
	uint8_t		_pad;		/* padding			*/
	uint32_t	id;		/* rule id */

	/* These fields are present in all rules.			*/
	uint64_t	pcnt;		/* Packet counter		*/
	uint64_t	bcnt;		/* Byte counter			*/
	uint32_t	timestamp;	/* tv_sec of last match		*/

	ipfw_insn	cmd[1];		/* storage for commands		*/
};

#define ACTION_PTR(rule)				\
	(ipfw_insn *)( (u_int32_t *)((rule)->cmd) + ((rule)->act_ofs) )

#define RULESIZE(rule)  (sizeof(struct ip_fw) + \
	((struct ip_fw *)(rule))->cmd_len * 4 - 4)

#if 1 // should be moved to in.h
/*
 * This structure is used as a flow mask and a flow id for various
 * parts of the code.
 * addr_type is used in userland and kernel to mark the address type.
 * fib is used in the kernel to record the fib in use.
 * _flags is used in the kernel to store tcp flags for dynamic rules.
 */
struct ipfw_flow_id {
	uint32_t	dst_ip;
	uint32_t	src_ip;
	uint16_t	dst_port;
	uint16_t	src_port;
	uint8_t		fib;
	uint8_t		proto;
	uint8_t		_flags;	/* protocol-specific flags */
	uint8_t		addr_type; /* 4=ip4, 6=ip6, 1=ether ? */
	struct in6_addr dst_ip6;
	struct in6_addr src_ip6;
	uint32_t	flow_id6;
	uint32_t	extra; /* queue/pipe or frag_id */
};
#endif

#define IS_IP6_FLOW_ID(id)	((id)->addr_type == 6)

/*
 * Dynamic ipfw rule.
 */
typedef struct _ipfw_dyn_rule ipfw_dyn_rule;

struct _ipfw_dyn_rule {
	ipfw_dyn_rule	*next;		/* linked list of rules.	*/
	struct ip_fw *rule;		/* pointer to rule		*/
	/* 'rule' is used to pass up the rule number (from the parent)	*/

	ipfw_dyn_rule *parent;		/* pointer to parent rule	*/
	u_int64_t	pcnt;		/* packet match counter		*/
	u_int64_t	bcnt;		/* byte match counter		*/
	struct ipfw_flow_id id;		/* (masked) flow id		*/
	u_int32_t	expire;		/* expire time			*/
	u_int32_t	bucket;		/* which bucket in hash table	*/
	u_int32_t	state;		/* state of this rule (typically a
					 * combination of TCP flags)
					 */
	u_int32_t	ack_fwd;	/* most recent ACKs in forward	*/
	u_int32_t	ack_rev;	/* and reverse directions (used	*/
					/* to generate keepalives)	*/
	u_int16_t	dyn_type;	/* rule type			*/
	u_int16_t	count;		/* refcount			*/
};

/*
 * Definitions for IP option names.
 */
#define	IP_FW_IPOPT_LSRR	0x01
#define	IP_FW_IPOPT_SSRR	0x02
#define	IP_FW_IPOPT_RR		0x04
#define	IP_FW_IPOPT_TS		0x08

/*
 * Definitions for TCP option names.
 */
#define	IP_FW_TCPOPT_MSS	0x01
#define	IP_FW_TCPOPT_WINDOW	0x02
#define	IP_FW_TCPOPT_SACK	0x04
#define	IP_FW_TCPOPT_TS		0x08
#define	IP_FW_TCPOPT_CC		0x10

#define	ICMP_REJECT_RST		0x100	/* fake ICMP code (send a TCP RST) */
#define	ICMP6_UNREACH_RST	0x100	/* fake ICMPv6 code (send a TCP RST) */

/*
 * These are used for lookup tables.
 */

#define	IPFW_TABLE_CIDR		1	/* Table for holding IPv4/IPv6 prefixes */
#define	IPFW_TABLE_INTERFACE	2	/* Table for holding interface names */
#define	IPFW_TABLE_MAXTYPE	2	/* Maximum valid number */

typedef struct	_ipfw_table_entry {
	in_addr_t	addr;		/* network address		*/
	u_int32_t	value;		/* value			*/
	u_int16_t	tbl;		/* table number			*/
	u_int8_t	masklen;	/* mask length			*/
} ipfw_table_entry;

typedef struct _ipfw_table_xentry {
	uint16_t	len;		/* Total entry length		*/
	uint8_t		type;		/* entry type			*/
	uint8_t		masklen;	/* mask length			*/
	uint16_t	tbl;		/* table number			*/
	uint32_t	value;		/* value			*/
	union {
		/* Longest field needs to be aligned by 4-byte boundary */
		struct	in6_addr addr6;	/* IPv6 address			*/
		char	iface[IF_NAMESIZE];     /* interface name	*/
	} k;
} ipfw_table_xentry;

typedef struct	_ipfw_table {
	u_int32_t	size;		/* size of entries in bytes	*/
	u_int32_t	cnt;		/* # of entries			*/
	u_int16_t	tbl;		/* table number			*/
	ipfw_table_entry ent[0];	/* entries			*/
} ipfw_table;

typedef struct _ipfw_xtable {
	ip_fw3_opheader opheader;	/* eXtended tables are controlled via IP_FW3 */
	uint32_t	size;		/* size of entries in bytes	*/
	uint32_t	cnt;		/* # of entries			*/
	uint16_t	tbl;		/* table number			*/
	uint8_t		type;		/* table type			*/
	ipfw_table_xentry xent[0];	/* entries			*/
} ipfw_xtable;

#endif /* _IPFW2_H */


================================================
FILE: sys/netinet/ip_icmp.h
================================================
/*
 * additional define not present in linux
 * should go in glue.h
 */
#ifndef _NETINET_IP_ICMP_H_
#define _NETINET_IP_ICMP_H_

#define ICMP_MAXTYPE            40      /* defined as 18 in compat.h */
#define ICMP_ROUTERSOLICIT      10              /* router solicitation */
#define ICMP_TSTAMP             13              /* timestamp request */
#define ICMP_IREQ               15              /* information request */
#define ICMP_MASKREQ            17              /* address mask request */
#define         ICMP_UNREACH_HOST       1               /* bad host */

#define ICMP_UNREACH            3               /* dest unreachable, codes: */

#endif /* _NETINET_IP_ICMP_H_ */


================================================
FILE: sys/netinet/ipfw/dn_heap.c
================================================
/*-
 * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
 * All rights reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Binary heap and hash tables, used in dummynet
 *
 * $Id: dn_heap.c 11480 2012-07-31 08:02:00Z luigi $
 */

#include <sys/cdefs.h>
#include <sys/param.h>
#ifdef _KERNEL
__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/dn_heap.c 203279 2010-01-31 12:20:29Z luigi $");
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/kernel.h>
#include <netinet/ipfw/dn_heap.h>
#ifndef log
#define log(x, arg...)
#endif

#else /* !_KERNEL */

#include <stdio.h>
#include <dn_test.h>
#include <strings.h>
#include <stdlib.h>

#include  "dn_heap.h"
#define log(x, arg...)	fprintf(stderr, ## arg)
#define panic(x...)	fprintf(stderr, ## x), exit(1)
#define MALLOC_DEFINE(a, b, c)
static void *my_malloc(int s) {	return malloc(s); }
static void my_free(void *p) {	free(p); }
#define malloc(s, t, w)	my_malloc(s)
#define free(p, t)	my_free(p)
#endif /* !_KERNEL */

MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap");

/*
 * Heap management functions.
 *
 * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
 * Some macros help finding parent/children so we can optimize them.
 *
 * heap_init() is called to expand the heap when needed.
 * Increment size in blocks of 16 entries.
 * Returns 1 on error, 0 on success
 */
#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
#define HEAP_LEFT(x) ( (x)+(x) + 1 )
#define	HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
#define HEAP_INCREMENT	15

static int
heap_resize(struct dn_heap *h, unsigned int new_size)
{
	struct dn_heap_entry *p;

	if (h->size >= new_size )	/* have enough room */
		return 0;
#if 1  /* round to the next power of 2 */
	new_size |= new_size >> 1;
	new_size |= new_size >> 2;
	new_size |= new_size >> 4;
	new_size |= new_size >> 8;
	new_size |= new_size >> 16;
#else
	new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT;
#endif
	p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT);
	if (p == NULL) {
		printf("--- %s, resize %d failed\n", __func__, new_size );
		return 1; /* error */
	}
	if (h->size > 0) {
		bcopy(h->p, p, h->size * sizeof(*p) );
		free(h->p, M_DN_HEAP);
	}
	h->p = p;
	h->size = new_size;
	return 0;
}

int
heap_init(struct dn_heap *h, int size, int ofs)
{
	if (heap_resize(h, size))
		return 1;
	h->elements = 0;
	h->ofs = ofs;
	return 0;
}

/*
 * Insert element in heap. Normally, p != NULL, we insert p in
 * a new position and bubble up. If p == NULL, then the element is
 * already in place, and key is the position where to start the
 * bubble-up.
 * Returns 1 on failure (cannot allocate new heap entry)
 *
 * If ofs > 0 the position (index, int) of the element in the heap is
 * also stored in the element itself at the given offset in bytes.
 */
#define SET_OFFSET(h, i) do {					\
	if (h->ofs > 0)						\
	    *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i;	\
	} while (0)
/*
 * RESET_OFFSET is used for sanity checks. It sets ofs
 * to an invalid value.
 */
#define RESET_OFFSET(h, i) do {					\
	if (h->ofs > 0)						\
	    *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16;	\
	} while (0)

int
heap_insert(struct dn_heap *h, uint64_t key1, void *p)
{
	int son = h->elements;

	//log("%s key %llu p %p\n", __FUNCTION__, key1, p);
	if (p == NULL) { /* data already there, set starting point */
		son = key1;
	} else { /* insert new element at the end, possibly resize */
		son = h->elements;
		if (son == h->size) /* need resize... */
			// XXX expand by 16 or so
			if (heap_resize(h, h->elements+16) )
				return 1; /* failure... */
		h->p[son].object = p;
		h->p[son].key = key1;
		h->elements++;
	}
	/* make sure that son >= father along the path */
	while (son > 0) {
		int father = HEAP_FATHER(son);
		struct dn_heap_entry tmp;

		if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
			break; /* found right position */
		/* son smaller than father, swap and repeat */
		HEAP_SWAP(h->p[son], h->p[father], tmp);
		SET_OFFSET(h, son);
		son = father;
	}
	SET_OFFSET(h, son);
	return 0;
}

/*
 * remove top element from heap, or obj if obj != NULL
 */
void
heap_extract(struct dn_heap *h, void *obj)
{
	int child, father, max = h->elements - 1;

	if (max < 0) {
		printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h);
		return;
	}
	if (obj == NULL)
		father = 0; /* default: move up smallest child */
	else { /* extract specific element, index is at offset */
		if (h->ofs <= 0)
			panic("%s: extract from middle not set on %p\n",
				__FUNCTION__, h);
		father = *((int *)((char *)obj + h->ofs));
		if (father < 0 || father >= h->elements) {
			panic("%s: father %d out of bound 0..%d\n",
				__FUNCTION__, father, h->elements);
		}
	}
	/*
	 * below, father is the index of the empty element, which
	 * we replace at each step with the smallest child until we
	 * reach the bottom level.
	 */
	// XXX why removing RESET_OFFSET increases runtime by 10% ?
	RESET_OFFSET(h, father);
	while ( (child = HEAP_LEFT(father)) <= max ) {
		if (child != max &&
		    DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
			child++; /* take right child, otherwise left */
		h->p[father] = h->p[child];
		SET_OFFSET(h, father);
		father = child;
	}
	h->elements--;
	if (father != max) {
		/*
		 * Fill hole with last entry and bubble up,
		 * reusing the insert code
		 */
		h->p[father] = h->p[max];
		heap_insert(h, father, NULL);
	}
}

#if 0
/*
 * change object position and update references
 * XXX this one is never used!
 */
static void
heap_move(struct dn_heap *h, uint64_t new_key, void *object)
{
	int temp, i, max = h->elements-1;
	struct dn_heap_entry *p, buf;

	if (h->ofs <= 0)
		panic("cannot move items on this heap");
	p = h->p;	/* shortcut */

	i = *((int *)((char *)object + h->ofs));
	if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */
		p[i].key = new_key;
		for (; i>0 &&
		    DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key);
		    i = temp ) { /* bubble up */
			HEAP_SWAP(p[i], p[temp], buf);
			SET_OFFSET(h, i);
		}
	} else {		/* must move down */
		p[i].key = new_key;
		while ( (temp = HEAP_LEFT(i)) <= max ) {
			/* found left child */
			if (temp != max &&
			    DN_KEY_LT(p[temp+1].key, p[temp].key))
				temp++; /* select child with min key */
			if (DN_KEY_LT(>p[temp].key, new_key)) {
				/* go down */
				HEAP_SWAP(p[i], p[temp], buf);
				SET_OFFSET(h, i);
			} else
				break;
			i = temp;
		}
	}
	SET_OFFSET(h, i);
}
#endif /* heap_move, unused */

/*
 * heapify() will reorganize data inside an array to maintain the
 * heap property. It is needed when we delete a bunch of entries.
 */
static void
heapify(struct dn_heap *h)
{
	int i;

	for (i = 0; i < h->elements; i++ )
		heap_insert(h, i , NULL);
}

int
heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t),
	uintptr_t arg)
{
	int i, ret, found;

	for (i = found = 0 ; i < h->elements ;) {
		ret = fn(h->p[i].object, arg);
		if (ret & HEAP_SCAN_DEL) {
			h->elements-- ;
			h->p[i] = h->p[h->elements] ;
			found++ ;
		} else
			i++ ;
		if (ret & HEAP_SCAN_END)
			break;
	}
	if (found)
		heapify(h);
	return found;
}

/*
 * cleanup the heap and free data structure
 */
void
heap_free(struct dn_heap *h)
{
	if (h->size >0 )
		free(h->p, M_DN_HEAP);
	bzero(h, sizeof(*h) );
}

/*
 * hash table support.
 */

struct dn_ht {
        int buckets;            /* how many buckets, really buckets - 1*/
        int entries;            /* how many entries */
        int ofs;	        /* offset of link field */
        uint32_t (*hash)(uintptr_t, int, void *arg);
        int (*match)(void *_el, uintptr_t key, int, void *);
        void *(*newh)(uintptr_t, int, void *);
        void **ht;              /* bucket heads */
};
/*
 * Initialize, allocating bucket pointers inline.
 * Recycle previous record if possible.
 * If the 'newh' function is not supplied, we assume that the
 * key passed to ht_find is the same object to be stored in.
 */
struct dn_ht *
dn_ht_init(struct dn_ht *ht, int buckets, int ofs,
        uint32_t (*h)(uintptr_t, int, void *),
        int (*match)(void *, uintptr_t, int, void *),
	void *(*newh)(uintptr_t, int, void *))
{
	int l;

	/*
	 * Notes about rounding bucket size to a power of two.
	 * Given the original bucket size, we compute the nearest lower and
	 * higher power of two, minus 1  (respectively b_min and b_max) because
	 * this value will be used to do an AND with the index returned
	 * by hash function.
	 * To choice between these two values, the original bucket size is
	 * compared with b_min. If the original size is greater than 4/3 b_min,
	 * we round the bucket size to b_max, else to b_min.
	 * This ratio try to round to the nearest power of two, advantaging
	 * the greater size if the different between two power is relatively
	 * big.
	 * Rounding the bucket size to a power of two avoid the use of
	 * module when calculating the correct bucket.
	 * The ht->buckets variable store the bucket size - 1 to simply
	 * do an AND between the index returned by hash function and ht->bucket
	 * instead of a module.
	 */
	int b_min; /* min buckets */
	int b_max; /* max buckets */
	int b_ori; /* original buckets */

	if (h == NULL || match == NULL) {
		printf("--- missing hash or match function");
		return NULL;
	}
	if (buckets < 1 || buckets > 65536)
		return NULL;

	b_ori = buckets;
	/* calculate next power of 2, - 1*/
	buckets |= buckets >> 1;
	buckets |= buckets >> 2;
	buckets |= buckets >> 4;
	buckets |= buckets >> 8;
	buckets |= buckets >> 16;

	b_max = buckets; /* Next power */
	b_min = buckets >> 1; /* Previous power */

	/* Calculate the 'nearest' bucket size */
	if (b_min * 4000 / 3000 < b_ori)
		buckets = b_max;
	else
		buckets = b_min;

	if (ht) {	/* see if we can reuse */
		if (buckets <= ht->buckets) {
			ht->buckets = buckets;
		} else {
			/* free pointers if not allocated inline */
			if (ht->ht != (void *)(ht + 1))
				free(ht->ht, M_DN_HEAP);
			free(ht, M_DN_HEAP);
			ht = NULL;
		}
	}
	if (ht == NULL) {
		/* Allocate buckets + 1 entries because buckets is use to
		 * do the AND with the index returned by hash function
		 */
		l = sizeof(*ht) + (buckets + 1) * sizeof(void **);
		ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO);
	}
	if (ht) {
		ht->ht = (void **)(ht + 1);
		ht->buckets = buckets;
		ht->ofs = ofs;
		ht->hash = h;
		ht->match = match;
		ht->newh = newh;
	}
	return ht;
}

/* dummy callback for dn_ht_free to unlink all */
static int
do_del(void *obj, void *arg)
{
	return DNHT_SCAN_DEL;
}

void
dn_ht_free(struct dn_ht *ht, int flags)
{
	if (ht == NULL)
		return;
	if (flags & DNHT_REMOVE) {
		(void)dn_ht_scan(ht, do_del, NULL);
	} else {
		if (ht->ht && ht->ht != (void *)(ht + 1))
			free(ht->ht, M_DN_HEAP);
		free(ht, M_DN_HEAP);
	}
}

int
dn_ht_entries(struct dn_ht *ht)
{
	return ht ? ht->entries : 0;
}

/*
 * Helper function to scan a bucket in the hash table, it
 * can only be called on a non-empty bucket for a valid table.
 *
 * In lookup and scan, consider ht->ht[i] as pointing to the tail
 * of the queue (head is NEXTP(tail). The 'empty' value is irrelevant.
 * While searching, start analysing p = head, end when p == tail.
 * Note that 'tail' is a cache of the _original_ ht->ht[i]
 * and is used to check for loop termination. If you remove
 * it, you must also adjust 'p' when deleting the 'tail' element.
 */
#define NEXT(_h, _p) *((void **)((char *)(_p) + (_h)->ofs))
static int
dn_ht_scan_body(struct dn_ht *ht, int *bucket,
	int (*fn)(void *, void *), void *arg)
{
	int ret, found = 0, i = *bucket;
	void *tail, *pp, *p, *nextp;

	pp = tail = ht->ht[i];
	do {
		p = NEXT(ht, pp);
		nextp = NEXT(ht, p);
		ret = fn(p, arg);
		if ((ret & DNHT_SCAN_DEL) == 0) {
			pp = p;	 /* prepare for next loop */
		} else {
			found++;
			ht->entries--;
			/* skip current element */
			if (pp != p)
				/* pp == p implies p == tail */
				NEXT(ht, pp) = nextp;
			if (p == tail)
				ht->ht[i] = (pp != p) ? pp : NULL;
		}
		if (ret & DNHT_SCAN_END) {
			/* Update ht->ht[i] before returning */
			ht->ht[i] = (ht->ht[i] == NULL) ? NULL : pp;
			return found;
		}
	} while (p != tail);

	(*bucket)++;
	return found;
}

/*
 * lookup and optionally create or delete element.
 * This is an optimized version of the scan so it is coded
 * inline.
 */
void *
dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg)
{
	int i, found;
	void *tail, *pp, *p; /* pp is the prev element, pp is current */

	if (ht == NULL)	/* easy on an empty hash */
		return NULL;
	i = (ht->buckets == 1) ? 0 :
		(ht->hash(key, flags, arg) & ht->buckets);

	pp = tail = ht->ht[i];
	if (tail) { /* non empty, try a lookup */
		do {
			p = NEXT(ht, pp);
			found = (flags & DNHT_MATCH_PTR) ? key == (uintptr_t)p :
					ht->match(p, key, flags, arg);
			if (!found)
				continue;
			if (flags & DNHT_REMOVE) {
				ht->entries--;
				if (p != pp) 	/* skip current element */
					NEXT(ht, pp) = NEXT(ht, p);
				if (p == tail)
					ht->ht[i] = (pp != p) ? pp : NULL;
			}
			return p;
		} while ( (pp = p) != tail);
	}
	/* not found */
	if ((flags & DNHT_INSERT) == 0)
		return NULL;
	p = ht->newh ? ht->newh(key, flags, arg) : (void *)key;
	if (p) {
		ht->entries++;
		if (tail == NULL) {
			ht->ht[i] = NEXT(ht, p) = p;
		} else {
			NEXT(ht, p) = NEXT(ht, tail);
			NEXT(ht, tail) = p;
		}
	}

	return p;
}

/*
 * do a scan with the option to delete the object.
 * Similar to the lookup, but the match function is different,
 * and we extract 'next' before running the callback because
 * the element may be destroyed there.
 */
int
dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg)
{
	int i, bucket, found = 0;

	if (ht == NULL || fn == NULL)
		return 0;
	for (i = 0; i <= ht->buckets; i++) {
		if (ht->ht[i] == NULL)
			continue; /* empty  bucket */
		bucket = i;
		found += dn_ht_scan_body(ht, &bucket, fn, arg);
		if (bucket == i) /* early exit */
				return found;
	}
	return found;
}

/*
 * Similar to dn_ht_scan(), except that the scan is performed only
 * in the bucket 'bucket'. The function returns a correct bucket number if
 * the original is invalid.
 * If the callback returns DNHT_SCAN_END, the function move the ht->ht[i]
 * pointer to the last entry processed. Moreover, the bucket number passed
 * by caller is decremented, because usually the caller increment it.
 */
int
dn_ht_scan_bucket(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *),
		 void *arg)
{
	if (ht == NULL || fn == NULL)
		return 0;
	if (*bucket > ht->buckets || *bucket < 0)
		*bucket = 0;
	if (ht->ht[*bucket] == NULL) {
		(*bucket)++;
		return 0;
	} else
		return dn_ht_scan_body(ht, bucket, fn, arg);
}


================================================
FILE: sys/netinet/ipfw/dn_heap.h
================================================
/*-
 * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa
 * All rights reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Binary heap and hash tables, header file
 *
 * $FreeBSD: head/sys/netinet/ipfw/dn_heap.h 204865 2010-03-08 11:27:08Z luigi $
 */

#ifndef _IP_DN_HEAP_H
#define _IP_DN_HEAP_H

#define DN_KEY_LT(a,b)     ((int64_t)((a)-(b)) < 0)
#define DN_KEY_LEQ(a,b)    ((int64_t)((a)-(b)) <= 0)

/*
 * This module implements a binary heap supporting random extraction.
 *
 * A heap entry contains an uint64_t key and a pointer to object.
 * DN_KEY_LT(a,b) returns true if key 'a' is smaller than 'b'
 *
 * The heap is a struct dn_heap plus a dynamically allocated
 * array of dn_heap_entry entries. 'size' represents the size of
 * the array, 'elements' count entries in use. The topmost
 * element has the smallest key.
 * The heap supports ordered insert, and extract from the top.
 * To extract an object from the middle of the heap, we the object
 * must reserve an 'int32_t' to store the position of the object
 * in the heap itself, and the location of this field must be
 * passed as an argument to heap_init() -- use -1 if the feature
 * is not used.
 */
struct dn_heap_entry {
	uint64_t key;	/* sorting key, smallest comes first */
	void *object;	/* object pointer */
};

struct dn_heap {
	int size;	/* the size of the array */
	int elements;	/* elements in use */
	int ofs;	/* offset in the object of heap index */
	struct dn_heap_entry *p;	/* array of "size" entries */
};

enum {
	HEAP_SCAN_DEL = 1,
	HEAP_SCAN_END = 2,
};

/*
 * heap_init() reinitializes the heap setting the size and the offset
 *	of the index for random extraction (use -1 if not used).
 *	The 'elements' counter is set to 0.
 *
 * SET_HEAP_OFS() indicates where, in the object, is stored the index
 *	for random extractions from the heap.
 *
 * heap_free() frees the memory associated to a heap.
 *
 * heap_insert() adds a key-pointer pair to the heap
 *
 * HEAP_TOP() returns a pointer to the top element of the heap,
 *	but makes no checks on its existance (XXX should we change ?)
 *
 * heap_extract() removes the entry at the top, returing the pointer.
 *	(the key should have been read before).
 *
 * heap_scan() invokes a callback on each entry of the heap.
 *	The callback can return a combination of HEAP_SCAN_DEL and
 *	HEAP_SCAN_END. HEAP_SCAN_DEL means the current element must
 *	be removed, and HEAP_SCAN_END means to terminate the scan.
 *	heap_scan() returns the number of elements removed.
 *	Because the order is not guaranteed, we should use heap_scan()
 *	only as a last resort mechanism.
 */
#define HEAP_TOP(h)	((h)->p)
#define SET_HEAP_OFS(h, n)	do { (h)->ofs = n; } while (0)
int     heap_init(struct dn_heap *h, int size, int ofs);
int     heap_insert(struct dn_heap *h, uint64_t key1, void *p);
void    heap_extract(struct dn_heap *h, void *obj);
void heap_free(struct dn_heap *h);
int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t);

/*------------------------------------------------------
 * This module implements a generic hash table with support for
 * running callbacks on the entire table. To avoid allocating
 * memory during hash table operations, objects must reserve
 * space for a link field. XXX if the heap is moderately full,
 * an SLIST suffices, and we can tolerate the cost of a hash
 * computation on each removal.
 *
 * dn_ht_init() initializes the table, setting the number of
 *	buckets, the offset of the link field, the main callbacks.
 *	Callbacks are:
 * 
 *	hash(key, flags, arg) called to return a bucket index.
 *	match(obj, key, flags, arg) called to determine if key
 *		matches the current 'obj' in the heap
 *	newh(key, flags, arg) optional, used to allocate a new
 *		object during insertions.
 *
 * dn_ht_free() frees the heap or unlink elements.
 *	DNHT_REMOVE unlink elements, 0 frees the heap.
 *	You need two calls to do both.
 *
 * dn_ht_find() is the main lookup function, which can also be
 *	used to insert or delete elements in the hash table.
 *	The final 'arg' is passed to all callbacks.
 *
 * dn_ht_scan() is used to invoke a callback on all entries of
 *	the heap, or possibly on just one bucket. The callback
 *	is invoked with a pointer to the object, and must return
 *	one of DNHT_SCAN_DEL or DNHT_SCAN_END to request the
 *	removal of the object from the heap and the end of the
 *	scan, respectively.
 *
 * dn_ht_scan_bucket() is similar to dn_ht_scan(), except that it scans
 *	only the specific bucket of the table. The bucket is a in-out
 *	parameter and return a valid bucket number if the original
 *	is invalid.
 *
 * A combination of flags can be used to modify the operation
 * of the dn_ht_find(), and of the callbacks:
 *
 * DNHT_KEY_IS_OBJ	means the key is the object pointer.
 *	It is usally of interest for the hash and match functions.
 *
 * DNHT_MATCH_PTR	during a lookup, match pointers instead
 *	of calling match(). Normally used when removing specific
 *	entries. Does not imply KEY_IS_OBJ as the latter _is_ used
 *	by the match function.
 *
 * DNHT_INSERT		insert the element if not found.
 *	Calls new() to allocates a new object unless
 *	DNHT_KEY_IS_OBJ is set.
 *
 * DNHT_UNIQUE		only insert if object not found.
 *	XXX should it imply DNHT_INSERT ?
 *
 * DNHT_REMOVE		remove objects if we find them.
 */
struct dn_ht;	/* should be opaque */

struct dn_ht *dn_ht_init(struct dn_ht *, int buckets, int ofs, 
        uint32_t (*hash)(uintptr_t, int, void *),
        int (*match)(void *, uintptr_t, int, void *),
        void *(*newh)(uintptr_t, int, void *));
void dn_ht_free(struct dn_ht *, int flags);

void *dn_ht_find(struct dn_ht *, uintptr_t, int, void *);
int dn_ht_scan(struct dn_ht *, int (*)(void *, void *), void *);
int dn_ht_scan_bucket(struct dn_ht *, int * , int (*)(void *, void *), void *);
int dn_ht_entries(struct dn_ht *);

enum {  /* flags values.
	 * first two are returned by the scan callback to indicate
	 * to delete the matching element or to end the scan
	 */
        DNHT_SCAN_DEL	= 0x0001,
        DNHT_SCAN_END	= 0x0002,
        DNHT_KEY_IS_OBJ	= 0x0004,	/* key is the obj pointer */
        DNHT_MATCH_PTR	= 0x0008,	/* match by pointer, not match() */
        DNHT_INSERT	= 0x0010,	/* insert if not found */
        DNHT_UNIQUE	= 0x0020,	/* report error if already there */
        DNHT_REMOVE	= 0x0040,	/* remove on find or dn_ht_free */
}; 

#endif /* _IP_DN_HEAP_H */


================================================
FILE: sys/netinet/ipfw/dn_sched.h
================================================
/*
 * Copyright (c) 2010 Riccardo Panicucci, Luigi Rizzo, Universita` di Pisa
 * All rights reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * The API to write a packet scheduling algorithm for dummynet.
 *
 * $FreeBSD: head/sys/netinet/ipfw/dn_sched.h 204591 2010-03-02 17:40:48Z luigi $
 */

#ifndef _DN_SCHED_H
#define _DN_SCHED_H

#define	DN_MULTIQUEUE	0x01
/*
 * Descriptor for a scheduling algorithm.
 * Contains all function pointers for a given scheduler
 * This is typically created when a module is loaded, and stored
 * in a global list of schedulers.
 */
struct dn_alg {
	uint32_t type;           /* the scheduler type */
	const char *name;   /* scheduler name */
	uint32_t flags;	/* DN_MULTIQUEUE if supports multiple queues */

	/*
	 * The following define the size of 3 optional data structures
	 * that may need to be allocated at runtime, and are appended
	 * to each of the base data structures: scheduler, sched.inst,
	 * and queue. We don't have a per-flowset structure.
	 */
	/*    + parameters attached to the template, e.g.
	 *	default queue sizes, weights, quantum size, and so on;
	 */
	size_t schk_datalen;

	/*    + per-instance parameters, such as timestamps,
	 *	containers for queues, etc;
	 */
	size_t si_datalen;

	size_t q_datalen;	/* per-queue parameters (e.g. S,F) */

	/*
	 * Methods implemented by the scheduler:
	 * enqueue	enqueue packet 'm' on scheduler 's', queue 'q'.
	 *	q is NULL for !MULTIQUEUE.
	 *	Return 0 on success, 1 on drop (packet consumed anyways).
	 *	Note that q should be interpreted only as a hint
	 *	on the flow that the mbuf belongs to: while a
	 *	scheduler will normally enqueue m into q, it is ok
	 *	to leave q alone and put the mbuf elsewhere.
	 *	This function is called in two cases:
	 *	 - when a new packet arrives to the scheduler;
	 *	 - when a scheduler is reconfigured. In this case the
	 *	   call is issued by the new_queue callback, with a 
	 *	   non empty queue (q) and m pointing to the first
	 *	   mbuf in the queue. For this reason, the function
	 *	   should internally check for (m != q->mq.head)
	 *	   before calling dn_enqueue().
	 *
	 * dequeue	Called when scheduler instance 's' can
	 *	dequeue a packet. Return NULL if none are available.
	 *	XXX what about non work-conserving ?
	 *
	 * config	called on 'sched X config ...', normally writes
	 *	in the area of size sch_arg
	 *
	 * destroy	called on 'sched delete', frees everything
	 *	in sch_arg (other parts are handled by more specific
	 *	functions)
	 *
	 * new_sched    called when a new instance is created, e.g.
	 *	to create the local queue for !MULTIQUEUE, set V or
	 *	copy parameters for WFQ, and so on.
	 *
	 * free_sched	called when deleting an instance, cleans
	 *	extra data in the per-instance area.
	 *
	 * new_fsk	called when a flowset is linked to a scheduler,
	 *	e.g. to validate parameters such as weights etc.
	 * free_fsk	when a flowset is unlinked from a scheduler.
	 *	(probably unnecessary)
	 *
	 * new_queue	called to set the per-queue parameters,
	 *	e.g. S and F, adjust sum of weights in the parent, etc.
	 *
	 *	The new_queue callback is normally called from when
	 *	creating a new queue. In some cases (such as a
	 *	scheduler change or reconfiguration) it can be called
	 *	with a non empty queue. In this case, the queue
	 *	In case of non empty queue, the new_queue callback could
	 *	need to call the enqueue function. In this case,
	 *	the callback should eventually call enqueue() passing
	 *	as m the first element in the queue.
	 *
	 * free_queue	actions related to a queue removal, e.g. undo
	 *	all the above. If the queue has data in it, also remove
	 *	from the scheduler. This can e.g. happen during a reconfigure.
	 *	If safe == 1 remove the queue only if the scheduler no longer
	 *	need it, otherwise delete it even if the scheduler is using
	 *	it. Usually, the flag safe is set when the drain routine is
	 *	running to delete idle queues.
	 */
	int (*enqueue)(struct dn_sch_inst *, struct dn_queue *,
		struct mbuf *);
	struct mbuf * (*dequeue)(struct dn_sch_inst *);

	int (*config)(struct dn_schk *);
	int (*destroy)(struct dn_schk*);
	int (*new_sched)(struct dn_sch_inst *);
	int (*free_sched)(struct dn_sch_inst *);
	int (*new_fsk)(struct dn_fsk *f);
	int (*free_fsk)(struct dn_fsk *f);
	int (*new_queue)(struct dn_queue *q);
	int (*free_queue)(struct dn_queue *q, int safe);

	/* run-time fields */
	int ref_count;      /* XXX number of instances in the system */
	SLIST_ENTRY(dn_alg) next; /* Next scheduler in the list */
};

/* MSVC does not support initializers so we need this ugly macro */
#ifdef _WIN32
#define _SI(fld)
#else
#define _SI(fld)	fld
#endif

/*
 * Additionally, dummynet exports some functions and macros
 * to be used by schedulers:
 */

void dn_free_pkts(struct mbuf *mnext);
int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop);
/* bound a variable between min and max */
int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg);

/*
 * Extract the head of a queue, update stats. Must be the very last
 * thing done on a dequeue as the queue itself may go away.
 */
static __inline struct mbuf*
dn_dequeue(struct dn_queue *q)
{
	struct mbuf *m = q->mq.head;
	if (m == NULL)
		return NULL;
	q->mq.head = m->m_nextpkt;

	/* Update stats for the queue */
	q->ni.length--;
	q->ni.len_bytes -= m->m_pkthdr.len;
	/* When the queue becomes idle, update idle_time (used by RED)
	 * and also update the count of idle queues (for garbage collection).
	 */
	if (q->ni.length == 0) {
		dn_cfg.idle_queue++;
		q->q_time = dn_cfg.curr_time;
	}
	if (q->_si) {
		struct dn_flow *ni = &(q->_si->ni);
		/* update stats for the scheduler instance, and keep track
		 * of idle scheduler instances if needed
		 */
		ni->length--;
		ni->len_bytes -= m->m_pkthdr.len;
		if (ni->length == 0)
			dn_cfg.idle_si++;
	}
	return m;
}

int dn_sched_modevent(module_t mod, int cmd, void *arg);

#define DECLARE_DNSCHED_MODULE(name, dnsched)			\
	static moduledata_t name##_mod = {			\
		#name, dn_sched_modevent, dnsched		\
	};							\
	DECLARE_MODULE(name, name##_mod, 			\
		SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); 	\
        MODULE_DEPEND(name, dummynet, 3, 3, 3);
#endif /* _DN_SCHED_H */


================================================
FILE: sys/netinet/ipfw/dn_sched_fifo.c
================================================
/*
 * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
 * All rights reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $Id: dn_sched_fifo.c 11480 2012-07-31 08:02:00Z luigi $
 */

#ifdef _KERNEL
#include <sys/malloc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/module.h>
#include <net/if.h>	/* IFNAMSIZ */
#include <netinet/in.h>
#include <netinet/ip_var.h>		/* ipfw_rule_ref */
#include <netinet/ip_fw.h>	/* flow_id */
#include <netinet/ip_dummynet.h>
#include <netinet/ipfw/dn_heap.h>
#include <netinet/ipfw/ip_dn_private.h>
#include <netinet/ipfw/dn_sched.h>
#else
#include <dn_test.h>
#endif

/*
 * This file implements a FIFO scheduler for a single queue.
 * The queue is allocated as part of the scheduler instance,
 * and there is a single flowset is in the template which stores
 * queue size and policy.
 * Enqueue and dequeue use the default library functions.
 */
static int 
fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m)
{
	/* XXX if called with q != NULL and m=NULL, this is a
	 * re-enqueue from an existing scheduler, which we should
	 * handle.
	 */
	return dn_enqueue((struct dn_queue *)(si+1), m, 0);
}

static struct mbuf *
fifo_dequeue(struct dn_sch_inst *si)
{
	return dn_dequeue((struct dn_queue *)(si + 1));
}

static int
fifo_new_sched(struct dn_sch_inst *si)
{
	/* This scheduler instance contains the queue */
	struct dn_queue *q = (struct dn_queue *)(si + 1);

        set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q));
	q->_si = si;
	q->fs = si->sched->fs;
	return 0;
}

static int
fifo_free_sched(struct dn_sch_inst *si)
{
	struct dn_queue *q = (struct dn_queue *)(si + 1);
	dn_free_pkts(q->mq.head);
	bzero(q, sizeof(*q));
	return 0;
}

/*
 * FIFO scheduler descriptor
 * contains the type of the scheduler, the name, the size of extra
 * data structures, and function pointers.
 */
static struct dn_alg fifo_desc = {
	_SI( .type = )  DN_SCHED_FIFO,
	_SI( .name = )  "FIFO",
	_SI( .flags = ) 0,

	_SI( .schk_datalen = ) 0,
	_SI( .si_datalen = )  sizeof(struct dn_queue),
	_SI( .q_datalen = )  0,

	_SI( .enqueue = )  fifo_enqueue,
	_SI( .dequeue = )  fifo_dequeue,
	_SI( .config = )  NULL,
	_SI( .destroy = )  NULL,
	_SI( .new_sched = )  fifo_new_sched,
	_SI( .free_sched = )  fifo_free_sched,
	_SI( .new_fsk = )  NULL,
	_SI( .free_fsk = )  NULL,
	_SI( .new_queue = )  NULL,
	_SI( .free_queue = )  NULL,
};

DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc);


================================================
FILE: sys/netinet/ipfw/dn_sched_prio.c
================================================
/*
 * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
 * All rights reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $Id: dn_sched_prio.c 11480 2012-07-31 08:02:00Z luigi $
 */
#ifdef _KERNEL
#include <sys/malloc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/module.h>
#include <net/if.h>	/* IFNAMSIZ */
#include <netinet/in.h>
#include <netinet/ip_var.h>		/* ipfw_rule_ref */
#include <netinet/ip_fw.h>	/* flow_id */
#include <netinet/ip_dummynet.h>
#include <netinet/ipfw/dn_heap.h>
#include <netinet/ipfw/ip_dn_private.h>
#include <netinet/ipfw/dn_sched.h>
#else
#include <dn_test.h>
#endif

#define DN_SCHED_PRIO	5 //XXX

#if !defined(_KERNEL) || !defined(__linux__)
#define test_bit(ix, pData)	((*pData) & (1<<(ix)))
#define __set_bit(ix, pData)	(*pData) |= (1<<(ix))
#define __clear_bit(ix, pData)	(*pData) &= ~(1<<(ix))
#endif

#ifdef __MIPSEL__
#define __clear_bit(ix, pData)	(*pData) &= ~(1<<(ix))
#endif

/* Size of the array of queues pointers. */
#define BITMAP_T	unsigned long
#define MAXPRIO		(sizeof(BITMAP_T) * 8)

/*
 * The scheduler instance contains an array of pointers to queues,
 * one for each priority, and a bitmap listing backlogged queues.
 */
struct prio_si {
	BITMAP_T bitmap;			/* array bitmap */
	struct dn_queue *q_array[MAXPRIO];	/* Array of queues pointers */
};

/*
 * If a queue with the same priority is already backlogged, use
 * that one instead of the queue passed as argument.
 */
static int 
prio_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
{
	struct prio_si *si = (struct prio_si *)(_si + 1);
	int prio = q->fs->fs.par[0];

	if (test_bit(prio, &si->bitmap) == 0) {
		/* No queue with this priority, insert */
		__set_bit(prio, &si->bitmap);
		si->q_array[prio] = q;
	} else { /* use the existing queue */
		q = si->q_array[prio];
	}
	if (dn_enqueue(q, m, 0))
		return 1;
	return 0;
}

/*
 * Packets are dequeued only from the highest priority queue.
 * The function ffs() return the lowest bit in the bitmap that rapresent
 * the array index (-1) which contains the pointer to the highest priority
 * queue.
 * After the dequeue, if this queue become empty, it is index is removed
 * from the bitmap.
 * Scheduler is idle if the bitmap is empty
 *
 * NOTE: highest priority is 0, lowest is sched->max_prio_q
 */
static struct mbuf *
prio_dequeue(struct dn_sch_inst *_si)
{
	struct prio_si *si = (struct prio_si *)(_si + 1);
	struct mbuf *m;
	struct dn_queue *q;
	int prio;

	if (si->bitmap == 0) /* scheduler idle */
		return NULL;

	prio = ffs(si->bitmap) - 1;

	/* Take the highest priority queue in the scheduler */
	q = si->q_array[prio];
	// assert(q)

	m = dn_dequeue(q);
	if (q->mq.head == NULL) {
		/* Queue is now empty, remove from scheduler
		 * and mark it
		 */
		si->q_array[prio] = NULL;
		__clear_bit(prio, &si->bitmap);
	}
	return m;
}

static int
prio_new_sched(struct dn_sch_inst *_si)
{
	struct prio_si *si = (struct prio_si *)(_si + 1);

	bzero(si->q_array, sizeof(si->q_array));
	si->bitmap = 0;

	return 0;
}

static int
prio_new_fsk(struct dn_fsk *fs)
{
	/* Check if the prioritiy is between 0 and MAXPRIO-1 */
	ipdn_bound_var(&fs->fs.par[0], 0, 0, MAXPRIO - 1, "PRIO priority");
	return 0;
}

static int
prio_new_queue(struct dn_queue *q)
{
	struct prio_si *si = (struct prio_si *)(q->_si + 1);
	int prio = q->fs->fs.par[0];
	struct dn_queue *oldq;

	q->ni.oid.subtype = DN_SCHED_PRIO;

	if (q->mq.head == NULL)
		return 0;

	/* Queue already full, must insert in the scheduler or append
	 * mbufs to existing queue. This partly duplicates prio_enqueue
	 */
	if (test_bit(prio, &si->bitmap) == 0) {
		/* No queue with this priority, insert */
		__set_bit(prio, &si->bitmap);
		si->q_array[prio] = q;
	} else if ( (oldq = si->q_array[prio]) != q) {
		/* must append to the existing queue.
		 * can simply append q->mq.head to q2->...
		 * and add the counters to those of q2
		 */
		oldq->mq.tail->m_nextpkt = q->mq.head;
		oldq->mq.tail = q->mq.tail;
		oldq->ni.length += q->ni.length;
		q->ni.length = 0;
		oldq->ni.len_bytes += q->ni.len_bytes;
		q->ni.len_bytes = 0;
		q->mq.tail = q->mq.head = NULL;
	}
	return 0;
}

static int
prio_free_queue(struct dn_queue *q, int safe)
{
	int prio = q->fs->fs.par[0];
	struct prio_si *si = (struct prio_si *)(q->_si + 1);

	if (si->q_array[prio] == q) {
		si->q_array[prio] = NULL;
		__clear_bit(prio, &si->bitmap);
	}
	return 0;
}


static struct dn_alg prio_desc = {
	_SI( .type = ) DN_SCHED_PRIO,
	_SI( .name = ) "PRIO",
	_SI( .flags = ) DN_MULTIQUEUE,

	/* we need extra space in the si and the queue */
	_SI( .schk_datalen = ) 0,
	_SI( .si_datalen = ) sizeof(struct prio_si),
	_SI( .q_datalen = ) 0,

	_SI( .enqueue = ) prio_enqueue,
	_SI( .dequeue = ) prio_dequeue,

	_SI( .config = )  NULL,
	_SI( .destroy = )  NULL,
	_SI( .new_sched = ) prio_new_sched,
	_SI( .free_sched = ) NULL,

	_SI( .new_fsk = ) prio_new_fsk,
	_SI( .free_fsk = )  NULL,

	_SI( .new_queue = ) prio_new_queue,
	_SI( .free_queue = ) prio_free_queue,
};


DECLARE_DNSCHED_MODULE(dn_prio, &prio_desc);


================================================
FILE: sys/netinet/ipfw/dn_sched_qfq.c
================================================
/*
 * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente
 * All rights reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $Id: dn_sched_qfq.c 11656 2012-08-07 08:39:06Z luigi $
 */

#ifdef _KERNEL
#include <sys/malloc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/module.h>
#include <net/if.h>	/* IFNAMSIZ */
#include <netinet/in.h>
#include <netinet/ip_var.h>		/* ipfw_rule_ref */
#include <netinet/ip_fw.h>	/* flow_id */
#include <netinet/ip_dummynet.h>
#include <netinet/ipfw/dn_heap.h>
#include <netinet/ipfw/ip_dn_private.h>
#include <netinet/ipfw/dn_sched.h>
#else
#include <dn_test.h>
#endif

#ifdef QFQ_DEBUG
struct qfq_sched;
static void dump_sched(struct qfq_sched *q, const char *msg);
#define	NO(x)	x
#else
#define NO(x)
#endif
#define DN_SCHED_QFQ	4 // XXX Where?
typedef	unsigned long	bitmap;

/*
 * bitmaps ops are critical. Some linux versions have __fls
 * and the bitmap ops. Some machines have ffs
 */
#if defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24))
int fls(unsigned int n)
{
	int i = 0;
	for (i = 0; n > 0; n >>= 1, i++)
		;
	return i;
}
#endif

#if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24))
static inline unsigned long __fls(unsigned long word)
{
	return fls(word) - 1;
}
#endif

#if !defined(_KERNEL) || !defined(__linux__)
#ifdef QFQ_DEBUG
int test_bit(int ix, bitmap *p)
{
	if (ix < 0 || ix > 31)
		D("bad index %d", ix);
	return *p & (1<<ix);
}
void __set_bit(int ix, bitmap *p)
{
	if (ix < 0 || ix > 31)
		D("bad index %d", ix);
	*p |= (1<<ix);
}
void __clear_bit(int ix, bitmap *p)
{
	if (ix < 0 || ix > 31)
		D("bad index %d", ix);
	*p &= ~(1<<ix);
}
#else /* !QFQ_DEBUG */
/* XXX do we have fast version, or leave it to the compiler ? */
#define test_bit(ix, pData)	((*pData) & (1<<(ix)))
#define __set_bit(ix, pData)	(*pData) |= (1<<(ix))
#define __clear_bit(ix, pData)	(*pData) &= ~(1<<(ix))
#endif /* !QFQ_DEBUG */
#endif /* !__linux__ */

#ifdef __MIPSEL__
#define __clear_bit(ix, pData)	(*pData) &= ~(1<<(ix))
#endif

/*-------------------------------------------*/
/*

Virtual time computations.

S, F and V are all computed in fixed point arithmetic with
FRAC_BITS decimal bits.

   QFQ_MAX_INDEX is the maximum index allowed for a group. We need
  	one bit per index.
   QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
   The layout of the bits is as below:
  
                   [ MTU_SHIFT ][      FRAC_BITS    ]
                   [ MAX_INDEX    ][ MIN_SLOT_SHIFT ]
  				 ^.__grp->index = 0
  				 *.__grp->slot_shift
  
   where MIN_SLOT_SHIFT is derived by difference from the others.

The max group index corresponds to Lmax/w_min, where
Lmax=1<<MTU_SHIFT, w_min = 1 .
From this, and knowing how many groups (MAX_INDEX) we want,
we can derive the shift corresponding to each group.

Because we often need to compute
	F = S + len/w_i  and V = V + len/wsum
instead of storing w_i store the value
	inv_w = (1<<FRAC_BITS)/w_i
so we can do F = S + len * inv_w * wsum.
We use W_TOT in the formulas so we can easily move between
static and adaptive weight sum.

The per-scheduler-instance data contain all the data structures
for the scheduler: bitmaps and bucket lists.

 */
/*
 * Maximum number of consecutive slots occupied by backlogged classes
 * inside a group. This is approx lmax/lmin + 5.
 * XXX check because it poses constraints on MAX_INDEX
 */
#define QFQ_MAX_SLOTS	32
/*
 * Shifts used for class<->group mapping. Class weights are
 * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the
 * group with the smallest index that can support the L_i / r_i
 * configured for the class.
 *
 * grp->index is the index of the group; and grp->slot_shift
 * is the shift for the corresponding (scaled) sigma_i.
 *
 * When computing the group index, we do (len<<FP_SHIFT)/weight,
 * then compute an FLS (which is like a log2()), and if the result
 * is below the MAX_INDEX region we use 0 (which is the same as
 * using a larger len).
 */
#define QFQ_MAX_INDEX		19
#define QFQ_MAX_WSHIFT		16	/* log2(max_weight) */

#define	QFQ_MAX_WEIGHT		(1<<QFQ_MAX_WSHIFT)
#define QFQ_MAX_WSUM		(2*QFQ_MAX_WEIGHT)
//#define IWSUM	(q->i_wsum)
#define IWSUM	((1<<FRAC_BITS)/QFQ_MAX_WSUM)

#define FRAC_BITS		30	/* fixed point arithmetic */
#define ONE_FP			(1UL << FRAC_BITS)

#define QFQ_MTU_SHIFT		11	/* log2(max_len) */
#define QFQ_MIN_SLOT_SHIFT	(FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX)

/*
 * Possible group states, also indexes for the bitmaps array in
 * struct qfq_queue. We rely on ER, IR, EB, IB being numbered 0..3
 */
enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };

struct qfq_group;
/*
 * additional queue info. Some of this info should come from
 * the flowset, we copy them here for faster processing.
 * This is an overlay of the struct dn_queue
 */
struct qfq_class {
	struct dn_queue _q;
	uint64_t S, F;		/* flow timestamps (exact) */
	struct qfq_class *next; /* Link for the slot list. */

	/* group we belong to. In principle we would need the index,
	 * which is log_2(lmax/weight), but we never reference it
	 * directly, only the group.
	 */
	struct qfq_group *grp;

	/* these are copied from the flowset. */
	uint32_t	inv_w;	/* ONE_FP/weight */
	uint32_t 	lmax;	/* Max packet size for this flow. */
};

/* Group descriptor, see the paper for details.
 * Basically this contains the bucket lists
 */
struct qfq_group {
	uint64_t S, F;			/* group timestamps (approx). */
	unsigned int slot_shift;	/* Slot shift. */
	unsigned int index;		/* Group index. */
	unsigned int front;		/* Index of the front slot. */
	bitmap full_slots;		/* non-empty slots */

	/* Array of lists of active classes. */
	struct qfq_class *slots[QFQ_MAX_SLOTS];
};

/* scheduler instance descriptor. */
struct qfq_sched {
	uint64_t	V;		/* Precise virtual time. */
	uint32_t	wsum;		/* weight sum */
	NO(uint32_t	i_wsum;		/* ONE_FP/w_sum */
	uint32_t	_queued;	/* debugging */
	uint32_t	loops;	/* debugging */)
	bitmap bitmaps[QFQ_MAX_STATE];	/* Group bitmaps. */
	struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
};

/*---- support functions ----------------------------*/

/* Generic comparison function, handling wraparound. */
static inline int qfq_gt(uint64_t a, uint64_t b)
{
	return (int64_t)(a - b) > 0;
}

/* Round a precise timestamp to its slotted value. */
static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift)
{
	return ts & ~((1ULL << shift) - 1);
}

/* return the pointer to the group with lowest index in the bitmap */
static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
					unsigned long bitmap)
{
	int index = ffs(bitmap) - 1; // zero-based
	return &q->groups[index];
}

/*
 * Calculate a flow index, given its weight and maximum packet length.
 * index = log_2(maxlen/weight) but we need to apply the scaling.
 * This is used only once at flow creation.
 */
static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen)
{
	uint64_t slot_size = (uint64_t)maxlen *inv_w;
	unsigned long size_map;
	int index = 0;

	size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT);
	if (!size_map)
		goto out;

	index = __fls(size_map) + 1;	// basically a log_2()
	index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1)));

	if (index < 0)
		index = 0;

out:
	ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index);
	return index;
}
/*---- end support functions ----*/

/*-------- API calls --------------------------------*/
/*
 * Validate and copy parameters from flowset.
 */
static int
qfq_new_queue(struct dn_queue *_q)
{
	struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
	struct qfq_class *cl = (struct qfq_class *)_q;
	int i;
	uint32_t w;	/* approximated weight */

	/* import parameters from the flowset. They should be correct
	 * already.
	 */
	w = _q->fs->fs.par[0];
	cl->lmax = _q->fs->fs.par[1];
	if (!w || w > QFQ_MAX_WEIGHT) {
		w = 1;
		D("rounding weight to 1");
	}
	cl->inv_w = ONE_FP/w;
	w = ONE_FP/cl->inv_w;	
	if (q->wsum + w > QFQ_MAX_WSUM)
		return EINVAL;

	i = qfq_calc_index(cl->inv_w, cl->lmax);
	cl->grp = &q->groups[i];
	q->wsum += w;
	// XXX cl->S = q->V; ?
	// XXX compute q->i_wsum
	return 0;
}

/* remove an empty queue */
static int
qfq_free_queue(struct dn_queue *_q, int safe)
{
	struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
	struct qfq_class *cl = (struct qfq_class *)_q;
	if (cl->inv_w) {
		q->wsum -= ONE_FP/cl->inv_w;
		cl->inv_w = 0; /* reset weight to avoid run twice */
	}
	return 0;
}

/* Calculate a mask to mimic what would be ffs_from(). */
static inline unsigned long
mask_from(unsigned long bitmap, int from)
{
	return bitmap & ~((1UL << from) - 1);
}

/*
 * The state computation relies on ER=0, IR=1, EB=2, IB=3
 * First compute eligibility comparing grp->S, q->V,
 * then check if someone is blocking us and possibly add EB
 */
static inline unsigned int
qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp)
{
	/* if S > V we are not eligible */
	unsigned int state = qfq_gt(grp->S, q->V);
	unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
	struct qfq_group *next;

	if (mask) {
		next = qfq_ffs(q, mask);
		if (qfq_gt(grp->F, next->F))
			state |= EB;
	}

	return state;
}

/*
 * In principle
 *	q->bitmaps[dst] |= q->bitmaps[src] & mask;
 *	q->bitmaps[src] &= ~mask;
 * but we should make sure that src != dst
 */
static inline void
qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst)
{
	q->bitmaps[dst] |= q->bitmaps[src] & mask;
	q->bitmaps[src] &= ~mask;
}

static inline void
qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish)
{
	unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
	struct qfq_group *next;

	if (mask) {
		next = qfq_ffs(q, mask);
		if (!qfq_gt(next->F, old_finish))
			return;
	}

	mask = (1UL << index) - 1;
	qfq_move_groups(q, mask, EB, ER);
	qfq_move_groups(q, mask, IB, IR);
}

/*
 * perhaps
 *
	old_V ^= q->V;
	old_V >>= QFQ_MIN_SLOT_SHIFT;
	if (old_V) {
		...
	}
 *
 */
static inline void
qfq_make_eligible(struct qfq_sched *q, uint64_t old_V)
{
	unsigned long mask, vslot, old_vslot;

	vslot = q->V >> QFQ_MIN_SLOT_SHIFT;
	old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT;

	if (vslot != old_vslot) {
		mask = (2UL << (__fls(vslot ^ old_vslot))) - 1;
		qfq_move_groups(q, mask, IR, ER);
		qfq_move_groups(q, mask, IB, EB);
	}
}

/*
 * XXX we should make sure that slot becomes less than 32.
 * This is guaranteed by the input values.
 * roundedS is always cl->S rounded on grp->slot_shift bits.
 */
static inline void
qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS)
{
	uint64_t slot = (roundedS - grp->S) >> grp->slot_shift;
	unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS;

	cl->next = grp->slots[i];
	grp->slots[i] = cl;
	__set_bit(slot, &grp->full_slots);
}

/*
 * remove the entry from the slot
 */
static inline void
qfq_front_slot_remove(struct qfq_group *grp)
{
	struct qfq_class **h = &grp->slots[grp->front];

	*h = (*h)->next;
	if (!*h)
		__clear_bit(0, &grp->full_slots);
}

/*
 * Returns the first full queue in a group. As a side effect,
 * adjust the bucket list so the first non-empty bucket is at
 * position 0 in full_slots.
 */
static inline struct qfq_class *
qfq_slot_scan(struct qfq_group *grp)
{
	int i;

	ND("grp %d full %x", grp->index, grp->full_slots);
	if (!grp->full_slots)
		return NULL;

	i = ffs(grp->full_slots) - 1; // zero-based
	if (i > 0) {
		grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
		grp->full_slots >>= i;
	}

	return grp->slots[grp->front];
}

/*
 * adjust the bucket list. When the start time of a group decreases,
 * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
 * move the objects. The mask of occupied slots must be shifted
 * because we use ffs() to find the first non-empty slot.
 * This covers decreases in the group's start time, but what about
 * increases of the start time ?
 * Here too we should make sure that i is less than 32
 */
static inline void
qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS)
{
	unsigned int i = (grp->S - roundedS) >> grp->slot_shift;

	grp->full_slots <<= i;
	grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
}


static inline void
qfq_update_eligible(struct qfq_sched *q, uint64_t old_V)
{
	bitmap ineligible;

	ineligible = q->bitmaps[IR] | q->bitmaps[IB];
	if (ineligible) {
		if (!q->bitmaps[ER]) {
			struct qfq_group *grp;
			grp = qfq_ffs(q, ineligible);
			if (qfq_gt(grp->S, q->V))
				q->V = grp->S;
		}
		qfq_make_eligible(q, old_V);
	}
}

/*
 * Updates the class, returns true if also the group needs to be updated.
 */
static inline int
qfq_update_class(struct qfq_sched *q, struct qfq_group *grp,
	    struct qfq_class *cl)
{

	cl->S = cl->F;
	if (cl->_q.mq.head == NULL)  {
		qfq_front_slot_remove(grp);
	} else {
		unsigned int len;
		uint64_t roundedS;

		len = cl->_q.mq.head->m_pkthdr.len;
		cl->F = cl->S + (uint64_t)len * cl->inv_w;
		roundedS = qfq_round_down(cl->S, grp->slot_shift);
		if (roundedS == grp->S)
			return 0;

		qfq_front_slot_remove(grp);
		qfq_slot_insert(grp, cl, roundedS);
	}
	return 1;
}

static struct mbuf *
qfq_dequeue(struct dn_sch_inst *si)
{
	struct qfq_sched *q = (struct qfq_sched *)(si + 1);
	struct qfq_group *grp;
	struct qfq_class *cl;
	struct mbuf *m;
	uint64_t old_V;

	NO(q->loops++;)
	if (!q->bitmaps[ER]) {
		NO(if (q->queued)
			dump_sched(q, "start dequeue");)
		return NULL;
	}

	grp = qfq_ffs(q, q->bitmaps[ER]);

	cl = grp->slots[grp->front];
	/* extract from the first bucket in the bucket list */
	m = dn_dequeue(&cl->_q);

	if (!m) {
		D("BUG/* non-workconserving leaf */");
		return NULL;
	}
	NO(q->queued--;)
	old_V = q->V;
	q->V += (uint64_t)m->m_pkthdr.len * IWSUM;
	ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V);

	if (qfq_update_class(q, grp, cl)) {
		uint64_t old_F = grp->F;
		cl = qfq_slot_scan(grp);
		if (!cl) { /* group gone, remove from ER */
			__clear_bit(grp->index, &q->bitmaps[ER]);
			// grp->S = grp->F + 1; // XXX debugging only
		} else {
			uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift);
			unsigned int s;

			if (grp->S == roundedS)
				goto skip_unblock;
			grp->S = roundedS;
			grp->F = roundedS + (2ULL << grp->slot_shift);
			/* remove from ER and put in the new set */
			__clear_bit(grp->index, &q->bitmaps[ER]);
			s = qfq_calc_state(q, grp);
			__set_bit(grp->index, &q->bitmaps[s]);
		}
		/* we need to unblock even if the group has gone away */
		qfq_unblock_groups(q, grp->index, old_F);
	}

skip_unblock:
	qfq_update_eligible(q, old_V);
	NO(if (!q->bitmaps[ER] && q->queued)
		dump_sched(q, "end dequeue");)

	return m;
}

/*
 * Assign a reasonable start time for a new flow k in group i.
 * Admissible values for \hat(F) are multiples of \sigma_i
 * no greater than V+\sigma_i . Larger values mean that
 * we had a wraparound so we consider the timestamp to be stale.
 *
 * If F is not stale and F >= V then we set S = F.
 * Otherwise we should assign S = V, but this may violate
 * the ordering in ER. So, if we have groups in ER, set S to
 * the F_j of the first group j which would be blocking us.
 * We are guaranteed not to move S backward because
 * otherwise our group i would still be blocked.
 */
static inline void
qfq_update_start(struct qfq_sched *q, struct qfq_class *cl)
{
	unsigned long mask;
	uint64_t limit, roundedF;
	int slot_shift = cl->grp->slot_shift;

	roundedF = qfq_round_down(cl->F, slot_shift);
	limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift);

	if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) {
		/* timestamp was stale */
		mask = mask_from(q->bitmaps[ER], cl->grp->index);
		if (mask) {
			struct qfq_group *next = qfq_ffs(q, mask);
			if (qfq_gt(roundedF, next->F)) {
				cl->S = next->F;
				return;
			}
		}
		cl->S = q->V;
	} else { /* timestamp is not stale */
		cl->S = cl->F;
	}
}

static int
qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m)
{
	struct qfq_sched *q = (struct qfq_sched *)(si + 1);
	struct qfq_group *grp;
	struct qfq_class *cl = (struct qfq_class *)_q;
	uint64_t roundedS;
	int s;

	NO(q->loops++;)
	DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len,
		_q, cl->inv_w, cl->grp->index);
	/* XXX verify that the packet obeys the parameters */
	if (m != _q->mq.head) {
		if (dn_enqueue(_q, m, 0)) /* packet was dropped */
			return 1;
		NO(q->queued++;)
		if (m != _q->mq.head)
			return 0;
	}
	/* If reach this point, queue q was idle */
	grp = cl->grp;
	qfq_update_start(q, cl); /* adjust start time */
	/* compute new finish time and rounded start. */
	cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w;
	roundedS = qfq_round_down(cl->S, grp->slot_shift);

	/*
	 * insert cl in the correct bucket.
	 * If cl->S >= grp->S we don't need to adjust the
	 * bucket list and simply go to the insertion phase.
	 * Otherwise grp->S is decreasing, we must make room
	 * in the bucket list, and also recompute the group state.
	 * Finally, if there were no flows in this group and nobody
	 * was in ER make sure to adjust V.
	 */
	if (grp->full_slots) {
		if (!qfq_gt(grp->S, cl->S))
			goto skip_update;
		/* create a slot for this cl->S */
		qfq_slot_rotate(q, grp, roundedS);
		/* group was surely ineligible, remove */
		__clear_bit(grp->index, &q->bitmaps[IR]);
		__clear_bit(grp->index, &q->bitmaps[IB]);
	} else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V))
		q->V = roundedS;

	grp->S = roundedS;
	grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i
	s = qfq_calc_state(q, grp);
	__set_bit(grp->index, &q->bitmaps[s]);
	ND("new state %d 0x%x", s, q->bitmaps[s]);
	ND("S %llx F %llx V %llx", cl->S, cl->F, q->V);
skip_update:
	qfq_slot_insert(grp, cl, roundedS);

	return 0;
}


#if 0
static inline void
qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
	struct qfq_class *cl, struct qfq_class **pprev)
{
	unsigned int i, offset;
	uint64_t roundedS;

	roundedS = qfq_round_down(cl->S, grp->slot_shift);
	offset = (roundedS - grp->S) >> grp->slot_shift;
	i = (grp->front + offset) % QFQ_MAX_SLOTS;

#ifdef notyet
	if (!pprev) {
		pprev = &grp->slots[i];
		while (*pprev && *pprev != cl)
			pprev = &(*pprev)->next;
	}
#endif

	*pprev = cl->next;
	if (!grp->slots[i])
		__clear_bit(offset, &grp->full_slots);
}

/*
 * called to forcibly destroy a queue.
 * If the queue is not in the front bucket, or if it has
 * other queues in the front bucket, we can simply remove
 * the queue with no other side effects.
 * Otherwise we must propagate the event up.
 * XXX description to be completed.
 */
static void
qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl,
				 struct qfq_class **pprev)
{
	struct qfq_group *grp = &q->groups[cl->index];
	unsigned long mask;
	uint64_t roundedS;
	int s;

	cl->F = cl->S;	// not needed if the class goes away.
	qfq_slot_remove(q, grp, cl, pprev);

	if (!grp->full_slots) {
		/* nothing left in the group, remove from all sets.
		 * Do ER last because if we were blocking other groups
		 * we must unblock them.
		 */
		__clear_bit(grp->index, &q->bitmaps[IR]);
		__clear_bit(grp->index, &q->bitmaps[EB]);
		__clear_bit(grp->index, &q->bitmaps[IB]);

		if (test_bit(grp->index, &q->bitmaps[ER]) &&
		    !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
			mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
			if (mask)
				mask = ~((1UL << __fls(mask)) - 1);
			else
				mask = ~0UL;
			qfq_move_groups(q, mask, EB, ER);
			qfq_move_groups(q, mask, IB, IR);
		}
		__clear_bit(grp->index, &q->bitmaps[ER]);
	} else if (!grp->slots[grp->front]) {
		cl = qfq_slot_scan(grp);
		roundedS = qfq_round_down(cl->S, grp->slot_shift);
		if (grp->S != roundedS) {
			__clear_bit(grp->index, &q->bitmaps[ER]);
			__clear_bit(grp->index, &q->bitmaps[IR]);
			__clear_bit(grp->index, &q->bitmaps[EB]);
			__clear_bit(grp->index, &q->bitmaps[IB]);
			grp->S = roundedS;
			grp->F = roundedS + (2ULL << grp->slot_shift);
			s = qfq_calc_state(q, grp);
			__set_bit(grp->index, &q->bitmaps[s]);
		}
	}
	qfq_update_eligible(q, q->V);
}
#endif

static int
qfq_new_fsk(struct dn_fsk *f)
{
	ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight");
	ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen");
	ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]);
	return 0;
}

/*
 * initialize a new scheduler instance
 */
static int
qfq_new_sched(struct dn_sch_inst *si)
{
	struct qfq_sched *q = (struct qfq_sched *)(si + 1);
	struct qfq_group *grp;
	int i;

	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
		grp = &q->groups[i];
		grp->index = i;
		grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS -
					(QFQ_MAX_INDEX - i);
	}
	return 0;
}

/*
 * QFQ scheduler descriptor
 */
static struct dn_alg qfq_desc = {
	_SI( .type = ) DN_SCHED_QFQ,
	_SI( .name = ) "QFQ",
	_SI( .flags = ) DN_MULTIQUEUE,

	_SI( .schk_datalen = ) 0,
	_SI( .si_datalen = ) sizeof(struct qfq_sched),
	_SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue),

	_SI( .enqueue = ) qfq_enqueue,
	_SI( .dequeue = ) qfq_dequeue,

	_SI( .config = )  NULL,
	_SI( .destroy = )  NULL,
	_SI( .new_sched = ) qfq_new_sched,
	_SI( .free_sched = )  NULL,
	_SI( .new_fsk = ) qfq_new_fsk,
	_SI( .free_fsk = )  NULL,
	_SI( .new_queue = ) qfq_new_queue,
	_SI( .free_queue = ) qfq_free_queue,
};

DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc);

#ifdef QFQ_DEBUG
static void
dump_groups(struct qfq_sched *q, uint32_t mask)
{
	int i, j;

	for (i = 0; i < QFQ_MAX_INDEX + 1; i++) {
		struct qfq_group *g = &q->groups[i];

		if (0 == (mask & (1<<i)))
			continue;
		for (j = 0; j < QFQ_MAX_SLOTS; j++) {
			if (g->slots[j])
				D("    bucket %d %p", j, g->slots[j]);
		}
		D("full_slots 0x%x", g->full_slots);
		D("        %2d S 0x%20llx F 0x%llx %c", i,
			g->S, g->F,
			mask & (1<<i) ? '1' : '0');
	}
}

static void
dump_sched(struct qfq_sched *q, const char *msg)
{
	D("--- in %s: ---", msg);
	ND("loops %d queued %d V 0x%llx", q->loops, q->queued, q->V);
	D("    ER 0x%08x", q->bitmaps[ER]);
	D("    EB 0x%08x", q->bitmaps[EB]);
	D("    IR 0x%08x", q->bitmaps[IR]);
	D("    IB 0x%08x", q->bitmaps[IB]);
	dump_groups(q, 0xffffffff);
};
#endif /* QFQ_DEBUG */


================================================
FILE: sys/netinet/ipfw/dn_sched_rr.c
================================================
/*
 * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
 * All rights reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $Id: dn_sched_rr.c 11480 2012-07-31 08:02:00Z luigi $
 */

#ifdef _KERNEL
#include <sys/malloc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/module.h>
#include <net/if.h>	/* IFNAMSIZ */
#include <netinet/in.h>
#include <netinet/ip_var.h>		/* ipfw_rule_ref */
#include <netinet/ip_fw.h>	/* flow_id */
#include <netinet/ip_dummynet.h>
#include <netinet/ipfw/dn_heap.h>
#include <netinet/ipfw/ip_dn_private.h>
#include <netinet/ipfw/dn_sched.h>
#else
#include <dn_test.h>
#endif

#define DN_SCHED_RR	3 // XXX Where?

struct rr_queue {
	struct dn_queue q;		/* Standard queue */
	int status;			/* 1: queue is in the list */
	int credit;			/* Number of bytes to transmit */
	int quantum;			/* quantum * C */
	struct rr_queue *qnext;		/* */
};

/* struct rr_schk contains global config parameters
 * and is right after dn_schk
 */
struct rr_schk {
	int min_q;		/* Min quantum */
	int max_q;		/* Max quantum */
	int q_bytes;		/* Bytes per quantum */
};

/* per-instance round robin list, right after dn_sch_inst */
struct rr_si {
	struct rr_queue *head, *tail;	/* Pointer to current queue */
};

/* Append a queue to the rr list */
static inline void
rr_append(struct rr_queue *q, struct rr_si *si)
{
	q->status = 1;		/* mark as in-rr_list */
	q->credit = q->quantum;	/* initialize credit */

	/* append to the tail */
	if (si->head == NULL)
		si->head = q;
	else
		si->tail->qnext = q;
	si->tail = q;		/* advance the tail pointer */
	q->qnext = si->head;	/* make it circular */
}

/* Remove the head queue from circular list. */
static inline void
rr_remove_head(struct rr_si *si)
{
	if (si->head == NULL)
		return; /* empty queue */
	si->head->status = 0;

	if (si->head == si->tail) {
		si->head = si->tail = NULL;
		return;
	}

	si->head = si->head->qnext;
	si->tail->qnext = si->head;
}

/* Remove a queue from circular list.
 * XXX see if ti can be merge with remove_queue()
 */
static inline void
remove_queue_q(struct rr_queue *q, struct rr_si *si)
{
	struct rr_queue *prev;

	if (q->status != 1)
		return;
	if (q == si->head) {
		rr_remove_head(si);
		return;
	}

	for (prev = si->head; prev; prev = prev->qnext) {
		if (prev->qnext != q)
			continue;
		prev->qnext = q->qnext;
		if (q == si->tail)
			si->tail = prev;
		q->status = 0;
		break;
	}
}


static inline void
next_pointer(struct rr_si *si)
{
	if (si->head == NULL)
		return; /* empty queue */

	si->head = si->head->qnext;
	si->tail = si->tail->qnext;
}

static int
rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
{
	struct rr_si *si;
	struct rr_queue *rrq;

	if (m != q->mq.head) {
		if (dn_enqueue(q, m, 0)) /* packet was dropped */
			return 1;
		if (m != q->mq.head)
			return 0;
	}

	/* If reach this point, queue q was idle */
	si = (struct rr_si *)(_si + 1);
	rrq = (struct rr_queue *)q;

	if (rrq->status == 1) /* Queue is already in the queue list */
		return 0;

	/* Insert the queue in the queue list */
	rr_append(rrq, si);

	return 0;
}

static struct mbuf *
rr_dequeue(struct dn_sch_inst *_si)
{
	/* Access scheduler instance private data */
	struct rr_si *si = (struct rr_si *)(_si + 1);
	struct rr_queue *rrq;
	uint64_t len;

	while ( (rrq = si->head) ) {
		struct mbuf *m = rrq->q.mq.head;
		if ( m == NULL) {
			/* empty queue, remove from list */
			rr_remove_head(si);
			continue;
		}
		len = m->m_pkthdr.len;

		if (len > rrq->credit) {
			/* Packet too big */
			rrq->credit += rrq->quantum;
			/* Try next queue */
			next_pointer(si);
		} else {
			rrq->credit -= len;
			return dn_dequeue(&rrq->q);
		}
	}

	/* no packet to dequeue*/
	return NULL;
}

static int
rr_config(struct dn_schk *_schk)
{
	struct rr_schk *schk = (struct rr_schk *)(_schk + 1);
	ND("called");

	/* use reasonable quantums (64..2k bytes, default 1500) */
	schk->min_q = 64;
	schk->max_q = 2048;
	schk->q_bytes = 1500;	/* quantum */

	return 0;
}

static int
rr_new_sched(struct dn_sch_inst *_si)
{
	struct rr_si *si = (struct rr_si *)(_si + 1);

	ND("called");
	si->head = si->tail = NULL;

	return 0;
}

static int
rr_free_sched(struct dn_sch_inst *_si)
{
	ND("called");
	/* Nothing to do? */
	return 0;
}

static int
rr_new_fsk(struct dn_fsk *fs)
{
	struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1);
	/* par[0] is the weight, par[1] is the quantum step */
	ipdn_bound_var(&fs->fs.par[0], 1,
		1, 65536, "RR weight");
	ipdn_bound_var(&fs->fs.par[1], schk->q_bytes,
		schk->min_q, schk->max_q, "RR quantum");
	return 0;
}

static int
rr_new_queue(struct dn_queue *_q)
{
	struct rr_queue *q = (struct rr_queue *)_q;

	_q->ni.oid.subtype = DN_SCHED_RR;

	q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1];
	ND("called, q->quantum %d", q->quantum);
	q->credit = q->quantum;
	q->status = 0;

	if (_q->mq.head != NULL) {
		/* Queue NOT empty, insert in the queue list */
		rr_append(q, (struct rr_si *)(_q->_si + 1));
	}
	return 0;
}

static int
rr_free_queue(struct dn_queue *_q, int safe)
{
	struct rr_queue *q = (struct rr_queue *)_q;

	ND("called");
	if (safe) 	/* Delete only if status == 0 */
		return q->status;

	if (q->status == 1) {
		struct rr_si *si = (struct rr_si *)(_q->_si + 1);
		remove_queue_q(q, si);
	}
	return 0;
}

/*
 * RR scheduler descriptor
 * contains the type of the scheduler, the name, the size of the
 * structures and function pointers.
 */
static struct dn_alg rr_desc = {
	_SI( .type = ) DN_SCHED_RR,
	_SI( .name = ) "RR",
	_SI( .flags = ) DN_MULTIQUEUE,

	_SI( .schk_datalen = ) 0,
	_SI( .si_datalen = ) sizeof(struct rr_si),
	_SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue),

	_SI( .enqueue = ) rr_enqueue,
	_SI( .dequeue = ) rr_dequeue,

	_SI( .config = ) rr_config,
	_SI( .destroy = ) NULL,
	_SI( .new_sched = ) rr_new_sched,
	_SI( .free_sched = ) rr_free_sched,
	_SI( .new_fsk = ) rr_new_fsk,
	_SI( .free_fsk = ) NULL,
	_SI( .new_queue = ) rr_new_queue,
	_SI( .free_queue = ) rr_free_queue,
};


DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc);


================================================
FILE: sys/netinet/ipfw/dn_sched_wf2q.c
================================================
/*
 * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
 * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa
 * All rights reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $Id: dn_sched_wf2q.c 11480 2012-07-31 08:02:00Z luigi $
 */

#ifdef _KERNEL
#include <sys/malloc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/module.h>
#include <net/if.h>	/* IFNAMSIZ */
#include <netinet/in.h>
#include <netinet/ip_var.h>		/* ipfw_rule_ref */
#include <netinet/ip_fw.h>	/* flow_id */
#include <netinet/ip_dummynet.h>
#include <netinet/ipfw/dn_heap.h>
#include <netinet/ipfw/ip_dn_private.h>
#include <netinet/ipfw/dn_sched.h>
#else
#include <dn_test.h>
#endif

#ifndef MAX64
#define MAX64(x,y)  (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
#endif

/*
 * timestamps are computed on 64 bit using fixed point arithmetic.
 * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len
 * and sum of weights, respectively. FRAC_BITS is the number of
 * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large
 * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w
 * using an unsigned 32-bit division, and to avoid wraparounds we need
 * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64
 * As an example
 * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19
 */
#ifndef FRAC_BITS
#define FRAC_BITS    28 /* shift for fixed point arithmetic */
#define	ONE_FP	(1UL << FRAC_BITS)
#endif

/*
 * Private information for the scheduler instance:
 * sch_heap (key is Finish time) returns the next queue to serve
 * ne_heap (key is Start time) stores not-eligible queues
 * idle_heap (key=start/finish time) stores idle flows. It must
 *	support extract-from-middle.
 * A flow is only in 1 of the three heaps.
 * XXX todo: use a more efficient data structure, e.g. a tree sorted
 * by F with min_subtree(S) in each node
 */
struct wf2qp_si {
    struct dn_heap sch_heap;	/* top extract - key Finish  time */
    struct dn_heap ne_heap;	/* top extract - key Start   time */
    struct dn_heap idle_heap;	/* random extract - key Start=Finish time */
    uint64_t V;			/* virtual time */
    uint32_t inv_wsum;		/* inverse of sum of weights */
    uint32_t wsum;		/* sum of weights */
};

struct wf2qp_queue {
    struct dn_queue _q;
    uint64_t S, F;		/* start time, finish time */
    uint32_t inv_w;		/* ONE_FP / weight */
    int32_t heap_pos;		/* position (index) of struct in heap */
};

/*
 * This file implements a WF2Q+ scheduler as it has been in dummynet
 * since 2000.
 * The scheduler supports per-flow queues and has O(log N) complexity.
 *
 * WF2Q+ needs to drain entries from the idle heap so that we
 * can keep the sum of weights up to date. We can do it whenever
 * we get a chance, or periodically, or following some other
 * strategy. The function idle_check() drains at most N elements
 * from the idle heap.
 */
static void
idle_check(struct wf2qp_si *si, int n, int force)
{
    struct dn_heap *h = &si->idle_heap;
    while (n-- > 0 && h->elements > 0 &&
		(force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) {
	struct dn_queue *q = HEAP_TOP(h)->object;
        struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;

        heap_extract(h, NULL);
        /* XXX to let the flowset delete the queue we should
	 * mark it as 'unused' by the scheduler.
	 */
        alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */
        si->wsum -= q->fs->fs.par[0];	/* adjust sum of weights */
	if (si->wsum > 0)
		si->inv_wsum = ONE_FP/si->wsum;
    }
}

static int
wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
{
    struct dn_fsk *fs = q->fs;
    struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
    struct wf2qp_queue *alg_fq;
    uint64_t len = m->m_pkthdr.len;

    if (m != q->mq.head) {
	if (dn_enqueue(q, m, 0)) /* packet was dropped */
	    return 1;
	if (m != q->mq.head)	/* queue was already busy */
	    return 0;
    }

    /* If reach this point, queue q was idle */
    alg_fq = (struct wf2qp_queue *)q;

    if (DN_KEY_LT(alg_fq->F, alg_fq->S)) {
        /* F<S means timestamps are invalid ->brand new queue. */
        alg_fq->S = si->V;		/* init start time */
        si->wsum += fs->fs.par[0];	/* add weight of new queue. */
	si->inv_wsum = ONE_FP/si->wsum;
    } else { /* if it was idle then it was in the idle heap */
        heap_extract(&si->idle_heap, q);
        alg_fq->S = MAX64(alg_fq->F, si->V);	/* compute new S */
    }
    alg_fq->F = alg_fq->S + len * alg_fq->inv_w;

    /* if nothing is backlogged, make sure this flow is eligible */
    if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0)
        si->V = MAX64(alg_fq->S, si->V);

    /*
     * Look at eligibility. A flow is not eligibile if S>V (when
     * this happens, it means that there is some other flow already
     * scheduled for the same pipe, so the sch_heap cannot be
     * empty). If the flow is not eligible we just store it in the
     * ne_heap. Otherwise, we store in the sch_heap.
     * Note that for all flows in sch_heap (SCH), S_i <= V,
     * and for all flows in ne_heap (NEH), S_i > V.
     * So when we need to compute max(V, min(S_i)) forall i in
     * SCH+NEH, we only need to look into NEH.
     */
    if (DN_KEY_LT(si->V, alg_fq->S)) {
        /* S>V means flow Not eligible. */
        if (si->sch_heap.elements == 0)
            D("++ ouch! not eligible but empty scheduler!");
        heap_insert(&si->ne_heap, alg_fq->S, q);
    } else {
        heap_insert(&si->sch_heap, alg_fq->F, q);
    }
    return 0;
}

/* XXX invariant: sch > 0 || V >= min(S in neh) */
static struct mbuf *
wf2qp_dequeue(struct dn_sch_inst *_si)
{
	/* Access scheduler instance private data */
	struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
	struct mbuf *m;
	struct dn_queue *q;
	struct dn_heap *sch = &si->sch_heap;
	struct dn_heap *neh = &si->ne_heap;
	struct wf2qp_queue *alg_fq;

	if (sch->elements == 0 && neh->elements == 0) {
		/* we have nothing to do. We could kill the idle heap
		 * altogether and reset V
		 */
		idle_check(si, 0x7fffffff, 1);
		si->V = 0;
		si->wsum = 0;	/* should be set already */
		return NULL;	/* quick return if nothing to do */
	}
	idle_check(si, 1, 0);	/* drain something from the idle heap */

	/* make sure at least one element is eligible, bumping V
	 * and moving entries that have become eligible.
	 * We need to repeat the first part twice, before and
	 * after extracting the candidate, or enqueue() will
	 * find the data structure in a wrong state.
	 */
  m = NULL;
  for(;;) {
	/*
	 * Compute V = max(V, min(S_i)). Remember that all elements
	 * in sch have by definition S_i <= V so if sch is not empty,
	 * V is surely the max and we must not update it. Conversely,
	 * if sch is empty we only need to look at neh.
	 * We don't need to move the queues, as it will be done at the
	 * next enqueue
	 */
	if (sch->elements == 0 && neh->elements > 0) {
		si->V = MAX64(si->V, HEAP_TOP(neh)->key);
	}
	while (neh->elements > 0 &&
		    DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) {
		q = HEAP_TOP(neh)->object;
		alg_fq = (struct wf2qp_queue *)q;
		heap_extract(neh, NULL);
		heap_insert(sch, alg_fq->F, q);
	}
	if (m) /* pkt found in previous iteration */
		break;
	/* ok we have at least one eligible pkt */
	q = HEAP_TOP(sch)->object;
	alg_fq = (struct wf2qp_queue *)q;
	m = dn_dequeue(q);
	heap_extract(sch, NULL); /* Remove queue from heap. */
	si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum;
	alg_fq->S = alg_fq->F;  /* Update start time. */
	if (q->mq.head == 0) {	/* not backlogged any more. */
		heap_insert(&si->idle_heap, alg_fq->F, q);
	} else {			/* Still backlogged. */
		/* Update F, store in neh or sch */
		uint64_t len = q->mq.head->m_pkthdr.len;
		alg_fq->F += len * alg_fq->inv_w;
		if (DN_KEY_LEQ(alg_fq->S, si->V)) {
			heap_insert(sch, alg_fq->F, q);
		} else {
			heap_insert(neh, alg_fq->S, q);
		}
	}
    }
	return m;
}

static int
wf2qp_new_sched(struct dn_sch_inst *_si)
{
	struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
	int ofs = offsetof(struct wf2qp_queue, heap_pos);

	/* all heaps support extract from middle */
	if (heap_init(&si->idle_heap, 16, ofs) ||
	    heap_init(&si->sch_heap, 16, ofs) ||
	    heap_init(&si->ne_heap, 16, ofs)) {
		heap_free(&si->ne_heap);
		heap_free(&si->sch_heap);
		heap_free(&si->idle_heap);
		return ENOMEM;
	}
	return 0;
}

static int
wf2qp_free_sched(struct dn_sch_inst *_si)
{
	struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);

	heap_free(&si->sch_heap);
	heap_free(&si->ne_heap);
	heap_free(&si->idle_heap);

	return 0;
}

static int
wf2qp_new_fsk(struct dn_fsk *fs)
{
	ipdn_bound_var(&fs->fs.par[0], 1,
		1, 100, "WF2Q+ weight");
	return 0;
}

static int
wf2qp_new_queue(struct dn_queue *_q)
{
	struct wf2qp_queue *q = (struct wf2qp_queue *)_q;

	_q->ni.oid.subtype = DN_SCHED_WF2QP;
	q->F = 0;	/* not strictly necessary */
	q->S = q->F + 1;    /* mark timestamp as invalid. */
        q->inv_w = ONE_FP / _q->fs->fs.par[0];
	if (_q->mq.head != NULL) {
		wf2qp_enqueue(_q->_si, _q, _q->mq.head);
	}
	return 0;
}

/*
 * Called when the infrastructure removes a queue (e.g. flowset
 * is reconfigured). Nothing to do if we did not 'own' the queue,
 * otherwise remove it from the right heap and adjust the sum
 * of weights.
 */
static int
wf2qp_free_queue(struct dn_queue *q, int safe)
{
	struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
	struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1);

	if (alg_fq->S >= alg_fq->F + 1)
		return 0;	/* nothing to do, not in any heap */

	/* queue is in a scheduler heap */
	if (safe)	/* do not delete in safe mode */
		return 1;

	si->wsum -= q->fs->fs.par[0];
	if (si->wsum > 0)
		si->inv_wsum = ONE_FP/si->wsum;

	/* extract from the heap. XXX TODO we may need to adjust V
	 * to make sure the invariants hold.
	 */
	if (q->mq.head == NULL) {
		heap_extract(&si->idle_heap, q);
	} else if (DN_KEY_LT(si->V, alg_fq->S)) {
		heap_extract(&si->ne_heap, q);
	} else {
		heap_extract(&si->sch_heap, q);
	}
	return 0;
}

/*
 * WF2Q+ scheduler descriptor
 * contains the type of the scheduler, the name, the size of the
 * structures and function pointers.
 */
static struct dn_alg wf2qp_desc = {
	_SI( .type = ) DN_SCHED_WF2QP,
	_SI( .name = ) "WF2Q+",
	_SI( .flags = ) DN_MULTIQUEUE,

	/* we need extra space in the si and the queue */
	_SI( .schk_datalen = ) 0,
	_SI( .si_datalen = ) sizeof(struct wf2qp_si),
	_SI( .q_datalen = ) sizeof(struct wf2qp_queue) -
				sizeof(struct dn_queue),

	_SI( .enqueue = ) wf2qp_enqueue,
	_SI( .dequeue = ) wf2qp_dequeue,

	_SI( .config = )  NULL,
	_SI( .destroy = )  NULL,
	_SI( .new_sched = ) wf2qp_new_sched,
	_SI( .free_sched = ) wf2qp_free_sched,

	_SI( .new_fsk = ) wf2qp_new_fsk,
	_SI( .free_fsk = )  NULL,

	_SI( .new_queue = ) wf2qp_new_queue,
	_SI( .free_queue = ) wf2qp_free_queue,
};


DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc);


================================================
FILE: sys/netinet/ipfw/ip_dn_glue.c
================================================
/*-
 * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
 * All rights reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $Id: ip_dn_glue.c 12500 2013-12-11 23:07:58Z luigi $
 *
 * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8
 */

#include "opt_inet6.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/module.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/time.h>
#include <sys/taskqueue.h>
#include <net/if.h>	/* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
#include <netinet/in.h>
#include <netinet/ip_var.h>	/* ip_output(), IP_FORWARDING */
#include <netinet/ip_fw.h>
#include <netinet/ip_dummynet.h>

#include <netinet/ipfw/ip_fw_private.h>
#include <netinet/ipfw/dn_heap.h>
#include <netinet/ipfw/ip_dn_private.h>
#include <netinet/ipfw/dn_sched.h>

/* FREEBSD7.2 ip_dummynet.h r191715*/

struct dn_heap_entry7 {
	int64_t key;        /* sorting key. Topmost element is smallest one */
	void *object;      /* object pointer */
};

struct dn_heap7 {
	int size;
	int elements;
	int offset; /* XXX if > 0 this is the offset of direct ptr to obj */
	struct dn_heap_entry7 *p;   /* really an array of "size" entries */
};

/* Common to 7.2 and 8 */
struct dn_flow_set {
	SLIST_ENTRY(dn_flow_set)    next;   /* linked list in a hash slot */

	u_short fs_nr ;             /* flow_set number       */
	u_short flags_fs;
#define DNOLD_HAVE_FLOW_MASK   0x0001
#define DNOLD_IS_RED       0x0002
#define DNOLD_IS_GENTLE_RED    0x0004
#define DNOLD_QSIZE_IS_BYTES   0x0008  /* queue size is measured in bytes */
#define DNOLD_NOERROR      0x0010  /* do not report ENOBUFS on drops  */
#define DNOLD_HAS_PROFILE      0x0020  /* the pipe has a delay profile. */
#define DNOLD_IS_PIPE      0x4000
#define DNOLD_IS_QUEUE     0x8000

	struct dn_pipe7 *pipe ;  /* pointer to parent pipe */
	u_short parent_nr ;     /* parent pipe#, 0 if local to a pipe */

	int weight ;        /* WFQ queue weight */
	int qsize ;         /* queue size in slots or bytes */
	int plr ;           /* pkt loss rate (2^31-1 means 100%) */

	struct ipfw_flow_id flow_mask ;

	/* hash table of queues onto this flow_set */
	int rq_size ;       /* number of slots */
	int rq_elements ;       /* active elements */
	struct dn_flow_queue7 **rq;  /* array of rq_size entries */

	u_int32_t last_expired ;    /* do not expire too frequently */
	int backlogged ;        /* #active queues for this flowset */

        /* RED parameters */
#define SCALE_RED               16
#define SCALE(x)                ( (x) << SCALE_RED )
#define SCALE_VAL(x)            ( (x) >> SCALE_RED )
#define SCALE_MUL(x,y)          ( ( (x) * (y) ) >> SCALE_RED )
	int w_q ;           /* queue weight (scaled) */
	int max_th ;        /* maximum threshold for queue (scaled) */
	int min_th ;        /* minimum threshold for queue (scaled) */
	int max_p ;         /* maximum value for p_b (scaled) */
	u_int c_1 ;         /* max_p/(max_th-min_th) (scaled) */
	u_int c_2 ;         /* max_p*min_th/(max_th-min_th) (scaled) */
	u_int c_3 ;         /* for GRED, (1-max_p)/max_th (scaled) */
	u_int c_4 ;         /* for GRED, 1 - 2*max_p (scaled) */
	u_int * w_q_lookup ;    /* lookup table for computing (1-w_q)^t */
	u_int lookup_depth ;    /* depth of lookup table */
	int lookup_step ;       /* granularity inside the lookup table */
	int lookup_weight ;     /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
	int avg_pkt_size ;      /* medium packet size */
	int max_pkt_size ;      /* max packet size */
};
SLIST_HEAD(dn_flow_set_head, dn_flow_set);

#define DN_IS_PIPE		0x4000
#define DN_IS_QUEUE		0x8000
struct dn_flow_queue7 {
	struct dn_flow_queue7 *next ;
	struct ipfw_flow_id id ;

	struct mbuf *head, *tail ;  /* queue of packets */
	u_int len ;
	u_int len_bytes ;

	u_long numbytes;

	u_int64_t tot_pkts ;    /* statistics counters  */
	u_int64_t tot_bytes ;
	u_int32_t drops ;

	int hash_slot ;     /* debugging/diagnostic */

	/* RED parameters */
	int avg ;                   /* average queue length est. (scaled) */
	int count ;                 /* arrivals since last RED drop */
	int random ;                /* random value (scaled) */
	u_int32_t q_time;      /* start of queue idle time */

	/* WF2Q+ support */
	struct dn_flow_set *fs ;    /* parent flow set */
	int heap_pos ;      /* position (index) of struct in heap */
	int64_t sched_time ;     /* current time when queue enters ready_heap */

	int64_t S,F ;        /* start time, finish time */
};

struct dn_pipe7 {        /* a pipe */
	SLIST_ENTRY(dn_pipe7)    next;   /* linked list in a hash slot */

	int pipe_nr ;       /* number   */
	int bandwidth;      /* really, bytes/tick.  */
	int delay ;         /* really, ticks    */

	struct  mbuf *head, *tail ; /* packets in delay line */

	/* WF2Q+ */
	struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/
	struct dn_heap7 not_eligible_heap; /* top extract- key Start time */
	struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */

	int64_t V ;          /* virtual time */
	int sum;            /* sum of weights of all active sessions */

	int numbytes;

	int64_t sched_time ;     /* time pipe was scheduled in ready_heap */

	/*
	* When the tx clock come from an interface (if_name[0] != '\0'), its name
	* is stored below, whereas the ifp is filled when the rule is configured.
	*/
	char if_name[IFNAMSIZ];
	struct ifnet *ifp ;
	int ready ; /* set if ifp != NULL and we got a signal from it */

	struct dn_flow_set fs ; /* used with fixed-rate flows */
};
SLIST_HEAD(dn_pipe_head7, dn_pipe7);


/* FREEBSD8 ip_dummynet.h r196045 */
struct dn_flow_queue8 {
	struct dn_flow_queue8 *next ;
	struct ipfw_flow_id id ;

	struct mbuf *head, *tail ;  /* queue of packets */
	u_int len ;
	u_int len_bytes ;

	uint64_t numbytes ;     /* credit for transmission (dynamic queues) */
	int64_t extra_bits;     /* extra bits simulating unavailable channel */

	u_int64_t tot_pkts ;    /* statistics counters  */
	u_int64_t tot_bytes ;
	u_int32_t drops ;

	int hash_slot ;     /* debugging/diagnostic */

	/* RED parameters */
	int avg ;                   /* average queue length est. (scaled) */
	int count ;                 /* arrivals since last RED drop */
	int random ;                /* random value (scaled) */
	int64_t idle_time;       /* start of queue idle time */

	/* WF2Q+ support */
	struct dn_flow_set *fs ;    /* parent flow set */
	int heap_pos ;      /* position (index) of struct in heap */
	int64_t sched_time ;     /* current time when queue enters ready_heap */

	int64_t S,F ;        /* start time, finish time */
};

struct dn_pipe8 {        /* a pipe */
	SLIST_ENTRY(dn_pipe8)    next;   /* linked list in a hash slot */

	int pipe_nr ;       /* number   */
	int bandwidth;      /* really, bytes/tick.  */
	int delay ;         /* really, ticks    */

	struct  mbuf *head, *tail ; /* packets in delay line */

	/* WF2Q+ */
	struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/
	struct dn_heap7 not_eligible_heap; /* top extract- key Start time */
	struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */

	int64_t V ;          /* virtual time */
	int sum;            /* sum of weights of all active sessions */

	/* Same as in dn_flow_queue, numbytes can become large */
	int64_t numbytes;       /* bits I can transmit (more or less). */
	uint64_t burst;     /* burst size, scaled: bits * hz */

	int64_t sched_time ;     /* time pipe was scheduled in ready_heap */
	int64_t idle_time;       /* start of pipe idle time */

	char if_name[IFNAMSIZ];
	struct ifnet *ifp ;
	int ready ; /* set if ifp != NULL and we got a signal from it */

	struct dn_flow_set fs ; /* used with fixed-rate flows */

    /* fields to simulate a delay profile */
#define ED_MAX_NAME_LEN     32
	char name[ED_MAX_NAME_LEN];
	int loss_level;
	int samples_no;
	int *samples;
};

#define ED_MAX_SAMPLES_NO   1024
struct dn_pipe_max8 {
	struct dn_pipe8 pipe;
	int samples[ED_MAX_SAMPLES_NO];
};
SLIST_HEAD(dn_pipe_head8, dn_pipe8);

/*
 * Changes from 7.2 to 8:
 * dn_pipe:
 *      numbytes from int to int64_t
 *      add burst (int64_t)
 *      add idle_time (int64_t)
 *      add profile
 *      add struct dn_pipe_max
 *      add flag DN_HAS_PROFILE
 *
 * dn_flow_queue
 *      numbytes from u_long to int64_t
 *      add extra_bits (int64_t)
 *      q_time from u_int32_t to int64_t and name idle_time
 *
 * dn_flow_set unchanged
 *
 */

/* NOTE:XXX copied from dummynet.c */
#define O_NEXT(p, len) ((void *)((char *)p + len))
static void
oid_fill(struct dn_id *oid, int len, int type, uintptr_t id)
{
	oid->len = len;
	oid->type = type;
	oid->subtype = 0;
	oid->id = id;
}
/* make room in the buffer and move the pointer forward */
static void *
o_next(struct dn_id **o, int len, int type)
{
	struct dn_id *ret = *o;
	oid_fill(ret, len, type, 0);
	*o = O_NEXT(*o, len);
	return ret;
}


static size_t pipesize7 = sizeof(struct dn_pipe7);
static size_t pipesize8 = sizeof(struct dn_pipe8);
static size_t pipesizemax8 = sizeof(struct dn_pipe_max8);

/* Indicate 'ipfw' version
 * 1: from FreeBSD 7.2
 * 0: from FreeBSD 8
 * -1: unknown (for now is unused)
 *
 * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives
 * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknown,
 *       it is suppose to be the FreeBSD 8 version.
 */
static int is7 = 0;

static int
convertflags2new(int src)
{
	int dst = 0;

	if (src & DNOLD_HAVE_FLOW_MASK)
		dst |= DN_HAVE_MASK;
	if (src & DNOLD_QSIZE_IS_BYTES)
		dst |= DN_QSIZE_BYTES;
	if (src & DNOLD_NOERROR)
		dst |= DN_NOERROR;
	if (src & DNOLD_IS_RED)
		dst |= DN_IS_RED;
	if (src & DNOLD_IS_GENTLE_RED)
		dst |= DN_IS_GENTLE_RED;
	if (src & DNOLD_HAS_PROFILE)
		dst |= DN_HAS_PROFILE;

	return dst;
}

static int
convertflags2old(int src)
{
	int dst = 0;

	if (src & DN_HAVE_MASK)
		dst |= DNOLD_HAVE_FLOW_MASK;
	if (src & DN_IS_RED)
		dst |= DNOLD_IS_RED;
	if (src & DN_IS_GENTLE_RED)
		dst |= DNOLD_IS_GENTLE_RED;
	if (src & DN_NOERROR)
		dst |= DNOLD_NOERROR;
	if (src & DN_HAS_PROFILE)
		dst |= DNOLD_HAS_PROFILE;
	if (src & DN_QSIZE_BYTES)
		dst |= DNOLD_QSIZE_IS_BYTES;

	return dst;
}

static int
dn_compat_del(void *v)
{
	struct dn_pipe7 *p = (struct dn_pipe7 *) v;
	struct dn_pipe8 *p8 = (struct dn_pipe8 *) v;
	struct {
		struct dn_id oid;
		uintptr_t a[1];	/* add more if we want a list */
	} cmd;

	/* XXX DN_API_VERSION ??? */
	oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION);

	if (is7) {
		if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
			return EINVAL;
		if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
			return EINVAL;
	} else {
		if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0)
			return EINVAL;
		if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0)
			return EINVAL;
	}

	if (p->pipe_nr != 0) { /* pipe x delete */
		cmd.a[0] = p->pipe_nr;
		cmd.oid.subtype = DN_LINK;
	} else { /* queue x delete */
		cmd.oid.subtype = DN_FS;
		cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr;
	}

	return do_config(&cmd, cmd.oid.len);
}

static int
dn_compat_config_queue(struct dn_fs *fs, void* v)
{
	struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
	struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
	struct dn_flow_set *f;

	if (is7)
		f = &p7->fs;
	else
		f = &p8->fs;

	fs->fs_nr = f->fs_nr;
	fs->sched_nr = f->parent_nr;
	fs->flow_mask = f->flow_mask;
	fs->buckets = f->rq_size;
	fs->qsize = f->qsize;
	fs->plr = f->plr;
	fs->par[0] = f->weight;
	fs->flags = convertflags2new(f->flags_fs);
	if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) {
		fs->w_q = f->w_q;
		fs->max_th = f->max_th;
		fs->min_th = f->min_th;
		fs->max_p = f->max_p;
	}

	return 0;
}

static int
dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p, 
		      struct dn_fs *fs, void* v)
{
	struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
	struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
	int i = p7->pipe_nr;

	sch->sched_nr = i;
	sch->oid.subtype = 0;
	p->link_nr = i;
	fs->fs_nr = i + 2*DN_MAX_ID;
	fs->sched_nr = i + DN_MAX_ID;

	/* Common to 7 and 8 */
	p->bandwidth = p7->bandwidth;
	p->delay = p7->delay;
	if (!is7) {
		/* FreeBSD 8 has burst  */
		p->burst = p8->burst;
	}

	/* fill the fifo flowset */
	dn_compat_config_queue(fs, v);
	fs->fs_nr = i + 2*DN_MAX_ID;
	fs->sched_nr = i + DN_MAX_ID;

	/* Move scheduler related parameter from fs to sch */
	sch->buckets = fs->buckets; /*XXX*/
	fs->buckets = 0;
	if (fs->flags & DN_HAVE_MASK) {
		sch->flags |= DN_HAVE_MASK;
		fs->flags &= ~DN_HAVE_MASK;
		sch->sched_mask = fs->flow_mask;
		bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id));
	}

	return 0;
}

static int
dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p,
			 void *v)
{
	struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;

	p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]);
	
	pf->link_nr = p->link_nr;
	pf->loss_level = p8->loss_level;
// 	pf->bandwidth = p->bandwidth; //XXX bandwidth redundant?
	pf->samples_no = p8->samples_no;
	strncpy(pf->name, p8->name,sizeof(pf->name));
	bcopy(p8->samples, pf->samples, sizeof(pf->samples));

	return 0;
}

/*
 * If p->pipe_nr != 0 the command is 'pipe x config', so need to create
 * the three main struct, else only a flowset is created
 */
static int
dn_compat_configure(void *v)
{
	struct dn_id *buf = NULL, *base;
	struct dn_sch *sch = NULL;
	struct dn_link *p = NULL;
	struct dn_fs *fs = NULL;
	struct dn_profile *pf = NULL;
	int lmax;
	int error;

	struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
	struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;

	int i; /* number of object to configure */

	lmax = sizeof(struct dn_id);	/* command header */
	lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) +
		sizeof(struct dn_fs) + sizeof(struct dn_profile);

	base = buf = malloc(lmax, M_DUMMYNET, M_WAITOK|M_ZERO);
	o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG);
	base->id = DN_API_VERSION;

	/* pipe_nr is the same in p7 and p8 */
	i = p7->pipe_nr;
	if (i != 0) { /* pipe config */
		sch = o_next(&buf, sizeof(*sch), DN_SCH);
		p = o_next(&buf, sizeof(*p), DN_LINK);
		fs = o_next(&buf, sizeof(*fs), DN_FS);

		error = dn_compat_config_pipe(sch, p, fs, v);
		if (error) {
			free(buf, M_DUMMYNET);
			return error;
		}
		if (!is7 && p8->samples_no > 0) {
			/* Add profiles*/
			pf = o_next(&buf, sizeof(*pf), DN_PROFILE);
			error = dn_compat_config_profile(pf, p, v);
			if (error) {
				free(buf, M_DUMMYNET);
				return error;
			}
		}
	} else { /* queue config */
		fs = o_next(&buf, sizeof(*fs), DN_FS);
		error = dn_compat_config_queue(fs, v);
		if (error) {
			free(buf, M_DUMMYNET);
			return error;
		}
	}
	error = do_config(base, (char *)buf - (char *)base);

	if (buf)
		free(buf, M_DUMMYNET);
	return error;
}

int
dn_compat_calc_size(void)
{
	int need = 0;
	/* XXX use FreeBSD 8 struct size */
	/* NOTE:
	 * - half scheduler: 		schk_count/2
	 * - all flowset:		fsk_count
	 * - all flowset queues:	queue_count
	 * - all pipe queue:		si_count
	 */
	need += dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2;
	need += dn_cfg.fsk_count * sizeof(struct dn_flow_set);
	need += dn_cfg.si_count * sizeof(struct dn_flow_queue8);
	need += dn_cfg.queue_count * sizeof(struct dn_flow_queue8);

	return need;
}

int
dn_c_copy_q (void *_ni, void *arg)
{
	struct copy_args *a = arg;
	struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start;
	struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start;
	struct dn_flow *ni = (struct dn_flow *)_ni;
	int size = 0;

	/* XXX hash slot not set */
	/* No difference between 7.2/8 */
	fq7->len = ni->length;
	fq7->len_bytes = ni->len_bytes;
	fq7->id = ni->fid;

	if (is7) {
		size = sizeof(struct dn_flow_queue7);
		fq7->tot_pkts = ni->tot_pkts;
		fq7->tot_bytes = ni->tot_bytes;
		fq7->drops = ni->drops;
	} else {
		size = sizeof(struct dn_flow_queue8);
		fq8->tot_pkts = ni->tot_pkts;
		fq8->tot_bytes = ni->tot_bytes;
		fq8->drops = ni->drops;
	}

	*a->start += size;
	return 0;
}

int
dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq)
{
	struct dn_link *l = &s->link;
	struct dn_fsk *f = s->fs;

	struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start;
	struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start;
	struct dn_flow_set *fs;
	int size = 0;

	if (is7) {
		fs = &pipe7->fs;
		size = sizeof(struct dn_pipe7);
	} else {
		fs = &pipe8->fs;
		size = sizeof(struct dn_pipe8);
	}

	/* These 4 field are the same in pipe7 and pipe8 */
	pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE;
	pipe7->bandwidth = l->bandwidth;
	pipe7->delay = l->delay * 1000 / hz;
	pipe7->pipe_nr = l->link_nr - DN_MAX_ID;

	if (!is7) {
		if (s->profile) {
			struct dn_profile *pf = s->profile;
			strncpy(pipe8->name, pf->name, sizeof(pf->name));
			pipe8->loss_level = pf->loss_level;
			pipe8->samples_no = pf->samples_no;
		}
		pipe8->burst = div64(l->burst , 8 * hz);
	}

	fs->flow_mask = s->sch.sched_mask;
	fs->rq_size = s->sch.buckets ? s->sch.buckets : 1;

	fs->parent_nr = l->link_nr - DN_MAX_ID;
	fs->qsize = f->fs.qsize;
	fs->plr = f->fs.plr;
	fs->w_q = f->fs.w_q;
	fs->max_th = f->max_th;
	fs->min_th = f->min_th;
	fs->max_p = f->fs.max_p;
	fs->rq_elements = nq;

	fs->flags_fs = convertflags2old(f->fs.flags);

	*a->start += size;
	return 0;
}


int
dn_compat_copy_pipe(struct copy_args *a, void *_o)
{
	int have = a->end - *a->start;
	int need = 0;
	int pipe_size = sizeof(struct dn_pipe8);
	int queue_size = sizeof(struct dn_flow_queue8);
	int n_queue = 0; /* number of queues */

	struct dn_schk *s = (struct dn_schk *)_o;
	/* calculate needed space:
	 * - struct dn_pipe
	 * - if there are instances, dn_queue * n_instances
	 */
	n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) :
						(s->siht ? 1 : 0));
	need = pipe_size + queue_size * n_queue;
	if (have < need) {
		D("have %d < need %d", have, need);
		return 1;
	}
	/* copy pipe */
	dn_c_copy_pipe(s, a, n_queue);

	/* copy queues */
	if (s->sch.flags & DN_HAVE_MASK)
		dn_ht_scan(s->siht, dn_c_copy_q, a);
	else if (s->siht)
		dn_c_copy_q(s->siht, a);
	return 0;
}

int
dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq)
{
	struct dn_flow_set *fs = (struct dn_flow_set *)*a->start;

	fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE;
	fs->fs_nr = f->fs.fs_nr;
	fs->qsize = f->fs.qsize;
	fs->plr = f->fs.plr;
	fs->w_q = f->fs.w_q;
	fs->max_th = f->max_th;
	fs->min_th = f->min_th;
	fs->max_p = f->fs.max_p;
	fs->flow_mask = f->fs.flow_mask;
	fs->rq_elements = nq;
	fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1);
	fs->parent_nr = f->fs.sched_nr;
	fs->weight = f->fs.par[0];

	fs->flags_fs = convertflags2old(f->fs.flags);
	*a->start += sizeof(struct dn_flow_set);
	return 0;
}

int
dn_compat_copy_queue(struct copy_args *a, void *_o)
{
	int have = a->end - *a->start;
	int need = 0;
	int fs_size = sizeof(struct dn_flow_set);
	int queue_size = sizeof(struct dn_flow_queue8);

	struct dn_fsk *fs = (struct dn_fsk *)_o;
	int n_queue = 0; /* number of queues */

	n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) :
						(fs->qht ? 1 : 0));

	need = fs_size + queue_size * n_queue;
	if (have < need) {
		D("have < need");
		return 1;
	}

	/* copy flowset */
	dn_c_copy_fs(fs, a, n_queue);

	/* copy queues */
	if (fs->fs.flags & DN_HAVE_MASK)
		dn_ht_scan(fs->qht, dn_c_copy_q, a);
	else if (fs->qht)
		dn_c_copy_q(fs->qht, a);

	return 0;
}

int
copy_data_helper_compat(void *_o, void *_arg)
{
	struct copy_args *a = _arg;

	if (a->type == DN_COMPAT_PIPE) {
		struct dn_schk *s = _o;
		if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) {
			return 0;	/* not old type */
		}
		/* copy pipe parameters, and if instance exists, copy
		 * other parameters and eventually queues.
		 */
		if(dn_compat_copy_pipe(a, _o))
			return DNHT_SCAN_END;
	} else if (a->type == DN_COMPAT_QUEUE) {
		struct dn_fsk *fs = _o;
		if (fs->fs.fs_nr >= DN_MAX_ID)
			return 0;
		if (dn_compat_copy_queue(a, _o))
			return DNHT_SCAN_END;
	}
	return 0;
}

/* Main function to manage old requests */
int
ip_dummynet_compat(struct sockopt *sopt)
{
	int error=0;
	void *v = NULL;
	struct dn_id oid;

	/* Lenght of data, used to found ipfw version... */
	int len = sopt->sopt_valsize;

	/* len can be 0 if command was dummynet_flush */
	if (len == pipesize7) {
		D("setting compatibility with FreeBSD 7.2");
		is7 = 1;
	}
	else if (len == pipesize8 || len == pipesizemax8) {
		D("setting compatibility with FreeBSD 8");
		is7 = 0;
	}

	switch (sopt->sopt_name) {
	default:
		printf("dummynet: -- unknown option %d", sopt->sopt_name);
		error = EINVAL;
		break;

	case IP_DUMMYNET_FLUSH:
		oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION);
		do_config(&oid, oid.len);
		break;

	case IP_DUMMYNET_DEL:
		v = malloc(len, M_TEMP, M_WAITOK);
		error = sooptcopyin(sopt, v, len, len);
		if (error)
			break;
		error = dn_compat_del(v);
		free(v, M_TEMP);
		break;

	case IP_DUMMYNET_CONFIGURE:
		v = malloc(len, M_TEMP, M_WAITOK);
		error = sooptcopyin(sopt, v, len, len);
		if (error)
			break;
		error = dn_compat_configure(v);
		free(v, M_TEMP);
		break;

	case IP_DUMMYNET_GET: {
		void *buf;
		int ret;
		int original_size = sopt->sopt_valsize;
		int size;

		ret = dummynet_get(sopt, &buf);
		if (ret)
			return 0;//XXX ?
		size = sopt->sopt_valsize;
		sopt->sopt_valsize = original_size;
		D("size=%d, buf=%p", size, buf);
		ret = sooptcopyout(sopt, buf, size);
		if (ret)
			printf("  %s ERROR sooptcopyout\n", __FUNCTION__);
		if (buf)
			free(buf, M_DUMMYNET);
	    }
	}

	return error;
}


================================================
FILE: sys/netinet/ipfw/ip_dn_io.c
================================================
/*-
 * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa
 * All rights reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Dummynet portions related to packet handling.
 */
#include <sys/cdefs.h>
__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_dn_io.c 203321 2010-01-31 21:39:25Z luigi $");

#include "opt_inet6.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/module.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/sysctl.h>

#include <net/if.h>	/* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
#include <net/netisr.h>
#include <net/vnet.h>

#include <netinet/in.h>
#include <netinet/ip.h>		/* ip_len, ip_off */
#include <netinet/ip_var.h>	/* ip_output(), IP_FORWARDING */
#include <netinet/ip_fw.h>
#include <netinet/ipfw/ip_fw_private.h>
#include <netinet/ipfw/dn_heap.h>
#include <netinet/ip_dummynet.h>
#include <netinet/ipfw/ip_dn_private.h>
#include <netinet/ipfw/dn_sched.h>

#include <netinet/if_ether.h> /* various ether_* routines */

#include <netinet/ip6.h>       /* for ip6_input, ip6_output prototypes */
#include <netinet6/ip6_var.h>

/*
 * We keep a private variable for the simulation time, but we could
 * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
 * instead of dn_cfg.curr_time
 */

struct dn_parms dn_cfg;
//VNET_DEFINE(struct dn_parms, _base_dn_cfg);

static long tick_last;		/* Last tick duration (usec). */
static long tick_delta;		/* Last vs standard tick diff (usec). */
static long tick_delta_sum;	/* Accumulated tick difference (usec).*/
static long tick_adjustment;	/* Tick adjustments done. */
static long tick_lost;		/* Lost(coalesced) ticks number. */
/* Adjusted vs non-adjusted curr_time difference (ticks). */
static long tick_diff;

static unsigned long	io_pkt;
static unsigned long	io_pkt_fast;
static unsigned long	io_pkt_drop;

/*
 * We use a heap to store entities for which we have pending timer events.
 * The heap is checked at every tick and all entities with expired events
 * are extracted.
 */
  
MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");

extern	void (*bridge_dn_p)(struct mbuf *, struct ifnet *);

#ifdef SYSCTL_NODE

SYSBEGIN(f4)

SYSCTL_DECL(_net_inet);
SYSCTL_DECL(_net_inet_ip);
SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");

/* wrapper to pass dn_cfg fields to SYSCTL_* */
//#define DC(x)	(&(VNET_NAME(_base_dn_cfg).x))
#define DC(x)	(&(dn_cfg.x))
/* parameters */
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
    CTLFLAG_RW, DC(hash_size), 0, "Default hash table size");
SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
    CTLFLAG_RW, DC(slot_limit), 0,
    "Upper limit in slots for pipe queue.");
SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
    CTLFLAG_RW, DC(byte_limit), 0,
    "Upper limit in bytes for pipe queue.");
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
    CTLFLAG_RW, DC(io_fast), 0, "Enable fast dummynet io.");
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug,
    CTLFLAG_RW, DC(debug), 0, "Dummynet debug level");

/* RED parameters */
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
    CTLFLAG_RD, DC(red_lookup_depth), 0, "Depth of RED lookup table");
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
    CTLFLAG_RD, DC(red_avg_pkt_size), 0, "RED Medium packet size");
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
    CTLFLAG_RD, DC(red_max_pkt_size), 0, "RED Max packet size");

/* time adjustment */
SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
    CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
    CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
    CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
    CTLFLAG_RD, &tick_diff, 0,
    "Adjusted vs non-adjusted curr_time difference (ticks).");
SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
    CTLFLAG_RD, &tick_lost, 0,
    "Number of ticks coalesced by dummynet taskqueue.");

/* Drain parameters */
SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire,
    CTLFLAG_RW, DC(expire), 0, "Expire empty queues/pipes");
SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle,
    CTLFLAG_RD, DC(expire_cycle), 0, "Expire cycle for queues/pipes");
SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_object,
    CTLFLAG_RW, DC(expire_object), 0, "Min # of objects before start drain routine");
SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, object_idle_tick,
    CTLFLAG_RD, DC(object_idle_tick), 0, "Time (in ticks) to cosiderer an object as idle");
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, drain_ratio,
    CTLFLAG_RD, DC(drain_ratio), 0, "% of dummynet_task() to dedicate to drain routine");

/* statistics */
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count,
    CTLFLAG_RD, DC(schk_count), 0, "Number of schedulers");
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count,
    CTLFLAG_RD, DC(si_count), 0, "Number of scheduler instances");
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count,
    CTLFLAG_RD, DC(fsk_count), 0, "Number of flowsets");
SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count,
    CTLFLAG_RD, DC(queue_count), 0, "Number of queues");
SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
    CTLFLAG_RD, &io_pkt, 0,
    "Number of packets passed to dummynet.");
SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
    CTLFLAG_RD, &io_pkt_fast, 0,
    "Number of packets bypassed dummynet scheduler.");
SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
    CTLFLAG_RD, &io_pkt_drop, 0,
    "Number of packets dropped by dummynet.");
#undef DC
SYSEND

#endif

static void	dummynet_send(struct mbuf *);

/*
 * Packets processed by dummynet have an mbuf tag associated with
 * them that carries their dummynet state.
 * Outside dummynet, only the 'rule' field is relevant, and it must
 * be at the beginning of the structure.
 */
struct dn_pkt_tag {
	struct ipfw_rule_ref rule;	/* matching rule	*/

	/* second part, dummynet specific */
	int dn_dir;		/* action when packet comes out.*/
				/* see ip_fw_private.h		*/
	uint64_t output_time;	/* when the pkt is due for delivery*/
	struct ifnet *ifp;	/* interface, for ip_output	*/
	struct _ip6dn_args ip6opt;	/* XXX ipv6 options	*/
};

/*
 * Return the mbuf tag holding the dummynet state (it should
 * be the first one on the list).
 */
static struct dn_pkt_tag *
dn_tag_get(struct mbuf *m)
{
	struct m_tag *mtag = m_tag_first(m);
	KASSERT(mtag != NULL &&
	    mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
	    mtag->m_tag_id == PACKET_TAG_DUMMYNET,
	    ("packet on dummynet queue w/o dummynet tag!"));
	return (struct dn_pkt_tag *)(mtag+1);
}

static inline void
mq_append(struct mq *q, struct mbuf *m)
{
	if (q->head == NULL)
		q->head = m;
	else
		q->tail->m_nextpkt = m;
	q->tail = m;
	m->m_nextpkt = NULL;
}

/*
 * Dispose a list of packet. Use a functions so if we need to do
 * more work, this is a central point to do it.
 */
void dn_free_pkts(struct mbuf *mnext)
{
        struct mbuf *m;
    
        while ((m = mnext) != NULL) {
                mnext = m->m_nextpkt;
                FREE_PKT(m);
        }
}

static int
red_drops (struct dn_queue *q, int len)
{
	/*
	 * RED algorithm
	 *
	 * RED calculates the average queue size (avg) using a low-pass filter
	 * with an exponential weighted (w_q) moving average:
	 * 	avg  <-  (1-w_q) * avg + w_q * q_size
	 * where q_size is the queue length (measured in bytes or * packets).
	 *
	 * If q_size == 0, we compute the idle time for the link, and set
	 *	avg = (1 - w_q)^(idle/s)
	 * where s is the time needed for transmitting a medium-sized packet.
	 *
	 * Now, if avg < min_th the packet is enqueued.
	 * If avg > max_th the packet is dropped. Otherwise, the packet is
	 * dropped with probability P function of avg.
	 */

	struct dn_fsk *fs = q->fs;
	int64_t p_b = 0;

	/* Queue in bytes or packets? */
	uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ?
	    q->ni.len_bytes : q->ni.length;

	/* Average queue size estimation. */
	if (q_size != 0) {
		/* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
		int diff = SCALE(q_size) - q->avg;
		int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);

		q->avg += (int)v;
	} else {
		/*
		 * Queue is empty, find for how long the queue has been
		 * empty and use a lookup table for computing
		 * (1 - * w_q)^(idle_time/s) where s is the time to send a
		 * (small) packet.
		 * XXX check wraps...
		 */
		if (q->avg) {
			u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step);

			q->avg = (t < fs->lookup_depth) ?
			    SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
		}
	}

	/* Should i drop? */
	if (q->avg < fs->min_th) {
		q->count = -1;
		return (0);	/* accept packet */
	}
	if (q->avg >= fs->max_th) {	/* average queue >=  max threshold */
		if (fs->fs.flags & DN_IS_GENTLE_RED) {
			/*
			 * According to Gentle-RED, if avg is greater than
			 * max_th the packet is dropped with a probability
			 *	 p_b = c_3 * avg - c_4
			 * where c_3 = (1 - max_p) / max_th
			 *       c_4 = 1 - 2 * max_p
			 */
			p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
			    fs->c_4;
		} else {
			q->count = -1;
			return (1);
		}
	} else if (q->avg > fs->min_th) {
		/*
		 * We compute p_b using the linear dropping function
		 *	 p_b = c_1 * avg - c_2
		 * where c_1 = max_p / (max_th - min_th)
		 * 	 c_2 = max_p * min_th / (max_th - min_th)
		 */
		p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
	}

	if (fs->fs.flags & DN_QSIZE_BYTES)
		p_b = div64((p_b * len) , fs->max_pkt_size);
	if (++q->count == 0)
		q->random = random() & 0xffff;
	else {
		/*
		 * q->count counts packets arrived since last drop, so a greater
		 * value of q->count means a greater packet drop probability.
		 */
		if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
			q->count = 0;
			/* After a drop we calculate a new random value. */
			q->random = random() & 0xffff;
			return (1);	/* drop */
		}
	}
	/* End of RED algorithm. */

	return (0);	/* accept */

}

/*
 * Enqueue a packet in q, subject to space and queue management policy
 * (whose parameters are in q->fs).
 * Update stats for the queue and the scheduler.
 * Return 0 on success, 1 on drop. The packet is consumed anyways.
 */
int
dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
{   
	struct dn_fs *f;
	struct dn_flow *ni;	/* stats for scheduler instance */
	uint64_t len;

	if (q->fs == NULL || q->_si == NULL) {
		printf("%s fs %p si %p, dropping\n",
			__FUNCTION__, q->fs, q->_si);
		FREE_PKT(m);
		return 1;
	}
	f = &(q->fs->fs);
	ni = &q->_si->ni;
	len = m->m_pkthdr.len;
	/* Update statistics, then check reasons to drop pkt. */
	q->ni.tot_bytes += len;
	q->ni.tot_pkts++;
	ni->tot_bytes += len;
	ni->tot_pkts++;
	if (drop)
		goto drop;
	if (f->plr && random() < f->plr)
		goto drop;
	if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len))
		goto drop;
	if (f->flags & DN_QSIZE_BYTES) {
		if (q->ni.len_bytes > f->qsize)
			goto drop;
	} else if (q->ni.length >= f->qsize) {
		goto drop;
	}
	mq_append(&q->mq, m);
	if (q->ni.length == 0) {	/* queue was idle */
		dn_cfg.idle_queue--;
		if (ni->length == 0)	/* scheduler was idle */
			dn_cfg.idle_si--;
	}
	q->ni.length++;
	q->ni.len_bytes += len;
	ni->length++;
	ni->len_bytes += len;
	return 0;

drop:
	io_pkt_drop++;
	q->ni.drops++;
	ni->drops++;
	FREE_PKT(m);
	return 1;
}

/*
 * Fetch packets from the delay line which are due now. If there are
 * leftover packets, reinsert the delay line in the heap.
 * Runs under scheduler lock.
 */
static void
transmit_event(struct mq *q, struct delay_line *dline, uint64_t now)
{
	struct mbuf *m;
	struct dn_pkt_tag *pkt = NULL;

	dline->oid.subtype = 0; /* not in heap */
	while ((m = dline->mq.head) != NULL) {
		pkt = dn_tag_get(m);
		if (!DN_KEY_LEQ(pkt->output_time, now))
			break;
		dline->mq.head = m->m_nextpkt;
		mq_append(q, m);
	}
	if (m != NULL) {
		dline->oid.subtype = 1; /* in heap */
		heap_insert(&dn_cfg.evheap, pkt->output_time, dline);
	}
}

/*
 * Convert the additional MAC overheads/delays into an equivalent
 * number of bits for the given data rate. The samples are
 * in milliseconds so we need to divide by 1000.
 */
static uint64_t
extra_bits(struct mbuf *m, struct dn_schk *s)
{
	int index;
	uint64_t bits;
	struct dn_profile *pf = s->profile;

	if (!pf || pf->samples_no == 0)
		return 0;
	index  = random() % pf->samples_no;
	bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000);
	if (index >= pf->loss_level) {
		struct dn_pkt_tag *dt = dn_tag_get(m);
		if (dt)
			dt->dn_dir = DIR_DROP;
	}
	return bits;
}

/*
 * Send traffic from a scheduler instance due by 'now'.
 * Return a pointer to the head of the queue.
 */
static struct mbuf *
serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now)
{
	struct mq def_q;
	struct dn_schk *s = si->sched;
	struct mbuf *m = NULL;
	int delay_line_idle = (si->dline.mq.head == NULL);
	int done, bw;

	if (q == NULL) {
		q = &def_q;
		q->head = NULL;
	}

	bw = s->link.bandwidth;
	si->kflags &= ~DN_ACTIVE;

	if (bw > 0)
		si->credit += (now - si->sched_time) * bw;
	else
		si->credit = 0;
	si->sched_time = now;
	done = 0;
	while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) {
		uint64_t len_scaled;

		/*
		 * Some schedulers might want wake up the scheduler later.
		 * To suppor this the caller returns an mbuf with len < 0
		 * this will result in a new wake up of the scheduler
		 * instance between m->m_pkthdr.len ticks.
		 */
		if (m->m_pkthdr.len < 0) {
			si->kflags |= DN_ACTIVE;
			heap_insert(&dn_cfg.evheap, now - m->m_pkthdr.len, si);
			if (delay_line_idle && done)
				transmit_event(q, &si->dline, now);
			return NULL;
		}

 		/* a regular mbuf received */
		done++;
		len_scaled = (bw == 0) ? 0 : hz *
			(m->m_pkthdr.len * 8 + extra_bits(m, s));
		si->credit -= len_scaled;
		/* Move packet in the delay line */
		dn_tag_get(m)->output_time = dn_cfg.curr_time + s->link.delay;
		mq_append(&si->dline.mq, m);
	}

	/*
	 * If credit >= 0 the instance is idle, mark time.
	 * Otherwise put back in the heap, and adjust the output
	 * time of the last inserted packet, m, which was too early.
	 */
	if (si->credit >= 0) {
		si->idle_time = now;
	} else {
		uint64_t t;
		KASSERT (bw > 0, ("bw=0 and credit<0 ?"));
		t = div64(bw - 1 - si->credit, bw);
		if (m)
			dn_tag_get(m)->output_time += t;
		si->kflags |= DN_ACTIVE;
		heap_insert(&dn_cfg.evheap, now + t, si);
	}
	if (delay_line_idle && done)
		transmit_event(q, &si->dline, now);
	return q->head;
}

/*
 * Support function to read the TSC (or equivalent). We use this
 * high resolution timer to adapt the amount of work done for
 * expiring the clock.
 * Supports Linux and FreeBSD both i386 and amd64 platform
 * Supports OpenWRT mips architecture
 *
 * SMP no special works is needed in
 * - In linux 2.6 timers will always run in the same cpu that have added it.See
 * (http://book.opensourceproject.org.cn/kernel/kernel3rd/opensource/0596005652/understandlk-chp-6-sect-5.html)
 * - FreeBSD8 has a new callout_reset_on() with specify the cpu on which
 *   the timer must be run
 * - Windows runs dummynet_task() on cpu0.
 *
 * - Linux 2.4 doesn't assure to run a timer in the same cpu every time.
 */
#ifdef HAVE_TSC
uint64_t
readTSC (void)
{
	uint64_t a=0;

#ifdef __linux__
	/* Linux and openwrt have a macro to read the tsc for i386 and
	 * amd64.
	 * Openwrt have patched the kernel and allow use of tsc with mips
	 * and other platforms
	 * rdtscll() is a macro defined in include/asm-xxx/msr.h,
	 * where xxx is the architecture (x86, mips).
	 */
	rdtscll(a);
#elif defined(_WIN32)
	/* Microsoft recommends the use of KeQueryPerformanceCounter()
	 * insteead of rdtsc().
	 */
	KeQueryPerformanceCounter((PLARGE_INTEGER)&a);  //XXX not tested!
#elif defined(__FreeBSD__)
	/* FreeBSD (i386/amd64) has macro rdtsc() defined in machine/cpufunc.h.
	 * We could use the macro instead of explicity assembly XXX
	 */
	return rdtsc();
#endif
	return a;
}
#endif /* HAVE_TSC */

/*
 * compute avg task period.
 * We could do something more complex, possibly.
 */
static void
do_update_cycle(void)
{
#ifdef HAVE_TSC
	uint64_t tmp = readTSC();
#if defined (LINUX_24) && defined(CONFIG_SMP)
	/* on LINUX24 and SMP, we have no guarantees on which cpu runs
	 * the timer callbacks. If the difference between new and
	 * old value is negative, we assume that the values come from
	 * different cpus so we adjust 'new' accordingly.
	 */
	if (tmp <= dn_cfg.cycle_task_new)
		dn_cfg.cycle_task_new = tmp - dn_cfg.cycle_task;
#endif /* !(linux24 && SMP) */
	dn_cfg.cycle_task_old = dn_cfg.cycle_task_new;
	dn_cfg.cycle_task_new = tmp;
	dn_cfg.cycle_task = dn_cfg.cycle_task_new - dn_cfg.cycle_task_old;

	/* Update the average
	 * avg = (2^N * avg + new - avg ) / 2^N * avg
	 * N==4 seems to be a good compromise between clock clock change
	 *      and 'spurious' cycle_task value
	 */
#define DN_N	4
	dn_cfg.cycle_task_avg = (dn_cfg.cycle_task_avg << DN_N) +
				dn_cfg.cycle_task - dn_cfg.cycle_task_avg;
	dn_cfg.cycle_task_avg = dn_cfg.cycle_task_avg >> DN_N;
#undef DN_N

#endif /* HAVE_TSC */
}

static void
do_drain(void)
{
#ifdef HAVE_TSC
	uint64_t dt_max;
#endif
	if (!dn_cfg.expire || ++dn_cfg.expire_cycle < dn_cfg.expire)
		return;
	/* It's time to check if drain routines should be called */
	dn_cfg.expire_cycle = 0;

	dn_cfg.idle_queue_wait = 0;
	dn_cfg.idle_si_wait = 0;
	/* Do a drain cycle even if there isn't time to do it */
#ifdef HAVE_TSC
	dt_max = dn_cfg.cycle_task_avg * dn_cfg.drain_ratio;
#endif
	for (;;) {
		int done = 0;

		if (dn_cfg.idle_queue > dn_cfg.expire_object &&
		    dn_cfg.idle_queue_wait < dn_cfg.idle_queue) {
			dn_drain_queue();
			done = 1;
		}
		if (dn_cfg.idle_si > dn_cfg.expire_object &&
		    dn_cfg.idle_si_wait < dn_cfg.idle_si) {
			dn_drain_scheduler();
			done = 1;
		}
		/* time to end ? */
#ifndef HAVE_TSC
		/* If tsc does not exist, do only one drain cycle and exit */
		break;
#else
		/* Exit when nothing was done or we have consumed all time */
		if ( (done == 0) || 
		     ((readTSC() -  dn_cfg.cycle_task_new) * 100 > dt_max) )
			break;
#endif	/* HAVE_TSC */
	}
}

/*
 * The timer handler for dummynet. Time is computed in ticks, but
 * but the code is tolerant to the actual rate at which this is called.
 * Once complete, the function reschedules itself for the next tick.
 */
void
dummynet_task(void *context, int pending)
{
	struct timeval t;
	struct mq q = { NULL, NULL }; /* queue to accumulate results */

	CURVNET_SET((struct vnet *)context);

	do_update_cycle();      /* compute avg. tick duration */

	DN_BH_WLOCK();

	/* Update number of lost(coalesced) ticks. */
	tick_lost += pending - 1;

	getmicrouptime(&t);
	/* Last tick duration (usec). */
	tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 +
	(t.tv_usec - dn_cfg.prev_t.tv_usec);
	/* Last tick vs standard tick difference (usec). */
	tick_delta = (tick_last * hz - 1000000) / hz;
	/* Accumulated tick difference (usec). */
	tick_delta_sum += tick_delta;

	dn_cfg.prev_t = t;

	/*
	* Adjust curr_time if the accumulated tick difference is
	* greater than the 'standard' tick. Since curr_time should
	* be monotonically increasing, we do positive adjustments
	* as required, and throttle curr_time in case of negative
	* adjustment.
	*/
	dn_cfg.curr_time++;
	if (tick_delta_sum - tick >= 0) {
		int diff = tick_delta_sum / tick;

		dn_cfg.curr_time += diff;
		tick_diff += diff;
		tick_delta_sum %= tick;
		tick_adjustment++;
	} else if (tick_delta_sum + tick <= 0) {
		dn_cfg.curr_time--;
		tick_diff--;
		tick_delta_sum += tick;
		tick_adjustment++;
	}

	/* serve pending events, accumulate in q */
	for (;;) {
		struct dn_id *p;    /* generic parameter to handler */

		if (dn_cfg.evheap.elements == 0 ||
		    DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key))
			break;
		p = HEAP_TOP(&dn_cfg.evheap)->object;
		heap_extract(&dn_cfg.evheap, NULL);

		if (p->type == DN_SCH_I) {
			serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time);
		} else { /* extracted a delay line */
			transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time);
		}
	}
	do_drain();

	DN_BH_WUNLOCK();
	dn_reschedule();
	if (q.head != NULL)
		dummynet_send(q.head);
	CURVNET_RESTORE();
}

/*
 * forward a chain of packets to the proper destination.
 * This runs outside the dummynet lock.
 */
static void
dummynet_send(struct mbuf *m)
{
	struct mbuf *n;

	for (; m != NULL; m = n) {
		struct ifnet *ifp = NULL;	/* gcc 3.4.6 complains */
        	struct m_tag *tag;
		int dst;

		n = m->m_nextpkt;
		m->m_nextpkt = NULL;
		tag = m_tag_first(m);
		if (tag == NULL) { /* should not happen */
			dst = DIR_DROP;
		} else {
			struct dn_pkt_tag *pkt = dn_tag_get(m);
			/* extract the dummynet info, rename the tag
			 * to carry reinject info.
			 */
			dst = pkt->dn_dir;
			ifp = pkt->ifp;
			tag->m_tag_cookie = MTAG_IPFW_RULE;
			tag->m_tag_id = 0;
		}

		switch (dst) {
		case DIR_OUT:
			SET_HOST_IPLEN(mtod(m, struct ip *));
			ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
			break ;

		case DIR_IN :
			/* put header in network format for ip_input() */
			//SET_NET_IPLEN(mtod(m, struct ip *));
			netisr_dispatch(NETISR_IP, m);
			break;

#ifdef INET6
		case DIR_IN | PROTO_IPV6:
			netisr_dispatch(NETISR_IPV6, m);
			break;

		case DIR_OUT | PROTO_IPV6:
			SET_HOST_IPLEN(mtod(m, struct ip *));
			ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
			break;
#endif

		case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */
			if (bridge_dn_p != NULL)
				((*bridge_dn_p)(m, ifp));
			else
				printf("dummynet: if_bridge not loaded\n");

			break;

		case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */
			/*
			 * The Ethernet code assumes the Ethernet header is
			 * contiguous in the first mbuf header.
			 * Insure this is true.
			 */
			if (m->m_len < ETHER_HDR_LEN &&
			    (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
				printf("dummynet/ether: pullup failed, "
				    "dropping packet\n");
				break;
			}
			ether_demux(m->m_pkthdr.rcvif, m);
			break;

		case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */
			ether_output_frame(ifp, m);
			break;

		case DIR_DROP:
			/* drop the packet after some time */
			FREE_PKT(m);
			break;

		default:
			printf("dummynet: bad switch %d!\n", dst);
			FREE_PKT(m);
			break;
		}
	}
}

static inline int
tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa)
{
	struct dn_pkt_tag *dt;
	struct m_tag *mtag;

	mtag = m_tag_get(PACKET_TAG_DUMMYNET,
		    sizeof(*dt), M_NOWAIT | M_ZERO);
	if (mtag == NULL)
		return 1;		/* Cannot allocate packet header. */
	m_tag_prepend(m, mtag);		/* Attach to mbuf chain. */
	dt = (struct dn_pkt_tag *)(mtag + 1);
	dt->rule = fwa->rule;
	dt->rule.info &= IPFW_ONEPASS;	/* only keep this info */
	dt->dn_dir = dir;
	dt->ifp = fwa->oif;
	/* dt->output tame is updated as we move through */
	dt->output_time = dn_cfg.curr_time;
	return 0;
}


/*
 * dummynet hook for packets.
 * We use the argument to locate the flowset fs and the sched_set sch
 * associated to it. The we apply flow_mask and sched_mask to
 * determine the queue and scheduler instances.
 *
 * dir		where shall we send the packet after dummynet.
 * *m0		the mbuf with the packet
 * ifp		the 'ifp' parameter from the caller.
 *		NULL in ip_input, destination interface in ip_output,
 */
int
dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
{
	struct mbuf *m = *m0;
	struct dn_fsk *fs = NULL;
	struct dn_sch_inst *si;
	struct dn_queue *q = NULL;	/* default */

	int fs_id = (fwa->rule.info & IPFW_INFO_MASK) +
		((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0);
	DN_BH_WLOCK();
	io_pkt++;
	/* we could actually tag outside the lock, but who cares... */
	if (tag_mbuf(m, dir, fwa))
		goto dropit;
	if (dn_cfg.busy) {
		/* if the upper half is busy doing something expensive,
		 * lets queue the packet and move forward
		 */
		mq_append(&dn_cfg.pending, m);
		m = *m0 = NULL; /* consumed */
		goto done; /* already active, nothing to do */
	}
	/* XXX locate_flowset could be optimised with a direct ref. */
	fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL);
	if (fs == NULL)
		goto dropit;	/* This queue/pipe does not exist! */
	if (fs->sched == NULL)	/* should not happen */
		goto dropit;
	/*
	 * If the scheduler supports multiple queues, find the right one
	 * (otherwise it will be ignored by enqueue).
	 */
	if (fs->sched->fp->flags & DN_MULTIQUEUE) {
		q = ipdn_q_find(fs, &(fwa->f_id));
		if (q == NULL)
			goto dropit;
		/* The scheduler instance lookup is done only for new queue.
		 * The callback q_new() will create the scheduler instance
		 * if needed.
		 */
		si = q->_si;
	} else
		si = ipdn_si_find(fs->sched, &(fwa->f_id));

	if (si == NULL)
		goto dropit;
	if (fs->sched->fp->enqueue(si, q, m)) {
		/* packet was dropped by enqueue() */
		m = *m0 = NULL;
		goto dropit;
	}

	if (si->kflags & DN_ACTIVE) {
		m = *m0 = NULL; /* consumed */
		goto done; /* already active, nothing to do */
	}

	/* compute the initial allowance */
	if (si->idle_time < dn_cfg.curr_time) {
	    /* Do this only on the first packet on an idle pipe */
	    struct dn_link *p = &fs->sched->link;

	    si->sched_time = dn_cfg.curr_time;
	    si->credit = dn_cfg.io_fast ? p->bandwidth : 0;
	    if (p->burst) {
		uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth;
		if (burst > p->burst)
			burst = p->burst;
		si->credit += burst;
	    }
	}
	/* pass through scheduler and delay line */
	m = serve_sched(NULL, si, dn_cfg.curr_time);

	/* optimization -- pass it back to ipfw for immediate send */
	/* XXX Don't call dummynet_send() if scheduler return the packet
	 *     just enqueued. This avoid a lock order reversal.
	 *     
	 */
	if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) {
		/* fast io, rename the tag * to carry reinject info. */
		struct m_tag *tag = m_tag_first(m);

		tag->m_tag_cookie = MTAG_IPFW_RULE;
		tag->m_tag_id = 0;
		io_pkt_fast++;
		if (m->m_nextpkt != NULL) {
			printf("dummynet: fast io: pkt chain detected!\n");
			m->m_nextpkt = NULL;
		}
		m = NULL;
	} else {
		*m0 = NULL;
	}
done:
	DN_BH_WUNLOCK();
	if (m)
		dummynet_send(m);
	return 0;

dropit:
	io_pkt_drop++;
	DN_BH_WUNLOCK();
	if (m)
		FREE_PKT(m);
	*m0 = NULL;
	return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS;
}


================================================
FILE: sys/netinet/ipfw/ip_dn_private.h
================================================
/*-
 * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa
 * All rights reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * internal dummynet APIs.
 *
 * $FreeBSD: head/sys/netinet/ipfw/ip_dn_private.h 204591 2010-03-02 17:40:48Z luigi $
 */

#ifndef _IP_DN_PRIVATE_H
#define _IP_DN_PRIVATE_H

/* debugging support
 * use ND() to remove debugging, D() to print a line,
 * DX(level, ...) to print above a certain level
 * If you redefine D() you are expected to redefine all.
 */
#ifndef D
#define ND(fmt, ...) do {} while (0)
#define D1(fmt, ...) do {} while (0)
#define D(fmt, ...) printf("%-10s " fmt "\n",      \
        __FUNCTION__, ## __VA_ARGS__)
#define DX(lev, fmt, ...) do {              \
        if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0)
#endif

MALLOC_DECLARE(M_DUMMYNET);

#ifndef __linux__
#define div64(a, b)  ((int64_t)(a) / (int64_t)(b))
#endif

#define DN_LOCK_INIT() do {				\
	mtx_init(&dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF);	\
	mtx_init(&dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF);	\
	} while (0)
#define DN_LOCK_DESTROY() do {				\
	mtx_destroy(&dn_cfg.uh_mtx);			\
	mtx_destroy(&dn_cfg.bh_mtx);			\
	} while (0)
#if 0 /* not used yet */
#define DN_UH_RLOCK()		mtx_lock(&dn_cfg.uh_mtx)
#define DN_UH_RUNLOCK()		mtx_unlock(&dn_cfg.uh_mtx)
#define DN_UH_WLOCK()		mtx_lock(&dn_cfg.uh_mtx)
#define DN_UH_WUNLOCK()		mtx_unlock(&dn_cfg.uh_mtx)
#define DN_UH_LOCK_ASSERT()	mtx_assert(&dn_cfg.uh_mtx, MA_OWNED)
#endif

#define DN_BH_RLOCK()		mtx_lock(&dn_cfg.uh_mtx)
#define DN_BH_RUNLOCK()		mtx_unlock(&dn_cfg.uh_mtx)
#define DN_BH_WLOCK()		mtx_lock(&dn_cfg.uh_mtx)
#define DN_BH_WUNLOCK()		mtx_unlock(&dn_cfg.uh_mtx)
#define DN_BH_LOCK_ASSERT()	mtx_assert(&dn_cfg.uh_mtx, MA_OWNED)

SLIST_HEAD(dn_schk_head, dn_schk);
SLIST_HEAD(dn_sch_inst_head, dn_sch_inst);
SLIST_HEAD(dn_fsk_head, dn_fsk);
SLIST_HEAD(dn_queue_head, dn_queue);
SLIST_HEAD(dn_alg_head, dn_alg);

struct mq {	/* a basic queue of packets*/
        struct mbuf *head, *tail;
};

static inline void
set_oid(struct dn_id *o, int type, int len)
{
        o->type = type;
        o->len = len;
        o->subtype = 0;
};

uint64_t readTSC (void);
/*
 * see if tsc (ot other timer) is supported.
 * - FreeBSD has rdtsc macro for i386 and amd64
 * - Linux has rdtscll and/or rdtsc (also for openWRT patched kernel source)
 * - Windows has KeQueryPerformanceCounter() function that use tsc or other
 *   timer
 */
#if defined(rdtscll) || defined(rdtsc) || defined(_WIN32)
#define HAVE_TSC
#endif
/*
 * configuration and global data for a dummynet instance
 *
 * When a configuration is modified from userland, 'id' is incremented
 * so we can use the value to check for stale pointers.
 */
struct dn_parms {
	uint32_t	id;		/* configuration version */

	/* defaults (sysctl-accessible) */
	int	red_lookup_depth;
	int	red_avg_pkt_size;
	int	red_max_pkt_size;
	int	hash_size;
	int	max_hash_size;
	long	byte_limit;		/* max queue sizes */
	long	slot_limit;

	int	io_fast;
	int	debug;

	/* timekeeping */
	struct timeval prev_t;		/* last time dummynet_tick ran */
	struct dn_heap	evheap;		/* scheduled events */

	/* counters of objects -- used for reporting space */
	int	schk_count;
	int	si_count;
	int	fsk_count;
	int	queue_count;

	/* ticks and other stuff */
	uint64_t	curr_time;	/* in ticks */

	/*
	 * Variables to manage the time spent in the drain routines.
	 * max_drain is max the fraction of a tick (0..100) to be used
	 * for draining.
	 * We also need some variables to store the average number of
	 * timecounter ticks between calls to the periodic task, etc.
	 */
	int drain_ratio;
	uint64_t cycle_task_new;	/* TSC when dummynet_task() starts */
	uint64_t cycle_task_old;	/* TSC when prev. dummynet_task() starts */
	uint64_t cycle_task;
	uint64_t cycle_task_avg;	/* Moving average of cicle_task */

	/* flowsets and schedulers are in hash tables, with 'hash_size'
	 * buckets. fshash is looked up at every packet arrival
	 * so better be generous if we expect many entries.
	 */
	struct dn_ht	*fshash;
	struct dn_ht	*schedhash;
	/* list of flowsets without a scheduler -- use sch_chain */
	struct dn_fsk_head	fsu;	/* list of unlinked flowsets */
	struct dn_alg_head	schedlist;	/* list of algorithms */

	/* Counter of idle objects -- used by drain routine
	 * We scan when idle_queue (or idle_si) > expire_object.
	 * The drain routine is called every 'expire' cycles (the counter
	 * used is expire_cycle).
	 * We can disable the expire routine by setting expire to 0.
	 * An object is kept alive for at least object_idle_tick after it
	 * becomes idle. During the scan, we count the number of objects
	 * that are idle but not ready in 'idle_si_wait' and 'idle_queue_wait'
	 */
	int	idle_queue;
	int	idle_queue_wait;		/* idle but not expired yet */
	int	idle_si;
	int	idle_si_wait;			/* idle but not expired yet */
	uint32_t expire_object;			/* threshold for expires */
	uint32_t expire;			/* how often to expire */
	uint32_t expire_cycle;
	uint32_t object_idle_tick; 		/* lifetime of objs */
	uint32_t expire_object_examined;	/* Burst of object examined */

	/* drain_fs and drain_sch point to the next bucket to scan when
	 * draining.
	 */
	uint32_t drain_fs;
	uint32_t drain_sch;

	int init_done;

	/* if the upper half is busy doing something long,
	 * can set the busy flag and we will enqueue packets in
	 * a queue for later processing.
	 */
	int	busy;
	struct	mq	pending;

#ifdef _KERNEL
	/*
	 * This file is normally used in the kernel, unless we do
	 * some userland tests, in which case we do not need a mtx.
	 * uh_mtx arbitrates between system calls and also
	 * protects fshash, schedhash and fsunlinked.
	 * These structures are readonly for the lower half.
	 * bh_mtx protects all other structures which may be
	 * modified upon packet arrivals
	 */
#if defined( __linux__ ) || defined( _WIN32 )
	spinlock_t uh_mtx;
	spinlock_t bh_mtx;
#else
	struct mtx uh_mtx;
	struct mtx bh_mtx;
#endif

#endif /* _KERNEL */
};

/*
 * Delay line, contains all packets on output from a link.
 * Every scheduler instance has one.
 */
struct delay_line {
	struct dn_id oid;
	struct dn_sch_inst *si;
	struct mq mq;
};

/*
 * The kernel side of a flowset. It is linked in a hash table
 * of flowsets, and in a list of children of their parent scheduler.
 * qht is either the queue or (if HAVE_MASK) a hash table queues.
 * Note that the mask to use is the (flow_mask|sched_mask), which
 * changes as we attach/detach schedulers. So we store it here.
 *
 * XXX If we want to add scheduler-specific parameters, we need to
 * put them in external storage because the scheduler may not be
 * available when the fsk is created.
 */
struct dn_fsk { /* kernel side of a flowset */
	struct dn_fs fs;
	SLIST_ENTRY(dn_fsk) fsk_next;	/* hash chain for fshash */

	struct ipfw_flow_id fsk_mask;

	/* qht is a hash table of queues, or just a single queue
	 * a bit in fs.flags tells us which one
	 */
	struct dn_ht	*qht;
	struct dn_schk *sched;		/* Sched we are linked to */
	SLIST_ENTRY(dn_fsk) sch_chain;	/* list of fsk attached to sched */

	/* bucket index used by drain routine to drain queues for this
	 * flowset
	 */
	int drain_bucket;
	/* Parameter realted to RED / GRED */
	/* original values are in dn_fs*/
	int w_q ;		/* queue weight (scaled) */
	int max_th ;		/* maximum threshold for queue (scaled) */
	int min_th ;		/* minimum threshold for queue (scaled) */
	int max_p ;		/* maximum value for p_b (scaled) */

	u_int c_1 ;		/* max_p/(max_th-min_th) (scaled) */
	u_int c_2 ;		/* max_p*min_th/(max_th-min_th) (scaled) */
	u_int c_3 ;		/* for GRED, (1-max_p)/max_th (scaled) */
	u_int c_4 ;		/* for GRED, 1 - 2*max_p (scaled) */
	u_int * w_q_lookup ;	/* lookup table for computing (1-w_q)^t */
	u_int lookup_depth ;	/* depth of lookup table */
	int lookup_step ;	/* granularity inside the lookup table */
	int lookup_weight ;	/* equal to (1-w_q)^t / (1-w_q)^(t+1) */
	int avg_pkt_size ;	/* medium packet size */
	int max_pkt_size ;	/* max packet size */
};

/*
 * A queue is created as a child of a flowset unless it belongs to
 * a !MULTIQUEUE scheduler. It is normally in a hash table in the
 * flowset. fs always points to the parent flowset.
 * si normally points to the sch_inst, unless the flowset has been
 * detached from the scheduler -- in this case si == NULL and we
 * should not enqueue.
 */
struct dn_queue {
	struct dn_flow ni;	/* oid, flow_id, stats */
	struct mq mq;	/* packets queue */
	struct dn_sch_inst *_si;	/* owner scheduler instance */
	SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */
	struct dn_fsk *fs;		/* parent flowset. */

	/* RED parameters */
	int avg;		/* average queue length est. (scaled) */
	int count;		/* arrivals since last RED drop */
	int random;		/* random value (scaled) */
	uint64_t q_time;	/* start of queue idle time */

};

/*
 * The kernel side of a scheduler. Contains the userland config,
 * a link, pointer to extra config arguments from command line,
 * kernel flags, and a pointer to the scheduler methods.
 * It is stored in a hash table, and holds a list of all
 * flowsets and scheduler instances.
 * XXX sch must be at the beginning, see schk_hash().
 */
struct dn_schk {
	struct dn_sch sch;
	struct dn_alg *fp;	/* Pointer to scheduler functions */
	struct dn_link link;	/* The link, embedded */
	struct dn_profile *profile; /* delay profile, if any */
	struct dn_id *cfg;	/* extra config arguments */

	SLIST_ENTRY(dn_schk) schk_next;  /* hash chain for schedhash */

	struct dn_fsk_head fsk_list;  /* all fsk linked to me */
	struct dn_fsk *fs;	/* Flowset for !MULTIQUEUE */

	/* bucket index used by the drain routine to drain the scheduler
	 * instance for this flowset.
	 */
	int drain_bucket;

	/* Hash table of all instances (through sch.sched_mask)
	 * or single instance if no mask. Always valid.
	 */
	struct dn_ht	*siht;
};


/*
 * Scheduler instance.
 * Contains variables and all queues relative to a this instance.
 * This struct is created a runtime.
 */
struct dn_sch_inst {
	struct dn_flow	ni;	/* oid, flowid and stats */
	SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */
	struct delay_line dline;
	struct dn_schk *sched;	/* the template */
	int		kflags;	/* DN_ACTIVE */

	int64_t	credit;		/* bits I can transmit (more or less). */
	uint64_t sched_time;	/* time link was scheduled in ready_heap */
	uint64_t idle_time;	/* start of scheduler instance idle time */

	/* q_count is the number of queues that this instance is using.
	 * The counter is incremented or decremented when
	 * a reference from the queue is created or deleted.
	 * It is used to make sure that a scheduler instance can be safely
	 * deleted by the drain routine.
	 */
	int q_count;

};


/* kernel-side flags. Linux has DN_DELETE in fcntl.h
 */
enum {
	/* 1 and 2 are reserved for the SCAN flags */
	DN_DESTROY	= 0x0004, /* destroy */
	DN_DELETE_FS	= 0x0008, /* destroy flowset */
	DN_DETACH	= 0x0010,
	DN_ACTIVE	= 0x0020, /* object is in evheap */
	DN_F_DLINE	= 0x0040, /* object is a delay line */
	DN_DEL_SAFE	= 0x0080, /* delete a queue only if no longer needed
				   * by scheduler */
	DN_QHT_IS_Q	= 0x0100, /* in flowset, qht is a single queue */
};

extern struct dn_parms dn_cfg;
//VNET_DECLARE(struct dn_parms, _base_dn_cfg);
//#define dn_cfg	VNET(_base_dn_cfg)

int dummynet_io(struct mbuf **, int , struct ip_fw_args *);
void dummynet_task(void *context, int pending);
void dn_reschedule(void);

struct dn_queue *ipdn_q_find(struct dn_fsk *, struct ipfw_flow_id *);
struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *);

/*
 * copy_range is a template for requests for ranges of pipes/queues/scheds.
 * The number of ranges is variable and can be derived by o.len.
 * As a default, we use a small number of entries so that the struct
 * fits easily on the stack and is sufficient for most common requests.
 */
#define DEFAULT_RANGES	5
struct copy_range {
        struct dn_id o;
        uint32_t	r[ 2 * DEFAULT_RANGES ];
};

struct copy_args {
	char **start;
	char *end;
	int flags;
	int type;
	struct copy_range *extra;	/* extra filtering */
};

struct sockopt;
int ip_dummynet_compat(struct sockopt *sopt);
int dummynet_get(struct sockopt *sopt, void **compat);
int dn_c_copy_q (void *_ni, void *arg);
int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq);
int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq);
int dn_compat_copy_queue(struct copy_args *a, void *_o);
int dn_compat_copy_pipe(struct copy_args *a, void *_o);
int copy_data_helper_compat(void *_o, void *_arg);
int dn_compat_calc_size(void);
int do_config(void *p, int l);

/* function to drain idle object */
void dn_drain_scheduler(void);
void dn_drain_queue(void);

#endif /* _IP_DN_PRIVATE_H */


================================================
FILE: sys/netinet/ipfw/ip_dummynet.c
================================================
/*-
 * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
 * Portions Copyright (c) 2000 Akamba Corp.
 * All rights reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_dummynet.c 203340 2010-02-01 12:06:37Z luigi $");

/*
 * Configuration and internal object management for dummynet.
 */

#include "opt_inet6.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/module.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/time.h>
#include <sys/taskqueue.h>
#include <net/if.h>	/* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
#include <netinet/in.h>
#include <netinet/ip_var.h>	/* ip_output(), IP_FORWARDING */
#include <netinet/ip_fw.h>
#include <netinet/ip_dummynet.h>

#include <netinet/ipfw/ip_fw_private.h>
#include <netinet/ipfw/dn_heap.h>
#include <netinet/ipfw/ip_dn_private.h>
#include <netinet/ipfw/dn_sched.h>

/* which objects to copy */
#define DN_C_LINK 	0x01
#define DN_C_SCH	0x02
#define DN_C_FLOW	0x04
#define DN_C_FS		0x08
#define DN_C_QUEUE	0x10

/* we use this argument in case of a schk_new */
struct schk_new_arg {
	struct dn_alg *fp;
	struct dn_sch *sch;
};

/*---- callout hooks. ----*/
static struct callout dn_timeout;
static struct task	dn_task;
static struct taskqueue	*dn_tq = NULL;

/* dummynet and ipfw_tick can't be static in windows */
void
dummynet(void * arg)
{

	(void)arg;	/* UNUSED */
	taskqueue_enqueue(dn_tq, &dn_task);
}

void
dn_reschedule(void)
{
	callout_reset_on(&dn_timeout, 1, dummynet, NULL, 0);
}
/*----- end of callout hooks -----*/

/* Return a scheduler descriptor given the type or name. */
static struct dn_alg *
find_sched_type(int type, char *name)
{
	struct dn_alg *d;

	SLIST_FOREACH(d, &dn_cfg.schedlist, next) {
		if (d->type == type || (name && !strcasecmp(d->name, name)))
			return d;
	}
	return NULL; /* not found */
}

int
ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg)
{
	int oldv = *v;
	const char *op = NULL;
	if (dflt < lo)
		dflt = lo;
	if (dflt > hi)
		dflt = hi;
	if (oldv < lo) {
		*v = dflt;
		op = "Bump";
	} else if (oldv > hi) {
		*v = hi;
		op = "Clamp";
	} else
		return *v;
	if (op && msg)
		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
	return *v;
}

/*---- flow_id mask, hash and compare functions ---*/
/*
 * The flow_id includes the 5-tuple, the queue/pipe number
 * which we store in the extra area in host order,
 * and for ipv6 also the flow_id6.
 * XXX see if we want the tos byte (can store in 'flags')
 */
static struct ipfw_flow_id *
flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id)
{
	int is_v6 = IS_IP6_FLOW_ID(id);

	id->dst_port &= mask->dst_port;
	id->src_port &= mask->src_port;
	id->proto &= mask->proto;
	id->extra &= mask->extra;
	if (is_v6) {
		APPLY_MASK(&id->dst_ip6, &mask->dst_ip6);
		APPLY_MASK(&id->src_ip6, &mask->src_ip6);
		id->flow_id6 &= mask->flow_id6;
	} else {
		id->dst_ip &= mask->dst_ip;
		id->src_ip &= mask->src_ip;
	}
	return id;
}

/* computes an OR of two masks, result in dst and also returned */
static struct ipfw_flow_id *
flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst)
{
	int is_v6 = IS_IP6_FLOW_ID(dst);

	dst->dst_port |= src->dst_port;
	dst->src_port |= src->src_port;
	dst->proto |= src->proto;
	dst->extra |= src->extra;
	if (is_v6) {
#define OR_MASK(_d, _s)                          \
    (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \
    (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \
    (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \
    (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3];
		OR_MASK(&dst->dst_ip6, &src->dst_ip6);
		OR_MASK(&dst->src_ip6, &src->src_ip6);
#undef OR_MASK
		dst->flow_id6 |= src->flow_id6;
	} else {
		dst->dst_ip |= src->dst_ip;
		dst->src_ip |= src->src_ip;
	}
	return dst;
}

static int
nonzero_mask(struct ipfw_flow_id *m)
{
	if (m->dst_port || m->src_port || m->proto || m->extra)
		return 1;
	if (IS_IP6_FLOW_ID(m)) {
		return
			m->dst_ip6.__u6_addr.__u6_addr32[0] ||
			m->dst_ip6.__u6_addr.__u6_addr32[1] ||
			m->dst_ip6.__u6_addr.__u6_addr32[2] ||
			m->dst_ip6.__u6_addr.__u6_addr32[3] ||
			m->src_ip6.__u6_addr.__u6_addr32[0] ||
			m->src_ip6.__u6_addr.__u6_addr32[1] ||
			m->src_ip6.__u6_addr.__u6_addr32[2] ||
			m->src_ip6.__u6_addr.__u6_addr32[3] ||
			m->flow_id6;
	} else {
		return m->dst_ip || m->src_ip;
	}
}

/* XXX we may want a better hash function */
static uint32_t
flow_id_hash(struct ipfw_flow_id *id)
{
    uint32_t i;

    if (IS_IP6_FLOW_ID(id)) {
	uint32_t *d = (uint32_t *)&id->dst_ip6;
	uint32_t *s = (uint32_t *)&id->src_ip6;
        i = (d[0]      ) ^ (d[1])       ^
            (d[2]      ) ^ (d[3])       ^
            (d[0] >> 15) ^ (d[1] >> 15) ^
            (d[2] >> 15) ^ (d[3] >> 15) ^
            (s[0] <<  1) ^ (s[1] <<  1) ^
            (s[2] <<  1) ^ (s[3] <<  1) ^
            (s[0] << 16) ^ (s[1] << 16) ^
            (s[2] << 16) ^ (s[3] << 16) ^
            (id->dst_port << 1) ^ (id->src_port) ^
	    (id->extra) ^
            (id->proto ) ^ (id->flow_id6);
    } else {
        i = (id->dst_ip)        ^ (id->dst_ip >> 15) ^
            (id->src_ip << 1)   ^ (id->src_ip >> 16) ^
	    (id->extra) ^
            (id->dst_port << 1) ^ (id->src_port)     ^ (id->proto);
    }
    return i;
}

/* Like bcmp, returns 0 if ids match, 1 otherwise. */
static int
flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2)
{
	int is_v6 = IS_IP6_FLOW_ID(id1);

	if (!is_v6) {
	    if (IS_IP6_FLOW_ID(id2))
		return 1; /* different address families */

	    return (id1->dst_ip == id2->dst_ip &&
		    id1->src_ip == id2->src_ip &&
		    id1->dst_port == id2->dst_port &&
		    id1->src_port == id2->src_port &&
		    id1->proto == id2->proto &&
		    id1->extra == id2->extra) ? 0 : 1;
	}
	/* the ipv6 case */
	return (
	    !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) &&
	    !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) &&
	    id1->dst_port == id2->dst_port &&
	    id1->src_port == id2->src_port &&
	    id1->proto == id2->proto &&
	    id1->extra == id2->extra &&
	    id1->flow_id6 == id2->flow_id6) ? 0 : 1;
}
/*--------- end of flow-id mask, hash and compare ---------*/

/*--- support functions for the qht hashtable ----
 * Entries are hashed by flow-id
 */
static uint32_t
q_hash(uintptr_t key, int flags, void *arg)
{
	/* compute the hash slot from the flow id */
	struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ?
		&((struct dn_queue *)key)->ni.fid :
		(struct ipfw_flow_id *)key;

	return flow_id_hash(id);
}

static int
q_match(void *obj, uintptr_t key, int flags, void *arg)
{
	struct dn_queue *o = (struct dn_queue *)obj;
	struct ipfw_flow_id *id2;

	if (flags & DNHT_KEY_IS_OBJ) {
		/* compare pointers */
		id2 = &((struct dn_queue *)key)->ni.fid;
	} else {
		id2 = (struct ipfw_flow_id *)key;
	}
	return (0 == flow_id_cmp(&o->ni.fid,  id2));
}

/*
 * create a new queue instance for the given 'key'.
 */
static void *
q_new(uintptr_t key, int flags, void *arg)
{   
	struct dn_queue *q, *template = arg;
	struct dn_fsk *fs = template->fs;
	int size = sizeof(*q) + fs->sched->fp->q_datalen;

	q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO);
	if (q == NULL) {
		D("no memory for new queue");
		return NULL;
	}

	set_oid(&q->ni.oid, DN_QUEUE, size);
	if (fs->fs.flags & DN_QHT_HASH)
		q->ni.fid = *(struct ipfw_flow_id *)key;
	q->fs = fs;
	q->_si = ipdn_si_find(q->fs->sched, &(template->ni.fid));
	if (q->_si == NULL) {
		D("no memory for new si");
		free (q, M_DUMMYNET);
		return NULL;
	}

	q->_si->q_count++;

	if (fs->sched->fp->new_queue)
		fs->sched->fp->new_queue(q);
	dn_cfg.queue_count++;
	dn_cfg.idle_queue++;
	return q;
}

/*
 * Notify schedulers that a queue is going away.
 * If (flags & DN_DESTROY), also free the packets.
 * The version for callbacks is called q_delete_cb().
 * Returns 1 if the queue is NOT deleted (usually when 
 * the drain routine try to delete a queue that a scheduler
 * instance needs), 0 otherwise.
 * NOTE: flag DN_DEL_SAFE means that the queue should be
 *       deleted only if the scheduler no longer needs it
 */
static int
dn_delete_queue(struct dn_queue *q, int flags)
{
	struct dn_fsk *fs = q->fs;

	// D("fs %p si %p\n", fs, q->_si);
	/* notify the parent scheduler that the queue is going away */
	if (fs && fs->sched->fp->free_queue)
		if (fs->sched->fp->free_queue(q, flags & DN_DEL_SAFE) == 1)
			return 1; 	/* queue NOT deleted */
	q->_si->q_count--;
	q->_si = NULL;
	if (flags & DN_DESTROY) {
		if (q->mq.head)
			dn_free_pkts(q->mq.head);
		else
			dn_cfg.idle_queue--;
		bzero(q, sizeof(*q));	// safety
		free(q, M_DUMMYNET);
		dn_cfg.queue_count--;
	}
	return 0;
}

static int
q_delete_cb(void *q, void *arg)
{
	int flags = (int)(uintptr_t)arg;
	dn_delete_queue(q, flags);
	return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0;
}

/*
 * calls dn_delete_queue/q_delete_cb on all queues,
 * which notifies the parent scheduler and possibly drains packets.
 * flags & DN_DESTROY: drains queues and destroy qht;
 */
static void
qht_delete(struct dn_fsk *fs, int flags)
{
	ND("fs %d start flags %d qht %p",
		fs->fs.fs_nr, flags, fs->qht);
	if (!fs->qht)
		return;
	if (fs->fs.flags & DN_QHT_HASH) {
		dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags);
		if (flags & DN_DESTROY) {
			dn_ht_free(fs->qht, 0);
			fs->qht = NULL;
		}
	} else {
		dn_delete_queue((struct dn_queue *)(fs->qht), flags);
		if (flags & DN_DESTROY)
			fs->qht = NULL;
	}
}

/*
 * Find and possibly create the queue for a MULTIQUEUE scheduler.
 * We never call it for !MULTIQUEUE (the queue is in the sch_inst).
 */
struct dn_queue *
ipdn_q_find(struct dn_fsk *fs, struct ipfw_flow_id *id)
{
	struct dn_queue template;

	template.fs = fs;

	if (fs->fs.flags & DN_QHT_HASH) {
		struct ipfw_flow_id masked_id;
		if (fs->qht == NULL) {
			fs->qht = dn_ht_init(NULL, fs->fs.buckets,
				offsetof(struct dn_queue, q_next),
				q_hash, q_match, q_new);
			if (fs->qht == NULL)
				return NULL;
		}
		masked_id = *id;
		flow_id_mask(&fs->fsk_mask, &masked_id);
		return dn_ht_find(fs->qht, (uintptr_t)&masked_id,
			DNHT_INSERT, &template);
	} else {
		if (fs->qht == NULL)
			fs->qht = q_new(0, 0, &template);
		return (struct dn_queue *)fs->qht;
	}
}
/*--- end of queue hash table ---*/

/*--- support functions for the sch_inst hashtable ----
 *
 * These are hashed by flow-id
 */
static uint32_t
si_hash(uintptr_t key, int flags, void *arg)
{
	/* compute the hash slot from the flow id */
	struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ?
		&((struct dn_sch_inst *)key)->ni.fid :
		(struct ipfw_flow_id *)key;

	return flow_id_hash(id);
}

static int
si_match(void *obj, uintptr_t key, int flags, void *arg)
{
	struct dn_sch_inst *o = obj;
	struct ipfw_flow_id *id2;

	id2 = (flags & DNHT_KEY_IS_OBJ) ?
		&((struct dn_sch_inst *)key)->ni.fid :
		(struct ipfw_flow_id *)key;
	return flow_id_cmp(&o->ni.fid,  id2) == 0;
}

static int si_reset_credit(void *_si, void *arg); // XXX si_new use this

/*
 * create a new instance for the given 'key'
 * Allocate memory for instance, delay line and scheduler private data.
 */
static void *
si_new(uintptr_t key, int flags, void *arg)
{
	struct dn_schk *s = arg;
	struct dn_sch_inst *si;
	int l = sizeof(*si) + s->fp->si_datalen;

	si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO);
	if (si == NULL)
		goto error;

	/* Set length only for the part passed up to userland. */
	set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow));
	set_oid(&(si->dline.oid), DN_DELAY_LINE,
		sizeof(struct delay_line));
	/* mark si and dline as outside the event queue */
	si->ni.oid.id = si->dline.oid.id = -1;

	si->sched = s;
	si->dline.si = si;

	if (s->fp->new_sched && s->fp->new_sched(si)) {
		D("new_sched error");
		goto error;
	}
	if (s->sch.flags & DN_HAVE_MASK)
		si->ni.fid = *(struct ipfw_flow_id *)key;

	si_reset_credit(si, NULL);
	dn_cfg.si_count++;
	dn_cfg.idle_si++;
	return si;

error:
	if (si) {
		bzero(si, sizeof(*si)); // safety
		free(si, M_DUMMYNET);
	}
        return NULL;
}

/*
 * Callback from siht to delete all scheduler instances. Remove
 * si and delay line from the system heap, destroy all queues.
 * We assume that all flowset have been notified and do not
 * point to us anymore.
 */
static int
si_destroy(void *_si, void *arg)
{
	struct dn_sch_inst *si = _si;
	struct dn_schk *s = si->sched;
	struct delay_line *dl = &si->dline;

	if (dl->oid.subtype) /* remove delay line from event heap */
		heap_extract(&dn_cfg.evheap, dl);
	if (si->ni.length == 0)
		dn_cfg.idle_si--;
	dn_free_pkts(dl->mq.head);	/* drain delay line */
	if (si->kflags & DN_ACTIVE) /* remove si from event heap */
		heap_extract(&dn_cfg.evheap, si);
	if (s->fp->free_sched)
		s->fp->free_sched(si);
	bzero(si, sizeof(*si));	/* safety */
	free(si, M_DUMMYNET);
	dn_cfg.si_count--;
	return DNHT_SCAN_DEL;
}

/*
 * Find the scheduler instance for this packet. If we need to apply
 * a mask, do on a local copy of the flow_id to preserve the original.
 * Assume siht is always initialized if we have a mask.
 */
struct dn_sch_inst *
ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id)
{

	if (s->sch.flags & DN_HAVE_MASK) {
		struct ipfw_flow_id id_t = *id;
		flow_id_mask(&s->sch.sched_mask, &id_t);
		return dn_ht_find(s->siht, (uintptr_t)&id_t,
			DNHT_INSERT, s);
	}
	if (!s->siht)
		s->siht = si_new(0, 0, s);
	return (struct dn_sch_inst *)s->siht;
}

/* callback to flush credit for the scheduler instance */
static int
si_reset_credit(void *_si, void *arg)
{
	struct dn_sch_inst *si = _si;
	struct dn_link *p = &si->sched->link;

	si->idle_time = dn_cfg.curr_time;
	si->credit = p->burst + (dn_cfg.io_fast ?  p->bandwidth : 0);
	return 0;
}

static void
schk_reset_credit(struct dn_schk *s)
{
	if (s->sch.flags & DN_HAVE_MASK)
		dn_ht_scan(s->siht, si_reset_credit, NULL);
	else if (s->siht)
		si_reset_credit(s->siht, NULL);
}
/*---- end of sch_inst hashtable ---------------------*/

/*-------------------------------------------------------
 * flowset hash (fshash) support. Entries are hashed by fs_nr.
 * New allocations are put in the fsunlinked list, from which
 * they are removed when they point to a specific scheduler.
 */
static uint32_t
fsk_hash(uintptr_t key, int flags, void *arg)
{
	uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key :
		((struct dn_fsk *)key)->fs.fs_nr;

	return ( (i>>8)^(i>>4)^i );
}

static int
fsk_match(void *obj, uintptr_t key, int flags, void *arg)
{
	struct dn_fsk *fs = obj;
	int i = !(flags & DNHT_KEY_IS_OBJ) ? key :
		((struct dn_fsk *)key)->fs.fs_nr;

	return (fs->fs.fs_nr == i);
}

static void *
fsk_new(uintptr_t key, int flags, void *arg)
{
	struct dn_fsk *fs;

	fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO);
	if (fs) {
		set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs));
		dn_cfg.fsk_count++;
		fs->drain_bucket = 0;
		SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain);
	}
	return fs;
}

/*
 * detach flowset from its current scheduler. Flags as follows:
 * DN_DETACH removes from the fsk_list
 * DN_DESTROY deletes individual queues
 * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked).
 */
static void
fsk_detach(struct dn_fsk *fs, int flags)
{
	if (flags & DN_DELETE_FS)
		flags |= DN_DESTROY;
	ND("fs %d from sched %d flags %s %s %s",
		fs->fs.fs_nr, fs->fs.sched_nr,
		(flags & DN_DELETE_FS) ? "DEL_FS":"",
		(flags & DN_DESTROY) ? "DEL":"",
		(flags & DN_DETACH) ? "DET":"");
	if (flags & DN_DETACH) { /* detach from the list */
		struct dn_fsk_head *h;
		h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu;
		SLIST_REMOVE(h, fs, dn_fsk, sch_chain);
	}
	/* Free the RED parameters, they will be recomputed on
	 * subsequent attach if needed.
	 */
	if (fs->w_q_lookup)
		free(fs->w_q_lookup, M_DUMMYNET);
	fs->w_q_lookup = NULL;
	qht_delete(fs, flags);
	if (fs->sched && fs->sched->fp->free_fsk)
		fs->sched->fp->free_fsk(fs);
	fs->sched = NULL;
	if (flags & DN_DELETE_FS) {
		bzero(fs, sizeof(*fs));	/* safety */
		free(fs, M_DUMMYNET);
		dn_cfg.fsk_count--;
	} else {
		SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain);
	}
}

/*
 * Detach or destroy all flowsets in a list.
 * flags specifies what to do:
 * DN_DESTROY:	flush all queues
 * DN_DELETE_FS:	DN_DESTROY + destroy flowset
 *	DN_DELETE_FS implies DN_DESTROY
 */
static void
fsk_detach_list(struct dn_fsk_head *h, int flags)
{
	struct dn_fsk *fs;
	int n = 0; /* only for stats */

	ND("head %p flags %x", h, flags);
	while ((fs = SLIST_FIRST(h))) {
		SLIST_REMOVE_HEAD(h, sch_chain);
		n++;
		fsk_detach(fs, flags);
	}
	ND("done %d flowsets", n);
}

/*
 * called on 'queue X delete' -- removes the flowset from fshash,
 * deletes all queues for the flowset, and removes the flowset.
 */
static int
delete_fs(int i, int locked)
{
	struct dn_fsk *fs;
	int err = 0;

	if (!locked)
		DN_BH_WLOCK();
	fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL);
	if (dn_ht_entries(dn_cfg.fshash) == 0) {
		dn_ht_free(dn_cfg.fshash, 0);
		dn_cfg.fshash = NULL;
	}
	ND("fs %d found %p", i, fs);
	if (fs) {
		fsk_detach(fs, DN_DETACH | DN_DELETE_FS);
		err = 0;
	} else
		err = EINVAL;
	if (!locked)
		DN_BH_WUNLOCK();
	return err;
}

/*----- end of flowset hashtable support -------------*/

/*------------------------------------------------------------
 * Scheduler hash. When searching by index we pass sched_nr,
 * otherwise we pass struct dn_sch * which is the first field in
 * struct dn_schk so we can cast between the two. We use this trick
 * because in the create phase (but it should be fixed).
 */
static uint32_t
schk_hash(uintptr_t key, int flags, void *_arg)
{
	uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key :
		((struct dn_schk *)key)->sch.sched_nr;
	return ( (i>>8)^(i>>4)^i );
}

static int
schk_match(void *obj, uintptr_t key, int flags, void *_arg)
{
	struct dn_schk *s = (struct dn_schk *)obj;
	int i = !(flags & DNHT_KEY_IS_OBJ) ? key :
		((struct dn_schk *)key)->sch.sched_nr;
	return (s->sch.sched_nr == i);
}

/*
 * Create the entry and intialize with the sched hash if needed.
 * Leave s->fp unset so we can tell whether a dn_ht_find() returns
 * a new object or a previously existing one.
 */
static void *
schk_new(uintptr_t key, int flags, void *arg)
{
	struct schk_new_arg *a = arg;
	struct dn_schk *s;
	int l = sizeof(*s) +a->fp->schk_datalen;

	s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO);
	if (s == NULL)
		return NULL;
	set_oid(&s->link.oid, DN_LINK, sizeof(s->link));
	s->sch = *a->sch; // copy initial values
	s->link.link_nr = s->sch.sched_nr;
	SLIST_INIT(&s->fsk_list);
	/* initialize the hash table or create the single instance */
	s->fp = a->fp;	/* si_new needs this */
	s->drain_bucket = 0;
	if (s->sch.flags & DN_HAVE_MASK) {
		s->siht = dn_ht_init(NULL, s->sch.buckets,
			offsetof(struct dn_sch_inst, si_next),
			si_hash, si_match, si_new);
		if (s->siht == NULL) {
			free(s, M_DUMMYNET);
			return NULL;
		}
	}
	s->fp = NULL;	/* mark as a new scheduler */
	dn_cfg.schk_count++;
	return s;
}

/*
 * Callback for sched delete. Notify all attached flowsets to
 * detach from the scheduler, destroy the internal flowset, and
 * all instances. The scheduler goes away too.
 * arg is 0 (only detach flowsets and destroy instances)
 * DN_DESTROY (detach & delete queues, delete schk)
 * or DN_DELETE_FS (delete queues and flowsets, delete schk)
 */
static int
schk_delete_cb(void *obj, void *arg)
{
	struct dn_schk *s = obj;
#if 0
	int a = (int)arg;
	ND("sched %d arg %s%s",
		s->sch.sched_nr,
		a&DN_DESTROY ? "DEL ":"",
		a&DN_DELETE_FS ? "DEL_FS":"");
#endif
	fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0);
	/* no more flowset pointing to us now */
	if (s->sch.flags & DN_HAVE_MASK) {
		dn_ht_scan(s->siht, si_destroy, NULL);
		dn_ht_free(s->siht, 0);
	}
	else if (s->siht)
		si_destroy(s->siht, NULL);
	if (s->profile) {
		free(s->profile, M_DUMMYNET);
		s->profile = NULL;
	}
	s->siht = NULL;
	if (s->fp->destroy)
		s->fp->destroy(s);
	bzero(s, sizeof(*s));	// safety
	free(obj, M_DUMMYNET);
	dn_cfg.schk_count--;
	return DNHT_SCAN_DEL;
}

/*
 * called on a 'sched X delete' command. Deletes a single scheduler.
 * This is done by removing from the schedhash, unlinking all
 * flowsets and deleting their traffic.
 */
static int
delete_schk(int i)
{
	struct dn_schk *s;

	s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL);
	if (dn_ht_entries(dn_cfg.schedhash) == 0) {
		dn_ht_free(dn_cfg.schedhash, 0);
		dn_cfg.schedhash = NULL;
	}
	ND("%d %p", i, s);
	if (!s)
		return EINVAL;
	delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */
	/* then detach flowsets, delete traffic */
	schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY);
	return 0;
}
/*--- end of schk hashtable support ---*/

static int
copy_obj(char **start, char *end, void *_o, const char *msg, int i)
{
	struct dn_id *o = _o;
	int have = end - *start;

	if (have < o->len || o->len == 0 || o->type == 0) {
		D("(WARN) type %d %s %d have %d need %d",
			o->type, msg, i, have, o->len);
		return 1;
	}
	ND("type %d %s %d len %d", o->type, msg, i, o->len);
	bcopy(_o, *start, o->len);
	if (o->type == DN_LINK) {
		/* Adjust burst parameter for link */
		struct dn_link *l = (struct dn_link *)*start;
		l->burst =  div64(l->burst, 8 * hz);
	} else if (o->type == DN_SCH) {
		/* Set id->id to the number of instances */
		struct dn_schk *s = _o;
		struct dn_id *id = (struct dn_id *)(*start);
		id->id = (s->sch.flags & DN_HAVE_MASK) ?
			dn_ht_entries(s->siht) : (s->siht ? 1 : 0);
	}
	*start += o->len;
	return 0;
}

/* Specific function to copy a queue.
 * Copies only the user-visible part of a queue (which is in
 * a struct dn_flow), and sets len accordingly.
 */
static int
copy_obj_q(char **start, char *end, void *_o, const char *msg, int i)
{
	struct dn_id *o = _o;
	int have = end - *start;
	int len = sizeof(struct dn_flow); /* see above comment */

	if (have < len || o->len == 0 || o->type != DN_QUEUE) {
		D("ERROR type %d %s %d have %d need %d",
			o->type, msg, i, have, len);
		return 1;
	}
	ND("type %d %s %d len %d", o->type, msg, i, len);
	bcopy(_o, *start, len);
	((struct dn_id*)(*start))->len = len;
	*start += len;
	return 0;
}

static int
copy_q_cb(void *obj, void *arg)
{
	struct dn_queue *q = obj;
	struct copy_args *a = arg;
	struct dn_flow *ni = (struct dn_flow *)(*a->start);
        if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1))
                return DNHT_SCAN_END;
        ni->oid.type = DN_FLOW; /* override the DN_QUEUE */
        ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL);
        return 0;
}

static int
copy_q(struct copy_args *a, struct dn_fsk *fs, int flags)
{
	if (!fs->qht)
		return 0;
	if (fs->fs.flags & DN_QHT_HASH)
		dn_ht_scan(fs->qht, copy_q_cb, a);
	else
		copy_q_cb(fs->qht, a);
	return 0;
}

/*
 * This routine only copies the initial part of a profile ? XXX
 * XXX marta: I think this routine is called to print a summary
 * of the pipe configuration and does not need to show the 
 * profile samples list.
 */
static int
copy_profile(struct copy_args *a, struct dn_profile *p)
{
	int have = a->end - *a->start;
	/* XXX here we check for max length */
	int profile_len = sizeof(struct dn_profile);

	if (p == NULL)
		return 0;
	if (have < profile_len) {
		D("error have %d need %d", have, profile_len);
		return 1;
	}
	bcopy(p, *a->start, profile_len);
	((struct dn_id *)(*a->start))->len = profile_len;
	*a->start += profile_len;
	return 0;
}

static int
copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags)
{
	struct dn_fs *ufs = (struct dn_fs *)(*a->start);
	if (!fs)
		return 0;
	ND("flowset %d", fs->fs.fs_nr);
	if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr))
		return DNHT_SCAN_END;
	ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ?
		dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0);
	if (flags) {	/* copy queues */
		copy_q(a, fs, 0);
	}
	return 0;
}

static int
copy_si_cb(void *obj, void *arg)
{
	struct dn_sch_inst *si = obj;
	struct copy_args *a = arg;
	struct dn_flow *ni = (struct dn_flow *)(*a->start);
	if (copy_obj(a->start, a->end, &si->ni, "inst",
			si->sched->sch.sched_nr))
		return DNHT_SCAN_END;
	ni->oid.type = DN_FLOW; /* override the DN_SCH_I */
	ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL);
	return 0;
}

static int
copy_si(struct copy_args *a, struct dn_schk *s, int flags)
{
	if (s->sch.flags & DN_HAVE_MASK)
		dn_ht_scan(s->siht, copy_si_cb, a);
	else if (s->siht)
		copy_si_cb(s->siht, a);
	return 0;
}

/*
 * compute a list of children of a scheduler and copy up
 */
static int
copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags)
{
	struct dn_fsk *fs;
	struct dn_id *o;
	uint32_t *p;

	int n = 0, space = sizeof(*o);
	SLIST_FOREACH(fs, &s->fsk_list, sch_chain) {
		if (fs->fs.fs_nr < DN_MAX_ID)
			n++;
	}
	space += n * sizeof(uint32_t);
	DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n);
	if (a->end - *(a->start) < space)
		return DNHT_SCAN_END;
	o = (struct dn_id *)(*(a->start));
	o->len = space;
	*a->start += o->len;
	o->type = DN_TEXT;
	p = (uint32_t *)(o+1);
	SLIST_FOREACH(fs, &s->fsk_list, sch_chain)
		if (fs->fs.fs_nr < DN_MAX_ID)
			*p++ = fs->fs.fs_nr;
	return 0;
}

static int
copy_data_helper(void *_o, void *_arg)
{
	struct copy_args *a = _arg;
	uint32_t *r = a->extra->r; /* start of first range */
	uint32_t *lim;	/* first invalid pointer */
	int n;

	lim = (uint32_t *)((char *)(a->extra) + a->extra->o.len);

	if (a->type == DN_LINK || a->type == DN_SCH) {
		/* pipe|sched show, we receive a dn_schk */
		struct dn_schk *s = _o;

		n = s->sch.sched_nr;
		if (a->type == DN_SCH && n >= DN_MAX_ID)
			return 0;	/* not a scheduler */
		if (a->type == DN_LINK && n <= DN_MAX_ID)
		    return 0;	/* not a pipe */

		/* see if the object is within one of our ranges */
		for (;r < lim; r += 2) {
			if (n < r[0] || n > r[1])
				continue;
			/* Found a valid entry, copy and we are done */
			if (a->flags & DN_C_LINK) {
				if (copy_obj(a->start, a->end,
				    &s->link, "link", n))
					return DNHT_SCAN_END;
				if (copy_profile(a, s->profile))
					return DNHT_SCAN_END;
				if (copy_flowset(a, s->fs, 0))
					return DNHT_SCAN_END;
			}
			if (a->flags & DN_C_SCH) {
				if (copy_obj(a->start, a->end,
				    &s->sch, "sched", n))
					return DNHT_SCAN_END;
				/* list all attached flowsets */
				if (copy_fsk_list(a, s, 0))
					return DNHT_SCAN_END;
			}
			if (a->flags & DN_C_FLOW)
				copy_si(a, s, 0);
			break;
		}
	} else if (a->type == DN_FS) {
		/* queue show, skip internal flowsets */
		struct dn_fsk *fs = _o;

		n = fs->fs.fs_nr;
		if (n >= DN_MAX_ID)
			return 0;
		/* see if the object is within one of our ranges */
		for (;r < lim; r += 2) {
			if (n < r[0] || n > r[1])
				continue;
			if (copy_flowset(a, fs, 0))
				return DNHT_SCAN_END;
			copy_q(a, fs, 0);
			break; /* we are done */
		}
	}
	return 0;
}

static inline struct dn_schk *
locate_scheduler(int i)
{
	return dn_ht_find(dn_cfg.schedhash, i, 0, NULL);
}

/*
 * red parameters are in fixed point arithmetic.
 */
static int
config_red(struct dn_fsk *fs)
{
	int64_t s, idle, weight, w0;
	int t, i;

	fs->w_q = fs->fs.w_q;
	fs->max_p = fs->fs.max_p;
	ND("called");
	/* Doing stuff that was in userland */
	i = fs->sched->link.bandwidth;
	s = (i <= 0) ? 0 :
		hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i;

	idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */
	fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth);
	/* fs->lookup_step not scaled, */
	if (!fs->lookup_step)
		fs->lookup_step = 1;
	w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled

	for (t = fs->lookup_step; t > 1; --t)
		weight = SCALE_MUL(weight, w0);
	fs->lookup_weight = (int)(weight); // scaled

	/* Now doing stuff that was in kerneland */
	fs->min_th = SCALE(fs->fs.min_th);
	fs->max_th = SCALE(fs->fs.max_th);

	fs->c_1 = fs->max_p / (fs->fs.max_th - fs->fs.min_th);
	fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th));

	if (fs->fs.flags & DN_IS_GENTLE_RED) {
		fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th;
		fs->c_4 = SCALE(1) - 2 * fs->max_p;
	}

	/* If the lookup table already exist, free and create it again. */
	if (fs->w_q_lookup) {
		free(fs->w_q_lookup, M_DUMMYNET);
		fs->w_q_lookup = NULL;
	}
	if (dn_cfg.red_lookup_depth == 0) {
		printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth"
		    "must be > 0\n");
		fs->fs.flags &= ~DN_IS_RED;
		fs->fs.flags &= ~DN_IS_GENTLE_RED;
		return (EINVAL);
	}
	fs->lookup_depth = dn_cfg.red_lookup_depth;
	fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int),
	    M_DUMMYNET, M_NOWAIT);
	if (fs->w_q_lookup == NULL) {
		printf("dummynet: sorry, cannot allocate red lookup table\n");
		fs->fs.flags &= ~DN_IS_RED;
		fs->fs.flags &= ~DN_IS_GENTLE_RED;
		return(ENOSPC);
	}

	/* Fill the lookup table with (1 - w_q)^x */
	fs->w_q_lookup[0] = SCALE(1) - fs->w_q;

	for (i = 1; i < fs->lookup_depth; i++)
		fs->w_q_lookup[i] =
		    SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight);

	if (dn_cfg.red_avg_pkt_size < 1)
		dn_cfg.red_avg_pkt_size = 512;
	fs->avg_pkt_size = dn_cfg.red_avg_pkt_size;
	if (dn_cfg.red_max_pkt_size < 1)
		dn_cfg.red_max_pkt_size = 1500;
	fs->max_pkt_size = dn_cfg.red_max_pkt_size;
	ND("exit");
	return 0;
}

/* Scan all flowset attached to this scheduler and update red */
static void
update_red(struct dn_schk *s)
{
	struct dn_fsk *fs;
	SLIST_FOREACH(fs, &s->fsk_list, sch_chain) {
		if (fs && (fs->fs.flags & DN_IS_RED))
			config_red(fs);
	}
}

/* attach flowset to scheduler s, possibly requeue */
static void
fsk_attach(struct dn_fsk *fs, struct dn_schk *s)
{
	ND("remove fs %d from fsunlinked, link to sched %d",
		fs->fs.fs_nr, s->sch.sched_nr);
	SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain);
	fs->sched = s;
	SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain);
	if (s->fp->new_fsk)
		s->fp->new_fsk(fs);
	/* XXX compute fsk_mask */
	fs->fsk_mask = fs->fs.flow_mask;
	if (fs->sched->sch.flags & DN_HAVE_MASK)
		flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask);
	if (fs->qht) {
		/*
		 * we must drain qht according to the old
		 * type, and reinsert according to the new one.
		 * The requeue is complex -- in general we need to
		 * reclassify every single packet.
		 * For the time being, let's hope qht is never set
		 * when we reach this point.
		 */
		D("XXX TODO requeue from fs %d to sch %d",
			fs->fs.fs_nr, s->sch.sched_nr);
		fs->qht = NULL;
	}
	/* set the new type for qht */
	if (nonzero_mask(&fs->fsk_mask))
		fs->fs.flags |= DN_QHT_HASH;
	else
		fs->fs.flags &= ~DN_QHT_HASH;

	/* XXX config_red() can fail... */
	if (fs->fs.flags & DN_IS_RED)
		config_red(fs);
}

/* update all flowsets which may refer to this scheduler */
static void
update_fs(struct dn_schk *s)
{
	struct dn_fsk *fs, *tmp;

	SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) {
		if (s->sch.sched_nr != fs->fs.sched_nr) {
			D("fs %d for sch %d not %d still unlinked",
				fs->fs.fs_nr, fs->fs.sched_nr,
				s->sch.sched_nr);
			continue;
		}
		fsk_attach(fs, s);
	}
}

/*
 * Configuration -- to preserve backward compatibility we use
 * the following scheme (N is 65536)
 *	NUMBER		SCHED	LINK	FLOWSET
 *	   1 ..  N-1	(1)WFQ	(2)WFQ	(3)queue
 *	 N+1 .. 2N-1	(4)FIFO (5)FIFO	(6)FIFO for sched 1..N-1
 *	2N+1 .. 3N-1	--	--	(7)FIFO for sched N+1..2N-1
 *
 * "pipe i config" configures #1, #2 and #3
 * "sched i config" configures #1 and possibly #6
 * "queue i config" configures #3
 * #1 is configured with 'pipe i config' or 'sched i config'
 * #2 is configured with 'pipe i config', and created if not
 *	existing with 'sched i config'
 * #3 is configured with 'queue i config'
 * #4 is automatically configured after #1, can only be FIFO
 * #5 is automatically configured after #2
 * #6 is automatically created when #1 is !MULTIQUEUE,
 *	and can be updated.
 * #7 is automatically configured after #2
 */

/*
 * configure a link (and its FIFO instance)
 */
static int
config_link(struct dn_link *p, struct dn_id *arg)
{
	int i;

	if (p->oid.len != sizeof(*p)) {
		D("invalid pipe len %d", p->oid.len);
		return EINVAL;
	}
	i = p->link_nr;
	if (i <= 0 || i >= DN_MAX_ID)
		return EINVAL;
	/*
	 * The config program passes parameters as follows:
	 * bw = bits/second (0 means no limits),
	 * delay = ms, must be translated into ticks.
	 * qsize = slots/bytes
	 * burst ???
	 */
	p->delay = (p->delay * hz) / 1000;
	/* Scale burst size: bytes -> bits * hz */
	p->burst *= 8 * hz;

	DN_BH_WLOCK();
	/* do it twice, base link and FIFO link */
	for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) {
	    struct dn_schk *s = locate_scheduler(i);
	    if (s == NULL) {
		DN_BH_WUNLOCK();
		D("sched %d not found", i);
		return EINVAL;
	    }
	    /* remove profile if exists */
	    if (s->profile) {
		free(s->profile, M_DUMMYNET);
		s->profile = NULL;
	    }
	    /* copy all parameters */
	    s->link.oid = p->oid;
	    s->link.link_nr = i;
	    s->link.delay = p->delay;
	    if (s->link.bandwidth != p->bandwidth) {
		/* XXX bandwidth changes, need to update red params */
	    s->link.bandwidth = p->bandwidth;
		update_red(s);
	    }
	    s->link.burst = p->burst;
	    schk_reset_credit(s);
	}
	dn_cfg.id++;
	DN_BH_WUNLOCK();
	return 0;
}

/*
 * configure a flowset. Can be called from inside with locked=1,
 */
static struct dn_fsk *
config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked)
{
	int i;
	struct dn_fsk *fs;

	if (nfs->oid.len != sizeof(*nfs)) {
		D("invalid flowset len %d", nfs->oid.len);
		return NULL;
	}
	i = nfs->fs_nr;
	if (i <= 0 || i >= 3*DN_MAX_ID)
		return NULL;
	ND("flowset %d", i);
	/* XXX other sanity checks */
        if (nfs->flags & DN_QSIZE_BYTES) {
		ipdn_bound_var(&nfs->qsize, 16384,
		    1500, dn_cfg.byte_limit, NULL); // "queue byte size");
        } else {
		ipdn_bound_var(&nfs->qsize, 50,
		    1, dn_cfg.slot_limit, NULL); // "queue slot size");
        }
	if (nfs->flags & DN_HAVE_MASK) {
		/* make sure we have some buckets */
		ipdn_bound_var((int *)&nfs->buckets, dn_cfg.hash_size,
			1, dn_cfg.max_hash_size, "flowset buckets");
	} else {
		nfs->buckets = 1;	/* we only need 1 */
	}
	if (!locked)
		DN_BH_WLOCK();
	if (dn_cfg.fshash == NULL)
		dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size,
					offsetof(struct dn_fsk, fsk_next),
					fsk_hash, fsk_match, fsk_new);
	do { /* exit with break when done */
	    struct dn_schk *s;
	    int flags = nfs->sched_nr ? DNHT_INSERT : 0;
	    int j;
	    int oldc = dn_cfg.fsk_count;
	    fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL);
	    if (fs == NULL) {
		D("missing sched for flowset %d", i);
	        break;
	    }
	    /* grab some defaults from the existing one */
	    if (nfs->sched_nr == 0) /* reuse */
		nfs->sched_nr = fs->fs.sched_nr;
	    for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) {
		if (nfs->par[j] == -1) /* reuse */
		    nfs->par[j] = fs->fs.par[j];
	    }
	    if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) {
		ND("flowset %d unchanged", i);
		break; /* no change, nothing to do */
	    }
	    if (oldc != dn_cfg.fsk_count)	/* new item */
		dn_cfg.id++;
	    s = locate_scheduler(nfs->sched_nr);
	    /* detach from old scheduler if needed, preserving
	     * queues if we need to reattach. Then update the
	     * configuration, and possibly attach to the new sched.
	     */
	    DX(2, "fs %d changed sched %d@%p to %d@%p",
		fs->fs.fs_nr,
		fs->fs.sched_nr, fs->sched, nfs->sched_nr, s);
	    if (fs->sched) {
		int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY);
		flags |= DN_DESTROY; /* XXX temporary */
		fsk_detach(fs, flags);
	    }
	    fs->fs = *nfs; /* copy configuration */
	    if (s != NULL)
		fsk_attach(fs, s);
	} while (0);
	if (!locked)
		DN_BH_WUNLOCK();
	return fs;
}

/*
 * config/reconfig a scheduler and its FIFO variant.
 * For !MULTIQUEUE schedulers, also set up the flowset.
 *
 * On reconfigurations (detected because s->fp is set),
 * detach existing flowsets preserving traffic, preserve link,
 * and delete the old scheduler creating a new one.
 */
static int
config_sched(struct dn_sch *_nsch, struct dn_id *arg)
{
	struct dn_schk *s;
	struct schk_new_arg a; /* argument for schk_new */
	int i;
	struct dn_link p;	/* copy of oldlink */
	struct dn_profile *pf = NULL;	/* copy of old link profile */
	/* Used to preserv mask parameter */
	struct ipfw_flow_id new_mask;
	int new_buckets = 0;
	int new_flags = 0;
	int pipe_cmd;
	int err = ENOMEM;

	a.sch = _nsch;
	if (a.sch->oid.len != sizeof(*a.sch)) {
		D("bad sched len %d", a.sch->oid.len);
		return EINVAL;
	}
	i = a.sch->sched_nr;
	if (i <= 0 || i >= DN_MAX_ID)
		return EINVAL;
	/* make sure we have some buckets */
	if (a.sch->flags & DN_HAVE_MASK)
		ipdn_bound_var((int *)&a.sch->buckets, dn_cfg.hash_size,
			1, dn_cfg.max_hash_size, "sched buckets");
	/* XXX other sanity checks */
	bzero(&p, sizeof(p));

	pipe_cmd = a.sch->flags & DN_PIPE_CMD;
	a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set?
	if (pipe_cmd) {
		/* Copy mask parameter */
		new_mask = a.sch->sched_mask;
		new_buckets = a.sch->buckets;
		new_flags = a.sch->flags;
	}
	DN_BH_WLOCK();
	if (dn_cfg.schedhash == NULL)
		dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size,
					offsetof(struct dn_schk, schk_next),
					schk_hash, schk_match, schk_new);
again: /* run twice, for wfq and fifo */
	/*
	 * lookup the type. If not supplied, use the previous one
	 * or default to WF2Q+. Otherwise, return an error.
	 */
	dn_cfg.id++;
	a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name);
	if (a.fp != NULL) {
		/* found. Lookup or create entry */
		s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a);
	} else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) {
		/* No type. search existing s* or retry with WF2Q+ */
		s = dn_ht_find(dn_cfg.schedhash, i, 0, &a);
		if (s != NULL) {
			a.fp = s->fp;
			/* Scheduler exists, skip to FIFO scheduler 
			 * if command was pipe config...
			 */
			if (pipe_cmd)
				goto next;
		} else {
			/* New scheduler, create a wf2q+ with no mask
			 * if command was pipe config...
			 */
			if (pipe_cmd) {
				/* clear mask parameter */
				bzero(&a.sch->sched_mask, sizeof(new_mask));
				a.sch->buckets = 0;
				a.sch->flags &= ~DN_HAVE_MASK;
			}
			a.sch->oid.subtype = DN_SCHED_WF2QP;
			goto again;
		}
	} else {
		D("invalid scheduler type %d %s",
			a.sch->oid.subtype, a.sch->name);
		err = EINVAL;
		goto error;
	}
	/* normalize name and subtype */
	a.sch->oid.subtype = a.fp->type;
	bzero(a.sch->name, sizeof(a.sch->name));
	strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name));
	if (s == NULL) {
		D("cannot allocate scheduler %d", i);
		goto error;
	}
	/* restore existing link if any */
	if (p.link_nr) {
		s->link = p;
		if (!pf || pf->link_nr != p.link_nr) { /* no saved value */
			s->profile = NULL; /* XXX maybe not needed */
		} else {
			size_t pf_size = sizeof(struct dn_profile) +
				s->profile->samples_no * sizeof(int);

			s->profile = malloc(pf_size,
					     M_DUMMYNET, M_NOWAIT | M_ZERO);
			if (s->profile == NULL) {
				D("cannot allocate profile");
				goto error; //XXX
			}
			bcopy(pf, s->profile, pf_size);
		}
	}
	p.link_nr = 0;
	if (s->fp == NULL) {
		DX(2, "sched %d new type %s", i, a.fp->name);
	} else if (s->fp != a.fp ||
			bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) {
		/* already existing. */
		DX(2, "sched %d type changed from %s to %s",
			i, s->fp->name, a.fp->name);
		DX(4, "   type/sub %d/%d -> %d/%d",
			s->sch.oid.type, s->sch.oid.subtype, 
			a.sch->oid.type, a.sch->oid.subtype);
		if (s->link.link_nr == 0)
			D("XXX WARNING link 0 for sched %d", i);
		p = s->link;	/* preserve link */
		if (s->profile) {/* preserve profile */
			if (!pf)
				pf = malloc(sizeof(*pf),
				    M_DUMMYNET, M_NOWAIT | M_ZERO);
			if (pf)	/* XXX should issue a warning otherwise */
				bcopy(s->profile, pf, sizeof(*pf));
		}
		/* remove from the hash */
		dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL);
		/* Detach flowsets, preserve queues. */
		// schk_delete_cb(s, NULL);
		// XXX temporarily, kill queues
		schk_delete_cb(s, (void *)DN_DESTROY);
		goto again;
	} else {
		DX(4, "sched %d unchanged type %s", i, a.fp->name);
	}
	/* complete initialization */
	s->sch = *a.sch;
	s->fp = a.fp;
	s->cfg = arg;
	// XXX schk_reset_credit(s);
	/* create the internal flowset if needed,
	 * trying to reuse existing ones if available
	 */
	if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) {
	        s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL);
		if (!s->fs) {
			struct dn_fs fs;
			bzero(&fs, sizeof(fs));
			set_oid(&fs.oid, DN_FS, sizeof(fs));
			fs.fs_nr = i + DN_MAX_ID;
			fs.sched_nr = i;
			s->fs = config_fs(&fs, NULL, 1 /* locked */);
		}
		if (!s->fs) {
			schk_delete_cb(s, (void *)DN_DESTROY);
			D("error creating internal fs for %d", i);
			goto error;
		}
	}
	/* call init function after the flowset is created */
	if (s->fp->config)
		s->fp->config(s);
	update_fs(s);
next:
	if (i < DN_MAX_ID) { /* now configure the FIFO instance */
		i += DN_MAX_ID;
		if (pipe_cmd) {
			/* Restore mask parameter for FIFO */
			a.sch->sched_mask = new_mask;
			a.sch->buckets = new_buckets;
			a.sch->flags = new_flags;
		} else {
			/* sched config shouldn't modify the FIFO scheduler */
			if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) {
				/* FIFO already exist, don't touch it */
				err = 0; /* and this is not an error */
				goto error;
			}
		}
		a.sch->sched_nr = i;
		a.sch->oid.subtype = DN_SCHED_FIFO;
		bzero(a.sch->name, sizeof(a.sch->name));
		goto again;
	}
	err = 0;
error:
	DN_BH_WUNLOCK();
	if (pf)
		free(pf, M_DUMMYNET);
	return err;
}

/*
 * attach a profile to a link
 */
static int
config_profile(struct dn_profile *pf, struct dn_id *arg)
{
	struct dn_schk *s;
	int i, olen, err = 0;

	if (pf->oid.len < sizeof(*pf)) {
		D("short profile len %d", pf->oid.len);
		return EINVAL;
	}
	i = pf->link_nr;
	if (i <= 0 || i >= DN_MAX_ID)
		return EINVAL;
	/* XXX other sanity checks */
	DN_BH_WLOCK();
	for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) {
		s = locate_scheduler(i);

		if (s == NULL) {
			err = EINVAL;
			break;
		}
		dn_cfg.id++;
		/*
		 * If we had a profile and the new one does not fit,
		 * or it is deleted, then we need to free memory.
		 */
		if (s->profile && (pf->samples_no == 0 ||
		    s->profile->oid.len < pf->oid.len)) {
			free(s->profile, M_DUMMYNET);
			s->profile = NULL;
		}
		if (pf->samples_no == 0)
			continue;
		/*
		 * new profile, possibly allocate memory
		 * and copy data.
		 */
		if (s->profile == NULL)
			s->profile = malloc(pf->oid.len,
				    M_DUMMYNET, M_NOWAIT | M_ZERO);
		if (s->profile == NULL) {
			D("no memory for profile %d", i);
			err = ENOMEM;
			break;
		}
		/* preserve larger length XXX double check */
		olen = s->profile->oid.len;
		if (olen < pf->oid.len)
			olen = pf->oid.len;
		bcopy(pf, s->profile, pf->oid.len);
		s->profile->oid.len = olen;
	}
	DN_BH_WUNLOCK();
	return err;
}

/*
 * Delete all objects:
 */
static void
dummynet_flush(void)
{

	/* delete all schedulers and related links/queues/flowsets */
	dn_ht_scan(dn_cfg.schedhash, schk_delete_cb,
		(void *)(uintptr_t)DN_DELETE_FS);
	/* delete all remaining (unlinked) flowsets */
	DX(4, "still %d unlinked fs", dn_cfg.fsk_count);
	dn_ht_free(dn_cfg.fshash, DNHT_REMOVE);
	fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS);

	dn_ht_free(dn_cfg.schedhash, DNHT_REMOVE);
	/* Reinitialize system heap... */
	heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id));
}

/*
 * Main handler for configuration. We are guaranteed to be called
 * with an oid which is at least a dn_id.
 * - the first object is the command (config, delete, flush, ...)
 * - config_link must be issued after the corresponding config_sched
 * - parameters (DN_TXT) for an object must preceed the object
 *   processed on a config_sched.
 */
int
do_config(void *p, int l)
{
	struct dn_id *next, *o;
	int err = 0, err2 = 0;
	struct dn_id *arg = NULL;
	uintptr_t *a;

	o = p;
	if (o->id != DN_API_VERSION) {
		D("invalid api version got %d need %d",
			o->id, DN_API_VERSION);
		return EINVAL;
	}
	for (; l >= sizeof(*o); o = next) {
		struct dn_id *prev = arg;
		if (o->len < sizeof(*o) || l < o->len) {
			D("bad len o->len %d len %d", o->len, l);
			err = EINVAL;
			break;
		}
		l -= o->len;
		next = (struct dn_id *)((char *)o + o->len);
		err = 0;
		switch (o->type) {
		default:
			D("cmd %d not implemented", o->type);
			break;

#ifdef EMULATE_SYSCTL
		/* sysctl emulation.
		 * if we recognize the command, jump to the correct
		 * handler and return
		 */
		case DN_SYSCTL_SET:
			err = kesysctl_emu_set(p, l);
			return err;
#endif

		case DN_CMD_CONFIG: /* simply a header */
			break;

		case DN_CMD_DELETE:
			/* the argument is in the first uintptr_t after o */
			a = (uintptr_t *)(o+1);
			if (o->len < sizeof(*o) + sizeof(*a)) {
				err = EINVAL;
				break;
			}
			switch (o->subtype) {
			case DN_LINK:
				/* delete base and derived schedulers */
				DN_BH_WLOCK();
				err = delete_schk(*a);
				err2 = delete_schk(*a + DN_MAX_ID);
				DN_BH_WUNLOCK();
				if (!err)
					err = err2;
				break;

			default:
				D("invalid delete type %d",
					o->subtype);
				err = EINVAL;
				break;

			case DN_FS:
				err = (*a <1 || *a >= DN_MAX_ID) ?
					EINVAL : delete_fs(*a, 0) ;
				break;
			}
			break;

		case DN_CMD_FLUSH:
			DN_BH_WLOCK();
			dummynet_flush();
			DN_BH_WUNLOCK();
			break;
		case DN_TEXT:	/* store argument the next block */
			prev = NULL;
			arg = o;
			break;
		case DN_LINK:
			err = config_link((struct dn_link *)o, arg);
			break;
		case DN_PROFILE:
			err = config_profile((struct dn_profile *)o, arg);
			break;
		case DN_SCH:
			err = config_sched((struct dn_sch *)o, arg);
			break;
		case DN_FS:
			err = (NULL==config_fs((struct dn_fs *)o, arg, 0));
			break;
		}
		if (prev)
			arg = NULL;
		if (err != 0)
			break;
	}
	return err;
}

static int
compute_space(struct dn_id *cmd, struct copy_args *a)
{
	int x = 0, need = 0;
	int profile_size = sizeof(struct dn_profile);

	/* NOTE about compute space:
	 * NP 	= dn_cfg.schk_count
	 * NSI 	= dn_cfg.si_count
	 * NF 	= dn_cfg.fsk_count
	 * NQ 	= dn_cfg.queue_count
	 * - ipfw pipe show
	 *   (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler
	 *                             link, scheduler template, flowset
	 *                             integrated in scheduler and header
	 *                             for flowset list
	 *   (NSI)*(dn_flow) all scheduler instance (includes
	 *                              the queue instance)
	 * - ipfw sched show
	 *   (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler
	 *                             link, scheduler template, flowset
	 *                             integrated in scheduler and header
	 *                             for flowset list
	 *   (NSI * dn_flow) all scheduler instances
	 *   (NF * sizeof(uint_32)) space for flowset list linked to scheduler
	 *   (NQ * dn_queue) all queue [XXXfor now not listed]
	 * - ipfw queue show
	 *   (NF * dn_fs) all flowset
	 *   (NQ * dn_queue) all queues
	 */
	switch (cmd->subtype) {
	default:
		return -1;
	/* XXX where do LINK and SCH differ ? */
	/* 'ipfw sched show' could list all queues associated to
	 * a scheduler. This feature for now is disabled
	 */
	case DN_LINK:	/* pipe show */
		x = DN_C_LINK | DN_C_SCH | DN_C_FLOW;
		need += dn_cfg.schk_count *
			(sizeof(struct dn_fs) + profile_size) / 2;
		need += dn_cfg.fsk_count * sizeof(uint32_t);
		break;
	case DN_SCH:	/* sched show */
		need += dn_cfg.schk_count *
			(sizeof(struct dn_fs) + profile_size) / 2;
		need += dn_cfg.fsk_count * sizeof(uint32_t);
		x = DN_C_SCH | DN_C_LINK | DN_C_FLOW;
		break;
	case DN_FS:	/* queue show */
		x = DN_C_FS | DN_C_QUEUE;
		break;
	case DN_GET_COMPAT:	/* compatibility mode */
		need =  dn_compat_calc_size(); 
		break;
	}
	a->flags = x;
	if (x & DN_C_SCH) {
		need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2;
		/* NOT also, each fs might be attached to a sched */
		need += dn_cfg.schk_count * sizeof(struct dn_id) / 2;
	}
	if (x & DN_C_FS)
		need += dn_cfg.fsk_count * sizeof(struct dn_fs);
	if (x & DN_C_LINK) {
		need += dn_cfg.schk_count * sizeof(struct dn_link) / 2;
	}
	/*
	 * When exporting a queue to userland, only pass up the
	 * struct dn_flow, which is the only visible part.
	 */

	if (x & DN_C_QUEUE)
		need += dn_cfg.queue_count * sizeof(struct dn_flow);
	if (x & DN_C_FLOW)
		need += dn_cfg.si_count * (sizeof(struct dn_flow));
	return need;
}

/*
 * If compat != NULL dummynet_get is called in compatibility mode.
 * *compat will be the pointer to the buffer to pass to ipfw
 */
int
dummynet_get(struct sockopt *sopt, void **compat)
{
	int have, i, need, error;
	char *start = NULL, *buf;
	size_t sopt_valsize;
	struct dn_id *cmd;
	struct copy_args a;
	struct copy_range r;
	int l = sizeof(struct dn_id);

	bzero(&a, sizeof(a));
	bzero(&r, sizeof(r));

	/* save and restore original sopt_valsize around copyin */
	sopt_valsize = sopt->sopt_valsize;

	cmd = &r.o;

	if (!compat) {
		/* copy at least an oid, and possibly a full object */
		error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd));
		sopt->sopt_valsize = sopt_valsize;
		if (error)
			goto done;
		l = cmd->len;
#ifdef EMULATE_SYSCTL
		/* sysctl emulation. */
		if (cmd->type == DN_SYSCTL_GET)
			return kesysctl_emu_get(sopt);
#endif
		if (l > sizeof(r)) {
			/* request larger than default, allocate buffer */
			cmd = malloc(l,  M_DUMMYNET, M_WAITOK);
			error = sooptcopyin(sopt, cmd, l, l);
			sopt->sopt_valsize = sopt_valsize;
			if (error)
				goto done;
		}
	} else { /* compatibility */
		error = 0;
		cmd->type = DN_CMD_GET;
		cmd->len = sizeof(struct dn_id);
		cmd->subtype = DN_GET_COMPAT;
		// cmd->id = sopt_valsize;
		D("compatibility mode");
	}
	a.extra = (struct copy_range *)cmd;
	if (cmd->len == sizeof(*cmd)) { /* no range, create a default */
		uint32_t *rp = (uint32_t *)(cmd + 1);
		cmd->len += 2* sizeof(uint32_t);
		rp[0] = 1;
		rp[1] = DN_MAX_ID - 1;
		if (cmd->subtype == DN_LINK) {
			rp[0] += DN_MAX_ID;
			rp[1] += DN_MAX_ID;
		}
	}
	/* Count space (under lock) and allocate (outside lock).
	 * Exit with lock held if we manage to get enough buffer.
	 * Try a few times then give up.
	 */
	for (have = 0, i = 0; i < 10; i++) {
		DN_BH_WLOCK();
		need = compute_space(cmd, &a);

		/* if there is a range, ignore value from compute_space() */
		if (l > sizeof(*cmd))
			need = sopt_valsize - sizeof(*cmd);

		if (need < 0) {
			DN_BH_WUNLOCK();
			error = EINVAL;
			goto done;
		}
		need += sizeof(*cmd);
		cmd->id = need;
		if (have >= need)
			break;

		DN_BH_WUNLOCK();
		if (start)
			free(start, M_DUMMYNET);
		start = NULL;
		if (need > sopt_valsize)
			break;

		have = need;
		start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO);
	}

	if (start == NULL) {
		if (compat) {
			*compat = NULL;
			error =  1; // XXX
		} else {
			error = sooptcopyout(sopt, cmd, sizeof(*cmd));
		}
		goto done;
	}
	ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, "
		"%d:%d si %d, %d:%d queues %d",
		dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH,
		dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK,
		dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS,
		dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I,
		dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE);
	sopt->sopt_valsize = sopt_valsize;
	a.type = cmd->subtype;

	if (compat == NULL) {
		bcopy(cmd, start, sizeof(*cmd));
		((struct dn_id*)(start))->len = sizeof(struct dn_id);
		buf = start + sizeof(*cmd);
	} else
		buf = start;
	a.start = &buf;
	a.end = start + have;
	/* start copying other objects */
	if (compat) {
		a.type = DN_COMPAT_PIPE;
		dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a);
		a.type = DN_COMPAT_QUEUE;
		dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a);
	} else if (a.type == DN_FS) {
		dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a);
	} else {
		dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a);
	}
	DN_BH_WUNLOCK();

	if (compat) {
		*compat = start;
		sopt->sopt_valsize = buf - start;
		/* free() is done by ip_dummynet_compat() */
		start = NULL; //XXX hack
	} else {
		error = sooptcopyout(sopt, start, buf - start);
	}
done:
	if (cmd && cmd != &r.o)
		free(cmd, M_DUMMYNET);
	if (start)
		free(start, M_DUMMYNET);
	return error;
}

/*
 * Functions to drain idle objects -- see dummynet_task() for some notes
 */
/* Callback called on scheduler instance to delete it if idle */
static int
drain_scheduler_cb(void *_si, void *_arg)
{
	struct dn_sch_inst *si = _si;
	int *arg = _arg;
	int empty;

	if ( (*arg++) > dn_cfg.expire_object_examined)
		return DNHT_SCAN_END;

	if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL)
		return 0;

	/*
	 * if the scheduler is multiqueue, q_count also reflects empty
	 * queues that point to si, so we need to check si->q_count to
	 * tell whether we can remove the instance.
	 */
	if (si->ni.length == 0) {
		/* si was marked as idle:
		 * remove it or increment idle_si_wait counter
		 */
		empty = (si->sched->fp->flags & DN_MULTIQUEUE) ? 
				(si->q_count == 0) : 1;
		if (empty && 
			(si->idle_time < dn_cfg.curr_time - dn_cfg.object_idle_tick))
				return si_destroy(si, NULL);
		else
			dn_cfg.idle_si_wait++;
	}
	return 0;
}

/* Callback called on scheduler to check if it has instances */
static int
drain_scheduler_sch_cb(void *_s, void *_arg)
{
	struct dn_schk *s = _s;
	int *arg = _arg;

	if (s->sch.flags & DN_HAVE_MASK) {
		dn_ht_scan_bucket(s->siht, &s->drain_bucket,
				drain_scheduler_cb, _arg);
	} else {
		if (s->siht) {
			if (drain_scheduler_cb(s->siht, _arg) == DNHT_SCAN_DEL)
				s->siht = NULL;
		}
	}
	return ( (*arg++) > dn_cfg.expire_object_examined) ? DNHT_SCAN_END : 0;
}

/* Called every tick, try to delete a 'bucket' of scheduler */
void
dn_drain_scheduler(void)
{
	int arg = 0;

	dn_ht_scan_bucket(dn_cfg.schedhash, (int *)&dn_cfg.drain_sch,
			   drain_scheduler_sch_cb, &arg);
}

/* Callback called on queue to delete if it is idle */
static int
drain_queue_cb(void *_q, void *_arg)
{
	struct dn_queue *q = _q;
	int *arg = _arg;

	if ( (*arg++) > dn_cfg.expire_object_examined)
		return DNHT_SCAN_END;

	if (q->ni.length == 0) {
		if (q->q_time < dn_cfg.curr_time - dn_cfg.object_idle_tick) {
			if (dn_delete_queue(q, DN_DESTROY | DN_DEL_SAFE) == 0)
				return DNHT_SCAN_DEL; /* queue is deleted */
		} else
			dn_cfg.idle_queue_wait++;
	}

	return 0; /* queue isn't deleted */
}

/* Callback called on flowset used to check if it has queues */
static int
drain_queue_fs_cb(void *_fs, void *_arg)
{
	struct dn_fsk *fs = _fs;
	int *arg = _arg;

	if (fs->fs.flags & DN_QHT_HASH) {
		/* Flowset has a hash table for queues */
		dn_ht_scan_bucket(fs->qht, &fs->drain_bucket,
				drain_queue_cb, _arg);
	} else {
		/* No hash table for this flowset, null the pointer 
		 * if the queue is deleted
		 */
		if (fs->qht) {
			if (drain_queue_cb(fs->qht, _arg) == DNHT_SCAN_DEL)
				fs->qht = NULL;
		}
	}
	return ( (*arg++) > dn_cfg.expire_object_examined) ? DNHT_SCAN_END : 0;
}

/* Called every tick, try to delete a 'bucket' of queue */
void
dn_drain_queue(void)
{
	int arg = 0;

	/* scan a bucket of flowset */
	dn_ht_scan_bucket(dn_cfg.fshash, (int *)&dn_cfg.drain_fs,
                               drain_queue_fs_cb, &arg);
}

/*
 * Handler for the various dummynet socket options
 */
static int
ip_dn_ctl(struct sockopt *sopt)
{
	void *p = NULL;
	int error, l;

	error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET);
	if (error)
		return (error);

	/* Disallow sets in really-really secure mode. */
	if (sopt->sopt_dir == SOPT_SET) {
		error =  securelevel_ge(sopt->sopt_td->td_ucred, 3);
		if (error)
			return (error);
	}

	switch (sopt->sopt_name) {
	default :
		D("dummynet: unknown option %d", sopt->sopt_name);
		error = EINVAL;
		break;

	case IP_DUMMYNET_FLUSH:
	case IP_DUMMYNET_CONFIGURE:
	case IP_DUMMYNET_DEL:	/* remove a pipe or queue */
	case IP_DUMMYNET_GET:
		D("dummynet: compat option %d", sopt->sopt_name);
		error = ip_dummynet_compat(sopt);
		break;

	case IP_DUMMYNET3 :
		if (sopt->sopt_dir == SOPT_GET) {
			error = dummynet_get(sopt, NULL);
			break;
		}
		l = sopt->sopt_valsize;
		if (l < sizeof(struct dn_id) || l > 12000) {
			D("argument len %d invalid", l);
			break;
		}
		p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ?
		error = sooptcopyin(sopt, p, l, l);
		if (error)
			break ;
		error = do_config(p, l);
		break;
	}

	if (p != NULL)
		free(p, M_TEMP);

	return error ;
}


static void
ip_dn_init(void)
{
	if (dn_cfg.init_done)
		return;
	printf("DUMMYNET %p with IPv6 initialized (100409)\n", curvnet);
	dn_cfg.init_done = 1;
	/* Set defaults here. MSVC does not accept initializers,
	 * and this is also useful for vimages
	 */
	/* queue limits */
	dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */
	dn_cfg.byte_limit = 1024 * 1024;
	dn_cfg.expire = 1;

	/* RED parameters */
	dn_cfg.red_lookup_depth = 256;	/* default lookup table depth */
	dn_cfg.red_avg_pkt_size = 512;	/* default medium packet size */
	dn_cfg.red_max_pkt_size = 1500;	/* default max packet size */

	/* hash tables */
	dn_cfg.max_hash_size = 1024;	/* max in the hash tables */

	if (dn_cfg.hash_size == 0) /* XXX or <= 0 ? */
		dn_cfg.hash_size = 64;		/* default hash size */

	/* hash tables for schedulers and flowsets are created
	 * when the first scheduler/flowset is inserted.
	 * This is done to allow to use the right hash_size value.
	 * When the last object is deleted, the table is destroyed,
	 * so a new hash_size value can be used.
	 * XXX rehash is not supported for now
	 */
	dn_cfg.schedhash = NULL;
	dn_cfg.fshash = NULL;
	/* bucket index to drain object */
	dn_cfg.drain_fs = 0;
	dn_cfg.drain_sch = 0;

	if (dn_cfg.expire_object == 0)
		dn_cfg.expire_object = 50;
	if (dn_cfg.object_idle_tick == 0)
		dn_cfg.object_idle_tick = 1000;
	if (dn_cfg.expire_object_examined == 0)
		dn_cfg.expire_object_examined = 10;
	if (dn_cfg.drain_ratio == 0)
		dn_cfg.drain_ratio = 1;

	// XXX what if we don't have a tsc ?
#ifdef HAVE_TSC
	dn_cfg.cycle_task_new = dn_cfg.cycle_task_old = readTSC();
#endif
	heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id));
	SLIST_INIT(&dn_cfg.fsu);
	SLIST_INIT(&dn_cfg.schedlist);

	DN_LOCK_INIT();

	TASK_INIT(&dn_task, 0, dummynet_task, curvnet);
	dn_tq = taskqueue_create_fast("dummynet", M_NOWAIT,
	    taskqueue_thread_enqueue, &dn_tq);
	taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet");

	callout_init(&dn_timeout, CALLOUT_MPSAFE);
	callout_reset_on(&dn_timeout, 1, dummynet, NULL, 0);

	/* Initialize curr_time adjustment mechanics. */
	getmicrouptime(&dn_cfg.prev_t);
}

#ifdef KLD_MODULE
static void
ip_dn_destroy(int last)
{
	callout_drain(&dn_timeout);

	DN_BH_WLOCK();
	if (last) {
		ND("removing last instance\n");
		ip_dn_ctl_ptr = NULL;
		ip_dn_io_ptr = NULL;
	}

	dummynet_flush();
	DN_BH_WUNLOCK();
	taskqueue_drain(dn_tq, &dn_task);
	taskqueue_free(dn_tq);

	dn_ht_free(dn_cfg.schedhash, 0);
	dn_ht_free(dn_cfg.fshash, 0);
	heap_free(&dn_cfg.evheap);

	DN_LOCK_DESTROY();
}
#endif /* KLD_MODULE */

static int
dummynet_modevent(module_t mod, int type, void *data)
{

	if (type == MOD_LOAD) {
		if (ip_dn_io_ptr) {
			printf("DUMMYNET already loaded\n");
			return EEXIST ;
		}
		ip_dn_init();
		ip_dn_ctl_ptr = ip_dn_ctl;
		ip_dn_io_ptr = dummynet_io;
		return 0;
	} else if (type == MOD_UNLOAD) {
#if !defined(KLD_MODULE)
		printf("dummynet statically compiled, cannot unload\n");
		return EINVAL ;
#else
		ip_dn_destroy(1 /* last */);
		return 0;
#endif
	} else
		return EOPNOTSUPP;
}

/* modevent helpers for the modules */
static int
load_dn_sched(struct dn_alg *d)
{
	struct dn_alg *s;

	if (d == NULL)
		return 1; /* error */
	ip_dn_init();	/* just in case, we need the lock */

	/* Check that mandatory funcs exists */
	if (d->enqueue == NULL || d->dequeue == NULL) {
		D("missing enqueue or dequeue for %s", d->name);
		return 1;
	}

	/* Search if scheduler already exists */
	DN_BH_WLOCK();
	SLIST_FOREACH(s, &dn_cfg.schedlist, next) {
		if (strcmp(s->name, d->name) == 0) {
			D("%s already loaded", d->name);
			break; /* scheduler already exists */
		}
	}
	if (s == NULL)
		SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next);
	DN_BH_WUNLOCK();
	D("dn_sched %s %sloaded", d->name, s ? "not ":"");
	return s ? 1 : 0;
}

static int
unload_dn_sched(struct dn_alg *s)
{
	struct dn_alg *tmp, *r;
	int err = EINVAL;

	ND("called for %s", s->name);

	DN_BH_WLOCK();
	SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) {
		if (strcmp(s->name, r->name) != 0)
			continue;
		ND("ref_count = %d", r->ref_count);
		err = (r->ref_count != 0) ? EBUSY : 0;
		if (err == 0)
			SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next);
		break;
	}
	DN_BH_WUNLOCK();
	D("dn_sched %s %sunloaded", s->name, err ? "not ":"");
	return err;
}

int
dn_sched_modevent(module_t mod, int cmd, void *arg)
{
	struct dn_alg *sch = arg;

	if (cmd == MOD_LOAD)
		return load_dn_sched(sch);
	else if (cmd == MOD_UNLOAD)
		return unload_dn_sched(sch);
	else
		return EINVAL;
}

static moduledata_t dummynet_mod = {
	"dummynet", dummynet_modevent, NULL
};

#define	DN_SI_SUB	SI_SUB_PROTO_IFATTACHDOMAIN
#define	DN_MODEV_ORD	(SI_ORDER_ANY - 128) /* after ipfw */
DECLARE_MODULE(dummynet, dummynet_mod, DN_SI_SUB, DN_MODEV_ORD);
MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
MODULE_VERSION(dummynet, 3);

/*
 * Starting up. Done in order after dummynet_modevent() has been called.
 * VNET_SYSINIT is also called for each existing vnet and each new vnet.
 */
//VNET_SYSINIT(vnet_dn_init, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_init, NULL);

/*
 * Shutdown handlers up shop. These are done in REVERSE ORDER, but still
 * after dummynet_modevent() has been called. Not called on reboot.
 * VNET_SYSUNINIT is also called for each exiting vnet as it exits.
 * or when the module is unloaded.
 */
//VNET_SYSUNINIT(vnet_dn_uninit, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_destroy, NULL);

/* end of file */


================================================
FILE: sys/netinet/ipfw/ip_fw2.c
================================================
/*-
 * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw2.c 200601 2009-12-16 10:48:40Z luigi $");

/*
 * The FreeBSD IP packet firewall, main file
 */

#include "opt_ipfw.h"
#include "opt_ipdivert.h"
#include "opt_inet.h"
#ifndef INET
#error "IPFIREWALL requires INET"
#endif /* INET */
#include "opt_inet6.h"
#include "opt_ipsec.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/condvar.h>
#include <sys/eventhandler.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/jail.h>
#include <sys/module.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/ucred.h>
#include <net/ethernet.h> /* for ETHERTYPE_IP */
#include <net/if.h>
#include <net/route.h>
#include <net/pf_mtag.h>
#include <net/vnet.h>

#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_pcb.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/ip_icmp.h>
#include <netinet/ip_fw.h>
#include <netinet/ipfw/ip_fw_private.h>
#include <netinet/ip_carp.h>
#include <netinet/pim.h>
#include <netinet/tcp_var.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#include <netinet/sctp.h>

#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#ifdef INET6
#include <netinet6/in6_pcb.h>
#include <netinet6/scope6_var.h>
#include <netinet6/ip6_var.h>
#endif

#include <machine/in_cksum.h>	/* XXX for in_cksum */

#ifdef MAC
#include <security/mac/mac_framework.h>
#endif

/*
 * static variables followed by global ones.
 * All ipfw global variables are here.
 */

/* ipfw_vnet_ready controls when we are open for business */
static VNET_DEFINE(int, ipfw_vnet_ready) = 0;
#define	V_ipfw_vnet_ready	VNET(ipfw_vnet_ready)

static VNET_DEFINE(int, fw_deny_unknown_exthdrs);
#define	V_fw_deny_unknown_exthdrs	VNET(fw_deny_unknown_exthdrs)

#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
static int default_to_accept = 1;
#else
static int default_to_accept;
#endif

VNET_DEFINE(int, autoinc_step);

/*
 * Each rule belongs to one of 32 different sets (0..31).
 * The variable set_disable contains one bit per set.
 * If the bit is set, all rules in the corresponding set
 * are disabled. Set RESVD_SET(31) is reserved for the default rule
 * and rules that are not deleted by the flush command,
 * and CANNOT be disabled.
 * Rules in set RESVD_SET can only be deleted individually.
 */
VNET_DEFINE(u_int32_t, set_disable);
#define	V_set_disable			VNET(set_disable)

VNET_DEFINE(int, fw_verbose);
/* counter for ipfw_log(NULL...) */
VNET_DEFINE(u_int64_t, norule_counter);
VNET_DEFINE(int, verbose_limit);

/* layer3_chain contains the list of rules for layer 3 */
VNET_DEFINE(struct ip_fw_chain, layer3_chain);

ipfw_nat_t *ipfw_nat_ptr = NULL;
struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
ipfw_nat_cfg_t *ipfw_nat_del_ptr;
ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;

#ifdef SYSCTL_NODE
uint32_t dummy_def = IPFW_DEFAULT_RULE;
uint32_t dummy_tables_max = IPFW_TABLES_MAX;

SYSBEGIN(f3)

SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
    CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
    "Only do a single pass through ipfw when using dummynet(4)");
SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
    CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
    "Rule number auto-increment step");
SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose,
    CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0,
    "Log matches to ipfw rules");
SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
    CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
    "Set upper limit of matches of ipfw rules logged");
SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
    &dummy_def, 0,
    "The default/max possible rule number.");
SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD,
    &dummy_tables_max, 0,
    "The maximum number of tables.");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
    &default_to_accept, 0,
    "Make the default rule accept all packets.");
TUNABLE_INT("net.inet.ip.fw.default_to_accept", &default_to_accept);
SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count,
    CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0,
    "Number of static rules");

#ifdef INET6
SYSCTL_DECL(_net_inet6_ip6);
SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs,
    CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0,
    "Deny packets with unknown IPv6 Extension Headers");
#endif /* INET6 */

SYSEND

#endif /* SYSCTL_NODE */


/*
 * Some macros used in the various matching options.
 * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
 * Other macros just cast void * into the appropriate type
 */
#define	L3HDR(T, ip)	((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
#define	TCP(p)		((struct tcphdr *)(p))
#define	SCTP(p)		((struct sctphdr *)(p))
#define	UDP(p)		((struct udphdr *)(p))
#define	ICMP(p)		((struct icmphdr *)(p))
#define	ICMP6(p)	((struct icmp6_hdr *)(p))

static __inline int
icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd)
{
	int type = icmp->icmp_type;

	return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) );
}

#define TT	( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \
    (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) )

static int
is_icmp_query(struct icmphdr *icmp)
{
	int type = icmp->icmp_type;

	return (type <= ICMP_MAXTYPE && (TT & (1<<type)) );
}
#undef TT

/*
 * The following checks use two arrays of 8 or 16 bits to store the
 * bits that we want set or clear, respectively. They are in the
 * low and high half of cmd->arg1 or cmd->d[0].
 *
 * We scan options and store the bits we find set. We succeed if
 *
 *	(want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
 *
 * The code is sometimes optimized not to store additional variables.
 */

static int
flags_match(ipfw_insn *cmd, u_int8_t bits)
{
	u_char want_clear;
	bits = ~bits;

	if ( ((cmd->arg1 & 0xff) & bits) != 0)
		return 0; /* some bits we want set were clear */
	want_clear = (cmd->arg1 >> 8) & 0xff;
	if ( (want_clear & bits) != want_clear)
		return 0; /* some bits we want clear were set */
	return 1;
}

static int
ipopts_match(struct ip *ip, ipfw_insn *cmd)
{
	int optlen, bits = 0;
	u_char *cp = (u_char *)(ip + 1);
	int x = (ip->ip_hl << 2) - sizeof (struct ip);

	for (; x > 0; x -= optlen, cp += optlen) {
		int opt = cp[IPOPT_OPTVAL];

		if (opt == IPOPT_EOL)
			break;
		if (opt == IPOPT_NOP)
			optlen = 1;
		else {
			optlen = cp[IPOPT_OLEN];
			if (optlen <= 0 || optlen > x)
				return 0; /* invalid or truncated */
		}
		switch (opt) {

		default:
			break;

		case IPOPT_LSRR:
			bits |= IP_FW_IPOPT_LSRR;
			break;

		case IPOPT_SSRR:
			bits |= IP_FW_IPOPT_SSRR;
			break;

		case IPOPT_RR:
			bits |= IP_FW_IPOPT_RR;
			break;

		case IPOPT_TS:
			bits |= IP_FW_IPOPT_TS;
			break;
		}
	}
	return (flags_match(cmd, bits));
}

static int
tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd)
{
	int optlen, bits = 0;
	u_char *cp = (u_char *)(tcp + 1);
	int x = (tcp->th_off << 2) - sizeof(struct tcphdr);

	for (; x > 0; x -= optlen, cp += optlen) {
		int opt = cp[0];
		if (opt == TCPOPT_EOL)
			break;
		if (opt == TCPOPT_NOP)
			optlen = 1;
		else {
			optlen = cp[1];
			if (optlen <= 0)
				break;
		}

		switch (opt) {

		default:
			break;

		case TCPOPT_MAXSEG:
			bits |= IP_FW_TCPOPT_MSS;
			break;

		case TCPOPT_WINDOW:
			bits |= IP_FW_TCPOPT_WINDOW;
			break;

		case TCPOPT_SACK_PERMITTED:
		case TCPOPT_SACK:
			bits |= IP_FW_TCPOPT_SACK;
			break;

		case TCPOPT_TIMESTAMP:
			bits |= IP_FW_TCPOPT_TS;
			break;

		}
	}
	return (flags_match(cmd, bits));
}

static int
iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
{
	if (ifp == NULL)	/* no iface with this packet, match fails */
		return 0;
	/* Check by name or by IP address */
	if (cmd->name[0] != '\0') { /* match by name */
		/* Check name */
		if (cmd->p.glob) {
			if (fnmatch(cmd->name, ifp->if_xname, 0) == 0)
				return(1);
		} else {
			if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
				return(1);
		}
	} else {
#ifdef __FreeBSD__	/* and OSX too ? */
		struct ifaddr *ia;

		if_addr_rlock(ifp);
		TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
			if (ia->ifa_addr->sa_family != AF_INET)
				continue;
			if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
			    (ia->ifa_addr))->sin_addr.s_addr) {
				if_addr_runlock(ifp);
				return(1);	/* match */
			}
		}
		if_addr_runlock(ifp);
#endif /* __FreeBSD__ */
	}
	return(0);	/* no match, fail ... */
}

/*
 * The verify_path function checks if a route to the src exists and
 * if it is reachable via ifp (when provided).
 * 
 * The 'verrevpath' option checks that the interface that an IP packet
 * arrives on is the same interface that traffic destined for the
 * packet's source address would be routed out of.
 * The 'versrcreach' option just checks that the source address is
 * reachable via any route (except default) in the routing table.
 * These two are a measure to block forged packets. This is also
 * commonly known as "anti-spoofing" or Unicast Reverse Path
 * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs
 * is purposely reminiscent of the Cisco IOS command,
 *
 *   ip verify unicast reverse-path
 *   ip verify unicast source reachable-via any
 *
 * which implements the same functionality. But note that the syntax
 * is misleading, and the check may be performed on all IP packets
 * whether unicast, multicast, or broadcast.
 */
static int
verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
{
#ifndef __FreeBSD__
	return 0;
#else
	struct route ro;
	struct sockaddr_in *dst;

	bzero(&ro, sizeof(ro));

	dst = (struct sockaddr_in *)&(ro.ro_dst);
	dst->sin_family = AF_INET;
	dst->sin_len = sizeof(*dst);
	dst->sin_addr = src;
	in_rtalloc_ign(&ro, 0, fib);

	if (ro.ro_rt == NULL)
		return 0;

	/*
	 * If ifp is provided, check for equality with rtentry.
	 * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
	 * in order to pass packets injected back by if_simloop():
	 * if useloopback == 1 routing entry (via lo0) for our own address
	 * may exist, so we need to handle routing assymetry.
	 */
	if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
		RTFREE(ro.ro_rt);
		return 0;
	}

	/* if no ifp provided, check if rtentry is not default route */
	if (ifp == NULL &&
	     satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) {
		RTFREE(ro.ro_rt);
		return 0;
	}

	/* or if this is a blackhole/reject route */
	if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
		RTFREE(ro.ro_rt);
		return 0;
	}

	/* found valid route */
	RTFREE(ro.ro_rt);
	return 1;
#endif /* __FreeBSD__ */
}

#ifdef INET6
/*
 * ipv6 specific rules here...
 */
static __inline int
icmp6type_match (int type, ipfw_insn_u32 *cmd)
{
	return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) );
}

static int
flow6id_match( int curr_flow, ipfw_insn_u32 *cmd )
{
	int i;
	for (i=0; i <= cmd->o.arg1; ++i )
		if (curr_flow == cmd->d[i] )
			return 1;
	return 0;
}

/* support for IP6_*_ME opcodes */
static int
search_ip6_addr_net (struct in6_addr * ip6_addr)
{
	struct ifnet *mdc;
	struct ifaddr *mdc2;
	struct in6_ifaddr *fdm;
	struct in6_addr copia;

	TAILQ_FOREACH(mdc, &V_ifnet, if_link) {
		if_addr_rlock(mdc);
		TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) {
			if (mdc2->ifa_addr->sa_family == AF_INET6) {
				fdm = (struct in6_ifaddr *)mdc2;
				copia = fdm->ia_addr.sin6_addr;
				/* need for leaving scope_id in the sock_addr */
				in6_clearscope(&copia);
				if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) {
					if_addr_runlock(mdc);
					return 1;
				}
			}
		}
		if_addr_runlock(mdc);
	}
	return 0;
}

static int
verify_path6(struct in6_addr *src, struct ifnet *ifp)
{
	struct route_in6 ro;
	struct sockaddr_in6 *dst;

	bzero(&ro, sizeof(ro));

	dst = (struct sockaddr_in6 * )&(ro.ro_dst);
	dst->sin6_family = AF_INET6;
	dst->sin6_len = sizeof(*dst);
	dst->sin6_addr = *src;
	/* XXX MRT 0 for ipv6 at this time */
	rtalloc_ign((struct route *)&ro, 0);

	if (ro.ro_rt == NULL)
		return 0;

	/* 
	 * if ifp is provided, check for equality with rtentry
	 * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
	 * to support the case of sending packets to an address of our own.
	 * (where the former interface is the first argument of if_simloop()
	 *  (=ifp), the latter is lo0)
	 */
	if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
		RTFREE(ro.ro_rt);
		return 0;
	}

	/* if no ifp provided, check if rtentry is not default route */
	if (ifp == NULL &&
	    IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) {
		RTFREE(ro.ro_rt);
		return 0;
	}

	/* or if this is a blackhole/reject route */
	if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
		RTFREE(ro.ro_rt);
		return 0;
	}

	/* found valid route */
	RTFREE(ro.ro_rt);
	return 1;

}

static int
is_icmp6_query(int icmp6_type)
{
	if ((icmp6_type <= ICMP6_MAXTYPE) &&
	    (icmp6_type == ICMP6_ECHO_REQUEST ||
	    icmp6_type == ICMP6_MEMBERSHIP_QUERY ||
	    icmp6_type == ICMP6_WRUREQUEST ||
	    icmp6_type == ICMP6_FQDN_QUERY ||
	    icmp6_type == ICMP6_NI_QUERY))
		return (1);

	return (0);
}

static void
send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6)
{
	struct mbuf *m;

	m = args->m;
	if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) {
		struct tcphdr *tcp;
		tcp = (struct tcphdr *)((char *)ip6 + hlen);

		if ((tcp->th_flags & TH_RST) == 0) {
			struct mbuf *m0;
			m0 = ipfw_send_pkt(args->m, &(args->f_id),
			    ntohl(tcp->th_seq), ntohl(tcp->th_ack),
			    tcp->th_flags | TH_RST);
			if (m0 != NULL)
				ip6_output(m0, NULL, NULL, 0, NULL, NULL,
				    NULL);
		}
		FREE_PKT(m);
	} else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */
#if 0
		/*
		 * Unlike above, the mbufs need to line up with the ip6 hdr,
		 * as the contents are read. We need to m_adj() the
		 * needed amount.
		 * The mbuf will however be thrown away so we can adjust it.
		 * Remember we did an m_pullup on it already so we
		 * can make some assumptions about contiguousness.
		 */
		if (args->L3offset)
			m_adj(m, args->L3offset);
#endif
		icmp6_error(m, ICMP6_DST_UNREACH, code, 0);
	} else
		FREE_PKT(m);

	args->m = NULL;
}

#endif /* INET6 */


/*
 * sends a reject message, consuming the mbuf passed as an argument.
 */
static void
send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip)
{

#if 0
	/* XXX When ip is not guaranteed to be at mtod() we will
	 * need to account for this */
	 * The mbuf will however be thrown away so we can adjust it.
	 * Remember we did an m_pullup on it already so we
	 * can make some assumptions about contiguousness.
	 */
	if (args->L3offset)
		m_adj(m, args->L3offset);
#endif
	if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
		/* We need the IP header in host order for icmp_error(). */
		SET_HOST_IPLEN(ip);
		icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
	} else if (args->f_id.proto == IPPROTO_TCP) {
		struct tcphdr *const tcp =
		    L3HDR(struct tcphdr, mtod(args->m, struct ip *));
		if ( (tcp->th_flags & TH_RST) == 0) {
			struct mbuf *m;
			m = ipfw_send_pkt(args->m, &(args->f_id),
				ntohl(tcp->th_seq), ntohl(tcp->th_ack),
				tcp->th_flags | TH_RST);
			if (m != NULL)
				ip_output(m, NULL, NULL, 0, NULL, NULL);
		}
		FREE_PKT(args->m);
	} else
		FREE_PKT(args->m);
	args->m = NULL;
}

/*
 * Support for uid/gid/jail lookup. These tests are expensive
 * (because we may need to look into the list of active sockets)
 * so we cache the results. ugid_lookupp is 0 if we have not
 * yet done a lookup, 1 if we succeeded, and -1 if we tried
 * and failed. The function always returns the match value.
 * We could actually spare the variable and use *uc, setting
 * it to '(void *)check_uidgid if we have no info, NULL if
 * we tried and failed, or any other value if successful.
 */
static int
check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif,
    struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
    u_int16_t src_port, int *ugid_lookupp,
    struct ucred **uc, struct inpcb *inp)
{
#ifndef __FreeBSD__
	return cred_check(insn, proto, oif,
	    dst_ip, dst_port, src_ip, src_port,
	    (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb);
#else  /* FreeBSD */
	struct inpcbinfo *pi;
	int wildcard;
	struct inpcb *pcb;
	int match;

	/*
	 * Check to see if the UDP or TCP stack supplied us with
	 * the PCB. If so, rather then holding a lock and looking
	 * up the PCB, we can use the one that was supplied.
	 */
	if (inp && *ugid_lookupp == 0) {
		INP_LOCK_ASSERT(inp);
		if (inp->inp_socket != NULL) {
			*uc = crhold(inp->inp_cred);
			*ugid_lookupp = 1;
		} else
			*ugid_lookupp = -1;
	}
	/*
	 * If we have already been here and the packet has no
	 * PCB entry associated with it, then we can safely
	 * assume that this is a no match.
	 */
	if (*ugid_lookupp == -1)
		return (0);
	if (proto == IPPROTO_TCP) {
		wildcard = 0;
		pi = &V_tcbinfo;
	} else if (proto == IPPROTO_UDP) {
		wildcard = INPLOOKUP_WILDCARD;
		pi = &V_udbinfo;
	} else
		return 0;
	match = 0;
	if (*ugid_lookupp == 0) {
		INP_INFO_RLOCK(pi);
		pcb =  (oif) ?
			in_pcblookup_hash(pi,
				dst_ip, htons(dst_port),
				src_ip, htons(src_port),
				wildcard, oif) :
			in_pcblookup_hash(pi,
				src_ip, htons(src_port),
				dst_ip, htons(dst_port),
				wildcard, NULL);
		if (pcb != NULL) {
			*uc = crhold(pcb->inp_cred);
			*ugid_lookupp = 1;
		}
		INP_INFO_RUNLOCK(pi);
		if (*ugid_lookupp == 0) {
			/*
			 * We tried and failed, set the variable to -1
			 * so we will not try again on this packet.
			 */
			*ugid_lookupp = -1;
			return (0);
		}
	} 
	if (insn->o.opcode == O_UID)
		match = ((*uc)->cr_uid == (uid_t)insn->d[0]);
	else if (insn->o.opcode == O_GID)
		match = groupmember((gid_t)insn->d[0], *uc);
	else if (insn->o.opcode == O_JAIL)
		match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]);
	return match;
#endif /* __FreeBSD__ */
}

/*
 * Helper function to set args with info on the rule after the matching
 * one. slot is precise, whereas we guess rule_id as they are
 * assigned sequentially.
 */
static inline void
set_match(struct ip_fw_args *args, int slot,
	struct ip_fw_chain *chain)
{
	args->rule.chain_id = chain->id;
	args->rule.slot = slot + 1; /* we use 0 as a marker */
	args->rule.rule_id = 1 + chain->map[slot]->id;
	args->rule.rulenum = chain->map[slot]->rulenum;
}

/*
 * The main check routine for the firewall.
 *
 * All arguments are in args so we can modify them and return them
 * back to the caller.
 *
 * Parameters:
 *
 *	args->m	(in/out) The packet; we set to NULL when/if we nuke it.
 *		Starts with the IP header.
 *	args->eh (in)	Mac header if present, NULL for layer3 packet.
 *	args->L3offset	Number of bytes bypassed if we came from L2.
 *			e.g. often sizeof(eh)  ** NOTYET **
 *	args->oif	Outgoing interface, NULL if packet is incoming.
 *		The incoming interface is in the mbuf. (in)
 *	args->divert_rule (in/out)
 *		Skip up to the first rule past this rule number;
 *		upon return, non-zero port number for divert or tee.
 *
 *	args->rule	Pointer to the last matching rule (in/out)
 *	args->next_hop	Socket we are forwarding to (out).
 *	args->f_id	Addresses grabbed from the packet (out)
 * 	args->rule.info	a cookie depending on rule action
 *
 * Return value:
 *
 *	IP_FW_PASS	the packet must be accepted
 *	IP_FW_DENY	the packet must be dropped
 *	IP_FW_DIVERT	divert packet, port in m_tag
 *	IP_FW_TEE	tee packet, port in m_tag
 *	IP_FW_DUMMYNET	to dummynet, pipe in args->cookie
 *	IP_FW_NETGRAPH	into netgraph, cookie args->cookie
 *		args->rule contains the matching rule,
 *		args->rule.info has additional information.
 *
 */
int
ipfw_chk(struct ip_fw_args *args)
{

	/*
	 * Local variables holding state while processing a packet:
	 *
	 * IMPORTANT NOTE: to speed up the processing of rules, there
	 * are some assumption on the values of the variables, which
	 * are documented here. Should you change them, please check
	 * the implementation of the various instructions to make sure
	 * that they still work.
	 *
	 * args->eh	The MAC header. It is non-null for a layer2
	 *	packet, it is NULL for a layer-3 packet.
	 * **notyet**
	 * args->L3offset Offset in the packet to the L3 (IP or equiv.) header.
	 *
	 * m | args->m	Pointer to the mbuf, as received from the caller.
	 *	It may change if ipfw_chk() does an m_pullup, or if it
	 *	consumes the packet because it calls send_reject().
	 *	XXX This has to change, so that ipfw_chk() never modifies
	 *	or consumes the buffer.
	 * ip	is the beginning of the ip(4 or 6) header.
	 *	Calculated by adding the L3offset to the start of data.
	 *	(Until we start using L3offset, the packet is
	 *	supposed to start with the ip header).
	 */
	struct mbuf *m = args->m;
	struct ip *ip = mtod(m, struct ip *);

	/*
	 * For rules which contain uid/gid or jail constraints, cache
	 * a copy of the users credentials after the pcb lookup has been
	 * executed. This will speed up the processing of rules with
	 * these types of constraints, as well as decrease contention
	 * on pcb related locks.
	 */
#ifndef __FreeBSD__
	struct bsd_ucred ucred_cache;
#else
	struct ucred *ucred_cache = NULL;
#endif
	int ucred_lookup = 0;

	/*
	 * oif | args->oif	If NULL, ipfw_chk has been called on the
	 *	inbound path (ether_input, ip_input).
	 *	If non-NULL, ipfw_chk has been called on the outbound path
	 *	(ether_output, ip_output).
	 */
	struct ifnet *oif = args->oif;

	int f_pos = 0;		/* index of current rule in the array */
	int retval = 0;

	/*
	 * hlen	The length of the IP header.
	 */
	u_int hlen = 0;		/* hlen >0 means we have an IP pkt */

	/*
	 * offset	The offset of a fragment. offset != 0 means that
	 *	we have a fragment at this offset of an IPv4 packet.
	 *	offset == 0 means that (if this is an IPv4 packet)
	 *	this is the first or only fragment.
	 *	For IPv6 offset == 0 means there is no Fragment Header. 
	 *	If offset != 0 for IPv6 always use correct mask to
	 *	get the correct offset because we add IP6F_MORE_FRAG
	 *	to be able to dectect the first fragment which would
	 *	otherwise have offset = 0.
	 */
	u_short offset = 0;

	/*
	 * Local copies of addresses. They are only valid if we have
	 * an IP packet.
	 *
	 * proto	The protocol. Set to 0 for non-ip packets,
	 *	or to the protocol read from the packet otherwise.
	 *	proto != 0 means that we have an IPv4 packet.
	 *
	 * src_port, dst_port	port numbers, in HOST format. Only
	 *	valid for TCP and UDP packets.
	 *
	 * src_ip, dst_ip	ip addresses, in NETWORK format.
	 *	Only valid for IPv4 packets.
	 */
	uint8_t proto;
	uint16_t src_port = 0, dst_port = 0;	/* NOTE: host format	*/
	struct in_addr src_ip, dst_ip;		/* NOTE: network format	*/
	uint16_t iplen=0;
	int pktlen;
	uint16_t	etype = 0;	/* Host order stored ether type */

	/*
	 * dyn_dir = MATCH_UNKNOWN when rules unchecked,
	 * 	MATCH_NONE when checked and not matched (q = NULL),
	 *	MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL)
	 */
	int dyn_dir = MATCH_UNKNOWN;
	ipfw_dyn_rule *q = NULL;
	struct ip_fw_chain *chain = &V_layer3_chain;

	/*
	 * We store in ulp a pointer to the upper layer protocol header.
	 * In the ipv4 case this is easy to determine from the header,
	 * but for ipv6 we might have some additional headers in the middle.
	 * ulp is NULL if not found.
	 */
	void *ulp = NULL;		/* upper layer protocol pointer. */

	/* XXX ipv6 variables */
	int is_ipv6 = 0;
	uint8_t	icmp6_type = 0;
	uint16_t ext_hd = 0;	/* bits vector for extension header filtering */
	/* end of ipv6 variables */

	int is_ipv4 = 0;

	int done = 0;		/* flag to exit the outer loop */

	if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready))
		return (IP_FW_PASS);	/* accept */

	dst_ip.s_addr = 0;		/* make sure it is initialized */
	src_ip.s_addr = 0;		/* make sure it is initialized */
	pktlen = m->m_pkthdr.len;
	args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */
	proto = args->f_id.proto = 0;	/* mark f_id invalid */
		/* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */

/*
 * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
 * then it sets p to point at the offset "len" in the mbuf. WARNING: the
 * pointer might become stale after other pullups (but we never use it
 * this way).
 */
#define PULLUP_TO(_len, p, T)					\
do {								\
	int x = (_len) + sizeof(T);				\
	if ((m)->m_len < x) {					\
		args->m = m = m_pullup(m, x);			\
		if (m == NULL)					\
			goto pullup_failed;			\
	}							\
	p = (mtod(m, char *) + (_len));				\
} while (0)

	/*
	 * if we have an ether header,
	 */
	if (args->eh)
		etype = ntohs(args->eh->ether_type);

	/* Identify IP packets and fill up variables. */
	if (pktlen >= sizeof(struct ip6_hdr) &&
	    (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) {
		struct ip6_hdr *ip6 = (struct ip6_hdr *)ip;
		is_ipv6 = 1;
		args->f_id.addr_type = 6;
		hlen = sizeof(struct ip6_hdr);
		proto = ip6->ip6_nxt;

		/* Search extension headers to find upper layer protocols */
		while (ulp == NULL) {
			switch (proto) {
			case IPPROTO_ICMPV6:
				PULLUP_TO(hlen, ulp, struct icmp6_hdr);
				icmp6_type = ICMP6(ulp)->icmp6_type;
				break;

			case IPPROTO_TCP:
				PULLUP_TO(hlen, ulp, struct tcphdr);
				dst_port = TCP(ulp)->th_dport;
				src_port = TCP(ulp)->th_sport;
				/* save flags for dynamic rules */
				args->f_id._flags = TCP(ulp)->th_flags;
				break;

			case IPPROTO_SCTP:
				PULLUP_TO(hlen, ulp, struct sctphdr);
				src_port = SCTP(ulp)->src_port;
				dst_port = SCTP(ulp)->dest_port;
				break;

			case IPPROTO_UDP:
				PULLUP_TO(hlen, ulp, struct udphdr);
				dst_port = UDP(ulp)->uh_dport;
				src_port = UDP(ulp)->uh_sport;
				break;

			case IPPROTO_HOPOPTS:	/* RFC 2460 */
				PULLUP_TO(hlen, ulp, struct ip6_hbh);
				ext_hd |= EXT_HOPOPTS;
				hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
				proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
				ulp = NULL;
				break;

			case IPPROTO_ROUTING:	/* RFC 2460 */
				PULLUP_TO(hlen, ulp, struct ip6_rthdr);
				switch (((struct ip6_rthdr *)ulp)->ip6r_type) {
				case 0:
					ext_hd |= EXT_RTHDR0;
					break;
				case 2:
					ext_hd |= EXT_RTHDR2;
					break;
				default:
					printf("IPFW2: IPV6 - Unknown Routing "
					    "Header type(%d)\n",
					    ((struct ip6_rthdr *)ulp)->ip6r_type);
					if (V_fw_deny_unknown_exthdrs)
					    return (IP_FW_DENY);
					break;
				}
				ext_hd |= EXT_ROUTING;
				hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
				proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
				ulp = NULL;
				break;

			case IPPROTO_FRAGMENT:	/* RFC 2460 */
				PULLUP_TO(hlen, ulp, struct ip6_frag);
				ext_hd |= EXT_FRAGMENT;
				hlen += sizeof (struct ip6_frag);
				proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
				offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
					IP6F_OFF_MASK;
				/* Add IP6F_MORE_FRAG for offset of first
				 * fragment to be != 0. */
				offset |= ((struct ip6_frag *)ulp)->ip6f_offlg &
					IP6F_MORE_FRAG;
				if (offset == 0) {
					printf("IPFW2: IPV6 - Invalid Fragment "
					    "Header\n");
					if (V_fw_deny_unknown_exthdrs)
					    return (IP_FW_DENY);
					break;
				}
				args->f_id.extra =
				    ntohl(((struct ip6_frag *)ulp)->ip6f_ident);
				ulp = NULL;
				break;

			case IPPROTO_DSTOPTS:	/* RFC 2460 */
				PULLUP_TO(hlen, ulp, struct ip6_hbh);
				ext_hd |= EXT_DSTOPTS;
				hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
				proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
				ulp = NULL;
				break;

			case IPPROTO_AH:	/* RFC 2402 */
				PULLUP_TO(hlen, ulp, struct ip6_ext);
				ext_hd |= EXT_AH;
				hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
				proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
				ulp = NULL;
				break;

			case IPPROTO_ESP:	/* RFC 2406 */
				PULLUP_TO(hlen, ulp, uint32_t);	/* SPI, Seq# */
				/* Anything past Seq# is variable length and
				 * data past this ext. header is encrypted. */
				ext_hd |= EXT_ESP;
				break;

			case IPPROTO_NONE:	/* RFC 2460 */
				/*
				 * Packet ends here, and IPv6 header has
				 * already been pulled up. If ip6e_len!=0
				 * then octets must be ignored.
				 */
				ulp = ip; /* non-NULL to get out of loop. */
				break;

			case IPPROTO_OSPFIGP:
				/* XXX OSPF header check? */
				PULLUP_TO(hlen, ulp, struct ip6_ext);
				break;

			case IPPROTO_PIM:
				/* XXX PIM header check? */
				PULLUP_TO(hlen, ulp, struct pim);
				break;

			case IPPROTO_CARP:
				PULLUP_TO(hlen, ulp, struct carp_header);
				if (((struct carp_header *)ulp)->carp_version !=
				    CARP_VERSION) 
					return (IP_FW_DENY);
				if (((struct carp_header *)ulp)->carp_type !=
				    CARP_ADVERTISEMENT) 
					return (IP_FW_DENY);
				break;

			case IPPROTO_IPV6:	/* RFC 2893 */
				PULLUP_TO(hlen, ulp, struct ip6_hdr);
				break;

			case IPPROTO_IPV4:	/* RFC 2893 */
				PULLUP_TO(hlen, ulp, struct ip);
				break;

			default:
				printf("IPFW2: IPV6 - Unknown Extension "
				    "Header(%d), ext_hd=%x\n", proto, ext_hd);
				if (V_fw_deny_unknown_exthdrs)
				    return (IP_FW_DENY);
				PULLUP_TO(hlen, ulp, struct ip6_ext);
				break;
			} /*switch */
		}
		ip = mtod(m, struct ip *);
		ip6 = (struct ip6_hdr *)ip;
		args->f_id.src_ip6 = ip6->ip6_src;
		args->f_id.dst_ip6 = ip6->ip6_dst;
		args->f_id.src_ip = 0;
		args->f_id.dst_ip = 0;
		args->f_id.flow_id6 = ntohl(ip6->ip6_flow);
	} else if (pktlen >= sizeof(struct ip) &&
	    (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) {
	    	is_ipv4 = 1;
		hlen = ip->ip_hl << 2;
		args->f_id.addr_type = 4;

		/*
		 * Collect parameters into local variables for faster matching.
		 */
		proto = ip->ip_p;
		src_ip = ip->ip_src;
		dst_ip = ip->ip_dst;
		offset = ntohs(ip->ip_off) & IP_OFFMASK;
		iplen = ntohs(ip->ip_len);
		pktlen = iplen < pktlen ? iplen : pktlen;

		if (offset == 0) {
			switch (proto) {
			case IPPROTO_TCP:
				PULLUP_TO(hlen, ulp, struct tcphdr);
				dst_port = TCP(ulp)->th_dport;
				src_port = TCP(ulp)->th_sport;
				/* save flags for dynamic rules */
				args->f_id._flags = TCP(ulp)->th_flags;
				break;

			case IPPROTO_UDP:
				PULLUP_TO(hlen, ulp, struct udphdr);
				dst_port = UDP(ulp)->uh_dport;
				src_port = UDP(ulp)->uh_sport;
				break;

			case IPPROTO_ICMP:
				PULLUP_TO(hlen, ulp, struct icmphdr);
				//args->f_id.flags = ICMP(ulp)->icmp_type;
				break;

			default:
				break;
			}
		}

		ip = mtod(m, struct ip *);
		args->f_id.src_ip = ntohl(src_ip.s_addr);
		args->f_id.dst_ip = ntohl(dst_ip.s_addr);
	}
#undef PULLUP_TO
	if (proto) { /* we may have port numbers, store them */
		args->f_id.proto = proto;
		args->f_id.src_port = src_port = ntohs(src_port);
		args->f_id.dst_port = dst_port = ntohs(dst_port);
	}

	IPFW_RLOCK(chain);
	if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */
		IPFW_RUNLOCK(chain);
		return (IP_FW_PASS);	/* accept */
	}
	if (args->rule.slot) {
		/*
		 * Packet has already been tagged as a result of a previous
		 * match on rule args->rule aka args->rule_id (PIPE, QUEUE,
		 * REASS, NETGRAPH, DIVERT/TEE...)
		 * Validate the slot and continue from the next one
		 * if still present, otherwise do a lookup.
		 */
		f_pos = (args->rule.chain_id == chain->id) ?
		    args->rule.slot :
		    ipfw_find_rule(chain, args->rule.rulenum,
			args->rule.rule_id);
	} else {
		f_pos = 0;
	}

	/*
	 * Now scan the rules, and parse microinstructions for each rule.
	 * We have two nested loops and an inner switch. Sometimes we
	 * need to break out of one or both loops, or re-enter one of
	 * the loops with updated variables. Loop variables are:
	 *
	 *	f_pos (outer loop) points to the current rule.
	 *		On output it points to the matching rule.
	 *	done (outer loop) is used as a flag to break the loop.
	 *	l (inner loop)	residual length of current rule.
	 *		cmd points to the current microinstruction.
	 *
	 * We break the inner loop by setting l=0 and possibly
	 * cmdlen=0 if we don't want to advance cmd.
	 * We break the outer loop by setting done=1
	 * We can restart the inner loop by setting l>0 and f_pos, f, cmd
	 * as needed.
	 */
	for (; f_pos < chain->n_rules; f_pos++) {
		ipfw_insn *cmd;
		uint32_t tablearg = 0;
		int l, cmdlen, skip_or; /* skip rest of OR block */
		struct ip_fw *f;

		f = chain->map[f_pos];
		if (V_set_disable & (1 << f->set) )
			continue;

		skip_or = 0;
		for (l = f->cmd_len, cmd = f->cmd ; l > 0 ;
		    l -= cmdlen, cmd += cmdlen) {
			int match;

			/*
			 * check_body is a jump target used when we find a
			 * CHECK_STATE, and need to jump to the body of
			 * the target rule.
			 */

/* check_body: */
			cmdlen = F_LEN(cmd);
			/*
			 * An OR block (insn_1 || .. || insn_n) has the
			 * F_OR bit set in all but the last instruction.
			 * The first match will set "skip_or", and cause
			 * the following instructions to be skipped until
			 * past the one with the F_OR bit clear.
			 */
			if (skip_or) {		/* skip this instruction */
				if ((cmd->len & F_OR) == 0)
					skip_or = 0;	/* next one is good */
				continue;
			}
			match = 0; /* set to 1 if we succeed */

			switch (cmd->opcode) {
			/*
			 * The first set of opcodes compares the packet's
			 * fields with some pattern, setting 'match' if a
			 * match is found. At the end of the loop there is
			 * logic to deal with F_NOT and F_OR flags associated
			 * with the opcode.
			 */
			case O_NOP:
				match = 1;
				break;

			case O_FORWARD_MAC:
				printf("ipfw: opcode %d unimplemented\n",
				    cmd->opcode);
				break;

			case O_GID:
			case O_UID:
			case O_JAIL:
				/*
				 * We only check offset == 0 && proto != 0,
				 * as this ensures that we have a
				 * packet with the ports info.
				 */
				if (offset!=0)
					break;
				if (is_ipv6) /* XXX to be fixed later */
					break;
				if (proto == IPPROTO_TCP ||
				    proto == IPPROTO_UDP)
					match = check_uidgid(
						    (ipfw_insn_u32 *)cmd,
						    proto, oif,
						    dst_ip, dst_port,
						    src_ip, src_port, &ucred_lookup,
#ifdef __FreeBSD__
						    &ucred_cache, args->inp);
#else
						    (void *)&ucred_cache,
						    (struct inpcb *)args->m);
#endif
				break;

			case O_RECV:
				match = iface_match(m->m_pkthdr.rcvif,
				    (ipfw_insn_if *)cmd);
				break;

			case O_XMIT:
				match = iface_match(oif, (ipfw_insn_if *)cmd);
				break;

			case O_VIA:
				match = iface_match(oif ? oif :
				    m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
				break;

			case O_MACADDR2:
				if (args->eh != NULL) {	/* have MAC header */
					u_int32_t *want = (u_int32_t *)
						((ipfw_insn_mac *)cmd)->addr;
					u_int32_t *mask = (u_int32_t *)
						((ipfw_insn_mac *)cmd)->mask;
					u_int32_t *hdr = (u_int32_t *)args->eh;

					match =
					    ( want[0] == (hdr[0] & mask[0]) &&
					      want[1] == (hdr[1] & mask[1]) &&
					      want[2] == (hdr[2] & mask[2]) );
				}
				break;

			case O_MAC_TYPE:
				if (args->eh != NULL) {
					u_int16_t *p =
					    ((ipfw_insn_u16 *)cmd)->ports;
					int i;

					for (i = cmdlen - 1; !match && i>0;
					    i--, p += 2)
						match = (etype >= p[0] &&
						    etype <= p[1]);
				}
				break;

			case O_FRAG:
				match = (offset != 0);
				break;

			case O_IN:	/* "out" is "not in" */
				match = (oif == NULL);
				break;

			case O_LAYER2:
				match = (args->eh != NULL);
				break;

			case O_DIVERTED:
			    {
				/* For diverted packets, args->rule.info
				 * contains the divert port (in host format)
				 * reason and direction.
				 */
				uint32_t i = args->rule.info;
				match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT &&
				    cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2);
			    }
				break;

			case O_PROTO:
				/*
				 * We do not allow an arg of 0 so the
				 * check of "proto" only suffices.
				 */
				match = (proto == cmd->arg1);
				break;

			case O_IP_SRC:
				match = is_ipv4 &&
				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
				    src_ip.s_addr);
				break;

			case O_IP_SRC_LOOKUP:
			case O_IP_DST_LOOKUP:
				if (is_ipv4) {
				    uint32_t key =
					(cmd->opcode == O_IP_DST_LOOKUP) ?
					    dst_ip.s_addr : src_ip.s_addr;
				    uint32_t v = 0;

				    if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) {
					/* generic lookup. The key must be
					 * in 32bit big-endian format.
					 */
					v = ((ipfw_insn_u32 *)cmd)->d[1];
					if (v == 0)
					    key = dst_ip.s_addr;
					else if (v == 1)
					    key = src_ip.s_addr;
					else if (v == 6) /* dscp */
					    key = (ip->ip_tos >> 2) & 0x3f;
					else if (offset != 0)
					    break;
					else if (proto != IPPROTO_TCP &&
						proto != IPPROTO_UDP)
					    break;
					else if (v == 2)
					    key = htonl(dst_port);
					else if (v == 3)
					    key = htonl(src_port);
					else if (v == 4 || v == 5) {
					    check_uidgid(
						(ipfw_insn_u32 *)cmd,
						proto, oif,
						dst_ip, dst_port,
						src_ip, src_port, &ucred_lookup,
#ifdef __FreeBSD__
						&ucred_cache, args->inp);
					    if (v == 4 /* O_UID */)
						key = ucred_cache->cr_uid;
					    else if (v == 5 /* O_JAIL */)
						key = ucred_cache->cr_prison->pr_id;
#else /* !__FreeBSD__ */
						(void *)&ucred_cache,
						(struct inpcb *)args->m);
					    if (v ==4 /* O_UID */)
						key = ucred_cache.uid;
					    else if (v == 5 /* O_JAIL */)
						key = ucred_cache.xid;
#endif /* !__FreeBSD__ */
					    key = htonl(key);
					} else
					    break;
				    }
				    match = ipfw_lookup_table(chain,
					cmd->arg1, key, &v);
				    if (!match)
					break;
				    if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
					match =
					    ((ipfw_insn_u32 *)cmd)->d[0] == v;
				    else
					tablearg = v;
				}
				break;

			case O_IP_SRC_MASK:
			case O_IP_DST_MASK:
				if (is_ipv4) {
				    uint32_t a =
					(cmd->opcode == O_IP_DST_MASK) ?
					    dst_ip.s_addr : src_ip.s_addr;
				    uint32_t *p = ((ipfw_insn_u32 *)cmd)->d;
				    int i = cmdlen-1;

				    for (; !match && i>0; i-= 2, p+= 2)
					match = (p[0] == (a & p[1]));
				}
				break;

			case O_IP_SRC_ME:
				if (is_ipv4) {
					struct ifnet *tif;

					INADDR_TO_IFP(src_ip, tif);
					match = (tif != NULL);
					break;
				}
#ifdef INET6
				/* FALLTHROUGH */
			case O_IP6_SRC_ME:
				match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6);
#endif
				break;

			case O_IP_DST_SET:
			case O_IP_SRC_SET:
				if (is_ipv4) {
					u_int32_t *d = (u_int32_t *)(cmd+1);
					u_int32_t addr =
					    cmd->opcode == O_IP_DST_SET ?
						args->f_id.dst_ip :
						args->f_id.src_ip;

					    if (addr < d[0])
						    break;
					    addr -= d[0]; /* subtract base */
					    match = (addr < cmd->arg1) &&
						( d[ 1 + (addr>>5)] &
						  (1<<(addr & 0x1f)) );
				}
				break;

			case O_IP_DST:
				match = is_ipv4 &&
				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
				    dst_ip.s_addr);
				break;

			case O_IP_DST_ME:
				if (is_ipv4) {
					struct ifnet *tif;

					INADDR_TO_IFP(dst_ip, tif);
					match = (tif != NULL);
					break;
				}
#ifdef INET6
				/* FALLTHROUGH */
			case O_IP6_DST_ME:
				match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6);
#endif
				break;


			case O_IP_SRCPORT:
			case O_IP_DSTPORT:
				/*
				 * offset == 0 && proto != 0 is enough
				 * to guarantee that we have a
				 * packet with port info.
				 */
				if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP)
				    && offset == 0) {
					u_int16_t x =
					    (cmd->opcode == O_IP_SRCPORT) ?
						src_port : dst_port ;
					u_int16_t *p =
					    ((ipfw_insn_u16 *)cmd)->ports;
					int i;

					for (i = cmdlen - 1; !match && i>0;
					    i--, p += 2)
						match = (x>=p[0] && x<=p[1]);
				}
				break;

			case O_ICMPTYPE:
				match = (offset == 0 && proto==IPPROTO_ICMP &&
				    icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) );
				break;

#ifdef INET6
			case O_ICMP6TYPE:
				match = is_ipv6 && offset == 0 &&
				    proto==IPPROTO_ICMPV6 &&
				    icmp6type_match(
					ICMP6(ulp)->icmp6_type,
					(ipfw_insn_u32 *)cmd);
				break;
#endif /* INET6 */

			case O_IPOPT:
				match = (is_ipv4 &&
				    ipopts_match(ip, cmd) );
				break;

			case O_IPVER:
				match = (is_ipv4 &&
				    cmd->arg1 == ip->ip_v);
				break;

			case O_IPID:
			case O_IPLEN:
			case O_IPTTL:
				if (is_ipv4) {	/* only for IP packets */
				    uint16_t x;
				    uint16_t *p;
				    int i;

				    if (cmd->opcode == O_IPLEN)
					x = iplen;
				    else if (cmd->opcode == O_IPTTL)
					x = ip->ip_ttl;
				    else /* must be IPID */
					x = ntohs(ip->ip_id);
				    if (cmdlen == 1) {
					match = (cmd->arg1 == x);
					break;
				    }
				    /* otherwise we have ranges */
				    p = ((ipfw_insn_u16 *)cmd)->ports;
				    i = cmdlen - 1;
				    for (; !match && i>0; i--, p += 2)
					match = (x >= p[0] && x <= p[1]);
				}
				break;

			case O_IPPRECEDENCE:
				match = (is_ipv4 &&
				    (cmd->arg1 == (ip->ip_tos & 0xe0)) );
				break;

			case O_IPTOS:
				match = (is_ipv4 &&
				    flags_match(cmd, ip->ip_tos));
				break;

			case O_TCPDATALEN:
				if (proto == IPPROTO_TCP && offset == 0) {
				    struct tcphdr *tcp;
				    uint16_t x;
				    uint16_t *p;
				    int i;

				    tcp = TCP(ulp);
				    x = iplen -
					((ip->ip_hl + tcp->th_off) << 2);
				    if (cmdlen == 1) {
					match = (cmd->arg1 == x);
					break;
				    }
				    /* otherwise we have ranges */
				    p = ((ipfw_insn_u16 *)cmd)->ports;
				    i = cmdlen - 1;
				    for (; !match && i>0; i--, p += 2)
					match = (x >= p[0] && x <= p[1]);
				}
				break;

			case O_TCPFLAGS:
				match = (proto == IPPROTO_TCP && offset == 0 &&
				    flags_match(cmd, TCP(ulp)->th_flags));
				break;

			case O_TCPOPTS:
				match = (proto == IPPROTO_TCP && offset == 0 &&
				    tcpopts_match(TCP(ulp), cmd));
				break;

			case O_TCPSEQ:
				match = (proto == IPPROTO_TCP && offset == 0 &&
				    ((ipfw_insn_u32 *)cmd)->d[0] ==
					TCP(ulp)->th_seq);
				break;

			case O_TCPACK:
				match = (proto == IPPROTO_TCP && offset == 0 &&
				    ((ipfw_insn_u32 *)cmd)->d[0] ==
					TCP(ulp)->th_ack);
				break;

			case O_TCPWIN:
				match = (proto == IPPROTO_TCP && offset == 0 &&
				    cmd->arg1 == TCP(ulp)->th_win);
				break;

			case O_ESTAB:
				/* reject packets which have SYN only */
				/* XXX should i also check for TH_ACK ? */
				match = (proto == IPPROTO_TCP && offset == 0 &&
				    (TCP(ulp)->th_flags &
				     (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
				break;

			case O_ALTQ: {
				struct pf_mtag *at;
				ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;

				match = 1;
				at = pf_find_mtag(m);
				if (at != NULL && at->qid != 0)
					break;
				at = pf_get_mtag(m);
				if (at == NULL) {
					/*
					 * Let the packet fall back to the
					 * default ALTQ.
					 */
					break;
				}
				at->qid = altq->qid;
				if (is_ipv4)
					at->af = AF_INET;
				else
					at->af = AF_LINK;
				at->hdr = ip;
				break;
			}

			case O_LOG:
				ipfw_log(f, hlen, args, m,
					    oif, offset, tablearg, ip);
				match = 1;
				break;

			case O_PROB:
				match = (random()<((ipfw_insn_u32 *)cmd)->d[0]);
				break;

			case O_VERREVPATH:
				/* Outgoing packets automatically pass/match */
				match = ((oif != NULL) ||
				    (m->m_pkthdr.rcvif == NULL) ||
				    (
#ifdef INET6
				    is_ipv6 ?
					verify_path6(&(args->f_id.src_ip6),
					    m->m_pkthdr.rcvif) :
#endif
				    verify_path(src_ip, m->m_pkthdr.rcvif,
				        args->f_id.fib)));
				break;

			case O_VERSRCREACH:
				/* Outgoing packets automatically pass/match */
				match = (hlen > 0 && ((oif != NULL) ||
#ifdef INET6
				    is_ipv6 ?
				        verify_path6(&(args->f_id.src_ip6),
				            NULL) :
#endif
				    verify_path(src_ip, NULL, args->f_id.fib)));
				break;

			case O_ANTISPOOF:
				/* Outgoing packets automatically pass/match */
				if (oif == NULL && hlen > 0 &&
				    (  (is_ipv4 && in_localaddr(src_ip))
#ifdef INET6
				    || (is_ipv6 &&
				        in6_localaddr(&(args->f_id.src_ip6)))
#endif
				    ))
					match =
#ifdef INET6
					    is_ipv6 ? verify_path6(
					        &(args->f_id.src_ip6),
					        m->m_pkthdr.rcvif) :
#endif
					    verify_path(src_ip,
					    	m->m_pkthdr.rcvif,
					        args->f_id.fib);
				else
					match = 1;
				break;

			case O_IPSEC:
#ifdef IPSEC
				match = (m_tag_find(m,
				    PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL);
#endif
				/* otherwise no match */
				break;

#ifdef INET6
			case O_IP6_SRC:
				match = is_ipv6 &&
				    IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6,
				    &((ipfw_insn_ip6 *)cmd)->addr6);
				break;

			case O_IP6_DST:
				match = is_ipv6 &&
				IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6,
				    &((ipfw_insn_ip6 *)cmd)->addr6);
				break;
			case O_IP6_SRC_MASK:
			case O_IP6_DST_MASK:
				if (is_ipv6) {
					int i = cmdlen - 1;
					struct in6_addr p;
					struct in6_addr *d =
					    &((ipfw_insn_ip6 *)cmd)->addr6;

					for (; !match && i > 0; d += 2,
					    i -= F_INSN_SIZE(struct in6_addr)
					    * 2) {
						p = (cmd->opcode ==
						    O_IP6_SRC_MASK) ?
						    args->f_id.src_ip6:
						    args->f_id.dst_ip6;
						APPLY_MASK(&p, &d[1]);
						match =
						    IN6_ARE_ADDR_EQUAL(&d[0],
						    &p);
					}
				}
				break;

			case O_FLOW6ID:
				match = is_ipv6 &&
				    flow6id_match(args->f_id.flow_id6,
				    (ipfw_insn_u32 *) cmd);
				break;

			case O_EXT_HDR:
				match = is_ipv6 &&
				    (ext_hd & ((ipfw_insn *) cmd)->arg1);
				break;

			case O_IP6:
				match = is_ipv6;
				break;
#endif

			case O_IP4:
				match = is_ipv4;
				break;

			case O_TAG: {
				struct m_tag *mtag;
				uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
				    tablearg : cmd->arg1;

				/* Packet is already tagged with this tag? */
				mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL);

				/* We have `untag' action when F_NOT flag is
				 * present. And we must remove this mtag from
				 * mbuf and reset `match' to zero (`match' will
				 * be inversed later).
				 * Otherwise we should allocate new mtag and
				 * push it into mbuf.
				 */
				if (cmd->len & F_NOT) { /* `untag' action */
					if (mtag != NULL)
						m_tag_delete(m, mtag);
					match = 0;
				} else if (mtag == NULL) {
					if ((mtag = m_tag_alloc(MTAG_IPFW,
					    tag, 0, M_NOWAIT)) != NULL)
						m_tag_prepend(m, mtag);
					match = 1;
				}
				break;
			}

			case O_FIB: /* try match the specified fib */
				if (args->f_id.fib == cmd->arg1)
					match = 1;
				break;

			case O_TAGGED: {
				struct m_tag *mtag;
				uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
				    tablearg : cmd->arg1;

				if (cmdlen == 1) {
					match = m_tag_locate(m, MTAG_IPFW,
					    tag, NULL) != NULL;
					break;
				}

				/* we have ranges */
				for (mtag = m_tag_first(m);
				    mtag != NULL && !match;
				    mtag = m_tag_next(m, mtag)) {
					uint16_t *p;
					int i;

					if (mtag->m_tag_cookie != MTAG_IPFW)
						continue;

					p = ((ipfw_insn_u16 *)cmd)->ports;
					i = cmdlen - 1;
					for(; !match && i > 0; i--, p += 2)
						match =
						    mtag->m_tag_id >= p[0] &&
						    mtag->m_tag_id <= p[1];
				}
				break;
			}
				
			/*
			 * The second set of opcodes represents 'actions',
			 * i.e. the terminal part of a rule once the packet
			 * matches all previous patterns.
			 * Typically there is only one action for each rule,
			 * and the opcode is stored at the end of the rule
			 * (but there are exceptions -- see below).
			 *
			 * In general, here we set retval and terminate the
			 * outer loop (would be a 'break 3' in some language,
			 * but we need to set l=0, done=1)
			 *
			 * Exceptions:
			 * O_COUNT and O_SKIPTO actions:
			 *   instead of terminating, we jump to the next rule
			 *   (setting l=0), or to the SKIPTO target (setting
			 *   f/f_len, cmd and l as needed), respectively.
			 *
			 * O_TAG, O_LOG and O_ALTQ action parameters:
			 *   perform some action and set match = 1;
			 *
			 * O_LIMIT and O_KEEP_STATE: these opcodes are
			 *   not real 'actions', and are stored right
			 *   before the 'action' part of the rule.
			 *   These opcodes try to install an entry in the
			 *   state tables; if successful, we continue with
			 *   the next opcode (match=1; break;), otherwise
			 *   the packet must be dropped (set retval,
			 *   break loops with l=0, done=1)
			 *
			 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
			 *   cause a lookup of the state table, and a jump
			 *   to the 'action' part of the parent rule
			 *   if an entry is found, or
			 *   (CHECK_STATE only) a jump to the next rule if
			 *   the entry is not found.
			 *   The result of the lookup is cached so that
			 *   further instances of these opcodes become NOPs.
			 *   The jump to the next rule is done by setting
			 *   l=0, cmdlen=0.
			 */
			case O_LIMIT:
			case O_KEEP_STATE:
				if (ipfw_install_state(f,
				    (ipfw_insn_limit *)cmd, args, tablearg)) {
					/* error or limit violation */
					retval = IP_FW_DENY;
					l = 0;	/* exit inner loop */
					done = 1; /* exit outer loop */
				}
				match = 1;
				break;

			case O_PROBE_STATE:
			case O_CHECK_STATE:
				/*
				 * dynamic rules are checked at the first
				 * keep-state or check-state occurrence,
				 * with the result being stored in dyn_dir.
				 * The compiler introduces a PROBE_STATE
				 * instruction for us when we have a
				 * KEEP_STATE (because PROBE_STATE needs
				 * to be run first).
				 */
				if (dyn_dir == MATCH_UNKNOWN &&
				    (q = ipfw_lookup_dyn_rule(&args->f_id,
				     &dyn_dir, proto == IPPROTO_TCP ?
					TCP(ulp) : NULL))
					!= NULL) {
					/*
					 * Found dynamic entry, update stats
					 * and jump to the 'action' part of
					 * the parent rule by setting
					 * f, cmd, l and clearing cmdlen.
					 */
					q->pcnt++;
					q->bcnt += pktlen;
					/* XXX we would like to have f_pos
					 * readily accessible in the dynamic
				         * rule, instead of having to
					 * lookup q->rule.
					 */
					f = q->rule;
					f_pos = ipfw_find_rule(chain,
						f->rulenum, f->id);
					cmd = ACTION_PTR(f);
					l = f->cmd_len - f->act_ofs;
					ipfw_dyn_unlock();
					cmdlen = 0;
					match = 1;
					break;
				}
				/*
				 * Dynamic entry not found. If CHECK_STATE,
				 * skip to next rule, if PROBE_STATE just
				 * ignore and continue with next opcode.
				 */
				if (cmd->opcode == O_CHECK_STATE)
					l = 0;	/* exit inner loop */
				match = 1;
				break;

			case O_ACCEPT:
				retval = 0;	/* accept */
				l = 0;		/* exit inner loop */
				done = 1;	/* exit outer loop */
				break;

			case O_PIPE:
			case O_QUEUE:
				set_match(args, f_pos, chain);
				args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
					tablearg : cmd->arg1;
				if (cmd->opcode == O_PIPE)
					args->rule.info |= IPFW_IS_PIPE;
				if (V_fw_one_pass)
					args->rule.info |= IPFW_ONEPASS;
				retval = IP_FW_DUMMYNET;
				l = 0;          /* exit inner loop */
				done = 1;       /* exit outer loop */
				break;

			case O_DIVERT:
			case O_TEE:
				if (args->eh) /* not on layer 2 */
				    break;
				/* otherwise this is terminal */
				l = 0;		/* exit inner loop */
				done = 1;	/* exit outer loop */
				retval = (cmd->opcode == O_DIVERT) ?
					IP_FW_DIVERT : IP_FW_TEE;
				set_match(args, f_pos, chain);
				args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
				    tablearg : cmd->arg1;
				break;

			case O_COUNT:
				f->pcnt++;	/* update stats */
				f->bcnt += pktlen;
				f->timestamp = time_uptime;
				l = 0;		/* exit inner loop */
				break;

			case O_SKIPTO:
			    f->pcnt++;	/* update stats */
			    f->bcnt += pktlen;
			    f->timestamp = time_uptime;
			    /* If possible use cached f_pos (in f->next_rule),
			     * whose version is written in f->next_rule
			     * (horrible hacks to avoid changing the ABI).
			     */
			    if (cmd->arg1 != IP_FW_TABLEARG &&
				    (uintptr_t)f->x_next == chain->id) {
				f_pos = (uintptr_t)f->next_rule;
			    } else {
				int i = (cmd->arg1 == IP_FW_TABLEARG) ?
					tablearg : cmd->arg1;
				/* make sure we do not jump backward */
				if (i <= f->rulenum)
				    i = f->rulenum + 1;
				f_pos = ipfw_find_rule(chain, i, 0);
				/* update the cache */
				if (cmd->arg1 != IP_FW_TABLEARG) {
				    f->next_rule =
					(void *)(uintptr_t)f_pos;
				    f->x_next =
					(void *)(uintptr_t)chain->id;
				}
			    }
			    /*
			     * Skip disabled rules, and re-enter
			     * the inner loop with the correct
			     * f_pos, f, l and cmd.
			     * Also clear cmdlen and skip_or
			     */
			    for (; f_pos < chain->n_rules - 1 &&
				    (V_set_disable &
				     (1 << chain->map[f_pos]->set));
				    f_pos++)
				;
			    /* Re-enter the inner loop at the skipto rule. */
			    f = chain->map[f_pos];
			    l = f->cmd_len;
			    cmd = f->cmd;
			    match = 1;
			    cmdlen = 0;
			    skip_or = 0;
			    continue;
			    break;	/* not reached */

			case O_REJECT:
				/*
				 * Drop the packet and send a reject notice
				 * if the packet is not ICMP (or is an ICMP
				 * query), and it is not multicast/broadcast.
				 */
				if (hlen > 0 && is_ipv4 && offset == 0 &&
				    (proto != IPPROTO_ICMP ||
				     is_icmp_query(ICMP(ulp))) &&
				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
				    !IN_MULTICAST(ntohl(dst_ip.s_addr))) {
					send_reject(args, cmd->arg1, iplen, ip);
					m = args->m;
				}
				/* FALLTHROUGH */
#ifdef INET6
			case O_UNREACH6:
				if (hlen > 0 && is_ipv6 &&
				    ((offset & IP6F_OFF_MASK) == 0) &&
				    (proto != IPPROTO_ICMPV6 ||
				     (is_icmp6_query(icmp6_type) == 1)) &&
				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
				    !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) {
					send_reject6(
					    args, cmd->arg1, hlen,
					    (struct ip6_hdr *)ip);
					m = args->m;
				}
				/* FALLTHROUGH */
#endif
			case O_DENY:
				retval = IP_FW_DENY;
				l = 0;		/* exit inner loop */
				done = 1;	/* exit outer loop */
				break;

			case O_FORWARD_IP:
				if (args->eh)	/* not valid on layer2 pkts */
					break;
				if (!q || dyn_dir == MATCH_FORWARD) {
				    struct sockaddr_in *sa;
				    sa = &(((ipfw_insn_sa *)cmd)->sa);
				    if (sa->sin_addr.s_addr == INADDR_ANY) {
					bcopy(sa, &args->hopstore,
							sizeof(*sa));
					args->hopstore.sin_addr.s_addr =
						    htonl(tablearg);
					args->next_hop = &args->hopstore;
				    } else {
					args->next_hop = sa;
				    }
				}
				retval = IP_FW_PASS;
				l = 0;          /* exit inner loop */
				done = 1;       /* exit outer loop */
				break;

			case O_NETGRAPH:
			case O_NGTEE:
				set_match(args, f_pos, chain);
				args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
					tablearg : cmd->arg1;
				if (V_fw_one_pass)
					args->rule.info |= IPFW_ONEPASS;
				retval = (cmd->opcode == O_NETGRAPH) ?
				    IP_FW_NETGRAPH : IP_FW_NGTEE;
				l = 0;          /* exit inner loop */
				done = 1;       /* exit outer loop */
				break;

			case O_SETFIB:
				f->pcnt++;	/* update stats */
				f->bcnt += pktlen;
				f->timestamp = time_uptime;
				M_SETFIB(m, cmd->arg1);
				args->f_id.fib = cmd->arg1;
				l = 0;		/* exit inner loop */
				break;

			case O_NAT:
 				if (!IPFW_NAT_LOADED) {
				    retval = IP_FW_DENY;
				} else {
				    struct cfg_nat *t;
				    int nat_id;

				    set_match(args, f_pos, chain);
				    t = ((ipfw_insn_nat *)cmd)->nat;
				    if (t == NULL) {
					nat_id = (cmd->arg1 == IP_FW_TABLEARG) ?
						tablearg : cmd->arg1;
					t = (*lookup_nat_ptr)(&chain->nat, nat_id);

					if (t == NULL) {
					    retval = IP_FW_DENY;
					    l = 0;	/* exit inner loop */
					    done = 1;	/* exit outer loop */
					    break;
					}
					if (cmd->arg1 != IP_FW_TABLEARG)
					    ((ipfw_insn_nat *)cmd)->nat = t;
				    }
				    retval = ipfw_nat_ptr(args, t, m);
				}
				l = 0;          /* exit inner loop */
				done = 1;       /* exit outer loop */
				break;

			case O_REASS: {
				int ip_off;

				f->pcnt++;
				f->bcnt += pktlen;
				l = 0;	/* in any case exit inner loop */
				ip_off = ntohs(ip->ip_off);

				/* if not fragmented, go to next rule */
				if ((ip_off & (IP_MF | IP_OFFMASK)) == 0)
				    break;
				/* 
				 * ip_reass() expects len & off in host
				 * byte order.
				 */
				SET_HOST_IPLEN(ip);

				args->m = m = ip_reass(m);

				/*
				 * do IP header checksum fixup.
				 */
				if (m == NULL) { /* fragment got swallowed */
				    retval = IP_FW_DENY;
				} else { /* good, packet complete */
				    int hlen;

				    ip = mtod(m, struct ip *);
				    hlen = ip->ip_hl << 2;
				    SET_NET_IPLEN(ip);
				    ip->ip_sum = 0;
				    if (hlen == sizeof(struct ip))
					ip->ip_sum = in_cksum_hdr(ip);
				    else
					ip->ip_sum = in_cksum(m, hlen);
				    retval = IP_FW_REASS;
				    set_match(args, f_pos, chain);
				}
				done = 1;	/* exit outer loop */
				break;
			}

			default:
				panic("-- unknown opcode %d\n", cmd->opcode);
			} /* end of switch() on opcodes */
			/*
			 * if we get here with l=0, then match is irrelevant.
			 */

			if (cmd->len & F_NOT)
				match = !match;

			if (match) {
				if (cmd->len & F_OR)
					skip_or = 1;
			} else {
				if (!(cmd->len & F_OR)) /* not an OR block, */
					break;		/* try next rule    */
			}

		}	/* end of inner loop, scan opcodes */

		if (done)
			break;

/* next_rule:; */	/* try next rule		*/

	}		/* end of outer for, scan rules */

	if (done) {
		struct ip_fw *rule = chain->map[f_pos];
		/* Update statistics */
		rule->pcnt++;
		rule->bcnt += pktlen;
		rule->timestamp = time_uptime;
	} else {
		retval = IP_FW_DENY;
		printf("ipfw: ouch!, skip past end of rules, denying packet\n");
	}
	IPFW_RUNLOCK(chain);
#ifdef __FreeBSD__
	if (ucred_cache != NULL)
		crfree(ucred_cache);
#endif
	return (retval);

pullup_failed:
	if (V_fw_verbose)
		printf("ipfw: pullup failed\n");
	return (IP_FW_DENY);
}

/*
 * Module and VNET glue
 */

/*
 * Stuff that must be initialised only on boot or module load
 */
static int
ipfw_init(void)
{
	int error = 0;

	ipfw_dyn_attach();
	/*
 	 * Only print out this stuff the first time around,
	 * when called from the sysinit code.
	 */
	printf("ipfw2 "
#ifdef INET6
		"(+ipv6) "
#endif
		"initialized, divert %s, nat %s, "
		"rule-based forwarding "
#ifdef IPFIREWALL_FORWARD
		"enabled, "
#else
		"disabled, "
#endif
		"default to %s, logging ",
#ifdef IPDIVERT
		"enabled",
#else
		"loadable",
#endif
#ifdef IPFIREWALL_NAT
		"enabled",
#else
		"loadable",
#endif
		default_to_accept ? "accept" : "deny");

	/*
	 * Note: V_xxx variables can be accessed here but the vnet specific
	 * initializer may not have been called yet for the VIMAGE case.
	 * Tuneables will have been processed. We will print out values for
	 * the default vnet. 
	 * XXX This should all be rationalized AFTER 8.0
	 */
	if (V_fw_verbose == 0)
		printf("disabled\n");
	else if (V_verbose_limit == 0)
		printf("unlimited\n");
	else
		printf("limited to %d packets/entry by default\n",
		    V_verbose_limit);

	ipfw_log_bpf(1); /* init */
	return (error);
}

/*
 * Called for the removal of the last instance only on module unload.
 */
static void
ipfw_destroy(void)
{

	ipfw_log_bpf(0); /* uninit */
	ipfw_dyn_detach();
	printf("IP firewall unloaded\n");
}

/*
 * Stuff that must be initialized for every instance
 * (including the first of course).
 */
static int
vnet_ipfw_init(const void *unused)
{
	int error;
	struct ip_fw *rule = NULL;
	struct ip_fw_chain *chain;

	chain = &V_layer3_chain;

	/* First set up some values that are compile time options */
	V_autoinc_step = 100;	/* bounded to 1..1000 in add_rule() */
	V_fw_deny_unknown_exthdrs = 1;
#ifdef IPFIREWALL_VERBOSE
	V_fw_verbose = 1;
#endif
#ifdef IPFIREWALL_VERBOSE_LIMIT
	V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
#endif
#ifdef IPFIREWALL_NAT
	LIST_INIT(&chain->nat);
#endif

	/* insert the default rule and create the initial map */
	chain->n_rules = 1;
	chain->static_len = sizeof(struct ip_fw);
	chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_NOWAIT | M_ZERO);
	if (chain->map)
		rule = malloc(chain->static_len, M_IPFW, M_NOWAIT | M_ZERO);
	if (rule == NULL) {
		if (chain->map)
			free(chain->map, M_IPFW);
		printf("ipfw2: ENOSPC initializing default rule "
			"(support disabled)\n");
		return (ENOSPC);
	}
	error = ipfw_init_tables(chain);
	if (error) {
		panic("init_tables"); /* XXX Marko fix this ! */
	}

	/* fill and insert the default rule */
	rule->act_ofs = 0;
	rule->rulenum = IPFW_DEFAULT_RULE;
	rule->cmd_len = 1;
	rule->set = RESVD_SET;
	rule->cmd[0].len = 1;
	rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY;
	chain->rules = chain->default_rule = chain->map[0] = rule;
	chain->id = rule->id = 1;

	IPFW_LOCK_INIT(chain);
	ipfw_dyn_init();

	/* First set up some values that are compile time options */
	V_ipfw_vnet_ready = 1;		/* Open for business */

	/*
	 * Hook the sockopt handler, and the layer2 (V_ip_fw_chk_ptr)
	 * and pfil hooks for ipv4 and ipv6. Even if the latter two fail
	 * we still keep the module alive because the sockopt and
	 * layer2 paths are still useful.
	 * ipfw[6]_hook return 0 on success, ENOENT on failure,
	 * so we can ignore the exact return value and just set a flag.
	 *
	 * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so
	 * changes in the underlying (per-vnet) variables trigger
	 * immediate hook()/unhook() calls.
	 * In layer2 we have the same behaviour, except that V_ether_ipfw
	 * is checked on each packet because there are no pfil hooks.
	 */
	V_ip_fw_ctl_ptr = ipfw_ctl;
	V_ip_fw_chk_ptr = ipfw_chk;
	error = ipfw_attach_hooks(1);
	return (error);
}

/*
 * Called for the removal of each instance.
 */
static int
vnet_ipfw_uninit(const void *unused)
{
	struct ip_fw *reap, *rule;
	struct ip_fw_chain *chain = &V_layer3_chain;
	int i;

	V_ipfw_vnet_ready = 0; /* tell new callers to go away */
	/*
	 * disconnect from ipv4, ipv6, layer2 and sockopt.
	 * Then grab, release and grab again the WLOCK so we make
	 * sure the update is propagated and nobody will be in.
	 */
	(void)ipfw_attach_hooks(0 /* detach */);
	V_ip_fw_chk_ptr = NULL;
	V_ip_fw_ctl_ptr = NULL;
	IPFW_UH_WLOCK(chain);
	IPFW_UH_WUNLOCK(chain);
	IPFW_UH_WLOCK(chain);

	IPFW_WLOCK(chain);
	IPFW_WUNLOCK(chain);
	IPFW_WLOCK(chain);

	ipfw_dyn_uninit(0);	/* run the callout_drain */
	ipfw_destroy_tables(chain);
	reap = NULL;
	for (i = 0; i < chain->n_rules; i++) {
		rule = chain->map[i];
		rule->x_next = reap;
		reap = rule;
	}
	if (chain->map)
		free(chain->map, M_IPFW);
	IPFW_WUNLOCK(chain);
	IPFW_UH_WUNLOCK(chain);
	if (reap != NULL)
		ipfw_reap_rules(reap);
	IPFW_LOCK_DESTROY(chain);
	ipfw_dyn_uninit(1);	/* free the remaining parts */
	return 0;
}

/*
 * Module event handler.
 * In general we have the choice of handling most of these events by the
 * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to
 * use the SYSINIT handlers as they are more capable of expressing the
 * flow of control during module and vnet operations, so this is just
 * a skeleton. Note there is no SYSINIT equivalent of the module
 * SHUTDOWN handler, but we don't have anything to do in that case anyhow.
 */
static int
ipfw_modevent(module_t mod, int type, void *unused)
{
	int err = 0;

	switch (type) {
	case MOD_LOAD:
		/* Called once at module load or
	 	 * system boot if compiled in. */
		break;
	case MOD_QUIESCE:
		/* Called before unload. May veto unloading. */
		break;
	case MOD_UNLOAD:
		/* Called during unload. */
		break;
	case MOD_SHUTDOWN:
		/* Called during system shutdown. */
		break;
	default:
		err = EOPNOTSUPP;
		break;
	}
	return err;
}

static moduledata_t ipfwmod = {
	"ipfw",
	ipfw_modevent,
	0
};

/* Define startup order. */
#define	IPFW_SI_SUB_FIREWALL	SI_SUB_PROTO_IFATTACHDOMAIN
#define	IPFW_MODEVENT_ORDER	(SI_ORDER_ANY - 255) /* On boot slot in here. */
#define	IPFW_MODULE_ORDER	(IPFW_MODEVENT_ORDER + 1) /* A little later. */
#define	IPFW_VNET_ORDER		(IPFW_MODEVENT_ORDER + 2) /* Later still. */

DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER);
MODULE_VERSION(ipfw, 2);
/* should declare some dependencies here */

/*
 * Starting up. Done in order after ipfwmod() has been called.
 * VNET_SYSINIT is also called for each existing vnet and each new vnet.
 */
SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
	    ipfw_init, NULL);
VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
	    vnet_ipfw_init, NULL);
 
/*
 * Closing up shop. These are done in REVERSE ORDER, but still
 * after ipfwmod() has been called. Not called on reboot.
 * VNET_SYSUNINIT is also called for each exiting vnet as it exits.
 * or when the module is unloaded.
 */
SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
	    ipfw_destroy, NULL);
VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
	    vnet_ipfw_uninit, NULL);
/* end of file */


================================================
FILE: sys/netinet/ipfw/ip_fw_dynamic.c
================================================
/*-
 * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_dynamic.c 200601 2009-12-16 10:48:40Z luigi $");

#define        DEB(x)
#define        DDB(x) x

/*
 * Dynamic rule support for ipfw
 */

#if !defined(KLD_MODULE)
#include "opt_ipfw.h"
#include "opt_ipdivert.h"
#include "opt_ipdn.h"
#include "opt_inet.h"
#ifndef INET
#error IPFIREWALL requires INET.
#endif /* INET */
#endif
#include "opt_inet6.h"
#include "opt_ipsec.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <net/ethernet.h> /* for ETHERTYPE_IP */
#include <net/if.h>
#include <net/vnet.h>

#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>	/* ip_defttl */
#include <netinet/ip_fw.h>
#include <netinet/ipfw/ip_fw_private.h>
#include <netinet/tcp_var.h>
#include <netinet/udp.h>

#include <netinet/ip6.h>	/* IN6_ARE_ADDR_EQUAL */
#ifdef INET6
#include <netinet6/in6_var.h>
#include <netinet6/ip6_var.h>
#endif

#include <machine/in_cksum.h>	/* XXX for in_cksum */

#ifdef MAC
#include <security/mac/mac_framework.h>
#endif

/*
 * Description of dynamic rules.
 *
 * Dynamic rules are stored in lists accessed through a hash table
 * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can
 * be modified through the sysctl variable dyn_buckets which is
 * updated when the table becomes empty.
 *
 * XXX currently there is only one list, ipfw_dyn.
 *
 * When a packet is received, its address fields are first masked
 * with the mask defined for the rule, then hashed, then matched
 * against the entries in the corresponding list.
 * Dynamic rules can be used for different purposes:
 *  + stateful rules;
 *  + enforcing limits on the number of sessions;
 *  + in-kernel NAT (not implemented yet)
 *
 * The lifetime of dynamic rules is regulated by dyn_*_lifetime,
 * measured in seconds and depending on the flags.
 *
 * The total number of dynamic rules is stored in dyn_count.
 * The max number of dynamic rules is dyn_max. When we reach
 * the maximum number of rules we do not create anymore. This is
 * done to avoid consuming too much memory, but also too much
 * time when searching on each packet (ideally, we should try instead
 * to put a limit on the length of the list on each bucket...).
 *
 * Each dynamic rule holds a pointer to the parent ipfw rule so
 * we know what action to perform. Dynamic rules are removed when
 * the parent rule is deleted. XXX we should make them survive.
 *
 * There are some limitations with dynamic rules -- we do not
 * obey the 'randomized match', and we do not do multiple
 * passes through the firewall. XXX check the latter!!!
 */

/*
 * Static variables followed by global ones
 */
static VNET_DEFINE(ipfw_dyn_rule **, ipfw_dyn_v);
static VNET_DEFINE(u_int32_t, dyn_buckets);
static VNET_DEFINE(u_int32_t, curr_dyn_buckets);
static VNET_DEFINE(struct callout, ipfw_timeout);
#define	V_ipfw_dyn_v			VNET(ipfw_dyn_v)
#define	V_dyn_buckets			VNET(dyn_buckets)
#define	V_curr_dyn_buckets		VNET(curr_dyn_buckets)
#define V_ipfw_timeout                  VNET(ipfw_timeout)

static uma_zone_t ipfw_dyn_rule_zone;
#ifndef __FreeBSD__
DEFINE_SPINLOCK(ipfw_dyn_mtx);
#else
static struct mtx ipfw_dyn_mtx;		/* mutex guarding dynamic rules */
#endif

#define	IPFW_DYN_LOCK_INIT() \
	mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF)
#define	IPFW_DYN_LOCK_DESTROY()	mtx_destroy(&ipfw_dyn_mtx)
#define	IPFW_DYN_LOCK()		mtx_lock(&ipfw_dyn_mtx)
#define	IPFW_DYN_UNLOCK()	mtx_unlock(&ipfw_dyn_mtx)
#define	IPFW_DYN_LOCK_ASSERT()	mtx_assert(&ipfw_dyn_mtx, MA_OWNED)

void
ipfw_dyn_unlock(void)
{
	IPFW_DYN_UNLOCK();
}

/*
 * Timeouts for various events in handing dynamic rules.
 */
static VNET_DEFINE(u_int32_t, dyn_ack_lifetime);
static VNET_DEFINE(u_int32_t, dyn_syn_lifetime);
static VNET_DEFINE(u_int32_t, dyn_fin_lifetime);
static VNET_DEFINE(u_int32_t, dyn_rst_lifetime);
static VNET_DEFINE(u_int32_t, dyn_udp_lifetime);
static VNET_DEFINE(u_int32_t, dyn_short_lifetime);

#define	V_dyn_ack_lifetime		VNET(dyn_ack_lifetime)
#define	V_dyn_syn_lifetime		VNET(dyn_syn_lifetime)
#define	V_dyn_fin_lifetime		VNET(dyn_fin_lifetime)
#define	V_dyn_rst_lifetime		VNET(dyn_rst_lifetime)
#define	V_dyn_udp_lifetime		VNET(dyn_udp_lifetime)
#define	V_dyn_short_lifetime		VNET(dyn_short_lifetime)

/*
 * Keepalives are sent if dyn_keepalive is set. They are sent every
 * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
 * seconds of lifetime of a rule.
 * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower
 * than dyn_keepalive_period.
 */

static VNET_DEFINE(u_int32_t, dyn_keepalive_interval);
static VNET_DEFINE(u_int32_t, dyn_keepalive_period);
static VNET_DEFINE(u_int32_t, dyn_keepalive);

#define	V_dyn_keepalive_interval	VNET(dyn_keepalive_interval)
#define	V_dyn_keepalive_period		VNET(dyn_keepalive_period)
#define	V_dyn_keepalive			VNET(dyn_keepalive)

static VNET_DEFINE(u_int32_t, dyn_count);	/* # of dynamic rules */
static VNET_DEFINE(u_int32_t, dyn_max);		/* max # of dynamic rules */

#define	V_dyn_count			VNET(dyn_count)
#define	V_dyn_max			VNET(dyn_max)

#ifdef SYSCTL_NODE

SYSBEGIN(f2)

SYSCTL_DECL(_net_inet_ip_fw);
SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_buckets,
    CTLFLAG_RW, &VNET_NAME(dyn_buckets), 0,
    "Number of dyn. buckets");
SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets,
    CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0,
    "Current Number of dyn. buckets");
SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_count,
    CTLFLAG_RD, &VNET_NAME(dyn_count), 0,
    "Number of dyn. rules");
SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_max,
    CTLFLAG_RW, &VNET_NAME(dyn_max), 0,
    "Max number of dyn. rules");
SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime,
    CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0,
    "Lifetime of dyn. rules for acks");
SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime,
    CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0,
    "Lifetime of dyn. rules for syn");
SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime,
    CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0,
    "Lifetime of dyn. rules for fin");
SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime,
    CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0,
    "Lifetime of dyn. rules for rst");
SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime,
    CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0,
    "Lifetime of dyn. rules for UDP");
SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime,
    CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0,
    "Lifetime of dyn. rules for other situations");
SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive,
    CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0,
    "Enable keepalives for dyn. rules");

SYSEND

#endif /* SYSCTL_NODE */


static __inline int
hash_packet6(struct ipfw_flow_id *id)
{
	u_int32_t i;
	i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^
	    (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^
	    (id->src_ip6.__u6_addr.__u6_addr32[2]) ^
	    (id->src_ip6.__u6_addr.__u6_addr32[3]) ^
	    (id->dst_port) ^ (id->src_port);
	return i;
}

/*
 * IMPORTANT: the hash function for dynamic rules must be commutative
 * in source and destination (ip,port), because rules are bidirectional
 * and we want to find both in the same bucket.
 */
static __inline int
hash_packet(struct ipfw_flow_id *id)
{
	u_int32_t i;

#ifdef INET6
	if (IS_IP6_FLOW_ID(id)) 
		i = hash_packet6(id);
	else
#endif /* INET6 */
	i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port);
	i &= (V_curr_dyn_buckets - 1);
	return i;
}

static __inline void
unlink_dyn_rule_print(struct ipfw_flow_id *id)
{
	struct in_addr da;
#ifdef INET6
	char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN];
#else
	char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
#endif

#ifdef INET6
	if (IS_IP6_FLOW_ID(id)) {
		ip6_sprintf(src, &id->src_ip6);
		ip6_sprintf(dst, &id->dst_ip6);
	} else
#endif
	{
		da.s_addr = htonl(id->src_ip);
		inet_ntoa_r(da, src);
		da.s_addr = htonl(id->dst_ip);
		inet_ntoa_r(da, dst);
	}
	printf("ipfw: unlink entry %s %d -> %s %d, %d left\n",
	    src, id->src_port, dst, id->dst_port, V_dyn_count - 1);
}

/**
 * unlink a dynamic rule from a chain. prev is a pointer to
 * the previous one, q is a pointer to the rule to delete,
 * head is a pointer to the head of the queue.
 * Modifies q and potentially also head.
 */
#define UNLINK_DYN_RULE(prev, head, q) {				\
	ipfw_dyn_rule *old_q = q;					\
									\
	/* remove a refcount to the parent */				\
	if (q->dyn_type == O_LIMIT)					\
		q->parent->count--;					\
	DEB(unlink_dyn_rule_print(&q->id);)				\
	if (prev != NULL)						\
		prev->next = q = q->next;				\
	else								\
		head = q = q->next;					\
	V_dyn_count--;							\
	uma_zfree(ipfw_dyn_rule_zone, old_q); }

#define TIME_LEQ(a,b)       ((int)((a)-(b)) <= 0)

/**
 * Remove dynamic rules pointing to "rule", or all of them if rule == NULL.
 *
 * If keep_me == NULL, rules are deleted even if not expired,
 * otherwise only expired rules are removed.
 *
 * The value of the second parameter is also used to point to identify
 * a rule we absolutely do not want to remove (e.g. because we are
 * holding a reference to it -- this is the case with O_LIMIT_PARENT
 * rules). The pointer is only used for comparison, so any non-null
 * value will do.
 */
static void
remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me)
{
	static u_int32_t last_remove = 0;

#define FORCE (keep_me == NULL)

	ipfw_dyn_rule *prev, *q;
	int i, pass = 0, max_pass = 0;

	IPFW_DYN_LOCK_ASSERT();

	if (V_ipfw_dyn_v == NULL || V_dyn_count == 0)
		return;
	/* do not expire more than once per second, it is useless */
	if (!FORCE && last_remove == time_uptime)
		return;
	last_remove = time_uptime;

	/*
	 * because O_LIMIT refer to parent rules, during the first pass only
	 * remove child and mark any pending LIMIT_PARENT, and remove
	 * them in a second pass.
	 */
next_pass:
	for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
		for (prev=NULL, q = V_ipfw_dyn_v[i] ; q ; ) {
			/*
			 * Logic can become complex here, so we split tests.
			 */
			if (q == keep_me)
				goto next;
			if (rule != NULL && rule != q->rule)
				goto next; /* not the one we are looking for */
			if (q->dyn_type == O_LIMIT_PARENT) {
				/*
				 * handle parent in the second pass,
				 * record we need one.
				 */
				max_pass = 1;
				if (pass == 0)
					goto next;
				if (FORCE && q->count != 0 ) {
					/* XXX should not happen! */
					printf("ipfw: OUCH! cannot remove rule,"
					     " count %d\n", q->count);
				}
			} else {
				if (!FORCE &&
				    !TIME_LEQ( q->expire, time_uptime ))
					goto next;
			}
             if (q->dyn_type != O_LIMIT_PARENT || !q->count) {
                     UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
                     continue;
             }
next:
			prev=q;
			q=q->next;
		}
	}
	if (pass++ < max_pass)
		goto next_pass;
}

void
ipfw_remove_dyn_children(struct ip_fw *rule)
{
	IPFW_DYN_LOCK();
	remove_dyn_rule(rule, NULL /* force removal */);
	IPFW_DYN_UNLOCK();
}

/**
 * lookup a dynamic rule, locked version
 */
static ipfw_dyn_rule *
lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction,
    struct tcphdr *tcp)
{
	/*
	 * stateful ipfw extensions.
	 * Lookup into dynamic session queue
	 */
#define MATCH_REVERSE	0
#define MATCH_FORWARD	1
#define MATCH_NONE	2
#define MATCH_UNKNOWN	3
	int i, dir = MATCH_NONE;
	ipfw_dyn_rule *prev, *q=NULL;

	IPFW_DYN_LOCK_ASSERT();

	if (V_ipfw_dyn_v == NULL)
		goto done;	/* not found */
	i = hash_packet( pkt );
	for (prev=NULL, q = V_ipfw_dyn_v[i] ; q != NULL ; ) {
		if (q->dyn_type == O_LIMIT_PARENT && q->count)
			goto next;
		if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */
			UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
			continue;
		}
		if (pkt->proto == q->id.proto &&
		    q->dyn_type != O_LIMIT_PARENT) {
			if (IS_IP6_FLOW_ID(pkt)) {
			    if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
				&(q->id.src_ip6)) &&
			    IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
				&(q->id.dst_ip6)) &&
			    pkt->src_port == q->id.src_port &&
			    pkt->dst_port == q->id.dst_port ) {
				dir = MATCH_FORWARD;
				break;
			    }
			    if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
				    &(q->id.dst_ip6)) &&
				IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
				    &(q->id.src_ip6)) &&
				pkt->src_port == q->id.dst_port &&
				pkt->dst_port == q->id.src_port ) {
				    dir = MATCH_REVERSE;
				    break;
			    }
			} else {
			    if (pkt->src_ip == q->id.src_ip &&
				pkt->dst_ip == q->id.dst_ip &&
				pkt->src_port == q->id.src_port &&
				pkt->dst_port == q->id.dst_port ) {
				    dir = MATCH_FORWARD;
				    break;
			    }
			    if (pkt->src_ip == q->id.dst_ip &&
				pkt->dst_ip == q->id.src_ip &&
				pkt->src_port == q->id.dst_port &&
				pkt->dst_port == q->id.src_port ) {
				    dir = MATCH_REVERSE;
				    break;
			    }
			}
		}
next:
		prev = q;
		q = q->next;
	}
	if (q == NULL)
		goto done; /* q = NULL, not found */

	if ( prev != NULL) { /* found and not in front */
		prev->next = q->next;
		q->next = V_ipfw_dyn_v[i];
		V_ipfw_dyn_v[i] = q;
	}
	if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
		u_char flags = pkt->_flags & (TH_FIN|TH_SYN|TH_RST);

#define BOTH_SYN	(TH_SYN | (TH_SYN << 8))
#define BOTH_FIN	(TH_FIN | (TH_FIN << 8))
		q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8);
		switch (q->state) {
		case TH_SYN:				/* opening */
			q->expire = time_uptime + V_dyn_syn_lifetime;
			break;

		case BOTH_SYN:			/* move to established */
		case BOTH_SYN | TH_FIN :	/* one side tries to close */
		case BOTH_SYN | (TH_FIN << 8) :
 			if (tcp) {
#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0)
			    u_int32_t ack = ntohl(tcp->th_ack);
			    if (dir == MATCH_FORWARD) {
				if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd))
				    q->ack_fwd = ack;
				else { /* ignore out-of-sequence */
				    break;
				}
			    } else {
				if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev))
				    q->ack_rev = ack;
				else { /* ignore out-of-sequence */
				    break;
				}
			    }
			}
			q->expire = time_uptime + V_dyn_ack_lifetime;
			break;

		case BOTH_SYN | BOTH_FIN:	/* both sides closed */
			if (V_dyn_fin_lifetime >= V_dyn_keepalive_period)
				V_dyn_fin_lifetime = V_dyn_keepalive_period - 1;
			q->expire = time_uptime + V_dyn_fin_lifetime;
			break;

		default:
#if 0
			/*
			 * reset or some invalid combination, but can also
			 * occur if we use keep-state the wrong way.
			 */
			if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0)
				printf("invalid state: 0x%x\n", q->state);
#endif
			if (V_dyn_rst_lifetime >= V_dyn_keepalive_period)
				V_dyn_rst_lifetime = V_dyn_keepalive_period - 1;
			q->expire = time_uptime + V_dyn_rst_lifetime;
			break;
		}
	} else if (pkt->proto == IPPROTO_UDP) {
		q->expire = time_uptime + V_dyn_udp_lifetime;
	} else {
		/* other protocols */
		q->expire = time_uptime + V_dyn_short_lifetime;
	}
done:
	if (match_direction)
		*match_direction = dir;
	return q;
}

ipfw_dyn_rule *
ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction,
    struct tcphdr *tcp)
{
	ipfw_dyn_rule *q;

	IPFW_DYN_LOCK();
	q = lookup_dyn_rule_locked(pkt, match_direction, tcp);
	if (q == NULL)
		IPFW_DYN_UNLOCK();
	/* NB: return table locked when q is not NULL */
	return q;
}

static void
realloc_dynamic_table(void)
{
	IPFW_DYN_LOCK_ASSERT();

	/*
	 * Try reallocation, make sure we have a power of 2 and do
	 * not allow more than 64k entries. In case of overflow,
	 * default to 1024.
	 */

	if (V_dyn_buckets > 65536)
		V_dyn_buckets = 1024;
	if ((V_dyn_buckets & (V_dyn_buckets-1)) != 0) { /* not a power of 2 */
		V_dyn_buckets = V_curr_dyn_buckets; /* reset */
		return;
	}
	V_curr_dyn_buckets = V_dyn_buckets;
	if (V_ipfw_dyn_v != NULL)
		free(V_ipfw_dyn_v, M_IPFW);
	for (;;) {
		V_ipfw_dyn_v = malloc(V_curr_dyn_buckets * sizeof(ipfw_dyn_rule *),
		       M_IPFW, M_NOWAIT | M_ZERO);
		if (V_ipfw_dyn_v != NULL || V_curr_dyn_buckets <= 2)
			break;
		V_curr_dyn_buckets /= 2;
	}
}

/**
 * Install state of type 'type' for a dynamic session.
 * The hash table contains two type of rules:
 * - regular rules (O_KEEP_STATE)
 * - rules for sessions with limited number of sess per user
 *   (O_LIMIT). When they are created, the parent is
 *   increased by 1, and decreased on delete. In this case,
 *   the third parameter is the parent rule and not the chain.
 * - "parent" rules for the above (O_LIMIT_PARENT).
 */
static ipfw_dyn_rule *
add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule)
{
	ipfw_dyn_rule *r;
	int i;

	IPFW_DYN_LOCK_ASSERT();

	if (V_ipfw_dyn_v == NULL ||
	    (V_dyn_count == 0 && V_dyn_buckets != V_curr_dyn_buckets)) {
		realloc_dynamic_table();
		if (V_ipfw_dyn_v == NULL)
			return NULL; /* failed ! */
	}
	i = hash_packet(id);

	r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO);
	if (r == NULL) {
		printf ("ipfw: sorry cannot allocate state\n");
		return NULL;
	}

	/* increase refcount on parent, and set pointer */
	if (dyn_type == O_LIMIT) {
		ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule;
		if ( parent->dyn_type != O_LIMIT_PARENT)
			panic("invalid parent");
		parent->count++;
		r->parent = parent;
		rule = parent->rule;
	}

	r->id = *id;
	r->expire = time_uptime + V_dyn_syn_lifetime;
	r->rule = rule;
	r->dyn_type = dyn_type;
	r->pcnt = r->bcnt = 0;
	r->count = 0;

	r->bucket = i;
	r->next = V_ipfw_dyn_v[i];
	V_ipfw_dyn_v[i] = r;
	V_dyn_count++;
	DEB({
		struct in_addr da;
#ifdef INET6
		char src[INET6_ADDRSTRLEN];
		char dst[INET6_ADDRSTRLEN];
#else
		char src[INET_ADDRSTRLEN];
		char dst[INET_ADDRSTRLEN];
#endif

#ifdef INET6
		if (IS_IP6_FLOW_ID(&(r->id))) {
			ip6_sprintf(src, &r->id.src_ip6);
			ip6_sprintf(dst, &r->id.dst_ip6);
		} else
#endif
		{
			da.s_addr = htonl(r->id.src_ip);
			inet_ntoa_r(da, src);
			da.s_addr = htonl(r->id.dst_ip);
			inet_ntoa_r(da, dst);
		}
		printf("ipfw: add dyn entry ty %d %s %d -> %s %d, total %d\n",
		    dyn_type, src, r->id.src_port, dst, r->id.dst_port,
		    V_dyn_count);
	})
	return r;
}

/**
 * lookup dynamic parent rule using pkt and rule as search keys.
 * If the lookup fails, then install one.
 */
static ipfw_dyn_rule *
lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule)
{
	ipfw_dyn_rule *q;
	int i;

	IPFW_DYN_LOCK_ASSERT();

	if (V_ipfw_dyn_v) {
		int is_v6 = IS_IP6_FLOW_ID(pkt);
		i = hash_packet( pkt );
		for (q = V_ipfw_dyn_v[i] ; q != NULL ; q=q->next)
			if (q->dyn_type == O_LIMIT_PARENT &&
			    rule== q->rule &&
			    pkt->proto == q->id.proto &&
			    pkt->src_port == q->id.src_port &&
			    pkt->dst_port == q->id.dst_port &&
			    (
				(is_v6 &&
				 IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
					&(q->id.src_ip6)) &&
				 IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
					&(q->id.dst_ip6))) ||
				(!is_v6 &&
				 pkt->src_ip == q->id.src_ip &&
				 pkt->dst_ip == q->id.dst_ip)
			    )
			) {
				q->expire = time_uptime + V_dyn_short_lifetime;
				DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);)
				return q;
			}
	}
	return add_dyn_rule(pkt, O_LIMIT_PARENT, rule);
}

/**
 * Install dynamic state for rule type cmd->o.opcode
 *
 * Returns 1 (failure) if state is not installed because of errors or because
 * session limitations are enforced.
 */
int
ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
    struct ip_fw_args *args, uint32_t tablearg)
{
	static int last_log;
	ipfw_dyn_rule *q;
	struct in_addr da;
#ifdef INET6
	char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2];
#else
	char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
#endif

	src[0] = '\0';
	dst[0] = '\0';

	IPFW_DYN_LOCK();

	DEB(
#ifdef INET6
	if (IS_IP6_FLOW_ID(&(args->f_id))) {
		ip6_sprintf(src, &args->f_id.src_ip6);
		ip6_sprintf(dst, &args->f_id.dst_ip6);
	} else
#endif
	{
		da.s_addr = htonl(args->f_id.src_ip);
		inet_ntoa_r(da, src);
		da.s_addr = htonl(args->f_id.dst_ip);
		inet_ntoa_r(da, dst);
	}
	printf("ipfw: %s: type %d %s %u -> %s %u\n",
	    __func__, cmd->o.opcode, src, args->f_id.src_port,
	    dst, args->f_id.dst_port);
	src[0] = '\0';
	dst[0] = '\0';
	)

	q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL);

	if (q != NULL) {	/* should never occur */
		if (last_log != time_uptime) {
			last_log = time_uptime;
			printf("ipfw: %s: entry already present, done\n",
			    __func__);
		}
		IPFW_DYN_UNLOCK();
		return (0);
	}

	if (V_dyn_count >= V_dyn_max)
		/* Run out of slots, try to remove any expired rule. */
		remove_dyn_rule(NULL, (ipfw_dyn_rule *)1);

	if (V_dyn_count >= V_dyn_max) {
		if (last_log != time_uptime) {
			last_log = time_uptime;
			printf("ipfw: %s: Too many dynamic rules\n", __func__);
		}
		IPFW_DYN_UNLOCK();
		return (1);	/* cannot install, notify caller */
	}

	switch (cmd->o.opcode) {
	case O_KEEP_STATE:	/* bidir rule */
		add_dyn_rule(&args->f_id, O_KEEP_STATE, rule);
		break;

	case O_LIMIT: {		/* limit number of sessions */
		struct ipfw_flow_id id;
		ipfw_dyn_rule *parent;
		uint32_t conn_limit;
		uint16_t limit_mask = cmd->limit_mask;

		conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ?
		    tablearg : cmd->conn_limit;
		  
		DEB(
		if (cmd->conn_limit == IP_FW_TABLEARG)
			printf("ipfw: %s: O_LIMIT rule, conn_limit: %u "
			    "(tablearg)\n", __func__, conn_limit);
		else
			printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n",
			    __func__, conn_limit);
		)

		id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0;
		id.proto = args->f_id.proto;
		id.addr_type = args->f_id.addr_type;
		id.fib = M_GETFIB(args->m);

		if (IS_IP6_FLOW_ID (&(args->f_id))) {
			if (limit_mask & DYN_SRC_ADDR)
				id.src_ip6 = args->f_id.src_ip6;
			if (limit_mask & DYN_DST_ADDR)
				id.dst_ip6 = args->f_id.dst_ip6;
		} else {
			if (limit_mask & DYN_SRC_ADDR)
				id.src_ip = args->f_id.src_ip;
			if (limit_mask & DYN_DST_ADDR)
				id.dst_ip = args->f_id.dst_ip;
		}
		if (limit_mask & DYN_SRC_PORT)
			id.src_port = args->f_id.src_port;
		if (limit_mask & DYN_DST_PORT)
			id.dst_port = args->f_id.dst_port;
		if ((parent = lookup_dyn_parent(&id, rule)) == NULL) {
			printf("ipfw: %s: add parent failed\n", __func__);
			IPFW_DYN_UNLOCK();
			return (1);
		}

		if (parent->count >= conn_limit) {
			/* See if we can remove some expired rule. */
			remove_dyn_rule(rule, parent);
			if (parent->count >= conn_limit) {
				if (V_fw_verbose && last_log != time_uptime) {
					last_log = time_uptime;
#ifdef INET6
					/*
					 * XXX IPv6 flows are not
					 * supported yet.
					 */
					if (IS_IP6_FLOW_ID(&(args->f_id))) {
						char ip6buf[INET6_ADDRSTRLEN];
						snprintf(src, sizeof(src),
						    "[%s]", ip6_sprintf(ip6buf,
							&args->f_id.src_ip6));
						snprintf(dst, sizeof(dst),
						    "[%s]", ip6_sprintf(ip6buf,
							&args->f_id.dst_ip6));
					} else
#endif
					{
						da.s_addr =
						    htonl(args->f_id.src_ip);
						inet_ntoa_r(da, src);
						da.s_addr =
						    htonl(args->f_id.dst_ip);
						inet_ntoa_r(da, dst);
					}
					log(LOG_SECURITY | LOG_DEBUG,
					    "ipfw: %d %s %s:%u -> %s:%u, %s\n",
					    parent->rule->rulenum,
					    "drop session",
					    src, (args->f_id.src_port),
					    dst, (args->f_id.dst_port),
					    "too many entries");
				}
				IPFW_DYN_UNLOCK();
				return (1);
			}
		}
		add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent);
		break;
	}
	default:
		printf("ipfw: %s: unknown dynamic rule type %u\n",
		    __func__, cmd->o.opcode);
		IPFW_DYN_UNLOCK();
		return (1);
	}

	/* XXX just set lifetime */
	lookup_dyn_rule_locked(&args->f_id, NULL, NULL);

	IPFW_DYN_UNLOCK();
	return (0);
}

/*
 * Generate a TCP packet, containing either a RST or a keepalive.
 * When flags & TH_RST, we are sending a RST packet, because of a
 * "reset" action matched the packet.
 * Otherwise we are sending a keepalive, and flags & TH_
 * The 'replyto' mbuf is the mbuf being replied to, if any, and is required
 * so that MAC can label the reply appropriately.
 */
struct mbuf *
ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq,
    u_int32_t ack, int flags)
{
	struct mbuf *m = NULL;		/* stupid compiler */
	int len, dir;
	struct ip *h = NULL;		/* stupid compiler */
#ifdef INET6
	struct ip6_hdr *h6 = NULL;
#endif
	struct tcphdr *th = NULL;

	MGETHDR(m, M_DONTWAIT, MT_DATA);
	if (m == NULL)
		return (NULL);

	M_SETFIB(m, id->fib);
#ifdef MAC
	if (replyto != NULL)
		mac_netinet_firewall_reply(replyto, m);
	else
		mac_netinet_firewall_send(m);
#else
	(void)replyto;		/* don't warn about unused arg */
#endif

	switch (id->addr_type) {
	case 4:
		len = sizeof(struct ip) + sizeof(struct tcphdr);
		break;
#ifdef INET6
	case 6:
		len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
		break;
#endif
	default:
		/* XXX: log me?!? */
		FREE_PKT(m);
		return (NULL);
	}
	dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN);

	m->m_data += max_linkhdr;
	m->m_flags |= M_SKIP_FIREWALL;
	m->m_pkthdr.len = m->m_len = len;
	m->m_pkthdr.rcvif = NULL;
	bzero(m->m_data, len);

	switch (id->addr_type) {
	case 4:
		h = mtod(m, struct ip *);

		/* prepare for checksum */
		h->ip_p = IPPROTO_TCP;
		h->ip_len = htons(sizeof(struct tcphdr));
		if (dir) {
			h->ip_src.s_addr = htonl(id->src_ip);
			h->ip_dst.s_addr = htonl(id->dst_ip);
		} else {
			h->ip_src.s_addr = htonl(id->dst_ip);
			h->ip_dst.s_addr = htonl(id->src_ip);
		}

		th = (struct tcphdr *)(h + 1);
		break;
#ifdef INET6
	case 6:
		h6 = mtod(m, struct ip6_hdr *);

		/* prepare for checksum */
		h6->ip6_nxt = IPPROTO_TCP;
		h6->ip6_plen = htons(sizeof(struct tcphdr));
		if (dir) {
			h6->ip6_src = id->src_ip6;
			h6->ip6_dst = id->dst_ip6;
		} else {
			h6->ip6_src = id->dst_ip6;
			h6->ip6_dst = id->src_ip6;
		}

		th = (struct tcphdr *)(h6 + 1);
		break;
#endif
	}

	if (dir) {
		th->th_sport = htons(id->src_port);
		th->th_dport = htons(id->dst_port);
	} else {
		th->th_sport = htons(id->dst_port);
		th->th_dport = htons(id->src_port);
	}
	th->th_off = sizeof(struct tcphdr) >> 2;

	if (flags & TH_RST) {
		if (flags & TH_ACK) {
			th->th_seq = htonl(ack);
			th->th_flags = TH_RST;
		} else {
			if (flags & TH_SYN)
				seq++;
			th->th_ack = htonl(seq);
			th->th_flags = TH_RST | TH_ACK;
		}
	} else {
		/*
		 * Keepalive - use caller provided sequence numbers
		 */
		th->th_seq = htonl(seq);
		th->th_ack = htonl(ack);
		th->th_flags = TH_ACK;
	}

	switch (id->addr_type) {
	case 4:
		th->th_sum = in_cksum(m, len);

		/* finish the ip header */
		h->ip_v = 4;
		h->ip_hl = sizeof(*h) >> 2;
		h->ip_tos = IPTOS_LOWDELAY;
		h->ip_off = 0;
		/* ip_len must be in host format for ip_output */
		h->ip_len = len;
		h->ip_ttl = V_ip_defttl;
		h->ip_sum = 0;
		break;
#ifdef INET6
	case 6:
		th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6),
		    sizeof(struct tcphdr));

		/* finish the ip6 header */
		h6->ip6_vfc |= IPV6_VERSION;
		h6->ip6_hlim = IPV6_DEFHLIM;
		break;
#endif
	}

	return (m);
}

/*
 * This procedure is only used to handle keepalives. It is invoked
 * every dyn_keepalive_period
 */
 /* dummynet() and ipfw_tick() can't be static in windows */
void
ipfw_tick(void * vnetx) 
{
	struct mbuf *m0, *m, *mnext, **mtailp;
#ifdef INET6
	struct mbuf *m6, **m6_tailp;
#endif
	int i;
	ipfw_dyn_rule *q;
#ifdef VIMAGE
	struct vnet *vp = vnetx;
#endif

	CURVNET_SET(vp);
	if (V_dyn_keepalive == 0 || V_ipfw_dyn_v == NULL || V_dyn_count == 0)
		goto done;

	/*
	 * We make a chain of packets to go out here -- not deferring
	 * until after we drop the IPFW dynamic rule lock would result
	 * in a lock order reversal with the normal packet input -> ipfw
	 * call stack.
	 */
	m0 = NULL;
	mtailp = &m0;
#ifdef INET6
	m6 = NULL;
	m6_tailp = &m6;
#endif
	IPFW_DYN_LOCK();
	for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
		for (q = V_ipfw_dyn_v[i] ; q ; q = q->next ) {
			if (q->dyn_type == O_LIMIT_PARENT)
				continue;
			if (q->id.proto != IPPROTO_TCP)
				continue;
			if ( (q->state & BOTH_SYN) != BOTH_SYN)
				continue;
			if (TIME_LEQ(time_uptime + V_dyn_keepalive_interval,
			    q->expire))
				continue;	/* too early */
			if (TIME_LEQ(q->expire, time_uptime))
				continue;	/* too late, rule expired */

			m = ipfw_send_pkt(NULL, &(q->id), q->ack_rev - 1,
				q->ack_fwd, TH_SYN);
			mnext = ipfw_send_pkt(NULL, &(q->id), q->ack_fwd - 1,
				q->ack_rev, 0);

			switch (q->id.addr_type) {
			case 4:
				if (m != NULL) {
					*mtailp = m;
					mtailp = &(*mtailp)->m_nextpkt;
				}
				if (mnext != NULL) {
					*mtailp = mnext;
					mtailp = &(*mtailp)->m_nextpkt;
				}
				break;
#ifdef INET6
			case 6:
				if (m != NULL) {
					*m6_tailp = m;
					m6_tailp = &(*m6_tailp)->m_nextpkt;
				}
				if (mnext != NULL) {
					*m6_tailp = mnext;
					m6_tailp = &(*m6_tailp)->m_nextpkt;
				}
				break;
#endif
			}

			m = mnext = NULL;
		}
	}
	IPFW_DYN_UNLOCK();
	for (m = mnext = m0; m != NULL; m = mnext) {
		mnext = m->m_nextpkt;
		m->m_nextpkt = NULL;
		ip_output(m, NULL, NULL, 0, NULL, NULL);
	}
#ifdef INET6
	for (m = mnext = m6; m != NULL; m = mnext) {
		mnext = m->m_nextpkt;
		m->m_nextpkt = NULL;
		ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
	}
#endif
done:
	callout_reset_on(&V_ipfw_timeout, V_dyn_keepalive_period * hz,
		      ipfw_tick, vnetx, 0);
	CURVNET_RESTORE();
}

void
ipfw_dyn_attach(void)
{
        ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule",
            sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL,
            UMA_ALIGN_PTR, 0);

        IPFW_DYN_LOCK_INIT();
}

void
ipfw_dyn_detach(void)
{
        uma_zdestroy(ipfw_dyn_rule_zone);
        IPFW_DYN_LOCK_DESTROY();
}

void
ipfw_dyn_init(void)
{
        V_ipfw_dyn_v = NULL;
        V_dyn_buckets = 256;    /* must be power of 2 */
        V_curr_dyn_buckets = 256; /* must be power of 2 */
 
        V_dyn_ack_lifetime = 300;
        V_dyn_syn_lifetime = 20;
        V_dyn_fin_lifetime = 1;
        V_dyn_rst_lifetime = 1;
        V_dyn_udp_lifetime = 10;
        V_dyn_short_lifetime = 5;

        V_dyn_keepalive_interval = 20;
        V_dyn_keepalive_period = 5;
        V_dyn_keepalive = 1;    /* do send keepalives */
        
        V_dyn_max = 4096;       /* max # of dynamic rules */
        callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE);
        callout_reset_on(&V_ipfw_timeout, hz, ipfw_tick, curvnet, 0);
}

void
ipfw_dyn_uninit(int pass)
{
	if (pass == 0)
		callout_drain(&V_ipfw_timeout);
	else {
		if (V_ipfw_dyn_v != NULL)
			free(V_ipfw_dyn_v, M_IPFW);
	}
}

int
ipfw_dyn_len(void)
{
	return (V_ipfw_dyn_v == NULL) ? 0 :
		(V_dyn_count * sizeof(ipfw_dyn_rule));
}

void
ipfw_get_dynamic(char **pbp, const char *ep)
{
	ipfw_dyn_rule *p, *last = NULL;
	char *bp;
	int i;

	if (V_ipfw_dyn_v == NULL)
		return;
	bp = *pbp;

	IPFW_DYN_LOCK();
	for (i = 0 ; i < V_curr_dyn_buckets; i++)
		for (p = V_ipfw_dyn_v[i] ; p != NULL; p = p->next) {
			if (bp + sizeof *p <= ep) {
				ipfw_dyn_rule *dst =
					(ipfw_dyn_rule *)bp;
				bcopy(p, dst, sizeof *p);
				bcopy(&(p->rule->rulenum), &(dst->rule),
				    sizeof(p->rule->rulenum));
				/*
				 * store set number into high word of
				 * dst->rule pointer.
				 */
				bcopy(&(p->rule->set),
				    (char *)&dst->rule +
				    sizeof(p->rule->rulenum),
				    sizeof(p->rule->set));
				/*
				 * store a non-null value in "next".
				 * The userland code will interpret a
				 * NULL here as a marker
				 * for the last dynamic rule.
				 */
				bcopy(&dst, &dst->next, sizeof(dst));
				last = dst;
				dst->expire =
				    TIME_LEQ(dst->expire, time_uptime) ?
					0 : dst->expire - time_uptime ;
				bp += sizeof(ipfw_dyn_rule);
			}
		}
	IPFW_DYN_UNLOCK();
	if (last != NULL) /* mark last dynamic rule */
		bzero(&last->next, sizeof(last));
	*pbp = bp;
}
/* end of file */


================================================
FILE: sys/netinet/ipfw/ip_fw_log.c
================================================
/*-
 * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_log.c 209845 2010-07-09 11:27:33Z glebius $");

/*
 * Logging support for ipfw
 */

#if !defined(KLD_MODULE)
#include "opt_ipfw.h"
#include "opt_ipdivert.h"
#include "opt_ipdn.h"
#include "opt_inet.h"
#ifndef INET
#error IPFIREWALL requires INET.
#endif /* INET */
#endif
#include "opt_inet6.h"
#include "opt_ipsec.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <net/ethernet.h> /* for ETHERTYPE_IP */
#include <net/if.h>
#include <net/vnet.h>
#include <net/if_types.h>	/* for IFT_ETHER */
#include <net/bpf.h>		/* for BPF */

#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h>
#include <netinet/ip_var.h>
#include <netinet/ip_fw.h>
#include <netinet/ipfw/ip_fw_private.h>
#include <netinet/tcp_var.h>
#include <netinet/udp.h>

#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#ifdef INET6
#include <netinet6/in6_var.h>	/* ip6_sprintf() */
#endif

#ifdef MAC
#include <security/mac/mac_framework.h>
#endif

/*
 * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
 * Other macros just cast void * into the appropriate type
 */
#define	L3HDR(T, ip)	((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
#define	TCP(p)		((struct tcphdr *)(p))
#define	SCTP(p)		((struct sctphdr *)(p))
#define	UDP(p)		((struct udphdr *)(p))
#define	ICMP(p)		((struct icmphdr *)(p))
#define	ICMP6(p)	((struct icmp6_hdr *)(p))

#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
#define SNP(buf) buf, sizeof(buf)

#ifdef WITHOUT_BPF
void
ipfw_log_bpf(int onoff)
{
}
#else /* !WITHOUT_BPF */
static struct ifnet *log_if;	/* hook to attach to bpf */

/* we use this dummy function for all ifnet callbacks */
static int
log_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr)
{
	return EINVAL;
}

static int
ipfw_log_output(struct ifnet *ifp, struct mbuf *m,
	struct sockaddr *dst, struct route *ro)
{
	if (m != NULL)
		m_freem(m);
	return EINVAL;
}

static void
ipfw_log_start(struct ifnet* ifp)
{
	panic("ipfw_log_start() must not be called");
}

static const u_char ipfwbroadcastaddr[6] =
	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };

void
ipfw_log_bpf(int onoff)
{
	struct ifnet *ifp;

	if (onoff) {
		if (log_if)
			return;
		ifp = if_alloc(IFT_ETHER);
		if (ifp == NULL)
			return;
		if_initname(ifp, "ipfw", 0);
		ifp->if_mtu = 65536;
		ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST;
		ifp->if_init = (void *)log_dummy;
		ifp->if_ioctl = log_dummy;
		ifp->if_start = ipfw_log_start;
		ifp->if_output = ipfw_log_output;
		ifp->if_addrlen = 6;
		ifp->if_hdrlen = 14;
		if_attach(ifp);
		ifp->if_broadcastaddr = ipfwbroadcastaddr;
		ifp->if_baudrate = IF_Mbps(10);
		bpfattach(ifp, DLT_EN10MB, 14);
		log_if = ifp;
	} else {
		if (log_if) {
			ether_ifdetach(log_if);
			if_free(log_if);
		}
		log_if = NULL;
	}
}
#endif /* !WITHOUT_BPF */

/*
 * We enter here when we have a rule with O_LOG.
 * XXX this function alone takes about 2Kbytes of code!
 */
void
ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
    struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
    struct ip *ip)
{
	char *action;
	int limit_reached = 0;
	char action2[40], proto[128], fragment[32];

	if (V_fw_verbose == 0) {
#ifndef WITHOUT_BPF

		if (log_if == NULL || log_if->if_bpf == NULL)
			return;

		if (args->eh) /* layer2, use orig hdr */
			BPF_MTAP2(log_if, args->eh, ETHER_HDR_LEN, m);
		else
			/* Add fake header. Later we will store
			 * more info in the header.
			 */
			BPF_MTAP2(log_if, "DDDDDDSSSSSS\x08\x00", ETHER_HDR_LEN, m);
#endif /* !WITHOUT_BPF */
		return;
	}
	/* the old 'log' function */
	fragment[0] = '\0';
	proto[0] = '\0';

	if (f == NULL) {	/* bogus pkt */
		if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit)
			return;
		V_norule_counter++;
		if (V_norule_counter == V_verbose_limit)
			limit_reached = V_verbose_limit;
		action = "Refuse";
	} else {	/* O_LOG is the first action, find the real one */
		ipfw_insn *cmd = ACTION_PTR(f);
		ipfw_insn_log *l = (ipfw_insn_log *)cmd;

		if (l->max_log != 0 && l->log_left == 0)
			return;
		l->log_left--;
		if (l->log_left == 0)
			limit_reached = l->max_log;
		cmd += F_LEN(cmd);	/* point to first action */
		if (cmd->opcode == O_ALTQ) {
			ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;

			snprintf(SNPARGS(action2, 0), "Altq %d",
				altq->qid);
			cmd += F_LEN(cmd);
		}
		if (cmd->opcode == O_PROB)
			cmd += F_LEN(cmd);

		if (cmd->opcode == O_TAG)
			cmd += F_LEN(cmd);

		action = action2;
		switch (cmd->opcode) {
		case O_DENY:
			action = "Deny";
			break;

		case O_REJECT:
			if (cmd->arg1==ICMP_REJECT_RST)
				action = "Reset";
			else if (cmd->arg1==ICMP_UNREACH_HOST)
				action = "Reject";
			else
				snprintf(SNPARGS(action2, 0), "Unreach %d",
					cmd->arg1);
			break;

		case O_UNREACH6:
			if (cmd->arg1==ICMP6_UNREACH_RST)
				action = "Reset";
			else
				snprintf(SNPARGS(action2, 0), "Unreach %d",
					cmd->arg1);
			break;

		case O_ACCEPT:
			action = "Accept";
			break;
		case O_COUNT:
			action = "Count";
			break;
		case O_DIVERT:
			snprintf(SNPARGS(action2, 0), "Divert %d",
				cmd->arg1);
			break;
		case O_TEE:
			snprintf(SNPARGS(action2, 0), "Tee %d",
				cmd->arg1);
			break;
		case O_SETFIB:
			snprintf(SNPARGS(action2, 0), "SetFib %d",
				cmd->arg1);
			break;
		case O_SKIPTO:
			snprintf(SNPARGS(action2, 0), "SkipTo %d",
				cmd->arg1);
			break;
		case O_PIPE:
			snprintf(SNPARGS(action2, 0), "Pipe %d",
				cmd->arg1);
			break;
		case O_QUEUE:
			snprintf(SNPARGS(action2, 0), "Queue %d",
				cmd->arg1);
			break;
		case O_FORWARD_IP: {
			ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
			int len;
			struct in_addr dummyaddr;
			if (sa->sa.sin_addr.s_addr == INADDR_ANY)
				dummyaddr.s_addr = htonl(tablearg);
			else
				dummyaddr.s_addr = sa->sa.sin_addr.s_addr;

			len = snprintf(SNPARGS(action2, 0), "Forward to %s",
				inet_ntoa(dummyaddr));

			if (sa->sa.sin_port)
				snprintf(SNPARGS(action2, len), ":%d",
				    sa->sa.sin_port);
			}
			break;
		case O_NETGRAPH:
			snprintf(SNPARGS(action2, 0), "Netgraph %d",
				cmd->arg1);
			break;
		case O_NGTEE:
			snprintf(SNPARGS(action2, 0), "Ngtee %d",
				cmd->arg1);
			break;
		case O_NAT:
			action = "Nat";
 			break;
		case O_REASS:
			action = "Reass";
			break;
		default:
			action = "UNKNOWN";
			break;
		}
	}

	if (hlen == 0) {	/* non-ip */
		snprintf(SNPARGS(proto, 0), "MAC");

	} else {
		int len;
#ifdef INET6
		char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2];
#else
		char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
#endif
		struct icmphdr *icmp;
		struct tcphdr *tcp;
		struct udphdr *udp;
#ifdef INET6
		struct ip6_hdr *ip6 = NULL;
		struct icmp6_hdr *icmp6;
#endif
		src[0] = '\0';
		dst[0] = '\0';
#ifdef INET6
		if (IS_IP6_FLOW_ID(&(args->f_id))) {
			char ip6buf[INET6_ADDRSTRLEN];
			snprintf(src, sizeof(src), "[%s]",
			    ip6_sprintf(ip6buf, &args->f_id.src_ip6));
			snprintf(dst, sizeof(dst), "[%s]",
			    ip6_sprintf(ip6buf, &args->f_id.dst_ip6));

			ip6 = (struct ip6_hdr *)ip;
			tcp = (struct tcphdr *)(((char *)ip) + hlen);
			udp = (struct udphdr *)(((char *)ip) + hlen);
		} else
#endif
		{
			tcp = L3HDR(struct tcphdr, ip);
			udp = L3HDR(struct udphdr, ip);

			inet_ntoa_r(ip->ip_src, src);
			inet_ntoa_r(ip->ip_dst, dst);
		}

		switch (args->f_id.proto) {
		case IPPROTO_TCP:
			len = snprintf(SNPARGS(proto, 0), "TCP %s", src);
			if (offset == 0)
				snprintf(SNPARGS(proto, len), ":%d %s:%d",
				    ntohs(tcp->th_sport),
				    dst,
				    ntohs(tcp->th_dport));
			else
				snprintf(SNPARGS(proto, len), " %s", dst);
			break;

		case IPPROTO_UDP:
			len = snprintf(SNPARGS(proto, 0), "UDP %s", src);
			if (offset == 0)
				snprintf(SNPARGS(proto, len), ":%d %s:%d",
				    ntohs(udp->uh_sport),
				    dst,
				    ntohs(udp->uh_dport));
			else
				snprintf(SNPARGS(proto, len), " %s", dst);
			break;

		case IPPROTO_ICMP:
			icmp = L3HDR(struct icmphdr, ip);
			if (offset == 0)
				len = snprintf(SNPARGS(proto, 0),
				    "ICMP:%u.%u ",
				    icmp->icmp_type, icmp->icmp_code);
			else
				len = snprintf(SNPARGS(proto, 0), "ICMP ");
			len += snprintf(SNPARGS(proto, len), "%s", src);
			snprintf(SNPARGS(proto, len), " %s", dst);
			break;
#ifdef INET6
		case IPPROTO_ICMPV6:
			icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen);
			if (offset == 0)
				len = snprintf(SNPARGS(proto, 0),
				    "ICMPv6:%u.%u ",
				    icmp6->icmp6_type, icmp6->icmp6_code);
			else
				len = snprintf(SNPARGS(proto, 0), "ICMPv6 ");
			len += snprintf(SNPARGS(proto, len), "%s", src);
			snprintf(SNPARGS(proto, len), " %s", dst);
			break;
#endif
		default:
			len = snprintf(SNPARGS(proto, 0), "P:%d %s",
			    args->f_id.proto, src);
			snprintf(SNPARGS(proto, len), " %s", dst);
			break;
		}

#ifdef INET6
		if (IS_IP6_FLOW_ID(&(args->f_id))) {
			if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG))
				snprintf(SNPARGS(fragment, 0),
				    " (frag %08x:%d@%d%s)",
				    args->f_id.extra,
				    ntohs(ip6->ip6_plen) - hlen,
				    ntohs(offset & IP6F_OFF_MASK) << 3,
				    (offset & IP6F_MORE_FRAG) ? "+" : "");
		} else
#endif
		{
			int ipoff, iplen;
			ipoff = ntohs(ip->ip_off);
			iplen = ntohs(ip->ip_len);
			if (ipoff & (IP_MF | IP_OFFMASK))
				snprintf(SNPARGS(fragment, 0),
				    " (frag %d:%d@%d%s)",
				    ntohs(ip->ip_id), iplen - (ip->ip_hl << 2),
				    offset << 3,
				    (ipoff & IP_MF) ? "+" : "");
		}
	}
#ifdef __FreeBSD__
	if (oif || m->m_pkthdr.rcvif)
		log(LOG_SECURITY | LOG_INFO,
		    "ipfw: %d %s %s %s via %s%s\n",
		    f ? f->rulenum : -1,
		    action, proto, oif ? "out" : "in",
		    oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
		    fragment);
	else
#endif
		log(LOG_SECURITY | LOG_INFO,
		    "ipfw: %d %s %s [no if info]%s\n",
		    f ? f->rulenum : -1,
		    action, proto, fragment);
	if (limit_reached)
		log(LOG_SECURITY | LOG_NOTICE,
		    "ipfw: limit %d reached on entry %d\n",
		    limit_reached, f ? f->rulenum : -1);
}
/* end of file */


================================================
FILE: sys/netinet/ipfw/ip_fw_lookup.c
================================================
/*-
 * Copyright (c) 2009 Luigi Rizzo Universita` di Pisa
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_fw_table.c 200601 2009-12-16 10:48:40Z luigi $");

/*
 * Rule and pipe lookup support for ipfw.
 *

ipfw and dummynet need to quickly find objects (rules, pipes)
that may be dynamically created or destroyed.
To address the problem, we label each new object with a unique
32-bit identifier whose low K bits are the index in a lookup
table. All existing objects are referred by the lookup table,
and identifiers are chosen so that for each slot there is
at most one active object (whose identifier points to the slot).
This is almost a hash table, except that we can pick the
identifiers after looking at the table's occupation so
we have a trivial hash function and are collision free.

With this structure, operations are very fast and simple:
- the table has N entries s[i] with two fields, 'id' and 'ptr',
  with N <= M = 2^k (M is an upper bound to the size of the table);
- initially, all slots have s[i].id = i, and the pointers
  are used to build a freelist (tailq).
- a slot is considered empty if ptr == NULL or s[0] <= ptr < s[N].
  This is easy to detect and we can use ptr to build the freelist.
- when a new object is created, we put it in the empty slot i at the
  head of the freelist, and set the id to s[i].id;
- when an object is destroyed, we append its slot i to the end
  of the freelist, and set s[i].id += M (note M, not N).
- on a lookup for id = X, we look at slot i = X & (M-1),
  and consider the lookup successful only if the slot is not
  empty and s[i].id == X;
- wraps occur at most every F * 2^32/M operations, where F is
  the number of free slots. Because F is usually a reasonable
  fraction of M, we should not worry too much.
- if the table fills up, we can extend it by increasing N
- shrinking the table is more difficult as we might create
  collisions during the rehashing.
 *
 */

#include <sys/cdefs.h>
#ifdef _KERNEL
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/rwlock.h>
MALLOC_DEFINE(M_IPFW_LUT, "ipfw_lookup", "IpFw lookup");
#define Malloc(n)	malloc(n, M_IPFW_LUT, M_WAITOK)
#define Calloc(n)	calloc(n, M_IPFW_LUT, M_WAITOK | M_ZERO)
#define Free(p)		free(p, M_IPFW_LUT)

#define log(x, arg...)

#else /* !_KERNEL */
#include <sys/types.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define Malloc(n)	malloc(n)
#define Calloc(n)	calloc(1, n)
#define Free(p)		free(p)
#define log(x, arg...)	fprintf(stderr, "%s: " x "\n", __FUNCTION__, ##arg)
#endif /* !_KERNEL */

struct entry {
	uint32_t	id;
	struct entry	*ptr;
};

struct lookup_table {
	int _size;
	int used;
	int mask; /* 2^k -1, used for hashing */
	struct entry *f_head, *f_tail; /* freelist */
	struct entry *	s;	/* slots, array of N entries */
};

static __inline int empty(struct lookup_table *head, const void *p)
{
	const struct entry *ep = p;
	return (ep == NULL ||
		(ep >= head->s && ep < &head->s[head->_size]));
}

/*
 * init or reinit a table
 */
struct lookup_table *
ipfw_lut_init(struct lookup_table *head, int new_size, int mask)
{
	int i;
	struct entry *s;	/* the new slots */
	struct entry *fh, *ft;	/* the freelist */

	if (head != NULL) {
		mask = head->mask;
		if (new_size <= head->_size)
			return head;
		if (new_size >= mask+1) {
			log("size larger than mask");
			return NULL;
		}
	} else {
		log("old is null, initialize");
		head = Calloc(sizeof(*head));
		if (head == NULL)
			return NULL;
		if (new_size >= mask)
			mask = new_size;
		if (mask & (mask -1)) {
			for (i = 1; i < mask; i += i)
			    ;
			log("mask %d not 2^k, round up to %d", mask, i);
			mask = i;
		}
		mask = head->mask = mask - 1;
	}

	s = Calloc(new_size * sizeof(*s));
	if (s == NULL)
		return NULL;
	if (!head->s) {
		head->s = s;
		head->_size = 1;
	}
	fh = ft = NULL;
	/* remap the entries, adjust the freelist */
	for (i = 0; i < new_size; i++) {
		s[i].id = (i >= head->_size) ? i : head->s[i].id;
		if (i < head->_size && !empty(head, head->s[i].ptr)) {
			s[i].ptr = head->s[i].ptr;
			continue;
		}
		if (fh == NULL)
			fh = &s[i];
		else
			ft->ptr = &s[i];
		ft = &s[i];
	}
	head->f_head = fh;
	head->f_tail = ft;

	/* write lock on the structure, to protect the readers */
	fh = head->s;
	head->s = s;
	head->_size = new_size;
	/* release write lock */
	if (fh != s)
		Free(fh);
	log("done");
	return head;
}

/* insert returns the id */
int
ipfw_lut_insert(struct lookup_table *head, void *d)
{
	struct entry *e;

	e = head->f_head;
	if (e == NULL)
		return -1;
	head->f_head = e->ptr;
	e->ptr = d;
	head->used++;
	return e->id;
}

/* delete, returns the original entry */
void *
ipfw_lut_delete(struct lookup_table *head, int id)
{
	int i = id & head->mask;
	void *result;
	struct entry *e;

	if (i >= head->_size)
		return NULL;
	e = &head->s[i];
	if (e->id != id)
		return NULL;
	result = e->ptr;
	/* write lock to invalidate the entry to readers */
	e->id += head->mask + 1; /* prepare for next insert */
	e->ptr = NULL;
	/* release write lock */
	if (head->f_head == NULL)
		head->f_head = e;
	else
		head->f_tail->ptr = e;
	head->f_tail = e;
	head->used--;
	return result;
}

void *
ipfw_lut_lookup(struct lookup_table *head, int id)
{
	int i = id & head->mask;
	struct entry *e;

	if (i >= head->_size)
		return NULL;
	e = &head->s[i];
	return (e->id == id) ? e->ptr : NULL;
}

void
ipfw_lut_dump(struct lookup_table *head)
{
	int i;

	log("head %p size %d used %d freelist %d",
	    head, head->_size, head->used, head->f_head ?
		    head->f_head - head->s : -1);
	for (i = 0; i < head->_size; i++) {
		struct entry *e = &head->s[i];
		char ee = empty(head, e->ptr) ? 'E' : ' ';
		log("%5d  %5d %c %p", i, e->id, ee,
		    ee == 'E' && e->ptr != NULL ?
		    (void *)((struct entry *)e->ptr - head->s) : e->ptr);
	}
}

#ifndef _KERNEL
void dump_p(struct lookup_table *p, int *map)
{
	int i;
	for (i = 0; i < p->_size; i++) {
	    int id = (int)ipfw_lut_lookup(p, map[i]);
	    log("%3d: %3d: %c", map[i] % 64, i, id);
	}
}
int main(int argc, char *argv[])
{
	int i, j, l;
#define S 1000
	int map[S];
	struct lookup_table *p;
	struct lookup_table *p1;
	const char *m = "nel mezzo del cammin di nostra vita mi ritrovai"
		" in una selva oscura e la diritta via era smarrita!";

	fprintf(stderr, "testing lookup\n");

	l = strlen(m);

	p = ipfw_lut_init(NULL, 120, 33);

	ipfw_lut_dump(p);
	for (i = 0; i < l; i++) {
	    int x = m[i];
	    int id = ipfw_lut_insert(p, (void *)x);
	    //ipfw_lut_dump(p);
	    map[i] = id;
	    for (j=0; j < 10; j++) {
		    id = ipfw_lut_insert(p, (void *)'a');
		    // ipfw_lut_dump(p);
		    ipfw_lut_delete(p, id);
	    	    // ipfw_lut_dump(p);
	    }
	//    ipfw_lut_dump(p);
	} 
	dump_p(p, map);
	p1 = ipfw_lut_init(p, 23, 0);
	if (!p1)
		return 1;
	dump_p(p1, map);
	p1 = ipfw_lut_init(p1, 120, 0);
	if (!p1)
		return 1;
	dump_p(p1, map);
	return 0;
}
#endif
/* end of file */


================================================
FILE: sys/netinet/ipfw/ip_fw_nat.c
================================================
/*-
 * Copyright (c) 2008 Paolo Pisati
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_fw_nat.c 200975 2009-12-25 01:15:39Z luigi $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/eventhandler.h>
#include <sys/malloc.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/module.h>
#include <sys/rwlock.h>

#define        IPFW_INTERNAL   /* Access to protected data structures in ip_fw.h. */

#include <netinet/libalias/alias.h>
#include <netinet/libalias/alias_local.h>

#include <net/if.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/ip_fw.h>
#include <netinet/ipfw/ip_fw_private.h>
#include <netinet/tcp.h>
#include <netinet/udp.h>

#include <machine/in_cksum.h>	/* XXX for in_cksum */

static VNET_DEFINE(eventhandler_tag, ifaddr_event_tag);
#define	V_ifaddr_event_tag	VNET(ifaddr_event_tag)

static void
ifaddr_change(void *arg, struct ifnet *ifp)
{
	struct cfg_nat *ptr;
	struct ifaddr *ifa;
	struct ip_fw_chain *chain;

	(void)arg;
	chain = &V_layer3_chain;
	IPFW_WLOCK(chain);
	/* Check every nat entry... */
	LIST_FOREACH(ptr, &chain->nat, _next) {
		/* ...using nic 'ifp->if_xname' as dynamic alias address. */
		if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0)
			continue;
		if_addr_rlock(ifp);
		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
			if (ifa->ifa_addr == NULL)
				continue;
			if (ifa->ifa_addr->sa_family != AF_INET)
				continue;
			ptr->ip = ((struct sockaddr_in *)
			    (ifa->ifa_addr))->sin_addr;
			LibAliasSetAddress(ptr->lib, ptr->ip);
		}
		if_addr_runlock(ifp);
	}
	IPFW_WUNLOCK(chain);
}

/*
 * delete the pointers for nat entry ix, or all of them if ix < 0
 */
static void
flush_nat_ptrs(struct ip_fw_chain *chain, const int ix)
{
	int i;
	ipfw_insn_nat *cmd;

	IPFW_WLOCK_ASSERT(chain);
	for (i = 0; i < chain->n_rules; i++) {
		cmd = (ipfw_insn_nat *)ACTION_PTR(chain->map[i]);
		/* XXX skip log and the like ? */
		if (cmd->o.opcode == O_NAT && cmd->nat != NULL &&
			    (ix < 0 || cmd->nat->id == ix))
			cmd->nat = NULL;
	}
}

static void
del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head)
{
	struct cfg_redir *r, *tmp_r;
	struct cfg_spool *s, *tmp_s;
	int i, num;

	LIST_FOREACH_SAFE(r, head, _next, tmp_r) {
		num = 1; /* Number of alias_link to delete. */
		switch (r->mode) {
		case REDIR_PORT:
			num = r->pport_cnt;
			/* FALLTHROUGH */
		case REDIR_ADDR:
		case REDIR_PROTO:
			/* Delete all libalias redirect entry. */
			for (i = 0; i < num; i++)
				LibAliasRedirectDelete(n->lib, r->alink[i]);
			/* Del spool cfg if any. */
			LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) {
				LIST_REMOVE(s, _next);
				free(s, M_IPFW);
			}
			free(r->alink, M_IPFW);
			LIST_REMOVE(r, _next);
			free(r, M_IPFW);
			break;
		default:
			printf("unknown redirect mode: %u\n", r->mode);
			/* XXX - panic?!?!? */
			break;
		}
	}
}

static int
add_redir_spool_cfg(char *buf, struct cfg_nat *ptr)
{
	struct cfg_redir *r, *ser_r;
	struct cfg_spool *s, *ser_s;
	int cnt, off, i;

	for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) {
		ser_r = (struct cfg_redir *)&buf[off];
		r = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO);
		memcpy(r, ser_r, SOF_REDIR);
		LIST_INIT(&r->spool_chain);
		off += SOF_REDIR;
		r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt,
		    M_IPFW, M_WAITOK | M_ZERO);
		switch (r->mode) {
		case REDIR_ADDR:
			r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr,
			    r->paddr);
			break;
		case REDIR_PORT:
			for (i = 0 ; i < r->pport_cnt; i++) {
				/* If remotePort is all ports, set it to 0. */
				u_short remotePortCopy = r->rport + i;
				if (r->rport_cnt == 1 && r->rport == 0)
					remotePortCopy = 0;
				r->alink[i] = LibAliasRedirectPort(ptr->lib,
				    r->laddr, htons(r->lport + i), r->raddr,
				    htons(remotePortCopy), r->paddr,
				    htons(r->pport + i), r->proto);
				if (r->alink[i] == NULL) {
					r->alink[0] = NULL;
					break;
				}
			}
			break;
		case REDIR_PROTO:
			r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr,
			    r->raddr, r->paddr, r->proto);
			break;
		default:
			printf("unknown redirect mode: %u\n", r->mode);
			break;
		}
		/* XXX perhaps return an error instead of panic ? */
		if (r->alink[0] == NULL)
			panic("LibAliasRedirect* returned NULL");
		/* LSNAT handling. */
		for (i = 0; i < r->spool_cnt; i++) {
			ser_s = (struct cfg_spool *)&buf[off];
			s = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO);
			memcpy(s, ser_s, SOF_SPOOL);
			LibAliasAddServer(ptr->lib, r->alink[0],
			    s->addr, htons(s->port));
			off += SOF_SPOOL;
			/* Hook spool entry. */
			LIST_INSERT_HEAD(&r->spool_chain, s, _next);
		}
		/* And finally hook this redir entry. */
		LIST_INSERT_HEAD(&ptr->redir_chain, r, _next);
	}
	return (1);
}

static int
ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m)
{
	struct mbuf *mcl;
	struct ip *ip;
	/* XXX - libalias duct tape */
	int ldt, retval;
	char *c;

	ldt = 0;
	retval = 0;
	mcl = m_megapullup(m, m->m_pkthdr.len);
	if (mcl == NULL) {
		args->m = NULL;
		return (IP_FW_DENY);
	}
	ip = mtod(mcl, struct ip *);

	/*
	 * XXX - Libalias checksum offload 'duct tape':
	 *
	 * locally generated packets have only pseudo-header checksum
	 * calculated and libalias will break it[1], so mark them for
	 * later fix.  Moreover there are cases when libalias modifies
	 * tcp packet data[2], mark them for later fix too.
	 *
	 * [1] libalias was never meant to run in kernel, so it does
	 * not have any knowledge about checksum offloading, and
	 * expects a packet with a full internet checksum.
	 * Unfortunately, packets generated locally will have just the
	 * pseudo header calculated, and when libalias tries to adjust
	 * the checksum it will actually compute a wrong value.
	 *
	 * [2] when libalias modifies tcp's data content, full TCP
	 * checksum has to be recomputed: the problem is that
	 * libalias does not have any idea about checksum offloading.
	 * To work around this, we do not do checksumming in LibAlias,
	 * but only mark the packets in th_x2 field. If we receive a
	 * marked packet, we calculate correct checksum for it
	 * aware of offloading.  Why such a terrible hack instead of
	 * recalculating checksum for each packet?
	 * Because the previous checksum was not checked!
	 * Recalculating checksums for EVERY packet will hide ALL
	 * transmission errors. Yes, marked packets still suffer from
	 * this problem. But, sigh, natd(8) has this problem, too.
	 *
	 * TODO: -make libalias mbuf aware (so
	 * it can handle delayed checksum and tso)
	 */

	if (mcl->m_pkthdr.rcvif == NULL &&
	    mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
		ldt = 1;

	c = mtod(mcl, char *);
	if (args->oif == NULL)
		retval = LibAliasIn(t->lib, c,
			mcl->m_len + M_TRAILINGSPACE(mcl));
	else
		retval = LibAliasOut(t->lib, c,
			mcl->m_len + M_TRAILINGSPACE(mcl));
	if (retval == PKT_ALIAS_RESPOND) {
		m->m_flags |= M_SKIP_FIREWALL;
		retval = PKT_ALIAS_OK;
	}
	if (retval != PKT_ALIAS_OK &&
	    retval != PKT_ALIAS_FOUND_HEADER_FRAGMENT) {
		/* XXX - should i add some logging? */
		m_free(mcl);
		args->m = NULL;
		return (IP_FW_DENY);
	}
	mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len);

	/*
	 * XXX - libalias checksum offload
	 * 'duct tape' (see above)
	 */

	if ((ip->ip_off & htons(IP_OFFMASK)) == 0 &&
	    ip->ip_p == IPPROTO_TCP) {
		struct tcphdr 	*th;

		th = (struct tcphdr *)(ip + 1);
		if (th->th_x2)
			ldt = 1;
	}

	if (ldt) {
		struct tcphdr 	*th;
		struct udphdr 	*uh;
		u_short cksum;

		ip->ip_len = ntohs(ip->ip_len);
		cksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
		    htons(ip->ip_p + ip->ip_len - (ip->ip_hl << 2)));

		switch (ip->ip_p) {
		case IPPROTO_TCP:
			th = (struct tcphdr *)(ip + 1);
			/*
			 * Maybe it was set in
			 * libalias...
			 */
			th->th_x2 = 0;
			th->th_sum = cksum;
			mcl->m_pkthdr.csum_data =
			    offsetof(struct tcphdr, th_sum);
			break;
		case IPPROTO_UDP:
			uh = (struct udphdr *)(ip + 1);
			uh->uh_sum = cksum;
			mcl->m_pkthdr.csum_data =
			    offsetof(struct udphdr, uh_sum);
			break;
		}
		/* No hw checksum offloading: do it ourselves */
		if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) {
			in_delayed_cksum(mcl);
			mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
		}
		ip->ip_len = htons(ip->ip_len);
	}
	args->m = mcl;
	return (IP_FW_NAT);
}

static struct cfg_nat *
lookup_nat(struct nat_list *l, int nat_id)
{
	struct cfg_nat *res;

	LIST_FOREACH(res, l, _next) {
		if (res->id == nat_id)
			break;
	}
	return res;
}

static int
ipfw_nat_cfg(struct sockopt *sopt)
{
	struct cfg_nat *ptr, *ser_n;
	char *buf;
	struct ip_fw_chain *chain = &V_layer3_chain;

	buf = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO);
	sooptcopyin(sopt, buf, NAT_BUF_LEN, sizeof(struct cfg_nat));
	ser_n = (struct cfg_nat *)buf;

	/* check valid parameter ser_n->id > 0 ? */
	/*
	 * Find/create nat rule.
	 */
	IPFW_WLOCK(chain);
	ptr = lookup_nat(&chain->nat, ser_n->id);
	if (ptr == NULL) {
		/* New rule: allocate and init new instance. */
		ptr = malloc(sizeof(struct cfg_nat),
		    M_IPFW, M_NOWAIT | M_ZERO);
		if (ptr == NULL) {
			IPFW_WUNLOCK(chain);
			free(buf, M_IPFW);
			return (ENOSPC);
		}
		ptr->lib = LibAliasInit(NULL);
		if (ptr->lib == NULL) {
			IPFW_WUNLOCK(chain);
			free(ptr, M_IPFW);
			free(buf, M_IPFW);
			return (EINVAL);
		}
		LIST_INIT(&ptr->redir_chain);
	} else {
		/* Entry already present: temporarly unhook it. */
		LIST_REMOVE(ptr, _next);
		flush_nat_ptrs(chain, ser_n->id);
	}
	IPFW_WUNLOCK(chain);

	/*
	 * Basic nat configuration.
	 */
	ptr->id = ser_n->id;
	/*
	 * XXX - what if this rule doesn't nat any ip and just
	 * redirect?
	 * do we set aliasaddress to 0.0.0.0?
	 */
	ptr->ip = ser_n->ip;
	ptr->redir_cnt = ser_n->redir_cnt;
	ptr->mode = ser_n->mode;
	LibAliasSetMode(ptr->lib, ser_n->mode, ser_n->mode);
	LibAliasSetAddress(ptr->lib, ptr->ip);
	memcpy(ptr->if_name, ser_n->if_name, IF_NAMESIZE);

	/*
	 * Redir and LSNAT configuration.
	 */
	/* Delete old cfgs. */
	del_redir_spool_cfg(ptr, &ptr->redir_chain);
	/* Add new entries. */
	add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr);
	free(buf, M_IPFW);
	IPFW_WLOCK(chain);
	LIST_INSERT_HEAD(&chain->nat, ptr, _next);
	IPFW_WUNLOCK(chain);
	return (0);
}

static int
ipfw_nat_del(struct sockopt *sopt)
{
	struct cfg_nat *ptr;
	struct ip_fw_chain *chain = &V_layer3_chain;
	int i;

	sooptcopyin(sopt, &i, sizeof i, sizeof i);
	/* XXX validate i */
	IPFW_WLOCK(chain);
	ptr = lookup_nat(&chain->nat, i);
	if (ptr == NULL) {
		IPFW_WUNLOCK(chain);
		return (EINVAL);
	}
	LIST_REMOVE(ptr, _next);
	flush_nat_ptrs(chain, i);
	IPFW_WUNLOCK(chain);
	del_redir_spool_cfg(ptr, &ptr->redir_chain);
	LibAliasUninit(ptr->lib);
	free(ptr, M_IPFW);
	return (0);
}

static int
ipfw_nat_get_cfg(struct sockopt *sopt)
{
	uint8_t *data;
	struct cfg_nat *n;
	struct cfg_redir *r;
	struct cfg_spool *s;
	int nat_cnt, off;
	struct ip_fw_chain *chain;
	int err = ENOSPC;

	chain = &V_layer3_chain;
	nat_cnt = 0;
	off = sizeof(nat_cnt);

	data = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO);
	IPFW_RLOCK(chain);
	/* Serialize all the data. */
	LIST_FOREACH(n, &chain->nat, _next) {
		nat_cnt++;
		if (off + SOF_NAT >= NAT_BUF_LEN)
			goto nospace;
		bcopy(n, &data[off], SOF_NAT);
		off += SOF_NAT;
		LIST_FOREACH(r, &n->redir_chain, _next) {
			if (off + SOF_REDIR >= NAT_BUF_LEN)
				goto nospace;
			bcopy(r, &data[off], SOF_REDIR);
			off += SOF_REDIR;
			LIST_FOREACH(s, &r->spool_chain, _next) {
				if (off + SOF_SPOOL >= NAT_BUF_LEN)
					goto nospace;
				bcopy(s, &data[off], SOF_SPOOL);
				off += SOF_SPOOL;
			}
		}
	}
	err = 0; /* all good */
nospace:
	IPFW_RUNLOCK(chain);
	if (err == 0) {
		bcopy(&nat_cnt, data, sizeof(nat_cnt));
		sooptcopyout(sopt, data, NAT_BUF_LEN);
	} else {
		printf("serialized data buffer not big enough:"
		    "please increase NAT_BUF_LEN\n");
	}
	free(data, M_IPFW);
	return (err);
}

static int
ipfw_nat_get_log(struct sockopt *sopt)
{
	uint8_t *data;
	struct cfg_nat *ptr;
	int i, size;
	struct ip_fw_chain *chain;

	chain = &V_layer3_chain;

	IPFW_RLOCK(chain);
	/* one pass to count, one to copy the data */
	i = 0;
	LIST_FOREACH(ptr, &chain->nat, _next) {
		if (ptr->lib->logDesc == NULL)
			continue;
		i++;
	}
	size = i * (LIBALIAS_BUF_SIZE + sizeof(int));
	data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO);
	if (data == NULL) {
		IPFW_RUNLOCK(chain);
		return (ENOSPC);
	}
	i = 0;
	LIST_FOREACH(ptr, &chain->nat, _next) {
		if (ptr->lib->logDesc == NULL)
			continue;
		bcopy(&ptr->id, &data[i], sizeof(int));
		i += sizeof(int);
		bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE);
		i += LIBALIAS_BUF_SIZE;
	}
	IPFW_RUNLOCK(chain);
	sooptcopyout(sopt, data, size);
	free(data, M_IPFW);
	return(0);
}

static void
ipfw_nat_init(void)
{

	IPFW_WLOCK(&V_layer3_chain);
	/* init ipfw hooks */
	ipfw_nat_ptr = ipfw_nat;
	lookup_nat_ptr = lookup_nat;
	ipfw_nat_cfg_ptr = ipfw_nat_cfg;
	ipfw_nat_del_ptr = ipfw_nat_del;
	ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg;
	ipfw_nat_get_log_ptr = ipfw_nat_get_log;
	IPFW_WUNLOCK(&V_layer3_chain);
	V_ifaddr_event_tag = EVENTHANDLER_REGISTER(
	    ifaddr_event, ifaddr_change,
	    NULL, EVENTHANDLER_PRI_ANY);
}

static void
ipfw_nat_destroy(void)
{
	struct cfg_nat *ptr, *ptr_temp;
	struct ip_fw_chain *chain;

	chain = &V_layer3_chain;
	IPFW_WLOCK(chain);
	LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) {
		LIST_REMOVE(ptr, _next);
		del_redir_spool_cfg(ptr, &ptr->redir_chain);
		LibAliasUninit(ptr->lib);
		free(ptr, M_IPFW);
	}
	EVENTHANDLER_DEREGISTER(ifaddr_event, V_ifaddr_event_tag);
	flush_nat_ptrs(chain, -1 /* flush all */);
	/* deregister ipfw_nat */
	ipfw_nat_ptr = NULL;
	lookup_nat_ptr = NULL;
	ipfw_nat_cfg_ptr = NULL;
	ipfw_nat_del_ptr = NULL;
	ipfw_nat_get_cfg_ptr = NULL;
	ipfw_nat_get_log_ptr = NULL;
	IPFW_WUNLOCK(chain);
}

static int
ipfw_nat_modevent(module_t mod, int type, void *unused)
{
	int err = 0;

	switch (type) {
	case MOD_LOAD:
		ipfw_nat_init();
		break;

	case MOD_UNLOAD:
		ipfw_nat_destroy();
		break;

	default:
		return EOPNOTSUPP;
		break;
	}
	return err;
}

static moduledata_t ipfw_nat_mod = {
	"ipfw_nat",
	ipfw_nat_modevent,
	0
};

DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1);
MODULE_DEPEND(ipfw_nat, ipfw, 2, 2, 2);
MODULE_VERSION(ipfw_nat, 1);
/* end of file */


================================================
FILE: sys/netinet/ipfw/ip_fw_pfil.c
================================================
/*-
 * Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_pfil.c 200601 2009-12-16 10:48:40Z luigi $");

#if !defined(KLD_MODULE)
#include "opt_ipfw.h"
#include "opt_ipdn.h"
#include "opt_inet.h"
#ifndef INET
#error IPFIREWALL requires INET.
#endif /* INET */
#endif /* KLD_MODULE */
#include "opt_inet6.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/module.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/rwlock.h>
#include <sys/socket.h>
#include <sys/sysctl.h>

#include <net/if.h>
#include <net/route.h>
#include <net/pfil.h>
#include <net/vnet.h>

#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/ip_fw.h>
#include <netinet/ipfw/ip_fw_private.h>
#include <netgraph/ng_ipfw.h>

#include <machine/in_cksum.h>

static VNET_DEFINE(int, fw_enable) = 1;
#define V_fw_enable	VNET(fw_enable)

#ifdef INET6
static VNET_DEFINE(int, fw6_enable) = 1;
#define V_fw6_enable	VNET(fw6_enable)
#endif

int ipfw_chg_hook(SYSCTL_HANDLER_ARGS);

/* Forward declarations. */
static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int);

#ifdef SYSCTL_NODE

SYSBEGIN(f1)

SYSCTL_DECL(_net_inet_ip_fw);
SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable,
    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0,
    ipfw_chg_hook, "I", "Enable ipfw");
#ifdef INET6
SYSCTL_DECL(_net_inet6_ip6_fw);
SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable,
    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0,
    ipfw_chg_hook, "I", "Enable ipfw+6");
#endif /* INET6 */

SYSEND

#endif /* SYSCTL_NODE */

/*
 * The pfilter hook to pass packets to ipfw_chk and then to
 * dummynet, divert, netgraph or other modules.
 * The packet may be consumed.
 */
int
ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
    struct inpcb *inp)
{
	struct ip_fw_args args;
	struct m_tag *tag;
	int ipfw;
	int ret;

	/* all the processing now uses ip_len in net format */
	if (mtod(*m0, struct ip *)->ip_v == 4)
		SET_NET_IPLEN(mtod(*m0, struct ip *));

	/* convert dir to IPFW values */
	dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT;
	bzero(&args, sizeof(args));

again:
	/*
	 * extract and remove the tag if present. If we are left
	 * with onepass, optimize the outgoing path.
	 */
	tag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL);
	if (tag != NULL) {
		args.rule = *((struct ipfw_rule_ref *)(tag+1));
		m_tag_delete(*m0, tag);
		if (args.rule.info & IPFW_ONEPASS) {
			SET_HOST_IPLEN(mtod(*m0, struct ip *));
			return 0;
		}
	}

	args.m = *m0;
	args.oif = dir == DIR_OUT ? ifp : NULL;
	args.inp = inp;

	ipfw = ipfw_chk(&args);
	*m0 = args.m;

	KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL",
	    __func__));

	/* breaking out of the switch means drop */
	ret = 0;	/* default return value for pass */
	switch (ipfw) {
	case IP_FW_PASS:
		/* next_hop may be set by ipfw_chk */
		if (args.next_hop == NULL)
			break; /* pass */
#ifndef IPFIREWALL_FORWARD
		ret = EACCES;
#else
	    {
		struct m_tag *fwd_tag;

		/* Incoming packets should not be tagged so we do not
		 * m_tag_find. Outgoing packets may be tagged, so we
		 * reuse the tag if present.
		 */
		fwd_tag = (dir == DIR_IN) ? NULL :
			m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL);
		if (fwd_tag != NULL) {
			m_tag_unlink(*m0, fwd_tag);
		} else {
			fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD,
				sizeof(struct sockaddr_in), M_NOWAIT);
			if (fwd_tag == NULL) {
				ret = EACCES;
				break; /* i.e. drop */
			}
		}
		bcopy(args.next_hop, (fwd_tag+1), sizeof(struct sockaddr_in));
		m_tag_prepend(*m0, fwd_tag);

		if (in_localip(args.next_hop->sin_addr))
			(*m0)->m_flags |= M_FASTFWD_OURS;
	    }
#endif
		break;

	case IP_FW_DENY:
		ret = EACCES;
		break; /* i.e. drop */

	case IP_FW_DUMMYNET:
		ret = EACCES;
		if (ip_dn_io_ptr == NULL)
			break; /* i.e. drop */
		if (mtod(*m0, struct ip *)->ip_v == 4)
			ret = ip_dn_io_ptr(m0, dir, &args);
		else if (mtod(*m0, struct ip *)->ip_v == 6)
			ret = ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args);
		else
			break; /* drop it */
		/*
		 * XXX should read the return value.
		 * dummynet normally eats the packet and sets *m0=NULL
		 * unless the packet can be sent immediately. In this
		 * case args is updated and we should re-run the
		 * check without clearing args.
		 */
		if (*m0 != NULL)
			goto again;
		break;

	case IP_FW_TEE:
	case IP_FW_DIVERT:
		if (ip_divert_ptr == NULL) {
			ret = EACCES;
			break; /* i.e. drop */
		}
		ret = ipfw_divert(m0, dir, &args.rule,
			(ipfw == IP_FW_TEE) ? 1 : 0);
		/* continue processing for the original packet (tee). */
		if (*m0)
			goto again;
		break;

	case IP_FW_NGTEE:
	case IP_FW_NETGRAPH:
		if (ng_ipfw_input_p == NULL) {
			ret = EACCES;
			break; /* i.e. drop */
		}
		ret = ng_ipfw_input_p(m0, dir, &args,
			(ipfw == IP_FW_NGTEE) ? 1 : 0);
		if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */
			goto again;	/* continue with packet */
		break;

	case IP_FW_NAT:
		/* honor one-pass in case of successful nat */
		if (V_fw_one_pass)
			break; /* ret is already 0 */
		goto again;

	case IP_FW_REASS:
		goto again;		/* continue with packet */
	
	default:
		KASSERT(0, ("%s: unknown retval", __func__));
	}

	if (ret != 0) {
		if (*m0)
			FREE_PKT(*m0);
		*m0 = NULL;
	}
	if (*m0 && mtod(*m0, struct ip *)->ip_v == 4)
		SET_HOST_IPLEN(mtod(*m0, struct ip *));
	return ret;
}

/* do the divert, return 1 on error 0 on success */
static int
ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule,
	int tee)
{
	/*
	 * ipfw_chk() has already tagged the packet with the divert tag.
	 * If tee is set, copy packet and return original.
	 * If not tee, consume packet and send it to divert socket.
	 */
	struct mbuf *clone;
	struct ip *ip;
	struct m_tag *tag;

	/* Cloning needed for tee? */
	if (tee == 0) {
		clone = *m0;	/* use the original mbuf */
		*m0 = NULL;
	} else {
		clone = m_dup(*m0, M_DONTWAIT);
		/* If we cannot duplicate the mbuf, we sacrifice the divert
		 * chain and continue with the tee-ed packet.
		 */
		if (clone == NULL)
			return 1;
	}

	/*
	 * Divert listeners can normally handle non-fragmented packets,
	 * but we can only reass in the non-tee case.
	 * This means that listeners on a tee rule may get fragments,
	 * and have to live with that.
	 * Note that we now have the 'reass' ipfw option so if we care
	 * we can do it before a 'tee'.
	 */
	ip = mtod(clone, struct ip *);
	if (!tee && ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) {
		int hlen;
		struct mbuf *reass;

		SET_HOST_IPLEN(ip); /* ip_reass wants host order */
		reass = ip_reass(clone); /* Reassemble packet. */
		if (reass == NULL)
			return 0; /* not an error */
		/* if reass = NULL then it was consumed by ip_reass */
		/*
		 * IP header checksum fixup after reassembly and leave header
		 * in network byte order.
		 */
		ip = mtod(reass, struct ip *);
		hlen = ip->ip_hl << 2;
		SET_NET_IPLEN(ip);
		ip->ip_sum = 0;
		if (hlen == sizeof(struct ip))
			ip->ip_sum = in_cksum_hdr(ip);
		else
			ip->ip_sum = in_cksum(reass, hlen);
		clone = reass;
	}
	/* attach a tag to the packet with the reinject info */
	tag = m_tag_alloc(MTAG_IPFW_RULE, 0,
		    sizeof(struct ipfw_rule_ref), M_NOWAIT);
	if (tag == NULL) {
		FREE_PKT(clone);
		return 1;
	}
	*((struct ipfw_rule_ref *)(tag+1)) = *rule;
	m_tag_prepend(clone, tag);

	/* Do the dirty job... */
	ip_divert_ptr(clone, incoming);
	return 0;
}

/*
 * attach or detach hooks for a given protocol family
 */
static int
ipfw_hook(int onoff, int pf)
{
	struct pfil_head *pfh;

	pfh = pfil_head_get(PFIL_TYPE_AF, pf);
	if (pfh == NULL)
		return ENOENT;

	(void) (onoff ? pfil_add_hook : pfil_remove_hook)
	    (ipfw_check_hook, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh);

	return 0;
}

int
ipfw_attach_hooks(int arg)
{
	int error = 0;

	if (arg == 0) /* detach */
		ipfw_hook(0, AF_INET);
	else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) {
                error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */
                printf("ipfw_hook() error\n");
        }
#ifdef INET6
	if (arg == 0) /* detach */
		ipfw_hook(0, AF_INET6);
	else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) {
                error = ENOENT;
                printf("ipfw6_hook() error\n");
        }
#endif
	return error;
}

int
ipfw_chg_hook(SYSCTL_HANDLER_ARGS)
{
	int enable;
	int oldenable;
	int error;
	int af;

	if (arg1 == &VNET_NAME(fw_enable)) {
		enable = V_fw_enable;
		af = AF_INET;
	}
#ifdef INET6
	else if (arg1 == &VNET_NAME(fw6_enable)) {
		enable = V_fw6_enable;
		af = AF_INET6;
	}
#endif
	else 
		return (EINVAL);

	oldenable = enable;

	error = sysctl_handle_int(oidp, &enable, 0, req);

	if (error)
		return (error);

	enable = (enable) ? 1 : 0;

	if (enable == oldenable)
		return (0);

	error = ipfw_hook(enable, af);
	if (error)
		return (error);
	if (af == AF_INET)
		V_fw_enable = enable;
#ifdef INET6
	else if (af == AF_INET6)
		V_fw6_enable = enable;
#endif

	return (0);
}
/* end of file */


================================================
FILE: sys/netinet/ipfw/ip_fw_private.h
================================================
/*-
 * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * $FreeBSD: head/sys/netinet/ipfw/ip_fw_private.h 200601 2009-12-16 10:48:40Z luigi $
 */

#ifndef _IPFW2_PRIVATE_H
#define _IPFW2_PRIVATE_H

/*
 * Internal constants and data structures used by ipfw components
 * and not meant to be exported outside the kernel.
 */

#ifdef _KERNEL

/*
 * For platforms that do not have SYSCTL support, we wrap the
 * SYSCTL_* into a function (one per file) to collect the values
 * into an array at module initialization. The wrapping macros,
 * SYSBEGIN() and SYSEND, are empty in the default case.
 */
#ifndef SYSBEGIN
#define SYSBEGIN(x)
#endif
#ifndef SYSEND
#define SYSEND
#endif

/* Return values from ipfw_chk() */
enum {
	IP_FW_PASS = 0,
	IP_FW_DENY,
	IP_FW_DIVERT,
	IP_FW_TEE,
	IP_FW_DUMMYNET,
	IP_FW_NETGRAPH,
	IP_FW_NGTEE,
	IP_FW_NAT,
	IP_FW_REASS,
};

/*
 * Structure for collecting parameters to dummynet for ip6_output forwarding
 */
struct _ip6dn_args {
       struct ip6_pktopts *opt_or;
       struct route_in6 ro_or;
       int flags_or;
       struct ip6_moptions *im6o_or;
       struct ifnet *origifp_or;
       struct ifnet *ifp_or;
       struct sockaddr_in6 dst_or;
       u_long mtu_or;
       struct route_in6 ro_pmtu_or;
};


/*
 * Arguments for calling ipfw_chk() and dummynet_io(). We put them
 * all into a structure because this way it is easier and more
 * efficient to pass variables around and extend the interface.
 */
struct ip_fw_args {
	struct mbuf	*m;		/* the mbuf chain		*/
	struct ifnet	*oif;		/* output interface		*/
	struct sockaddr_in *next_hop;	/* forward address		*/

	/*
	 * On return, it points to the matching rule.
	 * On entry, rule.slot > 0 means the info is valid and
	 * contains the the starting rule for an ipfw search.
	 * If chain_id == chain->id && slot >0 then jump to that slot.
	 * Otherwise, we locate the first rule >= rulenum:rule_id
	 */
	struct ipfw_rule_ref rule;	/* match/restart info		*/

	struct ether_header *eh;	/* for bridged packets		*/

	struct ipfw_flow_id f_id;	/* grabbed from IP header	*/
	//uint32_t	cookie;		/* a cookie depending on rule action */
	struct inpcb	*inp;

	struct _ip6dn_args	dummypar; /* dummynet->ip6_output */
	struct sockaddr_in hopstore;	/* store here if cannot use a pointer */
};

MALLOC_DECLARE(M_IPFW);

/*
 * Hooks sometime need to know the direction of the packet
 * (divert, dummynet, netgraph, ...)
 * We use a generic definition here, with bit0-1 indicating the
 * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the
 * specific protocol
 * indicating the protocol (if necessary)
 */
enum {
	DIR_MASK =	0x3,
	DIR_OUT =	0,
	DIR_IN =	1,
	DIR_FWD =	2,
	DIR_DROP =	3,
	PROTO_LAYER2 =	0x4, /* set for layer 2 */
	/* PROTO_DEFAULT = 0, */
	PROTO_IPV4 =	0x08,
	PROTO_IPV6 =	0x10,
	PROTO_IFB =	0x0c, /* layer2 + ifbridge */
   /*	PROTO_OLDBDG =	0x14, unused, old bridge */
};

/* wrapper for freeing a packet, in case we need to do more work */
#ifndef FREE_PKT
#if defined(__linux__) || defined(_WIN32)
#define FREE_PKT(m)	netisr_dispatch(-1, m)
#else
#define FREE_PKT(m)	m_freem(m)
#endif
#endif /* !FREE_PKT */

/*
 * Function definitions.
 */

/* attach (arg = 1) or detach (arg = 0) hooks */
int ipfw_attach_hooks(int);
#ifdef NOTYET
void ipfw_nat_destroy(void);
#endif

/* In ip_fw_log.c */
struct ip;
void ipfw_log_bpf(int);
void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
	struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
	struct ip *ip);
VNET_DECLARE(u_int64_t, norule_counter);
#define	V_norule_counter	VNET(norule_counter)
VNET_DECLARE(int, verbose_limit);
#define	V_verbose_limit		VNET(verbose_limit)

/* In ip_fw_dynamic.c */

enum { /* result for matching dynamic rules */
	MATCH_REVERSE = 0,
	MATCH_FORWARD,
	MATCH_NONE,
	MATCH_UNKNOWN,
};

/*
 * The lock for dynamic rules is only used once outside the file,
 * and only to release the result of lookup_dyn_rule().
 * Eventually we may implement it with a callback on the function.
 */
void ipfw_dyn_unlock(void);

struct tcphdr;
struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *,
    u_int32_t, u_int32_t, int);
int ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
    struct ip_fw_args *args, uint32_t tablearg);
ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt,
	int *match_direction, struct tcphdr *tcp);
void ipfw_remove_dyn_children(struct ip_fw *rule);
void ipfw_get_dynamic(char **bp, const char *ep);

void ipfw_dyn_attach(void);	/* uma_zcreate .... */
void ipfw_dyn_detach(void);	/* uma_zdestroy ... */
void ipfw_dyn_init(void);	/* per-vnet initialization */
void ipfw_dyn_uninit(int);	/* per-vnet deinitialization */
int ipfw_dyn_len(void);

/* common variables */
VNET_DECLARE(int, fw_one_pass);
#define	V_fw_one_pass		VNET(fw_one_pass)

VNET_DECLARE(int, fw_verbose);
#define	V_fw_verbose		VNET(fw_verbose)

VNET_DECLARE(struct ip_fw_chain, layer3_chain);
#define	V_layer3_chain		VNET(layer3_chain)

VNET_DECLARE(u_int32_t, set_disable);
#define	V_set_disable		VNET(set_disable)

VNET_DECLARE(int, autoinc_step);
#define V_autoinc_step		VNET(autoinc_step)

struct ip_fw_chain {
	struct ip_fw	*rules;		/* list of rules */
	struct ip_fw	*reap;		/* list of rules to reap */
	struct ip_fw	*default_rule;
	int		n_rules;	/* number of static rules */
	int		static_len;	/* total len of static rules */
	struct ip_fw	**map;		/* array of rule ptrs to ease lookup */
	LIST_HEAD(nat_list, cfg_nat) nat;       /* list of nat entries */
	struct radix_node_head *tables[IPFW_TABLES_MAX];
#if defined( __linux__ ) || defined( _WIN32 )
	spinlock_t rwmtx;
	spinlock_t uh_lock;
#else
	struct rwlock	rwmtx;
	struct rwlock	uh_lock;	/* lock for upper half */
#endif
	uint32_t	id;		/* ruleset id */
};

struct sockopt;	/* used by tcp_var.h */

/*
 * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c
 * so the variable and the macros must be here.
 */

#define	IPFW_LOCK_INIT(_chain) do {			\
	rw_init(&(_chain)->rwmtx, "IPFW static rules");	\
	rw_init(&(_chain)->uh_lock, "IPFW UH lock");	\
	} while (0)

#define	IPFW_LOCK_DESTROY(_chain) do {			\
	rw_destroy(&(_chain)->rwmtx);			\
	rw_destroy(&(_chain)->uh_lock);			\
	} while (0)

#define	IPFW_WLOCK_ASSERT(_chain)	rw_assert(&(_chain)->rwmtx, RA_WLOCKED)

#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx)
#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx)
#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx)
#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx)

#define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock)
#define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock)
#define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock)
#define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock)

/* In ip_fw_sockopt.c */
int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id);
int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule);
int ipfw_ctl(struct sockopt *sopt);
int ipfw_chk(struct ip_fw_args *args);
void ipfw_reap_rules(struct ip_fw *head);

/* In ip_fw_pfil */
int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
     struct inpcb *inp);

/* In ip_fw_table.c */
struct radix_node;
int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
    uint32_t *val);
int ipfw_init_tables(struct ip_fw_chain *ch);
void ipfw_destroy_tables(struct ip_fw_chain *ch);
int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl);
int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
    uint8_t mlen, uint32_t value);
int ipfw_dump_table_entry(struct radix_node *rn, void *arg);
int ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
    uint8_t mlen);
int ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt);
int ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl);

/* In ip_fw_nat.c -- XXX to be moved to ip_var.h */

extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);

typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *);
typedef int ipfw_nat_cfg_t(struct sockopt *);

extern ipfw_nat_t *ipfw_nat_ptr;
#define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL)

extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
extern ipfw_nat_cfg_t *ipfw_nat_del_ptr;
extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;

#endif /* _KERNEL */
#endif /* _IPFW2_PRIVATE_H */


================================================
FILE: sys/netinet/ipfw/ip_fw_sockopt.c
================================================
/*-
 * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
 *
 * Supported by: Valeria Paoli
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_sockopt.c 206339 2010-04-07 08:23:58Z luigi $");

/*
 * Sockopt support for ipfw. The routines here implement
 * the upper half of the ipfw code.
 */

#if !defined(KLD_MODULE)
#include "opt_ipfw.h"
#include "opt_ipdivert.h"
#include "opt_ipdn.h"
#include "opt_inet.h"
#ifndef INET
#error IPFIREWALL requires INET.
#endif /* INET */
#endif
#include "opt_inet6.h"
#include "opt_ipsec.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>	/* struct m_tag used by nested headers */
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <net/if.h>
#include <net/route.h>
#include <net/vnet.h>

#include <netinet/in.h>
#include <netinet/ip_var.h> /* hooks */
#include <netinet/ip_fw.h>
#include <netinet/ipfw/ip_fw_private.h>

#ifdef MAC
#include <security/mac/mac_framework.h>
#endif

MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");

/*
 * static variables followed by global ones (none in this file)
 */

/*
 * Find the smallest rule >= key, id.
 * We could use bsearch but it is so simple that we code it directly
 */
int
ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id)
{
	int i, lo, hi;
	struct ip_fw *r;

  	for (lo = 0, hi = chain->n_rules - 1; lo < hi;) {
		i = (lo + hi) / 2;
		r = chain->map[i];
		if (r->rulenum < key)
			lo = i + 1;	/* continue from the next one */
		else if (r->rulenum > key)
			hi = i;		/* this might be good */
		else if (r->id < id)
			lo = i + 1;	/* continue from the next one */
		else /* r->id >= id */
			hi = i;		/* this might be good */
	};
	return hi;
}

/*
 * allocate a new map, returns the chain locked. extra is the number
 * of entries to add or delete.
 */
static struct ip_fw **
get_map(struct ip_fw_chain *chain, int extra, int locked)
{

	for (;;) {
		struct ip_fw **map;
		int i;

		i = chain->n_rules + extra;
		map = malloc(i * sizeof(struct ip_fw *), M_IPFW,
			locked ? M_NOWAIT : M_WAITOK);
		if (map == NULL) {
			printf("%s: cannot allocate map\n", __FUNCTION__);
			return NULL;
		}
		if (!locked)
			IPFW_UH_WLOCK(chain);
		if (i >= chain->n_rules + extra) /* good */
			return map;
		/* otherwise we lost the race, free and retry */
		if (!locked)
			IPFW_UH_WUNLOCK(chain);
		free(map, M_IPFW);
	}
}

/*
 * swap the maps. It is supposed to be called with IPFW_UH_WLOCK
 */
static struct ip_fw **
swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len)
{
	struct ip_fw **old_map;

	IPFW_WLOCK(chain);
	chain->id++;
	chain->n_rules = new_len;
	old_map = chain->map;
	chain->map = new_map;
	IPFW_WUNLOCK(chain);
	return old_map;
}

/*
 * Add a new rule to the list. Copy the rule into a malloc'ed area, then
 * possibly create a rule number and add the rule to the list.
 * Update the rule_number in the input struct so the caller knows it as well.
 * XXX DO NOT USE FOR THE DEFAULT RULE.
 * Must be called without IPFW_UH held
 */
int
ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
{
	struct ip_fw *rule;
	int i, l, insert_before;
	struct ip_fw **map;	/* the new array of pointers */

	if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1)
		return (EINVAL);

	l = RULESIZE(input_rule);
	rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO);
	if (rule == NULL)
		return (ENOSPC);
	/* get_map returns with IPFW_UH_WLOCK if successful */
	map = get_map(chain, 1, 0 /* not locked */);
	if (map == NULL) {
		free(rule, M_IPFW);
		return ENOSPC;
	}

	bcopy(input_rule, rule, l);
	/* clear fields not settable from userland */
	rule->x_next = NULL;
	rule->next_rule = NULL;
	rule->pcnt = 0;
	rule->bcnt = 0;
	rule->timestamp = 0;

	if (V_autoinc_step < 1)
		V_autoinc_step = 1;
	else if (V_autoinc_step > 1000)
		V_autoinc_step = 1000;
	/* find the insertion point, we will insert before */
	insert_before = rule->rulenum ? rule->rulenum + 1 : IPFW_DEFAULT_RULE;
	i = ipfw_find_rule(chain, insert_before, 0);
	/* duplicate first part */
	if (i > 0)
		bcopy(chain->map, map, i * sizeof(struct ip_fw *));
	map[i] = rule;
	/* duplicate remaining part, we always have the default rule */
	bcopy(chain->map + i, map + i + 1,
		sizeof(struct ip_fw *) *(chain->n_rules - i));
	if (rule->rulenum == 0) {
		/* write back the number */
		rule->rulenum = i > 0 ? map[i-1]->rulenum : 0;
		if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step)
			rule->rulenum += V_autoinc_step;
		input_rule->rulenum = rule->rulenum;
	}

	rule->id = chain->id + 1;
	map = swap_map(chain, map, chain->n_rules + 1);
	chain->static_len += l;
	IPFW_UH_WUNLOCK(chain);
	if (map)
		free(map, M_IPFW);
	return (0);
}

/*
 * Reclaim storage associated with a list of rules.  This is
 * typically the list created using remove_rule.
 * A NULL pointer on input is handled correctly.
 */
void
ipfw_reap_rules(struct ip_fw *head)
{
	struct ip_fw *rule;

	while ((rule = head) != NULL) {
		head = head->x_next;
		free(rule, M_IPFW);
	}
}

/*
 * Used by del_entry() to check if a rule should be kept.
 * Returns 1 if the rule must be kept, 0 otherwise.
 *
 * Called with cmd = {0,1,5}.
 * cmd == 0 matches on rule numbers, excludes rules in RESVD_SET if n == 0 ;
 * cmd == 1 matches on set numbers only, rule numbers are ignored;
 * cmd == 5 matches on rule and set numbers.
 *
 * n == 0 is a wildcard for rule numbers, there is no wildcard for sets.
 *
 * Rules to keep are
 *	(default || reserved || !match_set || !match_number)
 * where
 *   default ::= (rule->rulenum == IPFW_DEFAULT_RULE)
 *	// the default rule is always protected
 *
 *   reserved ::= (cmd == 0 && n == 0 && rule->set == RESVD_SET)
 *	// RESVD_SET is protected only if cmd == 0 and n == 0 ("ipfw flush")
 *
 *   match_set ::= (cmd == 0 || rule->set == set)
 *	// set number is ignored for cmd == 0
 *
 *   match_number ::= (cmd == 1 || n == 0 || n == rule->rulenum)
 *	// number is ignored for cmd == 1 or n == 0
 *
 */
static int
keep_rule(struct ip_fw *rule, uint8_t cmd, uint8_t set, uint32_t n)
{
	return
		 (rule->rulenum == IPFW_DEFAULT_RULE)		||
		 (cmd == 0 && n == 0 && rule->set == RESVD_SET)	||
		!(cmd == 0 || rule->set == set)			||
		!(cmd == 1 || n == 0 || n == rule->rulenum);
}

/**
 * Remove all rules with given number, or do set manipulation.
 * Assumes chain != NULL && *chain != NULL.
 *
 * The argument is an uint32_t. The low 16 bit are the rule or set number;
 * the next 8 bits are the new set; the top 8 bits indicate the command:
 *
 *	0	delete rules numbered "rulenum"
 *	1	delete rules in set "rulenum"
 *	2	move rules "rulenum" to set "new_set"
 *	3	move rules from set "rulenum" to set "new_set"
 *	4	swap sets "rulenum" and "new_set"
 *	5	delete rules "rulenum" and set "new_set"
 */
static int
del_entry(struct ip_fw_chain *chain, uint32_t arg)
{
	struct ip_fw *rule;
	uint32_t num;	/* rule number or old_set */
	uint8_t cmd, new_set;
	int start, end, i, ofs, n;
	struct ip_fw **map = NULL;
	int error = 0;

	num = arg & 0xffff;
	cmd = (arg >> 24) & 0xff;
	new_set = (arg >> 16) & 0xff;

	if (cmd > 5 || new_set > RESVD_SET)
		return EINVAL;
	if (cmd == 0 || cmd == 2 || cmd == 5) {
		if (num >= IPFW_DEFAULT_RULE)
			return EINVAL;
	} else {
		if (num > RESVD_SET)	/* old_set */
			return EINVAL;
	}

	IPFW_UH_WLOCK(chain);	/* arbitrate writers */
	chain->reap = NULL;	/* prepare for deletions */

	switch (cmd) {
	case 0:	/* delete rules "num" (num == 0 matches all) */
	case 1:	/* delete all rules in set N */
	case 5: /* delete rules with number N and set "new_set". */

		/*
		 * Locate first rule to delete (start), the rule after
		 * the last one to delete (end), and count how many
		 * rules to delete (n). Always use keep_rule() to
		 * determine which rules to keep.
		 */
		n = 0;
		if (cmd == 1) {
			/* look for a specific set including RESVD_SET.
			 * Must scan the entire range, ignore num.
			 */
			new_set = num;
			for (start = -1, end = i = 0; i < chain->n_rules; i++) {
				if (keep_rule(chain->map[i], cmd, new_set, 0))
					continue;
				if (start < 0)
					start = i;
				end = i;
				n++;
			}
			end++;	/* first non-matching */
		} else {
			/* Optimized search on rule numbers */
			start = ipfw_find_rule(chain, num, 0);
			for (end = start; end < chain->n_rules; end++) {
				rule = chain->map[end];
				if (num > 0 && rule->rulenum != num)
					break;
				if (!keep_rule(rule, cmd, new_set, num))
					n++;
			}
		}

		if (n == 0) {
			/* A flush request (arg == 0) on empty ruleset
			 * returns with no error. On the contrary,
			 * if there is no match on a specific request,
			 * we return EINVAL.
			 */
			error = (arg == 0) ? 0 : EINVAL;
			break;
		}

		/* We have something to delete. Allocate the new map */
		map = get_map(chain, -n, 1 /* locked */);
		if (map == NULL) {
			error = EINVAL;
			break;
		}

		/* 1. bcopy the initial part of the map */
		if (start > 0)
			bcopy(chain->map, map, start * sizeof(struct ip_fw *));
		/* 2. copy active rules between start and end */
		for (i = ofs = start; i < end; i++) {
			rule = chain->map[i];
			if (keep_rule(rule, cmd, new_set, num))
				map[ofs++] = rule;
		}
		/* 3. copy the final part of the map */
		bcopy(chain->map + end, map + ofs,
			(chain->n_rules - end) * sizeof(struct ip_fw *));
		/* 4. swap the maps (under BH_LOCK) */
		map = swap_map(chain, map, chain->n_rules - n);
		/* 5. now remove the rules deleted from the old map */
		for (i = start; i < end; i++) {
			int l;
			rule = map[i];
			if (keep_rule(rule, cmd, new_set, num))
				continue;
			l = RULESIZE(rule);
			chain->static_len -= l;
			ipfw_remove_dyn_children(rule);
			rule->x_next = chain->reap;
			chain->reap = rule;
		}
		break;

	/*
	 * In the next 3 cases the loop stops at (n_rules - 1)
	 * because the default rule is never eligible..
	 */

	case 2:	/* move rules with given RULE number to new set */
		for (i = 0; i < chain->n_rules - 1; i++) {
			rule = chain->map[i];
			if (rule->rulenum == num)
				rule->set = new_set;
		}
		break;

	case 3: /* move rules with given SET number to new set */
		for (i = 0; i < chain->n_rules - 1; i++) {
			rule = chain->map[i];
			if (rule->set == num)
				rule->set = new_set;
		}
		break;

	case 4: /* swap two sets */
		for (i = 0; i < chain->n_rules - 1; i++) {
			rule = chain->map[i];
			if (rule->set == num)
				rule->set = new_set;
			else if (rule->set == new_set)
				rule->set = num;
		}
		break;
	}

	rule = chain->reap;
	chain->reap = NULL;
	IPFW_UH_WUNLOCK(chain);
	ipfw_reap_rules(rule);
	if (map)
		free(map, M_IPFW);
	return error;
}

/*
 * Clear counters for a specific rule.
 * Normally run under IPFW_UH_RLOCK, but these are idempotent ops
 * so we only care that rules do not disappear.
 */
static void
clear_counters(struct ip_fw *rule, int log_only)
{
	ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);

	if (log_only == 0) {
		rule->bcnt = rule->pcnt = 0;
		rule->timestamp = 0;
	}
	if (l->o.opcode == O_LOG)
		l->log_left = l->max_log;
}

/**
 * Reset some or all counters on firewall rules.
 * The argument `arg' is an u_int32_t. The low 16 bit are the rule number,
 * the next 8 bits are the set number, the top 8 bits are the command:
 *	0	work with rules from all set's;
 *	1	work with rules only from specified set.
 * Specified rule number is zero if we want to clear all entries.
 * log_only is 1 if we only want to reset logs, zero otherwise.
 */
static int
zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
{
	struct ip_fw *rule;
	char *msg;
	int i;

	uint16_t rulenum = arg & 0xffff;
	uint8_t set = (arg >> 16) & 0xff;
	uint8_t cmd = (arg >> 24) & 0xff;

	if (cmd > 1)
		return (EINVAL);
	if (cmd == 1 && set > RESVD_SET)
		return (EINVAL);

	IPFW_UH_RLOCK(chain);
	if (rulenum == 0) {
		V_norule_counter = 0;
		for (i = 0; i < chain->n_rules; i++) {
			rule = chain->map[i];
			/* Skip rules not in our set. */
			if (cmd == 1 && rule->set != set)
				continue;
			clear_counters(rule, log_only);
		}
		msg = log_only ? "All logging counts reset" :
		    "Accounting cleared";
	} else {
		int cleared = 0;
		for (i = 0; i < chain->n_rules; i++) {
			rule = chain->map[i];
			if (rule->rulenum == rulenum) {
				if (cmd == 0 || rule->set == set)
					clear_counters(rule, log_only);
				cleared = 1;
			}
			if (rule->rulenum > rulenum)
				break;
		}
		if (!cleared) {	/* we did not find any matching rules */
			IPFW_UH_RUNLOCK(chain);
			return (EINVAL);
		}
		msg = log_only ? "logging count reset" : "cleared";
	}
	IPFW_UH_RUNLOCK(chain);

	if (V_fw_verbose) {
		int lev = LOG_SECURITY | LOG_NOTICE;

		if (rulenum)
			log(lev, "ipfw: Entry %d %s.\n", rulenum, msg);
		else
			log(lev, "ipfw: %s.\n", msg);
	}
	return (0);
}

/*
 * Check validity of the structure before insert.
 * Rules are simple, so this mostly need to check rule sizes.
 */
static int
check_ipfw_struct(struct ip_fw *rule, int size)
{
	int l, cmdlen = 0;
	int have_action=0;
	ipfw_insn *cmd;

	if (size < sizeof(*rule)) {
		printf("ipfw: rule too short\n");
		return (EINVAL);
	}
	/* first, check for valid size */
	l = RULESIZE(rule);
	if (l != size) {
		printf("ipfw: size mismatch (have %d want %d)\n", size, l);
		return (EINVAL);
	}
	if (rule->act_ofs >= rule->cmd_len) {
		printf("ipfw: bogus action offset (%u > %u)\n",
		    rule->act_ofs, rule->cmd_len - 1);
		return (EINVAL);
	}
	/*
	 * Now go for the individual checks. Very simple ones, basically only
	 * instruction sizes.
	 */
	for (l = rule->cmd_len, cmd = rule->cmd ;
			l > 0 ; l -= cmdlen, cmd += cmdlen) {
		cmdlen = F_LEN(cmd);
		if (cmdlen > l) {
			printf("ipfw: opcode %d size truncated\n",
			    cmd->opcode);
			return EINVAL;
		}
		switch (cmd->opcode) {
		case O_PROBE_STATE:
		case O_KEEP_STATE:
		case O_PROTO:
		case O_IP_SRC_ME:
		case O_IP_DST_ME:
		case O_LAYER2:
		case O_IN:
		case O_FRAG:
		case O_DIVERTED:
		case O_IPOPT:
		case O_IPTOS:
		case O_IPPRECEDENCE:
		case O_IPVER:
		case O_TCPWIN:
		case O_TCPFLAGS:
		case O_TCPOPTS:
		case O_ESTAB:
		case O_VERREVPATH:
		case O_VERSRCREACH:
		case O_ANTISPOOF:
		case O_IPSEC:
#ifdef INET6
		case O_IP6_SRC_ME:
		case O_IP6_DST_ME:
		case O_EXT_HDR:
		case O_IP6:
#endif
		case O_IP4:
		case O_TAG:
			if (cmdlen != F_INSN_SIZE(ipfw_insn))
				goto bad_size;
			break;

		case O_FIB:
			if (cmdlen != F_INSN_SIZE(ipfw_insn))
				goto bad_size;
			if (cmd->arg1 >= rt_numfibs) {
				printf("ipfw: invalid fib number %d\n",
					cmd->arg1);
				return EINVAL;
			}
			break;

		case O_SETFIB:
			if (cmdlen != F_INSN_SIZE(ipfw_insn))
				goto bad_size;
			if (cmd->arg1 >= rt_numfibs) {
				printf("ipfw: invalid fib number %d\n",
					cmd->arg1);
				return EINVAL;
			}
			goto check_action;

		case O_UID:
		case O_GID:
		case O_JAIL:
		case O_IP_SRC:
		case O_IP_DST:
		case O_TCPSEQ:
		case O_TCPACK:
		case O_PROB:
		case O_ICMPTYPE:
			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
				goto bad_size;
			break;

		case O_LIMIT:
			if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
				goto bad_size;
			break;

		case O_LOG:
			if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
				goto bad_size;

			((ipfw_insn_log *)cmd)->log_left =
			    ((ipfw_insn_log *)cmd)->max_log;

			break;

		case O_IP_SRC_MASK:
		case O_IP_DST_MASK:
			/* only odd command lengths */
			if ( !(cmdlen & 1) || cmdlen > 31)
				goto bad_size;
			break;

		case O_IP_SRC_SET:
		case O_IP_DST_SET:
			if (cmd->arg1 == 0 || cmd->arg1 > 256) {
				printf("ipfw: invalid set size %d\n",
					cmd->arg1);
				return EINVAL;
			}
			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
			    (cmd->arg1+31)/32 )
				goto bad_size;
			break;

		case O_IP_SRC_LOOKUP:
		case O_IP_DST_LOOKUP:
			if (cmd->arg1 >= IPFW_TABLES_MAX) {
				printf("ipfw: invalid table number %d\n",
				    cmd->arg1);
				return (EINVAL);
			}
			if (cmdlen != F_INSN_SIZE(ipfw_insn) &&
			    cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 &&
			    cmdlen != F_INSN_SIZE(ipfw_insn_u32))
				goto bad_size;
			break;

		case O_MACADDR2:
			if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
				goto bad_size;
			break;

		case O_NOP:
		case O_IPID:
		case O_IPTTL:
		case O_IPLEN:
		case O_TCPDATALEN:
		case O_TAGGED:
			if (cmdlen < 1 || cmdlen > 31)
				goto bad_size;
			break;

		case O_MAC_TYPE:
		case O_IP_SRCPORT:
		case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
			if (cmdlen < 2 || cmdlen > 31)
				goto bad_size;
			break;

		case O_RECV:
		case O_XMIT:
		case O_VIA:
			if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
				goto bad_size;
			break;

		case O_ALTQ:
			if (cmdlen != F_INSN_SIZE(ipfw_insn_altq))
				goto bad_size;
			break;

		case O_PIPE:
		case O_QUEUE:
			if (cmdlen != F_INSN_SIZE(ipfw_insn))
				goto bad_size;
			goto check_action;

		case O_FORWARD_IP:
#ifdef	IPFIREWALL_FORWARD
			if (cmdlen != F_INSN_SIZE(ipfw_insn_sa))
				goto bad_size;
			goto check_action;
#else
			return EINVAL;
#endif

		case O_DIVERT:
		case O_TEE:
			if (ip_divert_ptr == NULL)
				return EINVAL;
			else
				goto check_size;
		case O_NETGRAPH:
		case O_NGTEE:
			if (ng_ipfw_input_p == NULL)
				return EINVAL;
			else
				goto check_size;
		case O_NAT:
			if (!IPFW_NAT_LOADED)
				return EINVAL;
			if (cmdlen != F_INSN_SIZE(ipfw_insn_nat))
 				goto bad_size;		
 			goto check_action;
		case O_FORWARD_MAC: /* XXX not implemented yet */
		case O_CHECK_STATE:
		case O_COUNT:
		case O_ACCEPT:
		case O_DENY:
		case O_REJECT:
#ifdef INET6
		case O_UNREACH6:
#endif
		case O_SKIPTO:
		case O_REASS:
check_size:
			if (cmdlen != F_INSN_SIZE(ipfw_insn))
				goto bad_size;
check_action:
			if (have_action) {
				printf("ipfw: opcode %d, multiple actions"
					" not allowed\n",
					cmd->opcode);
				return EINVAL;
			}
			have_action = 1;
			if (l != cmdlen) {
				printf("ipfw: opcode %d, action must be"
					" last opcode\n",
					cmd->opcode);
				return EINVAL;
			}
			break;
#ifdef INET6
		case O_IP6_SRC:
		case O_IP6_DST:
			if (cmdlen != F_INSN_SIZE(struct in6_addr) +
			    F_INSN_SIZE(ipfw_insn))
				goto bad_size;
			break;

		case O_FLOW6ID:
			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
			    ((ipfw_insn_u32 *)cmd)->o.arg1)
				goto bad_size;
			break;

		case O_IP6_SRC_MASK:
		case O_IP6_DST_MASK:
			if ( !(cmdlen & 1) || cmdlen > 127)
				goto bad_size;
			break;
		case O_ICMP6TYPE:
			if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) )
				goto bad_size;
			break;
#endif

		default:
			switch (cmd->opcode) {
#ifndef INET6
			case O_IP6_SRC_ME:
			case O_IP6_DST_ME:
			case O_EXT_HDR:
			case O_IP6:
			case O_UNREACH6:
			case O_IP6_SRC:
			case O_IP6_DST:
			case O_FLOW6ID:
			case O_IP6_SRC_MASK:
			case O_IP6_DST_MASK:
			case O_ICMP6TYPE:
				printf("ipfw: no IPv6 support in kernel\n");
				return EPROTONOSUPPORT;
#endif
			default:
				printf("ipfw: opcode %d, unknown opcode\n",
					cmd->opcode);
				return EINVAL;
			}
		}
	}
	if (have_action == 0) {
		printf("ipfw: missing action\n");
		return EINVAL;
	}
	return 0;

bad_size:
	printf("ipfw: opcode %d size %d wrong\n",
		cmd->opcode, cmdlen);
	return EINVAL;
}


/*
 * Translation of requests for compatibility with FreeBSD 7.2/8.
 * a static variable tells us if we have an old client from userland,
 * and if necessary we translate requests and responses between the
 * two formats.
 */
static int is7 = 0;

struct ip_fw7 {
	struct ip_fw7	*next;		/* linked list of rules     */
	struct ip_fw7	*next_rule;	/* ptr to next [skipto] rule    */
	/* 'next_rule' is used to pass up 'set_disable' status      */

	uint16_t	act_ofs;	/* offset of action in 32-bit units */
	uint16_t	cmd_len;	/* # of 32-bit words in cmd */
	uint16_t	rulenum;	/* rule number          */
	uint8_t		set;		/* rule set (0..31)     */
	// #define RESVD_SET   31  /* set for default and persistent rules */
	uint8_t		_pad;		/* padding          */
	// uint32_t        id;             /* rule id, only in v.8 */
	/* These fields are present in all rules.           */
	uint64_t	pcnt;		/* Packet counter       */
	uint64_t	bcnt;		/* Byte counter         */
	uint32_t	timestamp;	/* tv_sec of last match     */

	ipfw_insn	cmd[1];		/* storage for commands     */
};

	int convert_rule_to_7(struct ip_fw *rule);
int convert_rule_to_8(struct ip_fw *rule);

#ifndef RULESIZE7
#define RULESIZE7(rule)  (sizeof(struct ip_fw7) + \
	((struct ip_fw7 *)(rule))->cmd_len * 4 - 4)
#endif


/*
 * Copy the static and dynamic rules to the supplied buffer
 * and return the amount of space actually used.
 * Must be run under IPFW_UH_RLOCK
 */
static size_t
ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
{
	char *bp = buf;
	char *ep = bp + space;
	struct ip_fw *rule, *dst;
	int l, i;
	time_t	boot_seconds;

        boot_seconds = boottime.tv_sec;
	for (i = 0; i < chain->n_rules; i++) {
		rule = chain->map[i];

		if (is7) {
		    /* Convert rule to FreeBSd 7.2 format */
		    l = RULESIZE7(rule);
		    if (bp + l + sizeof(uint32_t) <= ep) {
			int error;
			bcopy(rule, bp, l + sizeof(uint32_t));
			error = convert_rule_to_7((struct ip_fw *) bp);
			if (error)
				return 0; /*XXX correct? */
			/*
			 * XXX HACK. Store the disable mask in the "next"
			 * pointer in a wild attempt to keep the ABI the same.
			 * Why do we do this on EVERY rule?
			 */
			bcopy(&V_set_disable,
				&(((struct ip_fw7 *)bp)->next_rule),
				sizeof(V_set_disable));
			if (((struct ip_fw7 *)bp)->timestamp)
			    ((struct ip_fw7 *)bp)->timestamp += boot_seconds;
			bp += l;
		    }
		    continue; /* go to next rule */
		}

		/* normal mode, don't touch rules */
		l = RULESIZE(rule);
		if (bp + l > ep) { /* should not happen */
			printf("overflow dumping static rules\n");
			break;
		}
		dst = (struct ip_fw *)bp;
		bcopy(rule, dst, l);
		/*
		 * XXX HACK. Store the disable mask in the "next"
		 * pointer in a wild attempt to keep the ABI the same.
		 * Why do we do this on EVERY rule?
		 */
		bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable));
		if (dst->timestamp)
			dst->timestamp += boot_seconds;
		bp += l;
	}
	ipfw_get_dynamic(&bp, ep); /* protected by the dynamic lock */
	return (bp - (char *)buf);
}


/**
 * {set|get}sockopt parser.
 */
int
ipfw_ctl(struct sockopt *sopt)
{
#define	RULE_MAXSIZE	(256*sizeof(u_int32_t))
	int error;
	size_t size;
	struct ip_fw *buf, *rule;
	struct ip_fw_chain *chain;
	u_int32_t rulenum[2];

	error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW);
	if (error)
		return (error);

	/*
	 * Disallow modifications in really-really secure mode, but still allow
	 * the logging counters to be reset.
	 */
	if (sopt->sopt_name == IP_FW_ADD ||
	    (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) {
		error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
		if (error)
			return (error);
	}

	chain = &V_layer3_chain;
	error = 0;

	switch (sopt->sopt_name) {
	case IP_FW_GET:
		/*
		 * pass up a copy of the current rules. Static rules
		 * come first (the last of which has number IPFW_DEFAULT_RULE),
		 * followed by a possibly empty list of dynamic rule.
		 * The last dynamic rule has NULL in the "next" field.
		 *
		 * Note that the calculated size is used to bound the
		 * amount of data returned to the user.  The rule set may
		 * change between calculating the size and returning the
		 * data in which case we'll just return what fits.
		 */
		for (;;) {
			int len = 0, want;

			size = chain->static_len;
			size += ipfw_dyn_len();
			if (size >= sopt->sopt_valsize)
				break;
			buf = malloc(size, M_TEMP, M_WAITOK);
			if (buf == NULL)
				break;
			IPFW_UH_RLOCK(chain);
			/* check again how much space we need */
			want = chain->static_len + ipfw_dyn_len();
			if (size >= want)
				len = ipfw_getrules(chain, buf, size);
			IPFW_UH_RUNLOCK(chain);
			if (size >= want)
				error = sooptcopyout(sopt, buf, len);
			free(buf, M_TEMP);
			if (size >= want)
				break;
		}
		break;

	case IP_FW_FLUSH:
		/* locking is done within del_entry() */
		error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */
		break;

	case IP_FW_ADD:
		rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK);
		error = sooptcopyin(sopt, rule, RULE_MAXSIZE,
			sizeof(struct ip_fw7) );

		/*
		 * If the size of commands equals RULESIZE7 then we assume
		 * a FreeBSD7.2 binary is talking to us (set is7=1).
		 * is7 is persistent so the next 'ipfw list' command
		 * will use this format.
		 * NOTE: If wrong version is guessed (this can happen if
		 *       the first ipfw command is 'ipfw [pipe] list')
		 *       the ipfw binary may crash or loop infinitly...
		 */
		if (sopt->sopt_valsize == RULESIZE7(rule)) {
		    is7 = 1;
		    error = convert_rule_to_8(rule);
		    if (error)
			return error;
		    if (error == 0)
			error = check_ipfw_struct(rule, RULESIZE(rule));
		} else {
		    is7 = 0;
		if (error == 0)
			error = check_ipfw_struct(rule, sopt->sopt_valsize);
		}
		if (error == 0) {
			/* locking is done within ipfw_add_rule() */
			error = ipfw_add_rule(chain, rule);
			size = RULESIZE(rule);
			if (!error && sopt->sopt_dir == SOPT_GET) {
				if (is7) {
					error = convert_rule_to_7(rule);
					size = RULESIZE7(rule);
					if (error)
						return error;
				}
				error = sooptcopyout(sopt, rule, size);
		}
		}
		free(rule, M_TEMP);
		break;

	case IP_FW_DEL:
		/*
		 * IP_FW_DEL is used for deleting single rules or sets,
		 * and (ab)used to atomically manipulate sets. Argument size
		 * is used to distinguish between the two:
		 *    sizeof(u_int32_t)
		 *	delete single rule or set of rules,
		 *	or reassign rules (or sets) to a different set.
		 *    2*sizeof(u_int32_t)
		 *	atomic disable/enable sets.
		 *	first u_int32_t contains sets to be disabled,
		 *	second u_int32_t contains sets to be enabled.
		 */
		error = sooptcopyin(sopt, rulenum,
			2*sizeof(u_int32_t), sizeof(u_int32_t));
		if (error)
			break;
		size = sopt->sopt_valsize;
		if (size == sizeof(u_int32_t) && rulenum[0] != 0) {
			/* delete or reassign, locking done in del_entry() */
			error = del_entry(chain, rulenum[0]);
		} else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */
			IPFW_UH_WLOCK(chain);
			V_set_disable =
			    (V_set_disable | rulenum[0]) & ~rulenum[1] &
			    ~(1<<RESVD_SET); /* set RESVD_SET always enabled */
			IPFW_UH_WUNLOCK(chain);
		} else
			error = EINVAL;
		break;

	case IP_FW_ZERO:
	case IP_FW_RESETLOG: /* argument is an u_int_32, the rule number */
		rulenum[0] = 0;
		if (sopt->sopt_val != 0) {
		    error = sooptcopyin(sopt, rulenum,
			    sizeof(u_int32_t), sizeof(u_int32_t));
		    if (error)
			break;
		}
		error = zero_entry(chain, rulenum[0],
			sopt->sopt_name == IP_FW_RESETLOG);
		break;

	/*--- TABLE manipulations are protected by the IPFW_LOCK ---*/
	case IP_FW_TABLE_ADD:
		{
			ipfw_table_entry ent;

			error = sooptcopyin(sopt, &ent,
			    sizeof(ent), sizeof(ent));
			if (error)
				break;
			error = ipfw_add_table_entry(chain, ent.tbl,
			    ent.addr, ent.masklen, ent.value);
		}
		break;

	case IP_FW_TABLE_DEL:
		{
			ipfw_table_entry ent;

			error = sooptcopyin(sopt, &ent,
			    sizeof(ent), sizeof(ent));
			if (error)
				break;
			error = ipfw_del_table_entry(chain, ent.tbl,
			    ent.addr, ent.masklen);
		}
		break;

	case IP_FW_TABLE_FLUSH:
		{
			u_int16_t tbl;

			error = sooptcopyin(sopt, &tbl,
			    sizeof(tbl), sizeof(tbl));
			if (error)
				break;
			IPFW_WLOCK(chain);
			error = ipfw_flush_table(chain, tbl);
			IPFW_WUNLOCK(chain);
		}
		break;

	case IP_FW_TABLE_GETSIZE:
		{
			u_int32_t tbl, cnt;

			if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl),
			    sizeof(tbl))))
				break;
			IPFW_RLOCK(chain);
			error = ipfw_count_table(chain, tbl, &cnt);
			IPFW_RUNLOCK(chain);
			if (error)
				break;
			error = sooptcopyout(sopt, &cnt, sizeof(cnt));
		}
		break;

	case IP_FW_TABLE_LIST:
		{
			ipfw_table *tbl;

			if (sopt->sopt_valsize < sizeof(*tbl)) {
				error = EINVAL;
				break;
			}
			size = sopt->sopt_valsize;
			tbl = malloc(size, M_TEMP, M_WAITOK);
			error = sooptcopyin(sopt, tbl, size, sizeof(*tbl));
			if (error) {
				free(tbl, M_TEMP);
				break;
			}
			tbl->size = (size - sizeof(*tbl)) /
			    sizeof(ipfw_table_entry);
			IPFW_RLOCK(chain);
			error = ipfw_dump_table(chain, tbl);
			IPFW_RUNLOCK(chain);
			if (error) {
				free(tbl, M_TEMP);
				break;
			}
			error = sooptcopyout(sopt, tbl, size);
			free(tbl, M_TEMP);
		}
		break;

	/*--- NAT operations are protected by the IPFW_LOCK ---*/
	case IP_FW_NAT_CFG:
		if (IPFW_NAT_LOADED)
			error = ipfw_nat_cfg_ptr(sopt);
		else {
			printf("IP_FW_NAT_CFG: %s\n",
			    "ipfw_nat not present, please load it");
			error = EINVAL;
		}
		break;

	case IP_FW_NAT_DEL:
		if (IPFW_NAT_LOADED)
			error = ipfw_nat_del_ptr(sopt);
		else {
			printf("IP_FW_NAT_DEL: %s\n",
			    "ipfw_nat not present, please load it");
			error = EINVAL;
		}
		break;

	case IP_FW_NAT_GET_CONFIG:
		if (IPFW_NAT_LOADED)
			error = ipfw_nat_get_cfg_ptr(sopt);
		else {
			printf("IP_FW_NAT_GET_CFG: %s\n",
			    "ipfw_nat not present, please load it");
			error = EINVAL;
		}
		break;

	case IP_FW_NAT_GET_LOG:
		if (IPFW_NAT_LOADED)
			error = ipfw_nat_get_log_ptr(sopt);
		else {
			printf("IP_FW_NAT_GET_LOG: %s\n",
			    "ipfw_nat not present, please load it");
			error = EINVAL;
		}
		break;

	default:
		printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name);
		error = EINVAL;
	}

	return (error);
#undef RULE_MAXSIZE
}


#define	RULE_MAXSIZE	(256*sizeof(u_int32_t))

/* Functions to convert rules 7.2 <==> 8.0 */
int
convert_rule_to_7(struct ip_fw *rule)
{
	/* Used to modify original rule */
	struct ip_fw7 *rule7 = (struct ip_fw7 *)rule;
	/* copy of original rule, version 8 */
	struct ip_fw *tmp;

	/* Used to copy commands */
	ipfw_insn *ccmd, *dst;
	int ll = 0, ccmdlen = 0;

	tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
	if (tmp == NULL) {
		return 1; //XXX error
	}
	bcopy(rule, tmp, RULE_MAXSIZE);

	/* Copy fields */
	rule7->_pad = tmp->_pad;
	rule7->set = tmp->set;
	rule7->rulenum = tmp->rulenum;
	rule7->cmd_len = tmp->cmd_len;
	rule7->act_ofs = tmp->act_ofs;
	rule7->next_rule = (struct ip_fw7 *)tmp->next_rule;
	rule7->next = (struct ip_fw7 *)tmp->x_next;
	rule7->cmd_len = tmp->cmd_len;
	rule7->pcnt = tmp->pcnt;
	rule7->bcnt = tmp->bcnt;
	rule7->timestamp = tmp->timestamp;

	/* Copy commands */
	for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ;
			ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
		ccmdlen = F_LEN(ccmd);

		bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));

		if (dst->opcode > O_NAT)
			/* O_REASS doesn't exists in 7.2 version, so
			 * decrement opcode if it is after O_REASS
			 */
			dst->opcode--;

		if (ccmdlen > ll) {
			printf("ipfw: opcode %d size truncated\n",
				ccmd->opcode);
			return EINVAL;
		}
	}
	free(tmp, M_TEMP);

	return 0;
}

int
convert_rule_to_8(struct ip_fw *rule)
{
	/* Used to modify original rule */
	struct ip_fw7 *rule7 = (struct ip_fw7 *) rule;

	/* Used to copy commands */
	ipfw_insn *ccmd, *dst;
	int ll = 0, ccmdlen = 0;

	/* Copy of original rule */
	struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
	if (tmp == NULL) {
		return 1; //XXX error
	}

	bcopy(rule7, tmp, RULE_MAXSIZE);

	for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ;
			ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
		ccmdlen = F_LEN(ccmd);
		
		bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));

		if (dst->opcode > O_NAT)
			/* O_REASS doesn't exists in 7.2 version, so
			 * increment opcode if it is after O_REASS
			 */
			dst->opcode++;

		if (ccmdlen > ll) {
			printf("ipfw: opcode %d size truncated\n",
			    ccmd->opcode);
			return EINVAL;
		}
	}

	rule->_pad = tmp->_pad;
	rule->set = tmp->set;
	rule->rulenum = tmp->rulenum;
	rule->cmd_len = tmp->cmd_len;
	rule->act_ofs = tmp->act_ofs;
	rule->next_rule = (struct ip_fw *)tmp->next_rule;
	rule->x_next = (struct ip_fw *)tmp->next;
	rule->cmd_len = tmp->cmd_len;
	rule->id = 0; /* XXX see if is ok = 0 */
	rule->pcnt = tmp->pcnt;
	rule->bcnt = tmp->bcnt;
	rule->timestamp = tmp->timestamp;

	free (tmp, M_TEMP);
	return 0;
}

/* end of file */


================================================
FILE: sys/netinet/ipfw/ip_fw_table.c
================================================
/*-
 * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_table.c 200601 2009-12-16 10:48:40Z luigi $");

/*
 * Lookup table support for ipfw
 *
 * Lookup tables are implemented (at the moment) using the radix
 * tree used for routing tables. Tables store key-value entries, where
 * keys are network prefixes (addr/masklen), and values are integers.
 * As a degenerate case we can interpret keys as 32-bit integers
 * (with a /32 mask).
 *
 * The table is protected by the IPFW lock even for manipulation coming
 * from userland, because operations are typically fast.
 */

#if !defined(KLD_MODULE)
#include "opt_ipfw.h"
#include "opt_ipdivert.h"
#include "opt_ipdn.h"
#include "opt_inet.h"
#ifndef INET
#error IPFIREWALL requires INET.
#endif /* INET */
#endif
#include "opt_inet6.h"
#include "opt_ipsec.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/rwlock.h>
#include <sys/socket.h>
#include <net/if.h>	/* ip_fw.h requires IFNAMSIZ */
#include <net/radix.h>
#include <net/route.h>
#include <net/vnet.h>

#include <netinet/in.h>
#include <netinet/ip_var.h>	/* struct ipfw_rule_ref */
#include <netinet/ip_fw.h>
#include <sys/queue.h> /* LIST_HEAD */
#include <netinet/ipfw/ip_fw_private.h>

#ifdef MAC
#include <security/mac/mac_framework.h>
#endif

MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables");

struct table_entry {
	struct radix_node	rn[2];
	struct sockaddr_in	addr, mask;
	u_int32_t		value;
};

/*
 * The radix code expects addr and mask to be array of bytes,
 * with the first byte being the length of the array. rn_inithead
 * is called with the offset in bits of the lookup key within the
 * array. If we use a sockaddr_in as the underlying type,
 * sin_len is conveniently located at offset 0, sin_addr is at
 * offset 4 and normally aligned.
 * But for portability, let's avoid assumption and make the code explicit
 */
#define KEY_LEN(v)	*((uint8_t *)&(v))
#define KEY_OFS		(8*offsetof(struct sockaddr_in, sin_addr))

int
ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
    uint8_t mlen, uint32_t value)
{
	struct radix_node_head *rnh;
	struct table_entry *ent;
	struct radix_node *rn;

	if (tbl >= IPFW_TABLES_MAX)
		return (EINVAL);
	rnh = ch->tables[tbl];
	ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO);
	if (ent == NULL)
		return (ENOMEM);
	ent->value = value;
	KEY_LEN(ent->addr) = KEY_LEN(ent->mask) = 8;
	ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
	ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr;
	IPFW_WLOCK(ch);
	rn = rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent);
	if (rn == NULL) {
		IPFW_WUNLOCK(ch);
		free(ent, M_IPFW_TBL);
		return (EEXIST);
	}
	IPFW_WUNLOCK(ch);
	return (0);
}

int
ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
    uint8_t mlen)
{
	struct radix_node_head *rnh;
	struct table_entry *ent;
	struct sockaddr_in sa, mask;

	if (tbl >= IPFW_TABLES_MAX)
		return (EINVAL);
	rnh = ch->tables[tbl];
	KEY_LEN(sa) = KEY_LEN(mask) = 8;
	mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
	sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr;
	IPFW_WLOCK(ch);
	ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh);
	if (ent == NULL) {
		IPFW_WUNLOCK(ch);
		return (ESRCH);
	}
	IPFW_WUNLOCK(ch);
	free(ent, M_IPFW_TBL);
	return (0);
}

static int
flush_table_entry(struct radix_node *rn, void *arg)
{
	struct radix_node_head * const rnh = arg;
	struct table_entry *ent;

	ent = (struct table_entry *)
	    rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
	if (ent != NULL)
		free(ent, M_IPFW_TBL);
	return (0);
}

int
ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl)
{
	struct radix_node_head *rnh;

	IPFW_WLOCK_ASSERT(ch);

	if (tbl >= IPFW_TABLES_MAX)
		return (EINVAL);
	rnh = ch->tables[tbl];
	KASSERT(rnh != NULL, ("NULL IPFW table"));
	rnh->rnh_walktree(rnh, flush_table_entry, rnh);
	return (0);
}

void
ipfw_destroy_tables(struct ip_fw_chain *ch)
{
	uint16_t tbl;
	struct radix_node_head *rnh;

	IPFW_WLOCK_ASSERT(ch);

	for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++) {
		ipfw_flush_table(ch, tbl);
		rnh = ch->tables[tbl];
		rn_detachhead((void **)&rnh);
	}
}

int
ipfw_init_tables(struct ip_fw_chain *ch)
{ 
	int i;
	uint16_t j;

	for (i = 0; i < IPFW_TABLES_MAX; i++) {
		if (!rn_inithead((void **)&ch->tables[i], KEY_OFS)) {
			for (j = 0; j < i; j++) {
				(void) ipfw_flush_table(ch, j);
			}
			return (ENOMEM);
		}
	}
	return (0);
}

int
ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
    uint32_t *val)
{
	struct radix_node_head *rnh;
	struct table_entry *ent;
	struct sockaddr_in sa;

	if (tbl >= IPFW_TABLES_MAX)
		return (0);
	rnh = ch->tables[tbl];
	KEY_LEN(sa) = 8;
	sa.sin_addr.s_addr = addr;
	ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh));
	if (ent != NULL) {
		*val = ent->value;
		return (1);
	}
	return (0);
}

static int
count_table_entry(struct radix_node *rn, void *arg)
{
	u_int32_t * const cnt = arg;

	(*cnt)++;
	return (0);
}

int
ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt)
{
	struct radix_node_head *rnh;

	if (tbl >= IPFW_TABLES_MAX)
		return (EINVAL);
	rnh = ch->tables[tbl];
	*cnt = 0;
	rnh->rnh_walktree(rnh, count_table_entry, cnt);
	return (0);
}

static int
dump_table_entry(struct radix_node *rn, void *arg)
{
	struct table_entry * const n = (struct table_entry *)rn;
	ipfw_table * const tbl = arg;
	ipfw_table_entry *ent;

	if (tbl->cnt == tbl->size)
		return (1);
	ent = &tbl->ent[tbl->cnt];
	ent->tbl = tbl->tbl;
	if (in_nullhost(n->mask.sin_addr))
		ent->masklen = 0;
	else
		ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr));
	ent->addr = n->addr.sin_addr.s_addr;
	ent->value = n->value;
	tbl->cnt++;
	return (0);
}

int
ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl)
{
	struct radix_node_head *rnh;

	if (tbl->tbl >= IPFW_TABLES_MAX)
		return (EINVAL);
	rnh = ch->tables[tbl->tbl];
	tbl->cnt = 0;
	rnh->rnh_walktree(rnh, dump_table_entry, tbl);
	return (0);
}
/* end of file */


================================================
FILE: sys/netinet/tcp.h
================================================
/*-
 * Copyright (c) 1982, 1986, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)tcp.h	8.1 (Berkeley) 6/10/93
 * $FreeBSD: src/sys/netinet/tcp.h,v 1.40.2.2 2008/07/31 06:10:25 kmacy Exp $
 */

#ifndef _NETINET_TCP_H_
#define _NETINET_TCP_H_

#include <sys/cdefs.h>

#define __BSD_VISIBLE 1

#if __BSD_VISIBLE

typedef	u_int32_t tcp_seq;

#define tcp6_seq	tcp_seq	/* for KAME src sync over BSD*'s */
#define tcp6hdr		tcphdr	/* for KAME src sync over BSD*'s */

/*
 * TCP header.
 * Per RFC 793, September, 1981.
 */
struct tcphdr {
	u_short	th_sport;		/* source port */
	u_short	th_dport;		/* destination port */
	tcp_seq	th_seq;			/* sequence number */
	tcp_seq	th_ack;			/* acknowledgement number */
#if BYTE_ORDER == LITTLE_ENDIAN
	u_char	th_x2:4,		/* (unused) */
		th_off:4;		/* data offset */
#endif
#if BYTE_ORDER == BIG_ENDIAN
	u_char	th_off:4,		/* data offset */
		th_x2:4;		/* (unused) */
#endif
	u_char	th_flags;
#define	TH_FIN	0x01
#define	TH_SYN	0x02
#define	TH_RST	0x04
#define	TH_PUSH	0x08
#define	TH_ACK	0x10
#define	TH_URG	0x20
#define	TH_ECE	0x40
#define	TH_CWR	0x80
#define	TH_FLAGS	(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG|TH_ECE|TH_CWR)
#define	PRINT_TH_FLAGS	"\20\1FIN\2SYN\3RST\4PUSH\5ACK\6URG\7ECE\10CWR"

	u_short	th_win;			/* window */
	u_short	th_sum;			/* checksum */
	u_short	th_urp;			/* urgent pointer */
};

#define	TCPOPT_EOL		0
#define	   TCPOLEN_EOL			1
#define	TCPOPT_PAD		0		/* padding after EOL */
#define	   TCPOLEN_PAD			1
#define	TCPOPT_NOP		1
#define	   TCPOLEN_NOP			1
#define	TCPOPT_MAXSEG		2
#define    TCPOLEN_MAXSEG		4
#define TCPOPT_WINDOW		3
#define    TCPOLEN_WINDOW		3
#define TCPOPT_SACK_PERMITTED	4
#define    TCPOLEN_SACK_PERMITTED	2
#define TCPOPT_SACK		5
#define	   TCPOLEN_SACKHDR		2
#define    TCPOLEN_SACK			8	/* 2*sizeof(tcp_seq) */
#define TCPOPT_TIMESTAMP	8
#define    TCPOLEN_TIMESTAMP		10
#define    TCPOLEN_TSTAMP_APPA		(TCPOLEN_TIMESTAMP+2) /* appendix A */
#define	TCPOPT_SIGNATURE	19		/* Keyed MD5: RFC 2385 */
#define	   TCPOLEN_SIGNATURE		18

/* Miscellaneous constants */
#define	MAX_SACK_BLKS	6	/* Max # SACK blocks stored at receiver side */
#define	TCP_MAX_SACK	4	/* MAX # SACKs sent in any segment */


/*
 * Default maximum segment size for TCP.
 * With an IP MTU of 576, this is 536,
 * but 512 is probably more convenient.
 * This should be defined as MIN(512, IP_MSS - sizeof (struct tcpiphdr)).
 */
#define	TCP_MSS	512
/*
 * TCP_MINMSS is defined to be 216 which is fine for the smallest
 * link MTU (256 bytes, AX.25 packet radio) in the Internet.
 * However it is very unlikely to come across such low MTU interfaces
 * these days (anno dato 2003).
 * See tcp_subr.c tcp_minmss SYSCTL declaration for more comments.
 * Setting this to "0" disables the minmss check.
 */
#define	TCP_MINMSS 216

/*
 * Default maximum segment size for TCP6.
 * With an IP6 MSS of 1280, this is 1220,
 * but 1024 is probably more convenient. (xxx kazu in doubt)
 * This should be defined as MIN(1024, IP6_MSS - sizeof (struct tcpip6hdr))
 */
#define	TCP6_MSS	1024

#define	TCP_MAXWIN	65535	/* largest value for (unscaled) window */
#define	TTCP_CLIENT_SND_WND	4096	/* dflt send window for T/TCP client */

#define TCP_MAX_WINSHIFT	14	/* maximum window shift */

#define TCP_MAXBURST		4	/* maximum segments in a burst */

#define TCP_MAXHLEN	(0xf<<2)	/* max length of header in bytes */
#define TCP_MAXOLEN	(TCP_MAXHLEN - sizeof(struct tcphdr))
					/* max space left for options */
#endif /* __BSD_VISIBLE */

/*
 * User-settable options (used with setsockopt).
 */
#define	TCP_NODELAY	0x01	/* don't delay send to coalesce packets */
#if __BSD_VISIBLE
#define	TCP_MAXSEG	0x02	/* set maximum segment size */
#define TCP_NOPUSH	0x04	/* don't push last block of write */
#define TCP_NOOPT	0x08	/* don't use TCP options */
#define TCP_MD5SIG	0x10	/* use MD5 digests (RFC2385) */
#define	TCP_INFO	0x20	/* retrieve tcp_info structure */
#define	TCP_CONGESTION	0x40	/* get/set congestion control algorithm */

#define	TCP_CA_NAME_MAX	16	/* max congestion control name length */

#define	TCPI_OPT_TIMESTAMPS	0x01
#define	TCPI_OPT_SACK		0x02
#define	TCPI_OPT_WSCALE		0x04
#define	TCPI_OPT_ECN		0x08
#define	TCPI_OPT_TOE		0x10

/*
 * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits
 * the caller to query certain information about the state of a TCP
 * connection.  We provide an overlapping set of fields with the Linux
 * implementation, but since this is a fixed size structure, room has been
 * left for growth.  In order to maximize potential future compatibility with
 * the Linux API, the same variable names and order have been adopted, and
 * padding left to make room for omitted fields in case they are added later.
 *
 * XXX: This is currently an unstable ABI/API, in that it is expected to
 * change.
 */
struct tcp_info {
	u_int8_t	tcpi_state;		/* TCP FSM state. */
	u_int8_t	__tcpi_ca_state;
	u_int8_t	__tcpi_retransmits;
	u_int8_t	__tcpi_probes;
	u_int8_t	__tcpi_backoff;
	u_int8_t	tcpi_options;		/* Options enabled on conn. */
	u_int8_t	tcpi_snd_wscale:4,	/* RFC1323 send shift value. */
			tcpi_rcv_wscale:4;	/* RFC1323 recv shift value. */

	u_int32_t	__tcpi_rto;
	u_int32_t	__tcpi_ato;
	u_int32_t	__tcpi_snd_mss;
	u_int32_t	__tcpi_rcv_mss;

	u_int32_t	__tcpi_unacked;
	u_int32_t	__tcpi_sacked;
	u_int32_t	__tcpi_lost;
	u_int32_t	__tcpi_retrans;
	u_int32_t	__tcpi_fackets;

	/* Times; measurements in usecs. */
	u_int32_t	__tcpi_last_data_sent;
	u_int32_t	__tcpi_last_ack_sent;	/* Also unimpl. on Linux? */
	u_int32_t	__tcpi_last_data_recv;
	u_int32_t	__tcpi_last_ack_recv;

	/* Metrics; variable units. */
	u_int32_t	__tcpi_pmtu;
	u_int32_t	__tcpi_rcv_ssthresh;
	u_int32_t	tcpi_rtt;		/* Smoothed RTT in usecs. */
	u_int32_t	tcpi_rttvar;		/* RTT variance in usecs. */
	u_int32_t	tcpi_snd_ssthresh;	/* Slow start threshold. */
	u_int32_t	tcpi_snd_cwnd;		/* Send congestion window. */
	u_int32_t	__tcpi_advmss;
	u_int32_t	__tcpi_reordering;

	u_int32_t	__tcpi_rcv_rtt;
	u_int32_t	tcpi_rcv_space;		/* Advertised recv window. */

	/* FreeBSD extensions to tcp_info. */
	u_int32_t	tcpi_snd_wnd;		/* Advertised send window. */
	u_int32_t	tcpi_snd_bwnd;		/* Bandwidth send window. */
	u_int32_t	tcpi_snd_nxt;		/* Next egress seqno */
	u_int32_t	tcpi_rcv_nxt;		/* Next ingress seqno */
	u_int32_t	tcpi_toe_tid;		/* HWTID for TOE endpoints */
	
	/* Padding to grow without breaking ABI. */
	u_int32_t	__tcpi_pad[29];		/* Padding. */
};
#endif

#endif /* !_NETINET_TCP_H_ */


================================================
FILE: sys/netinet/tcp_var.h
================================================
#ifndef _NETINET_TCP_VAR_H_
#define _NETINET_TCP_VAR_H_
#include <netinet/tcp.h>
#endif /* !_NETINET_TCP_VAR_H_ */


================================================
FILE: sys/netinet/udp.h
================================================
/*-
 * Copyright (c) 1982, 1986, 1993
 *	The Regents of the University of California.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)udp.h	8.1 (Berkeley) 6/10/93
 * $FreeBSD: src/sys/netinet/udp.h,v 1.10 2007/02/20 10:13:11 rwatson Exp $
 */

#ifndef _NETINET_UDP_H_
#define	_NETINET_UDP_H_

/*
 * UDP protocol header.
 * Per RFC 768, September, 1981.
 */
struct udphdr {
	u_short	uh_sport;		/* source port */
	u_short	uh_dport;		/* destination port */
	u_short	uh_ulen;		/* udp length */
	u_short	uh_sum;			/* udp checksum */
};

/* 
 * User-settable options (used with setsockopt).
 */
#define	UDP_ENCAP			0x01


/*
 * UDP Encapsulation of IPsec Packets options.
 */
/* Encapsulation types. */
#define	UDP_ENCAP_ESPINUDP_NON_IKE 	1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
#define	UDP_ENCAP_ESPINUDP		2 /* draft-ietf-ipsec-udp-encaps-02+ */

/* Default ESP in UDP encapsulation port. */
#define	UDP_ENCAP_ESPINUDP_PORT		500

/* Maximum UDP fragment size for ESP over UDP. */
#define	UDP_ENCAP_ESPINUDP_MAXFRAGLEN	552

#endif


================================================
FILE: sys/sys/cdefs.h
================================================
#ifndef _CDEFS_H_
#define _CDEFS_H_

/*
 * various compiler macros and common functions
 */

#ifndef __packed
#define __packed       __attribute__ ((__packed__))
#endif

#ifndef __aligned
#define __aligned(x) __attribute__((__aligned__(x)))
#endif

/* defined as assert */
void panic(const char *fmt, ...);

#define KASSERT(exp,msg) do {                                           \
        if (__predict_false(!(exp)))                                    \
                panic msg;                                              \
} while (0)

/* don't bother to optimize */
#ifndef __predict_false
#define __predict_false(x)   (x)	/* __builtin_expect((exp), 0) */
#endif

#endif /* !_CDEFS_H_ */


================================================
FILE: sys/sys/kernel.h
================================================
/*
 * from freebsd's kernel.h
 */
#ifndef _SYS_KERNEL_H_
#define _SYS_KERNEL_H_

#define SYSINIT(a, b, c, d, e)  \
        void *sysinit_ ## d = d
#define VNET_SYSINIT(a, b, c, d, e)  \
        void *sysinit_ ## d = d
#define SYSUNINIT(a, b, c, d, e)  \
        void *sysuninit_ ## d = d
#define VNET_SYSUNINIT(a, b, c, d, e)  \
        void *sysuninit_ ## d = d

/*
 * Some enumerated orders; "ANY" sorts last.
 */
enum sysinit_elem_order {
        SI_ORDER_FIRST          = 0x0000000,    /* first*/
        SI_ORDER_SECOND         = 0x0000001,    /* second*/
        SI_ORDER_THIRD          = 0x0000002,    /* third*/
        SI_ORDER_MIDDLE         = 0x1000000,    /* somewhere in the middle */
        SI_ORDER_ANY            = 0xfffffff     /* last*/
};
#endif


================================================
FILE: sys/sys/malloc.h
================================================
#ifndef _SYS_MALLOC_H_
#define _SYS_MALLOC_H_

/*
 * No matter what, try to get clear memory and be non-blocking.
 * XXX check if 2.4 has a native way to zero memory,
 * XXX obey to the flags (M_NOWAIT <-> GPF_ATOMIC, M_WAIT <-> GPF_KERNEL)
 */
#ifndef _WIN32 /* this is the linux version */

/*
 * XXX On zeroshell (2.6.25.17) we get a load error
 *	__you_cannot_kmalloc_that_much
 * which is triggered when kmalloc() is called with a large
 * compile-time constant argument (include/linux/slab_def.h)
 *
 * I think it may be a compiler (or source) bug because there is no
 * evidence that such a large request is made.
 * Making the _size argument to kmalloc volatile prevents the compiler
 * from making the mistake, though it is clearly not ideal.
 */

#if !defined (LINUX_24) && LINUX_VERSION_CODE > KERNEL_VERSION(2,6,22)
#define malloc(_size, type, flags)			\
	({ volatile int _v = _size; kmalloc(_v, GFP_ATOMIC | __GFP_ZERO); })
#else /* LINUX <= 2.6.22 and LINUX_24 */
/* linux 2.6.22 does not zero allocated memory */
#define malloc(_size, type, flags)			\
	({ int _s = _size;				\
	void *_ret = kmalloc(_s, GFP_ATOMIC);		\
	if (_ret) memset(_ret, 0, _s);			\
        (_ret);						\
        })
#endif /* LINUX <= 2.6.22 */

#define calloc(_n, _s) malloc((_n * _s), NULL, GFP_ATOMIC | __GFP_ZERO)
#define free(_var, type) kfree(_var)

#else /* _WIN32, the windows version */

/*
 * ntddk.h uses win_malloc() and MmFreeContiguousMemory().
 * wipfw uses
 * ExAllocatePoolWithTag(, pool, len, tag)
 * ExFreePoolWithTag(ptr, tag)
 */
#define malloc(_size, _type, _flags) my_alloc(_size)
#define calloc(_size, _type, _flags) my_alloc(_size)

void *my_alloc(int _size);
/* the 'tag' version does not work without -Gz in the linker */
#define free(_var, type) ExFreePool(_var)
//#define free(_var, type) ExFreePoolWithTag(_var, 'wfpi')

#endif /* _WIN32 */

#define M_NOWAIT        0x0001          /* do not block */
#define M_ZERO          0x0100          /* bzero the allocation */
#endif /* _SYS_MALLOC_H_ */


================================================
FILE: sys/sys/mbuf.h
================================================
/*
 * Copyright (C) 2009 Luigi Rizzo, Universita` di Pisa
 *
 * BSD copyright.
 *
 * A simple compatibility interface to map mbufs onto sk_buff
 */

#ifndef _SYS_MBUF_H_
#define	_SYS_MBUF_H_

#include <sys/malloc.h>		/* we use free() */
/* hopefully queue.h is already included by someone else */
#include <sys/queue.h>
#ifdef _KERNEL

/* bzero not present on linux, but this should go in glue.h */
// #define bzero(s, n) memset(s, 0, n)

/*
 * We implement a very simplified UMA allocator where the backend
 * is simply malloc, and uma_zone only stores the length of the components.
 */
typedef int uma_zone_t;		/* the zone size */

#define uma_zcreate(name, len, _3, _4, _5, _6, _7, _8)	(len)


#define uma_zfree(zone, item)	free(item, M_IPFW)
#define uma_zalloc(zone, flags) malloc(zone, M_IPFW, flags)
#define uma_zdestroy(zone)	do {} while (0)

/*-
 * Macros for type conversion:
 * mtod(m, t)	-- Convert mbuf pointer to data pointer of correct type.
 */
#define	mtod(m, t)	((t)((m)->m_data))

#endif /* _KERNEL */

/*
 * Packet tag structure (see below for details).
 */
struct m_tag {
	SLIST_ENTRY(m_tag)	m_tag_link;	/* List of packet tags */
	u_int16_t		m_tag_id;	/* Tag ID */
	u_int16_t		m_tag_len;	/* Length of data */
	u_int32_t		m_tag_cookie;	/* ABI/Module ID */
	void			(*m_tag_free)(struct m_tag *);
};

#if defined(__linux__) || defined( _WIN32 )

/*
 * Auxiliary structure to store values from the sk_buf.
 * Note that we should not alter the sk_buff, and if we do
 * so make sure to keep the values in sync between the mbuf
 * and the sk_buff (especially m_len and m_pkthdr.len).
 */

struct mbuf {
	struct mbuf *m_next;
	struct mbuf *m_nextpkt;
	char *m_data; // XXX was void *
	int m_len;	/* length in this mbuf */
	int m_flags;
#ifdef __linux__
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
	struct nf_info *queue_entry;
#else
	struct nf_queue_entry *queue_entry;
#endif
#else /* _WIN32 */
	int		direction;	/* could go in rcvif */
	NDIS_HANDLE	context;	/* replaces queue_entry or skb ?*/
	PNDIS_PACKET	pkt;
#endif
	struct sk_buff *m_skb;
	struct {
#ifdef __linux__
		struct net_device *rcvif;
#else
		struct ifnet *rcvif;
#endif
		int len;	/* total packet len */
		SLIST_HEAD (packet_tags, m_tag) tags;
	} m_pkthdr;
};

#define M_SKIP_FIREWALL	0x01		/* skip firewall processing */
#define M_BCAST         0x02 /* send/received as link-level broadcast */
#define M_MCAST         0x04 /* send/received as link-level multicast */

#define M_DONTWAIT      M_NOWAIT	/* should not be here... */


/*
 * m_dup() is used in the TEE case, currently unsupported so we
 * just return.
 */
static __inline struct mbuf	*m_dup(struct mbuf *m, int n)
{
	(void)m; (void)n;
	return NULL;
};

#define	MTAG_ABI_COMPAT		0		/* compatibility ABI */
static __inline struct m_tag *
m_tag_find(struct mbuf *m, int type, struct m_tag *start)
{
	(void)m; (void)type; (void)start;
	return NULL;
};


static __inline void
m_tag_prepend(struct mbuf *m, struct m_tag *t)
{
	SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link);
}

/*
 * Return the next tag in the list of tags associated with an mbuf.
 */
static __inline struct m_tag *
m_tag_next(struct mbuf *m, struct m_tag *t)
{
 
        return (SLIST_NEXT(t, m_tag_link));
}

/*
 * Create an mtag of the given type
 */
static __inline struct m_tag *
m_tag_alloc(uint32_t cookie, int type, int length, int wait)
{
	int l = length + sizeof(struct m_tag);
	struct m_tag *m = malloc(l, 0, M_NOWAIT);
	if (m) {
		memset(m, 0, l);
		m->m_tag_id = type;
		m->m_tag_len = length;
		m->m_tag_cookie = cookie;
	}
	return m;
};

static __inline struct m_tag *
m_tag_get(int type, int length, int wait)
{
	return m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait);
}

static __inline struct m_tag *
m_tag_first(struct mbuf *m)
{
	return SLIST_FIRST(&m->m_pkthdr.tags);
};

static __inline void
m_tag_delete(struct mbuf *m, struct m_tag *t)
{
};

static __inline struct m_tag *
m_tag_locate(struct mbuf *m, u_int32_t n, int x, struct m_tag *t)
{
	struct m_tag *tag;

	tag = m_tag_first(m);
	if (tag == NULL)
		return NULL;

	if (tag->m_tag_cookie != n || tag->m_tag_id != x)
		return NULL;
	else
		return tag;
};

#define M_SETFIB(_m, _fib)	/* nothing on linux */

static __inline void
m_freem(struct mbuf *m)
{
	struct m_tag *t;

	/* free the m_tag chain */
	while ( (t = SLIST_FIRST(&m->m_pkthdr.tags) ) ) {
		SLIST_REMOVE_HEAD(&m->m_pkthdr.tags, m_tag_link);
		free(t, 0);
	}

	/* free the mbuf */
	free(m, M_IPFW);
};

/* m_pullup is not supported, there is a macro in missing.h */

#define M_GETFIB(_m)	0

/* macro used to create a new mbuf */
#define MT_DATA         1       /* dynamic (data) allocation */
#define MSIZE           256     /* size of an mbuf */
#define MGETHDR(_m, _how, _type)   ((_m) = m_gethdr((_how), (_type)))

/* allocate and init a new mbuf using the same structure of FreeBSD */
static __inline struct mbuf *
m_gethdr(int how, short type)
{
	struct mbuf *m;

	m = malloc(MSIZE, M_IPFW, M_NOWAIT);

	if (m == NULL) {
		return m;
	}

	/* here we have MSIZE - sizeof(struct mbuf) available */
	m->m_data = (char *)(m + 1);

	return m;
}

#endif /* __linux__ || _WIN32 */

/*
 * Persistent tags stay with an mbuf until the mbuf is reclaimed.  Otherwise
 * tags are expected to ``vanish'' when they pass through a network
 * interface.  For most interfaces this happens normally as the tags are
 * reclaimed when the mbuf is free'd.  However in some special cases
 * reclaiming must be done manually.  An example is packets that pass through
 * the loopback interface.  Also, one must be careful to do this when
 * ``turning around'' packets (e.g., icmp_reflect).
 *
 * To mark a tag persistent bit-or this flag in when defining the tag id.
 * The tag will then be treated as described above.
 */
#define	MTAG_PERSISTENT				0x800

#define	PACKET_TAG_NONE				0  /* Nadda */

/* Packet tags for use with PACKET_ABI_COMPAT. */
#define	PACKET_TAG_IPSEC_IN_DONE		1  /* IPsec applied, in */
#define	PACKET_TAG_IPSEC_OUT_DONE		2  /* IPsec applied, out */
#define	PACKET_TAG_IPSEC_IN_CRYPTO_DONE		3  /* NIC IPsec crypto done */
#define	PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED	4  /* NIC IPsec crypto req'ed */
#define	PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO	5  /* NIC notifies IPsec */
#define	PACKET_TAG_IPSEC_PENDING_TDB		6  /* Reminder to do IPsec */
#define	PACKET_TAG_BRIDGE			7  /* Bridge processing done */
#define	PACKET_TAG_GIF				8  /* GIF processing done */
#define	PACKET_TAG_GRE				9  /* GRE processing done */
#define	PACKET_TAG_IN_PACKET_CHECKSUM		10 /* NIC checksumming done */
#define	PACKET_TAG_ENCAP			11 /* Encap.  processing */
#define	PACKET_TAG_IPSEC_SOCKET			12 /* IPSEC socket ref */
#define	PACKET_TAG_IPSEC_HISTORY		13 /* IPSEC history */
#define	PACKET_TAG_IPV6_INPUT			14 /* IPV6 input processing */
#define	PACKET_TAG_DUMMYNET			15 /* dummynet info */
#define	PACKET_TAG_DIVERT			17 /* divert info */
#define	PACKET_TAG_IPFORWARD			18 /* ipforward info */
#define	PACKET_TAG_MACLABEL	(19 | MTAG_PERSISTENT) /* MAC label */
#define	PACKET_TAG_PF				21 /* PF + ALTQ information */
#define	PACKET_TAG_RTSOCKFAM			25 /* rtsock sa family */
#define	PACKET_TAG_IPOPTIONS			27 /* Saved IP options */
#define	PACKET_TAG_CARP                         28 /* CARP info */

#endif /* !_SYS_MBUF_H_ */


================================================
FILE: sys/sys/module.h
================================================
/*
 * trivial module support
 */
#ifndef _SYS_MODULE_H_
#define _SYS_MODULE_H_
typedef struct module *module_t;
typedef int (*modeventhand_t)(module_t, int /* modeventtype_t */, void *);
 
typedef enum modeventtype {
        MOD_LOAD,
        MOD_UNLOAD,
        MOD_SHUTDOWN,
        MOD_QUIESCE
} modeventtype_t;
 
typedef struct moduledata {
        const char      *name;          /* module name */
        modeventhand_t  evhand;         /* event handler */
        void            *priv;          /* extra data */
} moduledata_t;

/*
 * Hook the module descriptor, md, into our list of things to do.
 * We should in principle respect the order of loading.
 *
 * XXX use the gcc .init functions
 */
#define DECLARE_MODULE(a, md, c,d)				\
    moduledata_t *moddesc_##a = &md;

/*
 * XXX MODULE_VERSION is define in linux too
 */
#define MODULE_DEPEND(a,b,c,d,e)
#if defined( __linux__ ) || defined( _WIN32 )
#undef MODULE_VERSION
#define MODULE_VERSION(a,b)
#endif

#endif	/* _SYS_MODULE_H_ */


================================================
FILE: sys/sys/param.h
================================================
#ifndef _SYS_PARAM_H_
#define _SYS_PARAM_H_

/*
 * number of additional groups
 */
#ifndef LINUX_24
#define NGROUPS		16
#endif

#endif /* _SYS_PARAM_H_ */


================================================
FILE: sys/sys/queue.h
================================================
/*-
 * Copyright (c) 1991, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)queue.h	8.5 (Berkeley) 8/20/94
 * $FreeBSD: src/sys/sys/queue.h,v 1.68 2006/10/24 11:20:29 ru Exp $
 */

#ifndef _SYS_QUEUE_H_
#define	_SYS_QUEUE_H_

//#include <sys/cdefs.h>

/*
 * This file defines four types of data structures: singly-linked lists,
 * singly-linked tail queues, lists and tail queues.
 *
 * A singly-linked list is headed by a single forward pointer. The elements
 * are singly linked for minimum space and pointer manipulation overhead at
 * the expense of O(n) removal for arbitrary elements. New elements can be
 * added to the list after an existing element or at the head of the list.
 * Elements being removed from the head of the list should use the explicit
 * macro for this purpose for optimum efficiency. A singly-linked list may
 * only be traversed in the forward direction.  Singly-linked lists are ideal
 * for applications with large datasets and few or no removals or for
 * implementing a LIFO queue.
 *
 * A singly-linked tail queue is headed by a pair of pointers, one to the
 * head of the list and the other to the tail of the list. The elements are
 * singly linked for minimum space and pointer manipulation overhead at the
 * expense of O(n) removal for arbitrary elements. New elements can be added
 * to the list after an existing element, at the head of the list, or at the
 * end of the list. Elements being removed from the head of the tail queue
 * should use the explicit macro for this purpose for optimum efficiency.
 * A singly-linked tail queue may only be traversed in the forward direction.
 * Singly-linked tail queues are ideal for applications with large datasets
 * and few or no removals or for implementing a FIFO queue.
 *
 * A list is headed by a single forward pointer (or an array of forward
 * pointers for a hash table header). The elements are doubly linked
 * so that an arbitrary element can be removed without a need to
 * traverse the list. New elements can be added to the list before
 * or after an existing element or at the head of the list. A list
 * may only be traversed in the forward direction.
 *
 * A tail queue is headed by a pair of pointers, one to the head of the
 * list and the other to the tail of the list. The elements are doubly
 * linked so that an arbitrary element can be removed without a need to
 * traverse the list. New elements can be added to the list before or
 * after an existing element, at the head of the list, or at the end of
 * the list. A tail queue may be traversed in either direction.
 *
 * For details on the use of these macros, see the queue(3) manual page.
 *
 *
 *				SLIST	LIST	STAILQ	TAILQ
 * _HEAD			+	+	+	+
 * _HEAD_INITIALIZER		+	+	+	+
 * _ENTRY			+	+	+	+
 * _INIT			+	+	+	+
 * _EMPTY			+	+	+	+
 * _FIRST			+	+	+	+
 * _NEXT			+	+	+	+
 * _PREV			-	-	-	+
 * _LAST			-	-	+	+
 * _FOREACH			+	+	+	+
 * _FOREACH_SAFE		+	+	+	+
 * _FOREACH_REVERSE		-	-	-	+
 * _FOREACH_REVERSE_SAFE	-	-	-	+
 * _INSERT_HEAD			+	+	+	+
 * _INSERT_BEFORE		-	+	-	+
 * _INSERT_AFTER		+	+	+	+
 * _INSERT_TAIL			-	-	+	+
 * _CONCAT			-	-	+	+
 * _REMOVE_HEAD			+	-	+	-
 * _REMOVE			+	+	+	+
 *
 */
#ifdef QUEUE_MACRO_DEBUG
/* Store the last 2 places the queue element or head was altered */
struct qm_trace {
	char * lastfile;
	int lastline;
	char * prevfile;
	int prevline;
};

#define	TRACEBUF	struct qm_trace trace;
#define	TRASHIT(x)	do {(x) = (void *)-1;} while (0)

#define	QMD_TRACE_HEAD(head) do {					\
	(head)->trace.prevline = (head)->trace.lastline;		\
	(head)->trace.prevfile = (head)->trace.lastfile;		\
	(head)->trace.lastline = __LINE__;				\
	(head)->trace.lastfile = __FILE__;				\
} while (0)

#define	QMD_TRACE_ELEM(elem) do {					\
	(elem)->trace.prevline = (elem)->trace.lastline;		\
	(elem)->trace.prevfile = (elem)->trace.lastfile;		\
	(elem)->trace.lastline = __LINE__;				\
	(elem)->trace.lastfile = __FILE__;				\
} while (0)

#else
#define	QMD_TRACE_ELEM(elem)
#define	QMD_TRACE_HEAD(head)
#define	TRACEBUF
#define	TRASHIT(x)
#endif	/* QUEUE_MACRO_DEBUG */

/*
 * Singly-linked List declarations.
 */
#define	SLIST_HEAD(name, type)						\
struct name {								\
	struct type *slh_first;	/* first element */			\
}

#define	SLIST_HEAD_INITIALIZER(head)					\
	{ NULL }

#if defined( _WIN32 ) && defined(SLIST_ENTRY)
#undef SLIST_ENTRY
#endif
#define	SLIST_ENTRY(type)						\
struct {								\
	struct type *sle_next;	/* next element */			\
}

/*
 * Singly-linked List functions.
 */
#define	SLIST_EMPTY(head)	((head)->slh_first == NULL)

#define	SLIST_FIRST(head)	((head)->slh_first)

#define	SLIST_FOREACH(var, head, field)					\
	for ((var) = SLIST_FIRST((head));				\
	    (var);							\
	    (var) = SLIST_NEXT((var), field))

#define	SLIST_FOREACH_SAFE(var, head, field, tvar)			\
	for ((var) = SLIST_FIRST((head));				\
	    (var) && ((tvar) = SLIST_NEXT((var), field), 1);		\
	    (var) = (tvar))

#define	SLIST_FOREACH_PREVPTR(var, varp, head, field)			\
	for ((varp) = &SLIST_FIRST((head));				\
	    ((var) = *(varp)) != NULL;					\
	    (varp) = &SLIST_NEXT((var), field))

#define	SLIST_INIT(head) do {						\
	SLIST_FIRST((head)) = NULL;					\
} while (0)

#define	SLIST_INSERT_AFTER(slistelm, elm, field) do {			\
	SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field);	\
	SLIST_NEXT((slistelm), field) = (elm);				\
} while (0)

#define	SLIST_INSERT_HEAD(head, elm, field) do {			\
	SLIST_NEXT((elm), field) = SLIST_FIRST((head));			\
	SLIST_FIRST((head)) = (elm);					\
} while (0)

#define	SLIST_NEXT(elm, field)	((elm)->field.sle_next)

#define	SLIST_REMOVE(head, elm, type, field) do {			\
	if (SLIST_FIRST((head)) == (elm)) {				\
		SLIST_REMOVE_HEAD((head), field);			\
	}								\
	else {								\
		struct type *curelm = SLIST_FIRST((head));		\
		while (SLIST_NEXT(curelm, field) != (elm))		\
			curelm = SLIST_NEXT(curelm, field);		\
		SLIST_NEXT(curelm, field) =				\
		    SLIST_NEXT(SLIST_NEXT(curelm, field), field);	\
	}								\
	TRASHIT((elm)->field.sle_next);					\
} while (0)

#define	SLIST_REMOVE_HEAD(head, field) do {				\
	SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field);	\
} while (0)

/*
 * Singly-linked Tail queue declarations.
 */
#define	STAILQ_HEAD(name, type)						\
struct name {								\
	struct type *stqh_first;/* first element */			\
	struct type **stqh_last;/* addr of last next element */		\
}

#define	STAILQ_HEAD_INITIALIZER(head)					\
	{ NULL, &(head).stqh_first }

#define	STAILQ_ENTRY(type)						\
struct {								\
	struct type *stqe_next;	/* next element */			\
}

/*
 * Singly-linked Tail queue functions.
 */
#define	STAILQ_CONCAT(head1, head2) do {				\
	if (!STAILQ_EMPTY((head2))) {					\
		*(head1)->stqh_last = (head2)->stqh_first;		\
		(head1)->stqh_last = (head2)->stqh_last;		\
		STAILQ_INIT((head2));					\
	}								\
} while (0)

#define	STAILQ_EMPTY(head)	((head)->stqh_first == NULL)

#define	STAILQ_FIRST(head)	((head)->stqh_first)

#define	STAILQ_FOREACH(var, head, field)				\
	for((var) = STAILQ_FIRST((head));				\
	   (var);							\
	   (var) = STAILQ_NEXT((var), field))


#define	STAILQ_FOREACH_SAFE(var, head, field, tvar)			\
	for ((var) = STAILQ_FIRST((head));				\
	    (var) && ((tvar) = STAILQ_NEXT((var), field), 1);		\
	    (var) = (tvar))

#define	STAILQ_INIT(head) do {						\
	STAILQ_FIRST((head)) = NULL;					\
	(head)->stqh_last = &STAILQ_FIRST((head));			\
} while (0)

#define	STAILQ_INSERT_AFTER(head, tqelm, elm, field) do {		\
	if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\
		(head)->stqh_last = &STAILQ_NEXT((elm), field);		\
	STAILQ_NEXT((tqelm), field) = (elm);				\
} while (0)

#define	STAILQ_INSERT_HEAD(head, elm, field) do {			\
	if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL)	\
		(head)->stqh_last = &STAILQ_NEXT((elm), field);		\
	STAILQ_FIRST((head)) = (elm);					\
} while (0)

#define	STAILQ_INSERT_TAIL(head, elm, field) do {			\
	STAILQ_NEXT((elm), field) = NULL;				\
	*(head)->stqh_last = (elm);					\
	(head)->stqh_last = &STAILQ_NEXT((elm), field);			\
} while (0)

#define	STAILQ_LAST(head, type, field)					\
	(STAILQ_EMPTY((head)) ?						\
		NULL :							\
	        ((struct type *)(void *)				\
		((char *)((head)->stqh_last) - __offsetof(struct type, field))))

#define	STAILQ_NEXT(elm, field)	((elm)->field.stqe_next)

#define	STAILQ_REMOVE(head, elm, type, field) do {			\
	if (STAILQ_FIRST((head)) == (elm)) {				\
		STAILQ_REMOVE_HEAD((head), field);			\
	}								\
	else {								\
		struct type *curelm = STAILQ_FIRST((head));		\
		while (STAILQ_NEXT(curelm, field) != (elm))		\
			curelm = STAILQ_NEXT(curelm, field);		\
		if ((STAILQ_NEXT(curelm, field) =			\
		     STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\
			(head)->stqh_last = &STAILQ_NEXT((curelm), field);\
	}								\
	TRASHIT((elm)->field.stqe_next);				\
} while (0)

#define	STAILQ_REMOVE_HEAD(head, field) do {				\
	if ((STAILQ_FIRST((head)) =					\
	     STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL)		\
		(head)->stqh_last = &STAILQ_FIRST((head));		\
} while (0)

#ifndef LIST_HEAD
/*
 * List declarations.
 */
#define	LIST_HEAD(name, type)						\
struct name {								\
	struct type *lh_first;	/* first element */			\
}

#define	LIST_HEAD_INITIALIZER(head)					\
	{ NULL }

#define	LIST_ENTRY(type)						\
struct {								\
	struct type *le_next;	/* next element */			\
	struct type **le_prev;	/* address of previous next element */	\
}

/*
 * List functions.
 */

#if (defined(_KERNEL) && defined(INVARIANTS))
#define	QMD_LIST_CHECK_HEAD(head, field) do {				\
	if (LIST_FIRST((head)) != NULL &&				\
	    LIST_FIRST((head))->field.le_prev !=			\
	     &LIST_FIRST((head)))					\
		panic("Bad list head %p first->prev != head", (head));	\
} while (0)

#define	QMD_LIST_CHECK_NEXT(elm, field) do {				\
	if (LIST_NEXT((elm), field) != NULL &&				\
	    LIST_NEXT((elm), field)->field.le_prev !=			\
	     &((elm)->field.le_next))					\
	     	panic("Bad link elm %p next->prev != elm", (elm));	\
} while (0)

#define	QMD_LIST_CHECK_PREV(elm, field) do {				\
	if (*(elm)->field.le_prev != (elm))				\
		panic("Bad link elm %p prev->next != elm", (elm));	\
} while (0)
#else
#define	QMD_LIST_CHECK_HEAD(head, field)
#define	QMD_LIST_CHECK_NEXT(elm, field)
#define	QMD_LIST_CHECK_PREV(elm, field)
#endif /* (_KERNEL && INVARIANTS) */

#define	LIST_EMPTY(head)	((head)->lh_first == NULL)

#define	LIST_FIRST(head)	((head)->lh_first)

#define	LIST_FOREACH(var, head, field)					\
	for ((var) = LIST_FIRST((head));				\
	    (var);							\
	    (var) = LIST_NEXT((var), field))

#define	LIST_FOREACH_SAFE(var, head, field, tvar)			\
	for ((var) = LIST_FIRST((head));				\
	    (var) && ((tvar) = LIST_NEXT((var), field), 1);		\
	    (var) = (tvar))

#define	LIST_INIT(head) do {						\
	LIST_FIRST((head)) = NULL;					\
} while (0)

#define	LIST_INSERT_AFTER(listelm, elm, field) do {			\
	QMD_LIST_CHECK_NEXT(listelm, field);				\
	if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\
		LIST_NEXT((listelm), field)->field.le_prev =		\
		    &LIST_NEXT((elm), field);				\
	LIST_NEXT((listelm), field) = (elm);				\
	(elm)->field.le_prev = &LIST_NEXT((listelm), field);		\
} while (0)

#define	LIST_INSERT_BEFORE(listelm, elm, field) do {			\
	QMD_LIST_CHECK_PREV(listelm, field);				\
	(elm)->field.le_prev = (listelm)->field.le_prev;		\
	LIST_NEXT((elm), field) = (listelm);				\
	*(listelm)->field.le_prev = (elm);				\
	(listelm)->field.le_prev = &LIST_NEXT((elm), field);		\
} while (0)

#define	LIST_INSERT_HEAD(head, elm, field) do {				\
	QMD_LIST_CHECK_HEAD((head), field);				\
	if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL)	\
		LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\
	LIST_FIRST((head)) = (elm);					\
	(elm)->field.le_prev = &LIST_FIRST((head));			\
} while (0)

#define	LIST_NEXT(elm, field)	((elm)->field.le_next)

#define	LIST_REMOVE(elm, field) do {					\
	QMD_LIST_CHECK_NEXT(elm, field);				\
	QMD_LIST_CHECK_PREV(elm, field);				\
	if (LIST_NEXT((elm), field) != NULL)				\
		LIST_NEXT((elm), field)->field.le_prev = 		\
		    (elm)->field.le_prev;				\
	*(elm)->field.le_prev = LIST_NEXT((elm), field);		\
	TRASHIT((elm)->field.le_next);					\
	TRASHIT((elm)->field.le_prev);					\
} while (0)
#endif /* LIST_HEAD */

/*
 * Tail queue declarations.
 */
#define	TAILQ_HEAD(name, type)						\
struct name {								\
	struct type *tqh_first;	/* first element */			\
	struct type **tqh_last;	/* addr of last next element */		\
	TRACEBUF							\
}

#define	TAILQ_HEAD_INITIALIZER(head)					\
	{ NULL, &(head).tqh_first }

#define	TAILQ_ENTRY(type)						\
struct {								\
	struct type *tqe_next;	/* next element */			\
	struct type **tqe_prev;	/* address of previous next element */	\
	TRACEBUF							\
}

/*
 * Tail queue functions.
 */
#if (defined(_KERNEL) && defined(INVARIANTS))
#define	QMD_TAILQ_CHECK_HEAD(head, field) do {				\
	if (!TAILQ_EMPTY(head) &&					\
	    TAILQ_FIRST((head))->field.tqe_prev !=			\
	     &TAILQ_FIRST((head)))					\
		panic("Bad tailq head %p first->prev != head", (head));	\
} while (0)

#define	QMD_TAILQ_CHECK_TAIL(head, field) do {				\
	if (*(head)->tqh_last != NULL)					\
	    	panic("Bad tailq NEXT(%p->tqh_last) != NULL", (head)); 	\
} while (0)

#define	QMD_TAILQ_CHECK_NEXT(elm, field) do {				\
	if (TAILQ_NEXT((elm), field) != NULL &&				\
	    TAILQ_NEXT((elm), field)->field.tqe_prev !=			\
	     &((elm)->field.tqe_next))					\
		panic("Bad link elm %p next->prev != elm", (elm));	\
} while (0)

#define	QMD_TAILQ_CHECK_PREV(elm, field) do {				\
	if (*(elm)->field.tqe_prev != (elm))				\
		panic("Bad link elm %p prev->next != elm", (elm));	\
} while (0)
#else
#define	QMD_TAILQ_CHECK_HEAD(head, field)
#define	QMD_TAILQ_CHECK_TAIL(head, headname)
#define	QMD_TAILQ_CHECK_NEXT(elm, field)
#define	QMD_TAILQ_CHECK_PREV(elm, field)
#endif /* (_KERNEL && INVARIANTS) */

#define	TAILQ_CONCAT(head1, head2, field) do {				\
	if (!TAILQ_EMPTY(head2)) {					\
		*(head1)->tqh_last = (head2)->tqh_first;		\
		(head2)->tqh_first->field.tqe_prev = (head1)->tqh_last;	\
		(head1)->tqh_last = (head2)->tqh_last;			\
		TAILQ_INIT((head2));					\
		QMD_TRACE_HEAD(head1);					\
		QMD_TRACE_HEAD(head2);					\
	}								\
} while (0)

#define	TAILQ_EMPTY(head)	((head)->tqh_first == NULL)

#define	TAILQ_FIRST(head)	((head)->tqh_first)

#define	TAILQ_FOREACH(var, head, field)					\
	for ((var) = TAILQ_FIRST((head));				\
	    (var);							\
	    (var) = TAILQ_NEXT((var), field))

#define	TAILQ_FOREACH_SAFE(var, head, field, tvar)			\
	for ((var) = TAILQ_FIRST((head));				\
	    (var) && ((tvar) = TAILQ_NEXT((var), field), 1);		\
	    (var) = (tvar))

#define	TAILQ_FOREACH_REVERSE(var, head, headname, field)		\
	for ((var) = TAILQ_LAST((head), headname);			\
	    (var);							\
	    (var) = TAILQ_PREV((var), headname, field))

#define	TAILQ_FOREACH_REVERSE_SAFE(var, head, headname, field, tvar)	\
	for ((var) = TAILQ_LAST((head), headname);			\
	    (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1);	\
	    (var) = (tvar))

#define	TAILQ_INIT(head) do {						\
	TAILQ_FIRST((head)) = NULL;					\
	(head)->tqh_last = &TAILQ_FIRST((head));			\
	QMD_TRACE_HEAD(head);						\
} while (0)

#define	TAILQ_INSERT_AFTER(head, listelm, elm, field) do {		\
	QMD_TAILQ_CHECK_NEXT(listelm, field);				\
	if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\
		TAILQ_NEXT((elm), field)->field.tqe_prev = 		\
		    &TAILQ_NEXT((elm), field);				\
	else {								\
		(head)->tqh_last = &TAILQ_NEXT((elm), field);		\
		QMD_TRACE_HEAD(head);					\
	}								\
	TAILQ_NEXT((listelm), field) = (elm);				\
	(elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field);		\
	QMD_TRACE_ELEM(&(elm)->field);					\
	QMD_TRACE_ELEM(&listelm->field);				\
} while (0)

#define	TAILQ_INSERT_BEFORE(listelm, elm, field) do {			\
	QMD_TAILQ_CHECK_PREV(listelm, field);				\
	(elm)->field.tqe_prev = (listelm)->field.tqe_prev;		\
	TAILQ_NEXT((elm), field) = (listelm);				\
	*(listelm)->field.tqe_prev = (elm);				\
	(listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field);		\
	QMD_TRACE_ELEM(&(elm)->field);					\
	QMD_TRACE_ELEM(&listelm->field);				\
} while (0)

#define	TAILQ_INSERT_HEAD(head, elm, field) do {			\
	QMD_TAILQ_CHECK_HEAD(head, field);				\
	if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL)	\
		TAILQ_FIRST((head))->field.tqe_prev =			\
		    &TAILQ_NEXT((elm), field);				\
	else								\
		(head)->tqh_last = &TAILQ_NEXT((elm), field);		\
	TAILQ_FIRST((head)) = (elm);					\
	(elm)->field.tqe_prev = &TAILQ_FIRST((head));			\
	QMD_TRACE_HEAD(head);						\
	QMD_TRACE_ELEM(&(elm)->field);					\
} while (0)

#define	TAILQ_INSERT_TAIL(head, elm, field) do {			\
	QMD_TAILQ_CHECK_TAIL(head, field);				\
	TAILQ_NEXT((elm), field) = NULL;				\
	(elm)->field.tqe_prev = (head)->tqh_last;			\
	*(head)->tqh_last = (elm);					\
	(head)->tqh_last = &TAILQ_NEXT((elm), field);			\
	QMD_TRACE_HEAD(head);						\
	QMD_TRACE_ELEM(&(elm)->field);					\
} while (0)

#define	TAILQ_LAST(head, headname)					\
	(*(((struct headname *)((head)->tqh_last))->tqh_last))

#define	TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)

#define	TAILQ_PREV(elm, headname, field)				\
	(*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))

#define	TAILQ_REMOVE(head, elm, field) do {				\
	QMD_TAILQ_CHECK_NEXT(elm, field);				\
	QMD_TAILQ_CHECK_PREV(elm, field);				\
	if ((TAILQ_NEXT((elm), field)) != NULL)				\
		TAILQ_NEXT((elm), field)->field.tqe_prev = 		\
		    (elm)->field.tqe_prev;				\
	else {								\
		(head)->tqh_last = (elm)->field.tqe_prev;		\
		QMD_TRACE_HEAD(head);					\
	}								\
	*(elm)->field.tqe_prev = TAILQ_NEXT((elm), field);		\
	TRASHIT((elm)->field.tqe_next);					\
	TRASHIT((elm)->field.tqe_prev);					\
	QMD_TRACE_ELEM(&(elm)->field);					\
} while (0)


#ifdef _KERNEL

/*
 * XXX insque() and remque() are an old way of handling certain queues.
 * They bogusly assumes that all queue heads look alike.
 */

struct quehead {
	struct quehead *qh_link;
	struct quehead *qh_rlink;
};

#ifdef __CC_SUPPORTS___INLINE

static __inline void
insque(void *a, void *b)
{
	struct quehead *element = (struct quehead *)a,
		 *head = (struct quehead *)b;

	element->qh_link = head->qh_link;
	element->qh_rlink = head;
	head->qh_link = element;
	element->qh_link->qh_rlink = element;
}

static __inline void
remque(void *a)
{
	struct quehead *element = (struct quehead *)a;

	element->qh_link->qh_rlink = element->qh_rlink;
	element->qh_rlink->qh_link = element->qh_link;
	element->qh_rlink = 0;
}

#else /* !__CC_SUPPORTS___INLINE */

void	insque(void *a, void *b);
void	remque(void *a);

#endif /* __CC_SUPPORTS___INLINE */

#endif /* _KERNEL */

#endif /* !_SYS_QUEUE_H_ */


================================================
FILE: sys/sys/syslog.h
================================================
#ifndef _SYS_SYSLOG_H_
#define _SYS_SYSLOG_H_
/* XXX find linux equivalent */
#define LOG_SECURITY 0
#define LOG_NOTICE 0
#define LOG_DEBUG 0
#endif /* _SYS_SYSLOG_H_ */


================================================
FILE: sys/sys/systm.h
================================================
#ifndef _SYS_SYSTM_H_
#define _SYS_SYSTM_H_

#define CALLOUT_ACTIVE          0x0002 /* callout is currently active */
#define CALLOUT_MPSAFE          0x0008 /* callout handler is mp safe */

#ifndef _WIN32	/* this is the linux version */
/* callout support, in <sys/callout.h> on FreeBSD */
/*
 * callout support on linux module is done using timers
 */
#include <linux/timer.h>
#ifdef LINUX_24
#include <linux/sched.h>        /* jiffies definition is here in 2.4 */
#endif
#define callout timer_list
static __inline int
callout_reset_on(struct callout *co, int ticks, void (*fn)(void *), void *arg, int cpu)
{
        co->expires = jiffies + ticks;
        co->function = (void (*)(unsigned long))fn;
        co->data = (unsigned long)arg;
	/*
	 * Linux 2.6.31 and above has add_timer_on(co, cpu),
	 * otherwise add_timer() always schedules a callout on the same
	 * CPU used the first time, so we don't need more.
	 */
        add_timer(co);
        return 0;
}

#define callout_init(co, safe)  init_timer(co)
#define callout_drain(co)       del_timer(co)
#define callout_stop(co)        del_timer(co)

#else /* _WIN32 */
#include <ndis.h>

/* This is the windows part for callout support */
struct callout {
	KTIMER thetimer;
	KDPC timerdpc;
	int dpcinitialized;
	LARGE_INTEGER duetime;
};

void dummynet (void*);
VOID dummynet_dpc(
    __in struct _KDPC  *Dpc,
    __in_opt PVOID  DeferredContext,
    __in_opt PVOID  SystemArgument1,
    __in_opt PVOID  SystemArgument2
    );

VOID ipfw_dpc(
    __in struct _KDPC  *Dpc,
    __in_opt PVOID  DeferredContext,
    __in_opt PVOID  SystemArgument1,
    __in_opt PVOID  SystemArgument2
    );

/* callout_reset must handle two problems:
 * - dummynet() scheduler must be run always on the same processor
 * because do_gettimeofday() is based on cpu performance counter, and
 * _occasionally_ can leap backward in time if we query another cpu.
 * typically this won't happen that much, and the cpu will almost always
 * be the same even without the affinity restriction, but better to be sure.
 * - ipfw_tick() does not have the granularity requirements of dummynet()
 * but we need to pass a pointer as argument.
 *
 * for these reasons, if we are called for dummynet() timer,
 * KeInitializeDpc is called only once as it should be, and the thread
 * is forced on cpu0 (which is always present), while if we're called
 * for ipfw_tick(), we re-initialize the DPC each time, using
 * parameter DeferredContext to pass the needed pointer. since this
 * timer is called only once a sec, this won't hurt that much.
 */
static __inline int
callout_reset_on(struct callout *co, int ticks, void (*fn)(void *), void *arg, int cpu) 
{
	if(fn == &dummynet)
	{
		if(co->dpcinitialized == 0)
		{
			KeInitializeDpc(&co->timerdpc, dummynet_dpc, NULL);
			KeSetTargetProcessorDpc(&co->timerdpc, cpu);
			co->dpcinitialized = 1;
		}
	}
	else
	{
		KeInitializeDpc(&co->timerdpc, ipfw_dpc, arg);
	}
	co->duetime.QuadPart = (-ticks)*10000;
	KeSetTimer(&co->thetimer, co->duetime, &co->timerdpc);
	return 0;
}

static __inline void
callout_init(struct callout* co, int safe)
{
	printf("%s: initializing timer at %p\n",__FUNCTION__,co);
	KeInitializeTimer(&co->thetimer);
}

static __inline int
callout_drain(struct callout* co)
{
	BOOLEAN canceled = KeCancelTimer(&co->thetimer);
	while (canceled != TRUE)
	{
		canceled = KeCancelTimer(&co->thetimer);
	}
	printf("%s: stopping timer at %p\n",__FUNCTION__,co);
	return 0;
}

static __inline int
callout_stop(struct callout* co)
{
	return callout_drain(co);
}

#endif /* _WIN32 */

#endif /* _SYS_SYSTM_H_ */


================================================
FILE: sys/sys/taskqueue.h
================================================
#ifndef _SYS_TASKQUEUE_H_
#define _SYS_TASKQUEUE_H_

/*
 * Remap taskqueue to direct calls
 */

#ifdef _WIN32
struct task {
	void (*func)(void*, int);
};
#define taskqueue_enqueue(tq, ta)	(ta)->func(NULL,1)
#define TASK_INIT(a,b,c,d) do { 				\
	(a)->func = (c); } while (0)
#else
struct task {
	void (*func)(void);
};
#define taskqueue_enqueue(tq, ta)	(ta)->func()
#define TASK_INIT(a,b,c,d) do { 				\
	(a)->func = (void (*)(void))c; } while (0)

#endif
#define taskqueue_create_fast(_a, _b, _c, _d)	NULL
#define taskqueue_start_threads(_a, _b, _c, _d)

#define	taskqueue_drain(_a, _b)	/* XXX to be completed */
#define	taskqueue_free(_a)	/* XXX to be completed */

#define PRI_MIN                 (0)             /* Highest priority. */
#define PRI_MIN_ITHD            (PRI_MIN)
#define PI_NET                  (PRI_MIN_ITHD + 16)

#endif /* !_SYS_TASKQUEUE_H_ */


================================================
FILE: tcc_glue.h
================================================
/*
 * Copyright (c) 2010 Luigi Rizzo, Universita` di Pisa
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
/*
 * headers to build userland ipfw under tcc.
 */
 
#ifndef _TCC_GLUE_H
#define	_TCC_GLUE_H

//#define	__restrict
#define	NULL	((void *)0)
typedef int size_t;
typedef unsigned char	u_char;
typedef unsigned char	uint8_t;
typedef unsigned char	u_int8_t;
typedef unsigned short	u_short;
typedef unsigned short	uint16_t;
typedef unsigned short	u_int16_t;
typedef int		__int32_t;
typedef int		int32_t;
typedef int		socklen_t;
typedef int		pid_t;
typedef unsigned int	time_t;
typedef unsigned int	uint;
typedef unsigned int	u_int;
typedef unsigned int	uint32_t;
typedef unsigned int	u_int32_t;
typedef unsigned int	gid_t;
typedef unsigned int	uid_t;
typedef unsigned long	u_long;
typedef unsigned long	uintptr_t;
typedef long long int	int64_t;
typedef unsigned long long	int uint64_t;
typedef unsigned long long	int u_int64_t;

typedef uint32_t	in_addr_t;
struct in_addr {
	uint32_t	s_addr;
};
struct sockaddr_in {
	uint8_t _sin_len;
        uint8_t	sin_family;
        uint16_t	sin_port;
        struct  in_addr sin_addr;
        char    sin_zero[8];
};
#define IFNAMSIZ	16
#define INET6_ADDRSTRLEN	64

struct in6_addr {
        union {
                uint8_t         __s6_addr8[16];
                uint16_t        __s6_addr16[8];
                uint32_t        __s6_addr32[4];
        } __u6; // _addr;                    /* 128-bit IP6 address */
};


#define LITTLE_ENDIAN 1234
#define BYTE_ORDER LITTLE_ENDIAN

/* to be revised */
#define	EX_OK		0
#define	EX_DATAERR	1
#define	EX_OSERR	2
#define	EX_UNAVAILABLE	3
#define	EX_USAGE	4
#define	EX_NOHOST	5

#define	EEXIST		1
#define	EINVAL		2
#define	ERANGE		3
#define	ESRCH		4

#define	IPPROTO_IP		1
#define	IPPROTO_IPV6		2
#define	IPPROTO_RAW		100

#define	IPTOS_LOWDELAY		100
#define	IPTOS_MINCOST		101
#define	IPTOS_RELIABILITY	102
#define	IPTOS_THROUGHPUT	103
#define	SOCK_RAW		12
#define	AF_INET			2
#define	AF_INET6		28

#define	INADDR_ANY		0


#define bcmp(src, dst, len)	memcmp(src, dst, len)
#define bcopy(src, dst, len)	memcpy(dst, src, len)
#define bzero(p, len)	memset(p, 0, len)
#define index(s, c)	strchr(s, c)

char *strsep(char **stringp, const char *delim);

void    warn(const char *, ...);
//void    warnx(const char *, ...);
#define warnx warn
void    err(int, const char *, ...);
#define	errx err

uint16_t	htons(uint16_t)__attribute__ ((stdcall));
uint16_t	ntohs(uint16_t)__attribute__ ((stdcall));
uint32_t	htonl(uint32_t)__attribute__ ((stdcall));
uint32_t	ntohl(uint32_t)__attribute__ ((stdcall));
int inet_aton(const char *cp, struct in_addr *pin)__attribute__ ((stdcall));;
char * inet_ntoa(struct in_addr)__attribute__ ((stdcall));;
const char * inet_ntop(int af, const void * src, char * dst,
         socklen_t size)__attribute__ ((stdcall));;
int inet_pton(int af, const char * src, void * dst)__attribute__ ((stdcall));;

struct group {
	gid_t	gr_gid;
	char	gr_name[16];
};
struct passwd {
	uid_t	pw_uid;
	char	pw_name[16];
};

#define getpwnam(s)	(NULL)
#define getpwuid(s)	(NULL)

#define getgrnam(x) (NULL)
#define getgrgid(x) (NULL)

int getopt(int argc, char * const argv[], const char *optstring);

int getsockopt(int s, int level, int optname, void * optval,
         socklen_t * optlen);

int setsockopt(int s, int level, int optname, const void *optval,
         socklen_t optlen);

struct  protoent {
        char    *p_name;           /* official protocol name */
        char    **p_aliases;  /* alias list */
        short   p_proto;                /* protocol # */
};

struct  servent {
        char    *s_name;           /* official service name */
        char    **s_aliases;  /* alias list */
        short   s_port;                 /* port # */
        char    *s_proto;          /* protocol to use */
};

struct  hostent {
        char    *h_name;           /* official name of host */
        char    **h_aliases;  /* alias list */
        short   h_addrtype;             /* host address type */
        short   h_length;               /* length of address */
        char    **h_addr_list; /* list of addresses */
#define h_addr  h_addr_list[0]          /* address, for backward compat */
};

struct hostent* gethostbyaddr(const char* addr, int len, int type)__attribute__ ((stdcall));
struct hostent* gethostbyname(const char *name)__attribute__ ((stdcall));

struct protoent* getprotobynumber(int number)__attribute__ ((stdcall));
struct protoent* getprotobyname(const char* name)__attribute__ ((stdcall));

struct servent* getservbyport(int port, const char* proto)__attribute__ ((stdcall));
struct servent* getservbyname(const char* name, const char* proto) __attribute__ ((stdcall));

extern int optind;
extern char *optarg;

#include <windef.h>

#define WSADESCRIPTION_LEN      256
#define WSASYS_STATUS_LEN       128

typedef struct WSAData {
        WORD                    wVersion;
        WORD                    wHighVersion;
        char                    szDescription[WSADESCRIPTION_LEN+1];
        char                    szSystemStatus[WSASYS_STATUS_LEN+1];
        unsigned short          iMaxSockets;
        unsigned short          iMaxUdpDg;
        char FAR *              lpVendorInfo;
} WSADATA, * LPWSADATA;

int WSAStartup(
    WORD wVersionRequested,
    LPWSADATA lpWSAData
    );

int
WSACleanup(void);

int WSAGetLastError();

/* return error on process handling */
#define	pipe(f)		(-1)
#define	kill(p, s)	(-1)
#define	waitpid(w,s,o)	(-1)
#define fork(x)		(-1)
#define execvp(f, a)	(-1)

#define _W_INT(i)       (i)
#define _WSTATUS(x)     (_W_INT(x) & 0177)
#define WIFEXITED(x)    (_WSTATUS(x) == 0)
#define WEXITSTATUS(x)  (_W_INT(x) >> 8)
#define _WSTOPPED       0177            /* _WSTATUS if process is stopped */
#define WIFSIGNALED(x)  (_WSTATUS(x) != _WSTOPPED && _WSTATUS(x) != 0)
#define WTERMSIG(x)     (_WSTATUS(x))

#endif /* _TCC_GLUE_H */


================================================
FILE: test/Makefile
================================================
#
# $Id: Makefile 5626 2010-03-04 21:55:22Z luigi $
#
# Makefile for building userland tests
# this is written in a form compatible with gmake

SCHED_SRCS = test_dn_sched.c
SCHED_SRCS += dn_sched_fifo.c
SCHED_SRCS += dn_sched_wf2q.c
SCHED_SRCS += dn_sched_qfq.c
SCHED_SRCS += dn_sched_rr.c
SCHED_SRCS += dn_heap.c
SCHED_SRCS += main.c

SCHED_OBJS=$(SCHED_SRCS:.c=.o)

HEAP_SRCS = dn_heap.c test_dn_heap.c
HEAP_OBJS=$(HEAP_SRCS:.c=.o)

VPATH=	.:../dummynet2

#CFLAGS = -I../dummynet2/include -I. -Wall -Werror -O3 -DIPFW
CFLAGS = -I. -I../dummynet2/include/netinet/ipfw -DIPFW
CFLAGS +=  -Wall -Werror
CFLAGS += -g -O3
TARGETS= test_sched # no test_heap by default

all: $(TARGETS)

test_heap : $(HEAP_OBJS)
	$(CC) -o $@ $(HEAP_OBJS)

test_sched : $(SCHED_OBJS)
	$(CC) -o $@ $(SCHED_OBJS)

$(SCHED_OBJS): dn_test.h
main.o: mylist.h

clean:
	- rm *.o $(TARGETS) *.core

ALLSRCS = $(SCHED_SRCS) dn_test.h mylist.h \
	dn_sched.h dn_heap.h ip_dn_private.h Makefile
TMPBASE = /tmp/testXYZ
TMPDIR = $(TMPBASE)/test

tgz:
	-rm -rf $(TMPDIR)
	mkdir -p $(TMPDIR)
	-cp -p $(ALLSRCS) $(TMPDIR)
	-(cd ..; cp -p $(ALLSRCS) $(TMPDIR))
	ls -la  $(TMPDIR)
	(cd $(TMPBASE); tar cvzf /tmp/test.tgz test)


================================================
FILE: test/basic_ipfw.sh
================================================
#!/bin/sh

IPFW=./ipfw/ipfw
PING=/bin/ping
RH=127.0.0.1		# remote host
R=10			# test rule number
P=1			# test pipe number

abort()
{ 
echo $* 
}

#insmod dummynet2/ipfw_mod.ko
#$IPFW show > /dev/null
#$IPFW pipe show 
echo "Flushing rules, do you agree ?"
$IPFW flush

# test_msg rule counter
clean() 
{ 
	$IPFW delete $R 2> /dev/null
	$IPFW pipe $P delete 2> /dev/null
}

# simple counter/allow test
echo -n "counter/allow test..."
clean
$IPFW add $R allow icmp from any to 127.0.0.1 > /dev/null
$PING -f -c100 $RH > /dev/null
counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f3`
[ ! $counter -eq 400 ] && abort "Wrong counter $counter 400"
echo "...OK"

# simple drop test
echo -n "deny test..."
clean
$IPFW add $R deny icmp from any to 127.0.0.1 > /dev/null
$PING -f -c10 -W 1 $RH > /dev/null
counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4`
[ ! $counter -eq 10 ] && abort "Wrong counter $counter 10"
echo "...OK"

# pipe delay test
echo -n "pipe delay test..."
clean
$IPFW pipe $P config delay 2000ms >/dev/null
$IPFW add $R pipe $P icmp from any to $RH >/dev/null
$PING -f -c10 -W 1 $RH > /dev/null
counter1=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4`
sleep 2
counter2=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4`
[ ! $counter1 -eq 10 ] && abort "Wrong counter $counter 10"
[ ! $counter2 -eq 20 ] && abort "Wrong counter $counter 20"
echo "...OK"

# pipe bw test
echo -n "pipe bw test..."
clean
$IPFW pipe $P config bw 2Kbit/s >/dev/null
$IPFW add $R pipe $P icmp from any to $RH >/dev/null
$PING -i 0.1 -c10 -W 1 $RH > /dev/null
counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4`
[ $counter -gt 30 ] && abort "Wrong counter $counter should be < 30"
sleep 1
counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4`
[ $counter -gt 30 ] && abort "Wrong counter $counter should be < 30"
echo "...OK"

# Final clean
clean


================================================
FILE: test/dn_test.h
================================================
/*
 * $Id: dn_test.h 5626 2010-03-04 21:55:22Z luigi $
 *
 * userspace compatibility code for dummynet schedulers
 */

#ifndef _DN_TEST_H
#define _DN_TEST_H
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>	/* bzero, ffs, ... */
#include <string.h>	/* strcmp */
#include <errno.h>
#include <sys/queue.h>
#include <sys/time.h>

extern int debug;
#define ND(fmt, args...) do {} while (0)
#define D1(fmt, args...) do {} while (0)
#define D(fmt, args...) fprintf(stderr, "%-8s " fmt "\n",      \
        __FUNCTION__, ## args)
#define DX(lev, fmt, args...) do {              \
        if (debug > lev) D(fmt, ## args); } while (0)


#define offsetof(t,m) (int)((&((t *)0L)->m))

#include <mylist.h>

/* prevent include of other system headers */
#define	_NETINET_IP_VAR_H_	/* ip_fw_args */
#define _IPFW2_H
#define _SYS_MBUF_H_

enum	{
	DN_QUEUE,
};

enum	{
	DN_SCHED_FIFO,
	DN_SCHED_WF2QP,
};

struct dn_id {
	int type, subtype, len, id;
};
struct dn_fs {
	int par[4];	/* flowset parameters */

	/* simulation entries.
	 * 'index' is not strictly necessary
	 * y is used for the inverse mapping ,
	 */
	int index;
	int y;	/* inverse mapping */
	int base_y;	/* inverse mapping */
	int next_y;	/* inverse mapping */
	int n_flows;
	int first_flow;
	int next_flow;	/* first_flow + n_flows */
	/*
	 * when generating, let 'cur' go from 0 to n_flows-1,
	 * then point to flow first_flow + cur
	 */
	int	cur;
};
struct dn_sch {
};
struct dn_flow {
	struct dn_id oid;
	int length;
	int len_bytes;
	int drops;
	uint64_t tot_bytes;
	uint32_t flow_id;
	struct list_head h;	/* used by the generator */
};
struct dn_link {
};

struct ip_fw_args {
};

struct mbuf {
        struct {
                int len;
        } m_pkthdr;
        struct mbuf *m_nextpkt;
	int flow_id;	/* for testing, index of a flow */
	//int flowset_id;	/* for testing, index of a flowset */
	void *cfg;	/* config args */
};

#define MALLOC_DECLARE(x)
#define KASSERT(x, y)	do { if (!(x)) printf y ; exit(0); } while (0)
struct ipfw_flow_id {
};

typedef void * module_t;
struct _md_t {
	const char *name;
	int (*f)(module_t, int, void *);
	void *p;
};
typedef struct _md_t moduledata_t;
#define DECLARE_MODULE(name, b, c, d)	\
	moduledata_t *_g_##name = & b
#define MODULE_DEPEND(a, b, c, d, e)

#ifdef IPFW
#include <dn_heap.h>
#include <ip_dn_private.h>
#include <dn_sched.h>
#else
struct dn_queue {
        struct dn_fsk *fs;             /* parent flowset. */
        struct dn_sch_inst *_si;	/* parent sched instance. */
};
struct dn_schk {
};
struct dn_fsk {
	struct dn_fs fs;
	struct dn_schk *sched;
};
struct dn_sch_inst {
	struct dn_schk *sched;
};
struct dn_alg {
	int type;
	const char *name;
	void *enqueue, *dequeue;
	int q_datalen, si_datalen, schk_datalen;
	int (*config)(struct dn_schk *);
	int (*new_sched)(struct dn_sch_inst *);
	int (*new_fsk)(struct dn_fsk *);
        int (*new_queue)(struct dn_queue *q);
};

#endif

#ifndef __FreeBSD__
int fls(int);
#endif

static inline void
mq_append(struct mq *q, struct mbuf *m)
{
        if (q->head == NULL)
                q->head = m;
        else
                q->tail->m_nextpkt = m;
        q->tail = m;
        m->m_nextpkt = NULL;
}

#endif /* _DN_TEST_H */


================================================
FILE: test/dynrules.sh
================================================
#!/bin/sh
#
# 20100507 marta, quick test for dyn rules
# ./ipfw/ipfw -d show |grep \ 80

IPFW_MOD=dummynet2/ipfw_mod.ko
IPFW=ipfw/ipfw

# main
# remove any previous loaded module
/sbin/rmmod ipfw_mod 
/sbin/insmod ${IPFW_MOD}
echo "25" >  /sys/module/ipfw_mod/parameters/dyn_ack_lifetime
${IPFW} add 1 check-state
${IPFW} add 9 allow all from any to any keep-state
${IPFW} add 10 allow all from any to onelab1.iet.unipi.it keep-state

telnet 72.14.234.104 80 


================================================
FILE: test/interpolation.c
================================================
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

/* gcc interpolation.c -o interpolation */

void    
err(int eval, const char *fmt, ...) 
{
}           
void    
errx(int eval, const char *fmt, ...) 
{
}           
        

#define ED_MAX_SAMPLES_NO 1000
#define ED_MAX_LINE_LEN 128
#define EX_DATAERR 1
#define EX_UNAVAILABLE  3
#define ED_TOK_DELAY    "delay"
#define ED_TOK_PROB     "prob"
#define ED_SEPARATORS   " \t\n"
#define ED_TOK_PROFILE_NO "profile_no"


struct point {
	double prob;		/* y */
	double delay;		/* x */
};

struct profile {
        char    filename[128];                   /* profile filename */
        int     samples[ED_MAX_SAMPLES_NO+1];    /* may be shorter */
        int     samples_no;                     /* actual len of samples[] */
};

/*
 * returns 1 if s is a non-negative number, with at least one '.'
 */
static int
is_valid_number(const char *s)
{
#if 0
        int i, dots_found = 0;
        int len = strlen(s);

        for (i = 0; i<len; ++i)
                if (!isdigit(s[i]) && (s[i] !='.' || ++dots_found > 1))
                        return 0;
#endif
        return 1;
}

static int
compare_points(const void *vp1, const void *vp2)
{
	const struct point *p1 = vp1;
	const struct point *p2 = vp2;
	double res = 0;

	res = p1->prob - p2->prob;
	if (res == 0)
		res = p1->delay - p2->delay;
	if (res < 0)
		return -1;
	else if (res > 0)
		return 1;
	else
		return 0;
}

#define ED_EFMT(s) 1,"error in %s at line %d: "#s,filename,lineno

/*
 * The points defined by the user are stored in the ponts structure.
 * The number of user defined points is stored in points_no.
 *       We assume that The last point for the '1' value of the
 *       probability should be defined. (XXX add checks for this)
 * The user defined sampling value is stored in samples_no.
 * The resulting samples are in the "samples" pointer.
 */
static void
interpolate_samples(struct point *p, int points_no, 
		int *samples, int samples_no, const char *filename)
{
	double dy;		/* delta on the y axis */
	double y;		/* current value of y */
	double x;		/* current value of x */
	double m;		/* the y slope */
	int i;			/* samples index */
	int curr;		/* points current index */

	dy = 1.0/samples_no;
	y = 0;

	for (i=0, curr = 0; i < samples_no; i++, y+=dy) {
		/* This statment move the curr pointer to the next point
		 * skipping the points with the same x value. We are
		 * guaranteed to exit from the loop because the
		 * last possible value of y is stricly less than 1
		 * and the last possible value of the y points is 1 */
		while ( y >= p[curr+1].prob ) curr++;

		/* compute the slope of the curve */
		m = (p[curr+1].delay - p[curr].delay) / (p[curr+1].prob - p[curr].prob);
		/* compute the x value starting from the current point */
		x = p[curr].delay + (y - p[curr].prob) * m;
		samples[i] = x;
	}

	/* add the last sample */
	samples[i] = p[curr+1].delay;
}

#if 0
static void
interpolate_samples_old(struct point *points, int points_no, 
		int *samples, int samples_no, const char *filename)
{
	int i;		/* pointer to the sampled array */
	int j = 0;	/* pointer to user defined samples */
	double dy;	/* delta y */
	double y;	/* current value of y */
	int x;		/* computed value of x */
	double m;	/* slope of the line */
	double y1, x1, y2, x2;	/* two points of the current line */

	/* make sure that there are enough points. */
	/* XXX Duplicated shoule be removed */
	if (points_no < 3)
	    errx(EX_DATAERR, "%s too few samples, need at least %d",
		filename, 3);

	qsort(points, points_no, sizeof(struct point), compare_points);

	samples_no--;
	dy = 1.0/samples_no;
	printf("\nsamples no is %d dy is %f ", samples_no, dy);

	/* start with the first two points */
	y1 = points[j].prob * samples_no;
	x1 = points[j].delay;
	j++;
	y2 = points[j].prob * samples_no;
	x2 = points[j].delay;

	m = (y2-y1)/(x2-x1);
	printf("\nStart");
	printf("\n\tCurrent points x1 y1 %f %f next point x2y2 %f %f m %f\n",
		 x1, y1, x2, y2, m);

	y = 0;
	x = x1;

	for(i=0; i < samples_no+1; i++, y+=dy) {
		printf("\ni:%d j:%d y:%f real y:%f", i, j, y, y*samples_no);
		if ( (y*samples_no) >= y2 ) { /* move to the next point */
			j++;
			if ( j >= points_no ) {
				printf("\n\tNo more points, exit with j: %d i: %d and y:%f %f\n",
					 j, i, y, (y*samples_no));
				break;	/* no more user defined points */
			}
			/* load a new point */
			y1 = y2;
			x1 = x2;
			y2 = points[j].prob * samples_no;
			x2 = points[j].delay;
			m = (y2-y1)/(x2-x1);
			if (x1==x2) { /* m = infinito */
				m = -1;
				x = x2;
			}
			/* very small m problem */
			printf ("\ndelta %f\n", (y1 - y2));
			if (abs(y1 - y2) < 0.00001) { /* m = 0 XXX Should this magic number depend on samples_no ? */
				m = 0;
				x = x2;
			}
			printf("\n\tCurrent points x1 y1 %f %f next point x2y2 %f %f (%f/%f)=m \n",
				 x1, y1, x2, y2, (y2-y1), (x2-x1), m);
		}
		printf("\n\tcompute step y %f x[%d]=%d ",
			y, i, x);
		if ((m != -1) && ( m != 0 )) {
			x = x + (dy * samples_no)/m;
		}
		samples[i] = x;
		printf(" dy %f x new %d\n", dy*samples_no, x);
		printf(" m %f (dy * samples_no)/m %f \n", m, (dy * samples_no)/m);
	}

	x = samples[i-1];
	printf("Finish i is %d samples_no is %d\n", i, samples_no);
	/* The last point has a probability less than 1 */
	for (; i <= samples_no; i++)
		samples[i] = x;
}
#endif

static void
load_profile(struct profile *p)
{
	FILE    *f;			/* file handler */
	char    line[ED_MAX_LINE_LEN];
	int     lineno = 0;
	int     do_points = 0;
	int     delay_first = -1;
	int i;

	struct	point points[1000]; /* MAX_POINTS_NO */
	int     points_no = 0;

	char *filename = p->filename;
	f = fopen(filename, "r");
	if (f == NULL) {
	    err(EX_UNAVAILABLE, "fopen: %s", filename);
	}


	while (fgets(line, ED_MAX_LINE_LEN, f)) {         /* read commands */
		char *s, *cur = line, *name = NULL, *arg = NULL;

		++lineno;

		/* parse the line */
		while (cur) {
			s = strsep(&cur, ED_SEPARATORS);
			if (s == NULL || *s == '#')
				break;
			if (*s == '\0')
				continue;
			if (arg)
				errx(ED_EFMT("too many arguments"));
			if (name == NULL)
				name = s;
			else
				arg = s;
		}

		if (name == NULL)
			continue;

		if (!strcasecmp(name, ED_TOK_DELAY)) {
		    if (do_points)
			errx(ED_EFMT("duplicated token: %s"), name);
		    delay_first = 1;
		    do_points = 1;
		    continue;
		} else if (!strcasecmp(name, ED_TOK_PROB)) {
		    if (do_points)
			errx(ED_EFMT("duplicated token: %s"), name);
		    delay_first = 0;
		    do_points = 1;
		    continue;
		}
		if (!strcasecmp(name, ED_TOK_PROFILE_NO)) {
			int p_no = atof(arg);
			if (p_no <= 0) {
				p_no = 100;
				printf("invalid interpolation samples, using %d\n",
					 p_no);
			}
			if (p_no > ED_MAX_SAMPLES_NO) {
				p_no = ED_MAX_SAMPLES_NO;
				printf("invalid interpolation samples, using %d\n",
					 p_no);
			}

			p->samples_no = p_no;
		    continue;

		} else if (do_points) {
		    if (!is_valid_number(name) || !is_valid_number(arg))
			errx(ED_EFMT("invalid point found"));
		    if (delay_first) {
			points[points_no].delay = atof(name);
			points[points_no].prob = atof(arg);
		    } else {
			points[points_no].delay = atof(arg);
			points[points_no].prob = atof(name);
		    }
		    if (points[points_no].prob > 1.0)
			errx(ED_EFMT("probability greater than 1.0"));
		    ++points_no;
	/* XXX no more that 1000 */
		    continue;
		} else {
		    errx(ED_EFMT("unrecognised command '%s'"), name);
		}
	}

	for(i=0; i < p->samples_no; i++) {
		p->samples[i] = 666;
	}

	/* This code assume the user define a value of X for the sampling value,
	 * and that:
	 * - the value stored in the emulator structure is X;
	 * - the allocated structure for the samples is X+1;
	 */
	interpolate_samples(points, points_no, p->samples, p->samples_no, filename);

	// User defined samples
	printf("\nLoaded %d points:\n", points_no);
	for(i=0; i < points_no; i++) {
		printf("%f %f\n", points[i].prob, points[i].delay);
	}
	printf("\n");
	printf("The sample value is %d \n", p->samples_no);

}

int main(int argc, char **argv)
{
	if (argc < 2) {
		printf("Usage: ./interpolation <filename>\n");
		return -1;
	}

	char *filename;
	filename = argv[1];

	struct profile p;
	int i;

	strncpy(p.filename, filename, 128);
	load_profile(&p);
	printf("-----------\n");
	for (i=0; i<=p.samples_no; i++)
		printf("%d %d\n", i, p.samples[i]);
	printf("-----------\n");
	return 0;
}


================================================
FILE: test/main.c
================================================
/*
 * $Id: main.c 5626 2010-03-04 21:55:22Z luigi $
 *
 * Testing program for schedulers
 *
 * The framework include a simple controller which, at each
 * iteration, decides whether we can enqueue and/or dequeue.
 * Then the mainloop runs the required number of tests,
 * keeping track of statistics.
 */

#include "dn_test.h"

struct q_list {
	struct list_head h;
};

struct cfg_s {
	int ac;
	char * const *av;

	const char *name;
	int loops;
	struct timeval time;

	/* running counters */
	uint32_t	_enqueue;
	uint32_t	drop;
	uint32_t	pending;
	uint32_t	dequeue;

	/* generator parameters */
	int th_min, th_max;
	int maxburst;
	int lmin, lmax;	/* packet len */
	int flows;	/* number of flows */
	int flowsets;	/* number of flowsets */
	int wsum;	/* sum of weights of all flows */
	int max_y;	/* max random number in the generation */
	int cur_y, cur_fs;	/* used in generation, between 0 and max_y - 1 */
	const char *fs_config; /* flowset config */
	int can_dequeue;
	int burst;	/* count of packets sent in a burst */
	struct mbuf *tosend;	/* packet to send -- also flag to enqueue */

	struct mbuf *freelist;

	struct mbuf *head, *tail;	/* a simple tailq */

	/* scheduler hooks */
	int (*enq)(struct dn_sch_inst *, struct dn_queue *,
		struct mbuf *);
	struct mbuf * (*deq)(struct dn_sch_inst *);
	/* size of the three fields including sched-specific areas */
	int schk_len;
	int q_len; /* size of a queue including sched-fields */
	int si_len; /* size of a sch_inst including sched-fields */
	char *q;	/* array of flow queues */
		/* use a char* because size is variable */
	struct dn_fsk *fs;	/* array of flowsets */
	struct dn_sch_inst *si;
	struct dn_schk *sched;

	/* generator state */
	int state;		/* 0 = going up, 1: going down */

	/*
	 * We keep lists for each backlog level, and always serve
	 * the one with shortest backlog. llmask contains a bitmap
	 * of lists, and ll are the heads of the lists. The last
	 * entry (BACKLOG) contains all entries considered 'full'
	 * XXX to optimize things, entry i could contain queues with
	 * 2^{i-1}+1 .. 2^i entries.
	 */
#define BACKLOG	30
	uint32_t	llmask;
	struct list_head ll[BACKLOG + 10];
};

/* FI2Q and Q2FI converts from flow_id to dn_queue and back.
 * We cannot easily use pointer arithmetic because it is variable size.
  */
#define FI2Q(c, i)	((struct dn_queue *)((c)->q + (c)->q_len * (i)))
#define Q2FI(c, q)	(((char *)(q) - (c)->q)/(c)->q_len)

int debug = 0;

struct dn_parms dn_cfg;

static void controller(struct cfg_s *c);

/* release a packet: put the mbuf in the freelist, and the queue in
 * the bucket.
 */
int
drop(struct cfg_s *c, struct mbuf *m)
{
	struct dn_queue *q;
	int i;

	c->drop++;
	q = FI2Q(c, m->flow_id);
	i = q->ni.length; // XXX or ffs...

	ND("q %p id %d current length %d", q, m->flow_id, i);
	if (i < BACKLOG) {
		struct list_head *h = &q->ni.h;
		c->llmask &= ~(1<<(i+1));
		c->llmask |= (1<<(i));
		list_del(h);
		list_add_tail(h, &c->ll[i]);
	}
	m->m_nextpkt = c->freelist;
	c->freelist = m;
	return 0;
}

/* dequeue returns NON-NULL when a packet is dropped */
static int
enqueue(struct cfg_s *c, void *_m)
{
	struct mbuf *m = _m;
	if (c->enq)
		return c->enq(c->si, FI2Q(c, m->flow_id), m);
	if (c->head == NULL)
		c->head = m;
	else
		c->tail->m_nextpkt = m;
	c->tail = m;
	return 0; /* default - success */
}

/* dequeue returns NON-NULL when a packet is available */
static void *
dequeue(struct cfg_s *c)
{
	struct mbuf *m;
	if (c->deq)
		return c->deq(c->si);
	if ((m = c->head)) {
		m = c->head;
		c->head = m->m_nextpkt;
		m->m_nextpkt = NULL;
	}
	return m;
}

static int
mainloop(struct cfg_s *c)
{
	int i;
	struct mbuf *m;

	for (i=0; i < c->loops; i++) {
		/* implement histeresis */
		controller(c);
		DX(3, "loop %d enq %d send %p rx %d",
			i, c->_enqueue, c->tosend, c->can_dequeue);
		if ( (m = c->tosend) ) {
			c->_enqueue++;
			if (enqueue(c, m)) {
				drop(c, m);
				ND("loop %d enqueue fail", i );
			} else {
				ND("enqueue ok");
				c->pending++;
			}
		}
		if (c->can_dequeue) {
			c->dequeue++;
			if ((m = dequeue(c))) {
				c->pending--;
				drop(c, m);
				c->drop--;	/* compensate */
			}
		}
	}
	DX(1, "mainloop ends %d", i);
	return 0;
}

int
dump(struct cfg_s *c)
{
	int i;
	struct dn_queue *q;

	for (i=0; i < c->flows; i++) {
		q = FI2Q(c, i);
		DX(1, "queue %4d tot %10lld", i, q->ni.tot_bytes);
	}
	DX(1, "done %d loops\n", c->loops);
	return 0;
}

/* interpret a number in human form */
static long
getnum(const char *s, char **next, const char *key)
{
	char *end = NULL;
	long l;

	if (next)	/* default */
		*next = NULL;
	if (s && *s) {
		DX(3, "token is <%s> %s", s, key ? key : "-");
		l = strtol(s, &end, 0);
	} else {
		DX(3, "empty string");
		l = -1;
	}
	if (l < 0) {
		DX(2, "invalid %s for %s", s ? s : "NULL", (key ? key : "") );
		return 0;	// invalid 
	}
	if (!end || !*end)
		return l;
	if (*end == 'n')
		l = -l;	/* multiply by n */
	else if (*end == 'K')
		l = l*1000;
	else if (*end == 'M')
		l = l*1000000;
	else if (*end == 'k')
		l = l*1024;
	else if (*end == 'm')
		l = l*1024*1024;
	else if (*end == 'w')
		;
	else {/* not recognized */
		D("suffix %s for %s, next %p", end, key, next);
		end--;
	}
	end++;
	DX(3, "suffix now %s for %s, next %p", end, key, next);
	if (next && *end) {
		DX(3, "setting next to %s for %s", end, key);
		*next = end;
	}
	return l;
}

/*
 * flowsets are a comma-separated list of
 *     weight:maxlen:flows
 * indicating how many flows are hooked to that fs.
 * Both weight and range can be min-max-steps.
 * In a first pass we just count the number of flowsets and flows,
 * in a second pass we complete the setup.
 */
static void
parse_flowsets(struct cfg_s *c, const char *fs, int pass)
{
	char *s, *cur, *next;
	int n_flows = 0, n_fs = 0, wsum = 0;
	int i, j;
	struct dn_fs *prev = NULL;

	DX(3, "--- pass %d flows %d flowsets %d", pass, c->flows, c->flowsets);
	if (pass == 0)
		c->fs_config = fs;
	s = c->fs_config ? strdup(c->fs_config) : NULL;
	if (s == NULL) {
		if (pass == 0)
			D("no fsconfig");
		return;
	}
	for (next = s; (cur = strsep(&next, ","));) {
		char *p = NULL;
		int w, w_h, w_steps, wi;
		int len, len_h, l_steps, li;
		int flows;

		w = getnum(strsep(&cur, ":"), &p, "weight");
		if (w <= 0)
			w = 1;
		w_h = p ? getnum(p+1, &p, "weight_max") : w;
		w_steps = p ? getnum(p+1, &p, "w_steps") : (w_h == w ?1:2);
		len = getnum(strsep(&cur, ":"), &p, "len");
		if (len <= 0)
			len = 1000;
		len_h = p ? getnum(p+1, &p, "len_max") : len;
		l_steps = p ? getnum(p+1, &p, "l_steps") : (len_h == len ? 1 : 2);
		flows = getnum(strsep(&cur, ":"), NULL, "flows");
		if (flows == 0)
			flows = 1;
		DX(4, "weight %d..%d (%d) len %d..%d (%d) flows %d",
			w, w_h, w_steps, len, len_h, l_steps, flows);
		if (w == 0 || w_h < w || len == 0 || len_h < len ||
				flows == 0) {
			DX(4,"wrong parameters %s", fs);
			return;
		}
		n_flows += flows * w_steps * l_steps;
		for (i = 0; i < w_steps; i++) {
			wi = w + ((w_h - w)* i)/(w_steps == 1 ? 1 : (w_steps-1));
			for (j = 0; j < l_steps; j++, n_fs++) {
				struct dn_fs *fs = &c->fs[n_fs].fs; // tentative
				int x;

				li = len + ((len_h - len)* j)/(l_steps == 1 ? 1 : (l_steps-1));
				x = (wi*2048)/li;
				DX(3, "----- fs %4d weight %4d lmax %4d X %4d flows %d",
					n_fs, wi, li, x, flows);
				if (pass == 0)
					continue;
				if (c->fs == NULL || c->flowsets <= n_fs) {
					D("error in number of flowsets");
					return;
				}
				wsum += wi * flows;
				fs->par[0] = wi;
				fs->par[1] = li;
				fs->index = n_fs;
				fs->n_flows = flows;
				fs->cur = fs->first_flow = prev==NULL ? 0 : prev->next_flow;
				fs->next_flow = fs->first_flow + fs->n_flows;
				fs->y = x * flows;
				fs->base_y = (prev == NULL) ? 0 : prev->next_y;
				fs->next_y = fs->base_y + fs->y;
				prev = fs;
			}
		}
	}
	c->max_y = prev ? prev->base_y + prev->y : 0;
	c->flows = n_flows;
	c->flowsets = n_fs;
	c->wsum = wsum;
	if (pass == 0)
		return;

	/* now link all flows to their parent flowsets */
	DX(1,"%d flows on %d flowsets max_y %d", c->flows, c->flowsets, c->max_y);
	for (i=0; i < c->flowsets; i++) {
		struct dn_fs *fs = &c->fs[i].fs;
		DX(1, "fs %3d w %5d l %4d flow %5d .. %5d y %6d .. %6d",
			i, fs->par[0], fs->par[1],
			fs->first_flow, fs->next_flow,
			fs->base_y, fs->next_y);
		for (j = fs->first_flow; j < fs->next_flow; j++) {
			struct dn_queue *q = FI2Q(c, j);
			q->fs = &c->fs[i];
		}
	}
}

static int
init(struct cfg_s *c)
{
	int i;
	int ac = c->ac;
	char * const *av = c->av;

	c->si_len = sizeof(struct dn_sch_inst);
	c->q_len = sizeof(struct dn_queue);
	moduledata_t *mod = NULL;
	struct dn_alg *p = NULL;

	c->th_min = 0;
	c->th_max = -20;/* 20 packets per flow */
	c->lmin = c->lmax = 1280;	/* packet len */
	c->flows = 1;
	c->flowsets = 1;
	c->name = "null";
	ac--; av++;
	while (ac > 1) {
		if (!strcmp(*av, "-n")) {
			c->loops = getnum(av[1], NULL, av[0]);
		} else if (!strcmp(*av, "-d")) {
			debug = atoi(av[1]);
		} else if (!strcmp(*av, "-alg")) {
			extern moduledata_t *_g_dn_fifo;
			extern moduledata_t *_g_dn_wf2qp;
			extern moduledata_t *_g_dn_rr;
			extern moduledata_t *_g_dn_qfq;
#ifdef WITH_KPS
			extern moduledata_t *_g_dn_kps;
#endif
			if (!strcmp(av[1], "rr"))
				mod = _g_dn_rr;
			else if (!strcmp(av[1], "wf2qp"))
				mod = _g_dn_wf2qp;
			else if (!strcmp(av[1], "fifo"))
				mod = _g_dn_fifo;
			else if (!strcmp(av[1], "qfq"))
				mod = _g_dn_qfq;
#ifdef WITH_KPS
			else if (!strcmp(av[1], "kps"))
				mod = _g_dn_kps;
#endif
			else
				mod = NULL;
			c->name = mod ? mod->name : "NULL";
			DX(3, "using scheduler %s", c->name);
		} else if (!strcmp(*av, "-len")) {
			c->lmin = getnum(av[1], NULL, av[0]);
			c->lmax = c->lmin;
			DX(3, "setting max to %d", c->th_max);
		} else if (!strcmp(*av, "-burst")) {
			c->maxburst = getnum(av[1], NULL, av[0]);
			DX(3, "setting max to %d", c->th_max);
		} else if (!strcmp(*av, "-qmax")) {
			c->th_max = getnum(av[1], NULL, av[0]);
			DX(3, "setting max to %d", c->th_max);
		} else if (!strcmp(*av, "-qmin")) {
			c->th_min = getnum(av[1], NULL, av[0]);
			DX(3, "setting min to %d", c->th_min);
		} else if (!strcmp(*av, "-flows")) {
			c->flows = getnum(av[1], NULL, av[0]);
			DX(3, "setting flows to %d", c->flows);
		} else if (!strcmp(*av, "-flowsets")) {
			parse_flowsets(c, av[1], 0);
			DX(3, "setting flowsets to %d", c->flowsets);
		} else {
			D("option %s not recognised, ignore", *av);
		}
		ac -= 2; av += 2;
	}
	if (c->maxburst <= 0)
		c->maxburst = 1;
	if (c->loops <= 0)
		c->loops = 1;
	if (c->flows <= 0)
		c->flows = 1;
	if (c->flowsets <= 0)
		c->flowsets = 1;
	if (c->lmin <= 0)
		c->lmin = 1;
	if (c->lmax <= 0)
		c->lmax = 1;
	/* multiply by N */
	if (c->th_min < 0)
		c->th_min = c->flows * -c->th_min;
	if (c->th_max < 0)
		c->th_max = c->flows * -c->th_max;
	if (c->th_max <= c->th_min)
		c->th_max = c->th_min + 1;
	if (mod) {
		p = mod->p;
		DX(3, "using module %s f %p p %p", mod->name, mod->f, mod->p);
		DX(3, "modname %s ty %d", p->name, p->type);
		c->enq = p->enqueue;
		c->deq = p->dequeue;
		c->si_len += p->si_datalen;
		c->q_len += p->q_datalen;
		c->schk_len += p->schk_datalen;
	}
	/* allocate queues, flowsets and one scheduler */
	c->q = calloc(c->flows, c->q_len);
	c->fs = calloc(c->flowsets, sizeof(struct dn_fsk));
	c->si = calloc(1, c->si_len);
	c->sched = calloc(c->flows, c->schk_len);
	if (c->q == NULL || c->fs == NULL) {
		D("error allocating memory for flows");
		exit(1);
	}
	c->si->sched = c->sched;
	if (p) {
		if (p->config)
			p->config(c->sched);
		if (p->new_sched)
			p->new_sched(c->si);
	}
	/* parse_flowsets links queues to their flowsets */
	parse_flowsets(c, av[1], 1);
	/* complete the work calling new_fsk */
	for (i = 0; i < c->flowsets; i++) {
		if (c->fs[i].fs.par[1] == 0)
			c->fs[i].fs.par[1] = 1000;	/* default pkt len */
		c->fs[i].sched = c->sched;
		if (p && p->new_fsk)
			p->new_fsk(&c->fs[i]);
	}

	/* initialize the lists for the generator, and put
	 * all flows in the list for backlog = 0
	 */
	for (i=0; i <= BACKLOG+5; i++)
		INIT_LIST_HEAD(&c->ll[i]);

	for (i = 0; i < c->flows; i++) {
		struct dn_queue *q = FI2Q(c, i);
		if (q->fs == NULL)
			q->fs = &c->fs[0]; /* XXX */
		q->_si = c->si;
		if (p && p->new_queue)
			p->new_queue(q);
		INIT_LIST_HEAD(&q->ni.h);
		list_add_tail(&q->ni.h, &c->ll[0]);
	}
	c->llmask = 1;
	return 0;
}


int
main(int ac, char *av[])
{
	struct cfg_s c;
	struct timeval end;
	double ll;
	int i;
	char msg[40];

	bzero(&c, sizeof(c));
	c.ac = ac;
	c.av = av;
	init(&c);
	gettimeofday(&c.time, NULL);
	mainloop(&c);
	gettimeofday(&end, NULL);
	end.tv_sec -= c.time.tv_sec;
	end.tv_usec -= c.time.tv_usec;
	if (end.tv_usec < 0) {
		end.tv_usec += 1000000;
		end.tv_sec--;
	}
	c.time = end;
	ll = end.tv_sec*1000000 + end.tv_usec;
	ll *= 1000;	/* convert to nanoseconds */
	ll /= c._enqueue;
	sprintf(msg, "1::%d", c.flows);
	D("%-8s n %d %d time %d.%06d %8.3f qlen %d %d flows %s drops %d",
		c.name, c._enqueue, c.loops,
		(int)c.time.tv_sec, (int)c.time.tv_usec, ll,
		c.th_min, c.th_max,
		c.fs_config ? c.fs_config : msg, c.drop);
	dump(&c);
	DX(1, "done ac %d av %p", ac, av);
	for (i=0; i < ac; i++)
		DX(1, "arg %d %s", i, av[i]);
	return 0;
}

/*
 * The controller decides whether in this iteration we should send
 * (the packet is in c->tosend) and/or receive (flag c->can_dequeue)
 */
static void
controller(struct cfg_s *c)
{
	struct mbuf *m;
	struct dn_fs *fs;
	int flow_id;

	/* histeresis between max and min */
	if (c->state == 0 && c->pending >= c->th_max)
		c->state = 1;
	else if (c->state == 1 && c->pending <= c->th_min)
		c->state = 0;
	ND(1, "state %d pending %2d", c->state, c->pending);
	c->can_dequeue = c->state;
	c->tosend = NULL;
	if (c->state)
		return;

    if (1) {
	int i;
	struct dn_queue *q;
	struct list_head *h;

	i = ffs(c->llmask) - 1;
	if (i < 0) {
		DX(2, "no candidate");
		c->can_dequeue = 1;
		return;
	}
	h = &c->ll[i];
	ND(1, "backlog %d p %p prev %p next %p", i, h, h->prev, h->next);
	q = list_first_entry(h, struct dn_queue, ni.h);
	list_del(&q->ni.h);
	flow_id = Q2FI(c, q);
	DX(2, "extracted flow %p %d backlog %d", q, flow_id, i);
	if (list_empty(h)) {
		ND(2, "backlog %d empty", i);
		c->llmask &= ~(1<<i);
	}
	ND(1, "before %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next);
	list_add_tail(&q->ni.h, h+1);
	ND(1, " after %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next);
	if (i < BACKLOG) {
		ND(2, "backlog %d full", i+1);
		c->llmask |= 1<<(1+i);
	}
	fs = &q->fs->fs;
	c->cur_fs = q->fs - c->fs;
	fs->cur = flow_id;
    } else {
	/* XXX this does not work ? */
	/* now decide whom to send the packet, and the length */
	/* lookup in the flow table */
	if (c->cur_y >= c->max_y) {	/* handle wraparound */
		c->cur_y = 0;
		c->cur_fs = 0;
	}
	fs = &c->fs[c->cur_fs].fs;
	flow_id = fs->cur++;
	if (fs->cur >= fs->next_flow)
		fs->cur = fs->first_flow;
	c->cur_y++;
	if (c->cur_y >= fs->next_y)
		c->cur_fs++;
    }

	/* construct a packet */
	if (c->freelist) {
		m = c->tosend = c->freelist;
		c->freelist = c->freelist->m_nextpkt;
	} else {
		m = c->tosend = calloc(1, sizeof(struct mbuf));
	}
	if (m == NULL)
		return;

	m->cfg = c;
	m->m_nextpkt = NULL;
	m->m_pkthdr.len = fs->par[1]; // XXX maxlen
	m->flow_id = flow_id;

	ND(2,"y %6d flow %5d fs %3d weight %4d len %4d",
		c->cur_y, m->flow_id, c->cur_fs,
		fs->par[0], m->m_pkthdr.len);

}

/*
Packet allocation:
to achieve a distribution that matches weights, for each X=w/lmax class
we should generate a number of packets proportional to Y = X times the number
of flows in the class.
So we construct an array with the cumulative distribution of Y's,
and use it to identify the flow via inverse mapping (if the Y's are
not too many we can use an array for the lookup). In practice,
each flow will have X entries [virtually] pointing to it.

*/


================================================
FILE: test/memory_leak.sh
================================================
#!/bin/sh
# this script execute N times the command CMD
# collecting the memory usage on a file.
# The value of the Dirty memory should not increase
# between tests.

BASE_NAME=ipfw_r5808_
N=10000
CMD1="/sbin/insmod ../dummynet2/ipfw_mod.ko"
CMD2="/sbin/rmmod ipfw_mod"

# main
# remove any previous loaded module
/sbin/rmmod ipfw_mod 

# pre

for n in `seq $N`; do
	$CMD1
	$CMD2
	[ $n = 10 ] && cat /proc/meminfo > /tmp/${BASE_NAME}_${n}
	[ $n = 100 ] && cat /proc/meminfo > /tmp/${BASE_NAME}_${n}
	[ $n = 1000 ] && cat /proc/meminfo > /tmp/${BASE_NAME}_${n}
done;

# post


================================================
FILE: test/mylist.h
================================================
/*
 * $Id: mylist.h 5626 2010-03-04 21:55:22Z luigi $
 *
 * linux-like bidirectional lists
 */

#ifndef _MYLIST_H
#define _MYLIST_H
struct list_head {
        struct list_head *prev, *next;
};

#define INIT_LIST_HEAD(l) do {  (l)->prev = (l)->next = (l); } while (0)
#define list_empty(l)   ( (l)->next == l )
static inline void
__list_add(struct list_head *o, struct list_head *prev,
        struct list_head *next)
{
        next->prev = o;
        o->next = next;
        o->prev = prev;
        prev->next = o;
}
 
static inline void
list_add_tail(struct list_head *o, struct list_head *head)
{
        __list_add(o, head->prev, head);
}

#define list_first_entry(pL, ty, member)        \
        (ty *)((char *)((pL)->next) - offsetof(ty, member))

static inline void
__list_del(struct list_head *prev, struct list_head *next)
{
        next->prev = prev;
        prev->next = next;
}

static inline void
list_del(struct list_head *entry)
{
	ND("called on %p", entry);
        __list_del(entry->prev, entry->next);
        entry->next = entry->prev = NULL;
}

#endif /* _MYLIST_H */


================================================
FILE: test/profile_bench1
================================================
profile_no 100
delay prob
207 0.000264
255 0.034117
270 0.072280
279 0.106749
288 0.148604
298 0.184304
302 0.202194
353 0.384541
423 0.588842
510 0.782126
516 0.800970
545 0.845706
553 0.861411
573 0.889430
586 0.912117
620 0.920003
661 0.938308
695 0.944191
740 0.949112
765 0.952598
848 0.957109
1379 0.983768
1555 0.983778
1649 1


================================================
FILE: test/profile_bench2
================================================
samples 10
delay prob
0 0
250 0
250 0.5
500 0.5
500 1


================================================
FILE: test/profile_bench3
================================================
profile_no 100
delay prob
0 0
50 0.5
100 1


================================================
FILE: test/test_dn_heap.c
================================================
/*-
 * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
 * All rights reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Userland code for testing binary heaps and hash tables
 *
 * $Id: test_dn_heap.c 6131 2010-04-22 15:37:36Z svn_panicucci $
 */

#include <sys/cdefs.h>
#include <sys/param.h>

#include <stdio.h>
#include <strings.h>
#include <stdlib.h>
#include "dn_test.h"
#include  "dn_heap.h"
#define log(x, arg...)	fprintf(stderr, ## arg)
#define panic(x...)	fprintf(stderr, ## x), exit(1)

#include <string.h>

struct x {
	struct x *ht_link;
	char buf[0];
};

uint32_t hf(uintptr_t key, int flags, void *arg)
{
	return (flags & DNHT_KEY_IS_OBJ) ?
		((struct x *)key)->buf[0] : *(char *)key;
}

int matchf(void *obj, uintptr_t key, int flags, void *arg)
{
	char *s = (flags & DNHT_KEY_IS_OBJ) ?
		((struct x *)key)->buf : (char *)key;
	return (strcmp(((struct x *)obj)->buf, s) == 0);
}

void *newfn(uintptr_t key, int flags, void *arg)
{
	char *s = (char *)key;
	struct x *p = malloc(sizeof(*p) + 1 + strlen(s));
	if (p)
		strcpy(p->buf, s);
	return p;
}

char *strings[] = {
	"undici", "unico", "doppio", "devoto",
	"uno", "due", "tre", "quattro", "cinque", "sei",
	"uno", "due", "tre", "quattro", "cinque", "sei",
	NULL,
};

int doprint(void *_x, void *arg)
{
	struct x *x = _x;
	printf("found element <%s>\n", x->buf);
	return (int)arg;
}

static void
test_hash()
{
	char **p;
	struct dn_ht *h;
	uintptr_t x = 0;
	uintptr_t x1 = 0;

	/* first, find and allocate */
	h = dn_ht_init(NULL, 10, 0, hf, matchf, newfn);

	for (p = strings; *p; p++) {
		dn_ht_find(h, (uintptr_t)*p, DNHT_INSERT, NULL);
	}
	dn_ht_scan(h, doprint, 0);
	printf("/* second -- find without allocate */\n");
	h = dn_ht_init(NULL, 10, 0, hf, matchf, NULL);
	for (p = strings; *p; p++) {
		void **y = newfn((uintptr_t)*p, 0, NULL);
		if (x == 0)
			x = (uintptr_t)y;
		else {
			if (x1 == 0)
				x1 = (uintptr_t)*p;
		}
		dn_ht_find(h, (uintptr_t)y, DNHT_INSERT | DNHT_KEY_IS_OBJ, NULL);
	}
	dn_ht_scan(h, doprint, 0);
	printf("remove %p gives %p\n", (void *)x,
		dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL));
	printf("remove %p gives %p\n", (void *)x,
		dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL));
	printf("remove %p gives %p\n", (void *)x,
		dn_ht_find(h, x1, DNHT_REMOVE, NULL));
	printf("remove %p gives %p\n", (void *)x,
		dn_ht_find(h, x1, DNHT_REMOVE, NULL));
	dn_ht_scan(h, doprint, 0);
}

int
main(int argc, char *argv[])
{
	struct dn_heap h;
	int i, n, n2, n3;

	test_hash();
	return 0;

	/* n = elements, n2 = cycles */
	n = (argc > 1) ? atoi(argv[1]) : 0;
	if (n <= 0 || n > 1000000)
		n = 100;
	n2 = (argc > 2) ? atoi(argv[2]) : 0;
	if (n2 <= 0)
		n = 1000000;
	n3 = (argc > 3) ? atoi(argv[3]) : 0;
	bzero(&h, sizeof(h));
	heap_init(&h, n, -1);
	while (n2-- > 0) {
		uint64_t prevk = 0;
		for (i=0; i < n; i++)
			heap_insert(&h, n3 ? n-i: random(), (void *)(100+i));
		
		for (i=0; h.elements > 0; i++) {
			uint64_t k = h.p[0].key;
			if (k < prevk)
				panic("wrong sequence\n");
			prevk = k;
			if (0)
			printf("%d key %llu, val %p\n",
				i, h.p[0].key, h.p[0].object);
			heap_extract(&h, NULL);
		}
	}
	return 0;
}


================================================
FILE: test/test_dn_sched.c
================================================
/*
 * $Id: test_dn_sched.c 5626 2010-03-04 21:55:22Z luigi $
 *
 * library functions for userland testing of dummynet schedulers
 */

#include "dn_test.h"

void
m_freem(struct mbuf *m)
{
	printf("free %p\n", m);
}

int
dn_sched_modevent(module_t mod, int cmd, void *arg)
{
	return 0;
}

void
dn_free_pkts(struct mbuf *m)
{
	struct mbuf *x;
	while ( (x = m) ) {
		m = m->m_nextpkt;
		m_freem(x);
	}
}
		
int
dn_delete_queue(void *_q, void *do_free)
{
	struct dn_queue *q = _q;
        if (q->mq.head)
                dn_free_pkts(q->mq.head);
        free(q);
        return 0;
}

/*
 * This is a simplified function for testing purposes, which does
 * not implement statistics or random loss.
 * Enqueue a packet in q, subject to space and queue management policy
 * (whose parameters are in q->fs).
 * Update stats for the queue and the scheduler.
 * Return 0 on success, 1 on drop. The packet is consumed anyways.
 */
int
dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
{
        if (drop)
                goto drop;
        if (q->ni.length >= 200)
                goto drop;
        mq_append(&q->mq, m);
        q->ni.length++;
        q->ni.tot_bytes += m->m_pkthdr.len;
        return 0;

drop:
        q->ni.drops++;
        return 1;
}

int
ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg)
{
        if (*v < lo) {
                *v = dflt;
        } else if (*v > hi) {
                *v = hi;
        }
        return *v;
}

#ifndef __FreeBSD__
int
fls(int mask)
{
        int bit;
 
        if (mask == 0)
                return (0);
        for (bit = 1; mask != 1; bit++)
                mask = (unsigned int)mask >> 1;
        return (bit);
}
#endif