Repository: luigirizzo/dummynet Branch: master Commit: e717cdd4bef7 Files: 120 Total size: 1.1 MB Directory structure: gitextract_r3ojrber/ ├── 020-mips-hz1000.patch ├── Makefile ├── Makefile.inc ├── Makefile.openwrt ├── NOTES ├── README ├── binary/ │ ├── README.txt │ ├── ipfw.sys │ ├── netipfw.inf │ ├── netipfw_m.inf │ └── testme.bat ├── binary64/ │ └── ipfw.sys ├── configuration/ │ ├── README │ ├── change_rules.sh │ ├── change_rules_linux.sh │ ├── ipfw.conf │ ├── ipfw.rules │ └── rc.firewall ├── glue.h ├── ipfw/ │ ├── Makefile │ ├── add_rules │ ├── dummynet.c │ ├── expand_number.c │ ├── glue.c │ ├── humanize_number.c │ ├── include/ │ │ ├── alias.h │ │ ├── net/ │ │ │ ├── if_dl.h │ │ │ └── pfvar.h │ │ └── timeconv.h │ ├── ipfw.8 │ ├── ipfw2.c │ ├── ipfw2.h │ ├── ipv6.c │ ├── main.c │ ├── qsort.c │ ├── qsort_r.c │ ├── rule_test.sh │ └── ws2_32.def ├── kipfw/ │ ├── Makefile │ ├── bsd_compat.c │ ├── debug.c │ ├── ipfw2_mod.c │ ├── md_win.c │ ├── missing.h │ ├── mysetenv.sh │ ├── netipfw.inf │ ├── netipfw_m.inf │ ├── sources │ ├── win-passthru.diff │ └── winmissing.h ├── kmod-ipfw3_2.4.35.4-brcm-2.4-1_mipsel.ipk ├── planetlab/ │ ├── Makefile.planetlab │ ├── check_planetlab_sync │ ├── ipfw │ ├── ipfw.cron │ ├── ipfwroot.spec │ ├── ipfwslice.spec │ ├── netconfig │ ├── planetlab-tags.mk │ ├── planetlab.mk │ └── sample_hook ├── sys/ │ ├── net/ │ │ ├── if.h │ │ ├── pfil.h │ │ ├── radix.c │ │ └── radix.h │ ├── netgraph/ │ │ └── ng_ipfw.h │ ├── netinet/ │ │ ├── in_cksum.c │ │ ├── ip.h │ │ ├── ip6.h │ │ ├── ip_dummynet.h │ │ ├── ip_fw.h │ │ ├── ip_icmp.h │ │ ├── ipfw/ │ │ │ ├── dn_heap.c │ │ │ ├── dn_heap.h │ │ │ ├── dn_sched.h │ │ │ ├── dn_sched_fifo.c │ │ │ ├── dn_sched_prio.c │ │ │ ├── dn_sched_qfq.c │ │ │ ├── dn_sched_rr.c │ │ │ ├── dn_sched_wf2q.c │ │ │ ├── ip_dn_glue.c │ │ │ ├── ip_dn_io.c │ │ │ ├── ip_dn_private.h │ │ │ ├── ip_dummynet.c │ │ │ ├── ip_fw2.c │ │ │ ├── ip_fw_dynamic.c │ │ │ ├── ip_fw_log.c │ │ │ ├── ip_fw_lookup.c │ │ │ ├── ip_fw_nat.c │ │ │ ├── ip_fw_pfil.c │ │ │ ├── ip_fw_private.h │ │ │ ├── ip_fw_sockopt.c │ │ │ └── ip_fw_table.c │ │ ├── tcp.h │ │ ├── tcp_var.h │ │ └── udp.h │ └── sys/ │ ├── cdefs.h │ ├── kernel.h │ ├── malloc.h │ ├── mbuf.h │ ├── module.h │ ├── param.h │ ├── queue.h │ ├── syslog.h │ ├── systm.h │ └── taskqueue.h ├── tcc_glue.h └── test/ ├── Makefile ├── basic_ipfw.sh ├── dn_test.h ├── dynrules.sh ├── interpolation.c ├── main.c ├── memory_leak.sh ├── mylist.h ├── profile_bench1 ├── profile_bench2 ├── profile_bench3 ├── test_dn_heap.c └── test_dn_sched.c ================================================ FILE CONTENTS ================================================ ================================================ FILE: 020-mips-hz1000.patch ================================================ --- include/asm-mips/param_orig.h 2010-02-23 12:45:58.000000000 +0100 +++ include/asm-mips/param.h 2010-02-23 12:00:31.000000000 +0100 @@ -41,7 +41,7 @@ counter is increasing. This value is independent from the external value and can be changed in order to suit the hardware and application requirements. */ -# define HZ 100 +# define HZ 1000 # define hz_to_std(a) (a) #endif /* Not a DECstation */ ================================================ FILE: Makefile ================================================ # $Id: Makefile 11689 2012-08-12 21:07:34Z luigi $ # # Top level makefile for building ipfw/dummynet (kernel and userspace). # You can run it manually or also under the Planetlab build. # Planetlab wants also the 'install' target. # # To build on system with non standard Kernel sources or userland files, # you should run this with # # make KERNELPATH=/path/to/linux-2.x.y.z USRDIR=/path/to/usr # # We assume that $(USRDIR) contains include/ and lib/ used to build userland. # include Makefile.inc DATE ?= $(shell date +%Y%m%d) SNAPSHOT_NAME=$(DATE)-ipfw3.tgz BINDIST=$(DATE)-dummynet-linux.tgz WINDIST=$(DATE)-dummynet-windows.zip DISTFILES= Makefile Makefile.inc README binary* ipfw kipfw *.h sys .PHONY: ipfw kipfw ########################################### # windows x86 and x64 specific variables # ########################################### # DRIVE must be the hard drive letter where DDK is installed # DDKDIR must be the path to the DDK root directory, without drive letter # TARGETOS (x64 only) must be one of the following: # wnet -> windows server 2003 # wlh -> windows vista and windows server 2008 # win7 -> windows 7 # future version must be added here DRIVE ?= C: DDKDIR ?= /WinDDK/7600.16385.1 DDK = $(DRIVE)$(DDKDIR) TARGETOS=win7 export WIN64 export DDK export DRIVE export DDKDIR _all: all clean distclean: -@(cd ipfw && $(MAKE) $(@) ) -@rm -rf kipfw-mod binary64/[A-hj-z]* all: kipfw ipfw @# -- windows only ifeq ($(OSARCH),Windows) # copy files ifeq ($(WIN64),) -@ cp ipfw/ipfw.exe kipfw-mod/$(OBJDIR)/ipfw.sys binary/ -@ cp kipfw/*.inf binary/ else -@ cp binary/* kipfw/*.inf binary64/ -@ cp ipfw/ipfw.exe kipfw-mod/objchk_win7_amd64/amd64/ipfw.sys binary64/ endif # WIN64 endif # Windows win64: $(MAKE) WIN64=1 # kipfw-src prepares the sources for the kernel part. # The windows files (passthru etc.) are modified version of the # examples found in the $(DDK)/src/network/ndis/passthru/driver/ # They can be re-created using the 'ndis-glue' target # # We need a sed trick to remove newlines from the patchfile. ndis-glue: -@mkdir -p kipfw-mod cp $(DDK)/src/network/ndis/passthru/driver/*.[ch] kipfw-mod (cd kipfw-mod; for i in `find . -type f`; do sed -i.tmp "s/$$(printf '\r')//g" $$i; done ) cat kipfw/win-passthru.diff | sed "s/$$(printf '\r')//g" | (cd kipfw-mod; patch ) kipfw-src: -@rm -rf kipfw-mod -@mkdir -p kipfw-mod -@cp -Rp kipfw/* kipfw-mod -@cp `find sys -name \*.c` kipfw-mod -@(cd kipfw-mod && $(MAKE) include_e) ifeq ($(OSARCH),Windows) make ndis-glue endif snapshot: $(MAKE) distclean (tar cvzhf /tmp/$(SNAPSHOT_NAME) -s':^:ipfw3-2012/:' $(DISTFILES) ) bindist: $(MAKE) clean $(MAKE) all tar cvzf /tmp/$(BINDIST) ipfw/ipfw ipfw/ipfw.8 kipfw-mod/ipfw_mod.ko windist: $(MAKE) clean -$(MAKE) all -rm /tmp/$(WINDIST) zip -r /tmp/$(WINDIST) binary -x \*.svn\* ipfw: @(cd ipfw && $(MAKE) $(@) ) kipfw: kipfw-src ifeq ($(WIN64),) # linux or windows 32 bit @(cd kipfw-mod && $(MAKE) $(@) ) else #--- windows 64 bit, we use build.exe and nmake rm -f kipfw-mod/Makefile mkdir kipfw-mod/tmpbuild # check mysetenv.sh bash kipfw/mysetenv.sh $(DRIVE) $(DDKDIR) $(TARGETOS) endif openwrt_release: # create a temporary directory $(eval TMPDIR := $(shell mktemp -d -p /tmp/ ipfw3_openwrt_XXXXX)) # create the source destination directory $(eval IPFWDIR := ipfw3-$(DATE)) $(eval DSTDIR := $(TMPDIR)/$(IPFWDIR)) mkdir $(DSTDIR) # copy the package, clean objects and svn info cp -r ./ipfw ./kipfw-mod glue.h Makefile ./configuration README $(DSTDIR) (cd $(DSTDIR); make -s distclean; find . -name .svn | xargs rm -rf) (cd $(TMPDIR); tar czf $(IPFWDIR).tar.gz $(IPFWDIR)) # create the port files in /tmp/ipfw3-port $(eval PORTDIR := $(TMPDIR)/ipfw3) mkdir -p $(PORTDIR)/patches # generate the Makefile, PKG_VERSION and PKG_MD5SUM md5sum $(DSTDIR).tar.gz | cut -d ' ' -f 1 > $(TMPDIR)/md5sum cat ./OPENWRT/Makefile | \ sed s/PKG_VERSION:=/PKG_VERSION:=$(DATE)/ | \ sed s/PKG_MD5SUM:=/PKG_MD5SUM:=`cat $(TMPDIR)/md5sum`/ \ > $(PORTDIR)/Makefile @echo "" @echo "The openwrt port is in $(TMPDIR)/ipfw3-port" @echo "The source file should be copied to the public server:" @echo "scp $(DSTDIR).tar.gz marta@info.iet.unipi.it:~marta/public_html/dummynet" @echo "after this the temporary directory $(TMPDIR) can be removed." install: diff: -@(diff -upr $(BSD_HEAD)/sbin/ipfw ipfw) -@(diff -upr $(BSD_HEAD)/sys sys) ================================================ FILE: Makefile.inc ================================================ # $Id$ # GNU makefile header for ipfw/kipfw building BSD_HEAD ?= ~/FreeBSD/head OSARCH := $(shell uname) OSARCH := $(findstring $(OSARCH),FreeBSD Linux Darwin) ifeq ($(OSARCH),) OSARCH := Windows endif OBJDIR=mia KSRC ?= /lib/modules/$(shell uname -r)/build ifneq ($V,1) # no echo MSG=@echo HIDE=@ else MSG=@\# HIDE= endif .c.o: $(MSG) " CC $<" $(HIDE) $(CC) $(CFLAGS) -c $< -o $@ ================================================ FILE: Makefile.openwrt ================================================ # Makefile to build the package in openwrt. # goes into package/network/utils/ipfw3/Makefile # # Edit IPFW_DIR to point to the directory with the sources for ipfw IPFW_DIR := $(TOPDIR)/../qemu-misc/ipfw3 include $(TOPDIR)/rules.mk include $(INCLUDE_DIR)/kernel.mk PKG_NAME:=ipfw3 PKG_RELEASE:=1 # MV is undefined, we use it in the internal Makefiles MV ?= mv include $(INCLUDE_DIR)/package.mk #Stuff depending on kernel version $(warning --- openwrt kernel version $(KERNEL) linux dir $(LINUX_DIR) -------) ifeq ($(KERNEL),2.4) VERS:=openwrt CFLAGS_WRT:=-DSYSCTL_NODE -DEMULATE_SYSCTL IPFW_MOD:=ipfw_mod.o IPFW_SRC_DIR:=SUBDIRS else #VERS:=2.6 IPFW_MOD:=ipfw_mod.ko IPFW_SRC_DIR:=M endif define Package/ipfw3 SECTION:=utils CATEGORY:=Utilities TITLE := /sbin/ipfw DEPENDS := +libc +libgcc FILES := $(PKG_BUILD_DIR)/ipfw/ipfw $(warning --- build dir is $(PKG_BUILD_DIR) ---) endef define Package/ipfw3/description Control program for ipfw and dummynet endef # XXX not entirely clear why the install entry for userland works, # given that /sbin/ipfw is in KernelPackage/ipfw3 define Package/ipfw3/install $(INSTALL_DIR) $(1) /sbin endef # Description for the package. # The names KernelPackage/ipfw3 must match the arguments to the # call $(eval $(call KernelPackage,ipfw3)) used to build it define KernelPackage/ipfw3 SUBMENU:=Other modules TITLE:= IPFW and dummynet # FILES is what makes up the module, both kernel and userland # It must be in the KernelPackage section XXX FILES := $(PKG_BUILD_DIR)/kipfw-mod/$(IPFW_MOD) # AUTOLOAD:=$(call AutoLoad,80,ipfw_mod) endef define KernelPackage/kmod-ipfw3/description ipfw and dummynet kernel module endef # Standard entries for the openwrt builds: Build/Prepare and Build/Compile # Remember that commands must start with a tab # 'prepare' instructions for both kernel and userland # We copy the entire subtree, then build include_e/ which # contains empty headers used by the kernel sources. define Build/Prepare # $(warning --- Preparing ipfw sources ---) mkdir -p $(PKG_BUILD_DIR) $(CP) -Rp $(IPFW_DIR)/* $(PKG_BUILD_DIR)/ # The kernel sources are spread in multiple places, # so we put everything in kipfw-mod mkdir -p $(PKG_BUILD_DIR)/kipfw-mod cp -Rp $(IPFW_DIR)/kipfw/* $(PKG_BUILD_DIR)/kipfw-mod cp `find $(IPFW_DIR)/sys -name \*.c` $(PKG_BUILD_DIR)/kipfw-mod # we do not need cross parameters (cd $(PKG_BUILD_DIR)/ipfw && $(MAKE) include_e ) (cd $(PKG_BUILD_DIR)/kipfw-mod && $(MAKE) include_e ) endef define Build/Compile # XXX check whether we need all linux_dir etc. $(warning --- compile the user part for ipfw/openwrt ---) $(MAKE) -C $(PKG_BUILD_DIR)/ipfw \ LINUX_DIR=$(LINUX_DIR) \ $(TARGET_CONFIGURE_OPTS) \ CFLAGS="$(TARGET_CFLAGS) $(CFLAGS_WRT) -I./include_e -I./include -include ../glue.h -DNO_ALTQ -D__BSD_VISIBLE" \ _VER=$(VERS) all $(warning --- compile the kernel part for ipfw/openwrt ---) $(MAKE) -C "$(LINUX_DIR)" \ CROSS_COMPILE="$(TARGET_CROSS)" \ LINUX_DIR=$(LINUX_DIR) \ KERNELPATH=$(LINUX_DIR) \ ARCH="$(LINUX_KARCH)" \ $(IPFW_SRC_DIR)="$(PKG_BUILD_DIR)/kipfw-mod" \ IPFW3_ROOT="$(PKG_BUILD_DIR)" \ _VER=$(VERS) modules $(warning +++ done compile the kernel part for ipfw/openwrt ---) endef $(eval $(call BuildPackage,ipfw3)) $(eval $(call KernelPackage,ipfw3)) ================================================ FILE: NOTES ================================================ # # $Id: NOTES 6552 2010-06-15 11:24:59Z svn_panicucci $ # --------------------------------------------------------------------- --- DEVELOPER NOTES ------------------------------------------------ Both the client and the kernel code use almost unmodified sources from FreeBSD (just a very small number of sections #ifdef'ed out for features not relevant or not implemented). In both cases we provide two set of headers: - one set is made of empty files, automatically generated, to replace FreeBSD headers not available or conflicting on the ported platforms. - one set is made of custom files, sometimes copied verbatim from FreeBSD, sometimes containing only the minimal set of macros/ struct/ prototypes required by the port. Additionally, we have a small set of .c files providing functions not available in the port platforms, and hooks for the sockopt/packet data. TODO 20100205: + use an appropriate identifier instead of LINUX24 + find the discharging module hook, in order to force a queue flush + better matching on interface names (case insensitive etc ?) + match by interface address + verify path + send keepalives (20100301 marta: implemented) + pullup of data in external buffers + O_TAG + O_DIVERT + O_TEE + O_SETFIB + kmem_cache_alloc TODO (OpenWRT) 20090622 + add a module compilation for 2.6 TODO (FreeBSD, general) + New features related to the forthcoming IPv6 are missing, as the IPv6 support for lookup tables that currently support IPv4 addresses only. One of the goal of this project is to add the tables feature to the IPv6 protocol. + The current code implements rules listing requests as a single request returning both static and dynamic rules as a whole block. This operation requires a lock to be held for the time needed to get the full list of rules, regardless of the requested rules. I propose to break up the rule request in two parts, for static and dynamic rules, in order to avoid to lock the whole struct for a subset of rules required. + At last, due to improvement and contribution to the code, the tool significantly grown over the time with new functionalities and features, leaving the general view aside. An example of this will be the use of dispatching table instead some very long switch case, making the resulting code more readable and hopefully a faster execution. + XXX can't find the ipfw_* indirection... DETAILED PORTING INFO --- ipfw (userland) on linux --- The port is relatively trivial. Communication with the kernel occurs through a raw socket using [gs]etsockopt(), and all is needed is the availability of ip_fw.h and ip_dummynet.h headers to describe the relevant data structures. --- kernel ipfw on linux --- Sources are mostly unmodified, except for commenting out unsupported features (tables, in-kernel nat...). The port requires a rather large number of empty headers. Other porting issues are in ipfw2_mod.c --- build as an Openwrt package ------ WINDOWS PORT ------ We started from the wipfw port available at [WIPFW] , but most of the port is done from scratch using the most recent version of ipfw+dummynet from HEAD/RELENG_7 as of March 2009 # WIPFW: wipfw.sourceforge.net #binary: http://downloads.sourceforge.net/wipfw/wipfw-0.3.2b.zip?use_mirror=mesh http://downloads.sourceforge.net/wipfw/wipfw-0.2.8-source.zip --- DEVELOPMENT TOOLS: At least initially, to build the code you need a pc with windows installed and the [WINDDK] from the microsoft site. Other tools like the new WDK should work as well. The 'standard' way used by WDK/WINDDK is to run a 'build' script which in turn calls nmake and then the microsoft compiler [CL] and linker [LINK]. See the documentation for command line switches for these tools, they are similar but not the same as the equivalent gcc switches. In particular, a / is often used to replace - though both forms are accepted. The steps to do in order to launch the build environment follows: + download winddk from microsoft.com + install + run the Free Build Enviroment from: Start -> All Program -> WINDDK -> [NT|XP|2000] -> Free Build Environment + change dir to .src and type `build' in command line For our purposes, however, it is much more convenient to use cygwin [CYGWIN] and invoke CL and LINK using gmake A debugging tools is: http://technet.microsoft.com/en-us/sysinternals/bb896647.aspx it simply display the kernel-mode debug output. Use the DbgPrint() function, that is something similar to printk(). Can be lauched with dbgview.exe. After a succesfully compilation and link, you can launch the program in user space simply executing the binary file, while for the kernel space you need to do the following steps: cp ipfw.sys /cygdrive/c/WINDOWS/system32/drivers/ ipfw install_drv System32\DRIVERS\ip_fw.sys net start ip_fw ======= --- ARCHITECTURE --- The main part of the userland program mostly work as the unix equivalent, the only issue is to provide empty header files to replace those not available in Windows, and include the winsock2 headers to access some network related functions and headers. Communication with the kernel module does not use a raw IP socket as in the unix version. Instead, we inherit the same method used in ipfw -- a replacement for socket() creates a handle to access the control structure, and setsockopt/getsockopt replacements are also used to communicate with the kernel side. This is implemented in win32.c In order to load the module and activate it, we also use the same technique suggested in wipfw -- the main() is extended (with a wrapper) so that it can handle additional commands to install/control/deinstall the service and call the appropriate actions. See svcmain.c for details. --- PORTING ISSUES: Most of the unix hierarchy of headers is not available so we have to replicate them. gcc attributes are also not present. C99 types are not present, remapped in Also, we don't have C99 initializers which sometimes gives trouble. --- USEFUL LINKS: [WIPFW] http://wipfw.sourceforge.net/ [WINDDK] http://www.microsoft.com/whdc/devtools/ddk/default.mspx [CL] http://msdn.microsoft.com/en-us/library/610ecb4h.aspx command line syntax [CYGWIN] http://www.cygwin.com/setup.exe Windows Driver Kit http://www.microsoft.com/whdc/DevTools/WDK/WDKpkg.mspx Debug Symbols for WinXP SP3 http://www.microsoft.com/whdc/devtools/debugging/symbolpkg.mspx#d DbgView http://technet.microsoft.com/en-us/sysinternals/bb896647.aspx Cygwin http://www.cygwin.com/ (installazione pacchetti di default + categoria devel) Winrar (il WDK e' distribuito in un file .iso) http://www.rarlab.com/download.htm puttycyg (terminale per cygwin) http://code.google.com/p/puttycyg/ Tortoise SVN http://tortoisesvn.net/downloads EditPlus http://www.editplus.com/ --------------------------------------------------------------------- --- OPEN ISSUES/TODO ------------------------------------------------ - Fix the build on OpenWRT for linux 2.6 [Forum: https://forum.openwrt.org/viewtopic.php?id=24990] - Compilation on 2.6 OpenWRT (target is MIPS Artheros 71xx) gives compilation errors; [Send updates to: https://forum.openwrt.org/viewtopic.php?id=24990] - Windows stack corruption [a tricky bug in dummynet] - Windows ipv6 port [RE: Windows port of ipv6 in ipfw+dummynet] NOTE: - To allow compilation on OpenWRT with kernel 2.6 only the Makefile.opewrt is modified to guess the kernel version (2.4/2.6) - ipfw3 Makefile is not modified. - Also compile on bigendian, but not tested yet... - Little changes in source code. ================================================ FILE: README ================================================ # # $Id: README 11691 2012-08-12 21:32:37Z luigi $ # This directory contains a port of ipfw and dummynet to Linux and Windows. This version of ipfw and dummynet is called "ipfw3" as it is the third major rewrite of the code. The source code here comes straight from FreeBSD (roughly the version in HEAD as of February 2010), plus some glue code and headers written from scratch. Unless specified otherwise, all the code here is under a BSD license. Specific build instructions are below, and in general produce a kernel module, ipfw_mod.ko (ipfw.sys on windows) a userland program, /sbin/ipfw (ipfw.exe on windows) which you need to install on your system. CREDITS: Luigi Rizzo (main design and development) Marta Carbone (Linux and Planetlab ports) Riccardo Panicucci (modular scheduler support) Francesco Magno (Windows port) Fabio Checconi (the QFQ scheduler) Funding from Universita` di Pisa (NETOS project), European Commission (ONELAB2 project) ACM SIGCOMM (Sigcomm Community Projects Award, April 2012) ------ INSTALL/REMOVE INSTRUCTIONS ------ Linux INSTALL: # Do the following as root insmod ./dummynet2/ipfw_mod.ko cp ipfw/ipfw /usr/local/sbin REMOVE: rmmod ipfw_mod.ko OpenWRT INSTALL: # use the correct name for your system opkg install kmod-ipfw3_2.4.35.4-brcm-2.4-1_mipsel.ipk #install ls -l ls -l /lib/modules/2.4.35.4/ipfw* # check insmod /lib/modules/2.4.35.4/ipfw_mod.o # load the module /lib/modules/2.4.35.4/ipfw show # launch the userspace tool REMOVE: rmmod ipfw_mod.o # remove the module Windows: A pre-built version is in binary/ and binary64/ directories. INSTALL THE NDIS DRIVER - open the configuration panel for the network card in use (right click on the icon on the SYSTRAY, or go to Control Panel -> Network and select one card) - click on Properties->Install->Service->Add - click on 'Driver Disk' and select 'netipfw.inf' in this folder - select 'ipfw+dummynet' which is the only service you should see - click accept on the warnings for the installation of an unsigned driver (roughly twice per existing network card) Now you are ready to use the emulator. To configure it, open a 'cmd' window (REMEMBER to run it as Administrator) and you can use the ipfw command from the command line. Otherwise click on the 'TESTME.bat' which is a batch program that runs various tests. REMEMBER: you need to run ipfw as administrator. REMOVE: - select a network card as above. - click on Properties - select 'ipfw+dummynet' - click on 'Remove' ------ BUILD INSTRUCTIONS ------ + Windows 32 bit and 64 bit (XP, Windows7) To build your own version of the package you need: - cygwin, http://www.cygwin.com/ with base packages, make, c compiler, possibly an editor and subversion. This is used to build the userspace control program, ipfw.exe - Microsoft Windows Driver Kit Version 7.1.0, available from http://www.microsoft.com/en-us/download/details.aspx?id=11800 (ISO image, GRMWDK_EN_7600_1.ISO) This is used to build the kernel module. - optionally, DbgView if you want to see diagnostics coming from the kernel module. You can find it at http://technet.microsoft.com/en-us/sysinternals/bb896647.aspx Check the Makefile in the root directory to make sure that the WDK is installed in the place indicated by DRIVE and DDKDIR variables (otherwise pass the correct values to the Makefile). Open a shell from cygwin, move to this directory, and run "make" for the 32-bit version, "make win64" for the 64 bit version. This will produce in the binary/ or binary64/ directory the following files: ipfw.exe (you also need cygwin1.dll) ipfw.sys (an NDIS intermediate filter driver) netipfw.inf and netipfw_m.inf (installer files) Cross compilation of the userland side under FreeBSD is possible with gmake TCC=`pwd`/tcc-0.9.25-bsd/win32 CC=`pwd`/tcc-0.9.25-bsd/win32/bin/wintcc (wintcc is a custom version of tcc which produces Windows code) NOTE: the 64-bit version is compiled as a 32-bit executable for userspace, with appropriate changes to produce 64-bit pointers. The kernel module is built using the MSC 'build' utility instead of 'make'. THE MODULE IS NOT SIGNED. IMPORTANT: Windows 64-bit will not load unsigned kernel modules unless you boot with 'F8' and disable checks for signed modules. ***** Linux 2.6 and above ****** make [KSRC=/path/to/linux USRDIR=/path/to/usr] where the two variables are optional an point to the linux kernel sources and the /usr directory. Defaults are USRDIR=/usr and KSRC=/lib/modules/`uname -r`/build --- XXX check ? NOTE: make sure CONFIG_NETFILTER is enabled in the kernel configuration file. You need the ncurses devel library, that can be installed according your distro with: apt-get install ncurses-dev # for debian based distro yum -y install ncurses-dev # for fedora based distro You can enable CONFIG_NETFILTER by doing: "(cd ${KSRC}; make menuconfig)" and enabling the option listed below: Networking ---> Networking options ---> [*] Network packet filtering framework (Netfilter) If you have not yet compiled your kernel source, you need to prepare the build environment: (cd $(KSRC); make oldconfig; make prepare; make scripts) ***** Linux 2.4.x ***** Almost as above, with an additional VER=2.4 make VER=2.4 KSRC=... For 2.4, if KSRC is not specified then we use KSRC ?= /usr/src/`uname -r`/build You need to follow the same instruction for the 2.6 kernel, enabling netfilter in the kernel options: Networking options ---> [*] Network packet filtering (replaces ipchains) ***** Openwrt package ***** (Tested with kamikaze_8.09.1 and Linux 2.4) + Download and extract the OpenWrt package, e.g. wget http://downloads.openwrt.org/kamikaze/8.09.1/kamikaze_8.09.1_source.tar.bz2 tar xvjf kamikaze_8.09.1_source.tar.bz2 + move to the directory with the OpenWrt sources (the one that contains Config.in, rules.mk ...) cd kamikaze_8.09.1 + Optional: Add support for 1ms resolution. By default OpenWRT kernel is compiled with HZ=100; this implies that all timeouts are rounded to 10ms, too coarse for dummynet. The file 020-mips-hz1000.patch contains a kernel patch to build a kernel with HZ=1000 (i.e. 1ms resolution) as in Linux/FreeBSD. To apply this patch, go in the kernel source directory and patch the kernel cd build_dir/linux-brcm-2.4/linux-2.4.35.4 cat $IPFW3_SOURCES/020-mips-hz1000.patch | patch -p0 where IPFW3_SOURCES contains the ipfw3 source code. Now, the next kernel recompilation will use the right HZ value + Optional: to be sure that the tools are working, make a first build as follows: - run "make menuconfig" and set the correct target device, drivers, and so on; - run "make" to do the build + Add ipfw3 to the openwrt package, as follows: - copy the code from this directory to the place used for the build: cp -Rp /path_to_ipfw3 ../ipfw3; If you want, you can fetch a newer version from the web (cd ..; rm -rf ipfw3; \ wget http://info.iet.unipi.it/~luigi/dummynet/ipfw3-latest.tgz;\ tar xvzf ipfw3-latest.tgz) - run the following commands: (mkdir package/ipfw3; \ cp ../ipfw3/Makefile.openwrt package/ipfw3/Makefile) to create the package/ipfw3 directory in the OpenWrt source directory, and copy Makefile.openwrt to package/ipfw3/Makefile ; - if necessary, edit package/ipfw3/Makefile and set IPFW_DIR to point to the directory ipfw3, which contains the sources; - run "make menuconfig" and select kmod-ipfw3 as a module in Kernel Modules -> Other modules -> kmod-ipfw3 - run "make" to build the package, "make V=99" for verbose build. - to modify the code, assuming you are in directory "kamikaze_8.09.1" (cd ../ipfw3 && vi ...the files you are interested in ) rm -rf build_dir/linux-brcm-2.4/kmod-ipfw3 make package/ipfw3/compile V=99 The resulting package is located in bin/packages/mipsel/kmod-ipfw3*, upload the file and install on the target system, as follows: opkg install kmod-ipfw3_2.4.35.4-brcm-2.4-1_mipsel.ipk #install ls -l ls -l /lib/modules/2.4.35.4/ipfw* # check insmod /lib/modules/2.4.35.4/ipfw_mod.o # load the module /lib/modules/2.4.35.4/ipfw show # launch the userspace tool rmmod ipfw_mod.o # remove the module ***** PLANETLAB BUILD (within a slice) ***** These instruction can be used by PlanetLab developers to compile the dummynet module on a node. To install the module on the node users need root access in root context. PlanetLab users that want to use the dummynet package should ask to PlanetLab support for nodes with dummynet emulation capabilities. Follow the instructions below. You can just cut&paste # install the various tools if not available sudo yum -y install subversion rpm-build rpm-devel m4 redhat-rpm-config make gcc # new build installation requires the gnupg package sudo yum -y install gnupg # the linux kernel and the ipfw source can be fetched by git sudo yum -y install git # create and move to a work directory mkdir -p test # extract a planetlab distribution to directory XYZ (cd test; git clone git://git.onelab.eu/build ./XYZ) # download the specfiles and do some patching. # Results are into SPEC/ (takes 5 minutes) (cd test/XYZ; make stage1=true PLDISTRO=onelab) # Building the slice code is fast, the root code takes longer # as it needs to rebuild the whole kernel (cd test/XYZ; sudo make ipfwslice PLDISTRO=onelab) (cd test/XYZ; sudo make ipfwroot PLDISTRO=onelab) The kernel dependency phase is a bit time consuming, but does not need to be redone if we are changing the ipfw sources only. To clean up the code do (cd test/XYZ; sudo make ipfwroot-clean ipfwslice-clean) then after you have updated the repository again (cd test/XYZ; sudo make ipfwslice ipfwroot) --- References [1] https://svn.planet-lab.org/wiki/VserverCentos [2] http://wiki.linux-vserver.org/Installation_on_CentOS [3] http://mirror.centos.org/centos/5/isos/ [4] More information are in /build/README* files ================================================ FILE: binary/README.txt ================================================ This directory contains the binaries to install and use IPFW and DUMMYNET on a Windows Machine. The kernel part is an NDIS module, whereas the user interface is a command line program. 1. INSTALL THE NDIS DRIVER - open the configuration panel for the network card in use (either right click on the icon on the SYSTRAY, or go to Control Panel -> Network and select one card) - click on Properties->Install->Service->Add - click on 'Driver Disk' and select 'netipfw.inf' in this folder - select 'ipfw+dummynet' which is the only service you should see - click accept on the warnings for the installation of an unknown driver (roughly twice per existing network card) Now you are ready to use the emulator. To configure it, open a 'cmd' window and you can use the ipfw command from the command line. Otherwise click on the 'TESTME.bat' which is a batch program that runs various tests. 2. UNINSTALL THE DRIVER - select a network card as above. - click on Properties - select 'ipfw+dummynet' - click on 'Remove' ================================================ FILE: binary/netipfw.inf ================================================ ; version section [Version] Signature = "$Windows NT$" Class = NetService ClassGUID = {4D36E974-E325-11CE-BFC1-08002BE10318} Provider = %Unipi% DriverVer = 26/02/2010,3.0.0.1 ; manufacturer section [Manufacturer] %Unipi% = UNIPI,NTx86,NTamd64 ; control flags section ; optional, unused in netipfw.inf inf, used in netipfw_m.inf [ControlFlags] ; models section [UNIPI] ; Win2k %Desc% = Ipfw.ndi, unipi_ipfw [UNIPI.NTx86] ;For WinXP and later %Desc% = Ipfw.ndi, unipi_ipfw [UNIPI.NTamd64] ;For x64 %Desc% = Ipfw.ndi, unipi_ipfw ; ddinstall section [Ipfw.ndi] AddReg = Ipfw.ndi.AddReg, Ipfw.AddReg Characteristics = 0x4410 ; NCF_FILTER | NCF_NDIS_PROTOCOL !--Filter Specific--!! CopyFiles = Ipfw.Files.Sys CopyInf = netipfw_m.inf ; remove section [Ipfw.ndi.Remove] DelFiles = Ipfw.Files.Sys ;ddinstall.services section [Ipfw.ndi.Services] AddService = Ipfw,,Ipfw.AddService [Ipfw.AddService] DisplayName = %ServiceDesc% ServiceType = 1 ;SERVICE_KERNEL_DRIVER StartType = 3 ;SERVICE_DEMAND_START ErrorControl = 1 ;SERVICE_ERROR_NORMAL ServiceBinary = %12%\ipfw.sys AddReg = Ipfw.AddService.AddReg [Ipfw.AddService.AddReg] ;file copy related sections [SourceDisksNames] 1=%DiskDescription%,"",, [SourceDisksFiles] ipfw.sys=1 [DestinationDirs] DefaultDestDir = 12 Ipfw.Files.Sys = 12 ; %windir%\System32\drivers ; ddinstall->copyfiles points here [Ipfw.Files.Sys] ipfw.sys,,,2 ; ddinstall->addreg points here [Ipfw.ndi.AddReg] HKR, Ndi, HelpText, , %HELP% ; this is displayed at the bottom of the General page of the Connection Properties dialog box HKR, Ndi, FilterClass, , failover HKR, Ndi, FilterDeviceInfId, , unipi_ipfwmp HKR, Ndi, Service, , Ipfw HKR, Ndi\Interfaces, UpperRange, , noupper HKR, Ndi\Interfaces, LowerRange, , nolower HKR, Ndi\Interfaces, FilterMediaTypes, , "ethernet, tokenring, fddi, wan" ;strings section [Strings] Unipi = "Unipi" DiskDescription = "Ipfw Driver Disk" Desc = "ipfw+dummynet" HELP = "This is ipfw and dummynet network emulator, developed by unipi.it" ServiceDesc = "ipfw service" ================================================ FILE: binary/netipfw_m.inf ================================================ ; version section [Version] Signature = "$Windows NT$" Class = Net ClassGUID = {4D36E972-E325-11CE-BFC1-08002BE10318} Provider = %Unipi% DriverVer = 26/02/2010,3.0.0.1 ; control flags section ; optional, unused in netipfw.inf inf, used in netipfw_m.inf [ControlFlags] ExcludeFromSelect = unipi_ipfwmp ; destinationdirs section, optional [DestinationDirs] DefaultDestDir=12 ; No files to copy ; manufacturer section [Manufacturer] %Unipi% = UNIPI,NTx86,NTamd64 ; models section [UNIPI] ; Win2k %Desc% = IpfwMP.ndi, unipi_ipfwmp [UNIPI.NTx86] ;For WinXP and later %Desc% = IpfwMP.ndi, unipi_ipfwmp [UNIPI.NTamd64] ;For x64 %Desc% = IpfwMP.ndi, unipi_ipfwmp ; ddinstall section [IpfwMP.ndi] AddReg = IpfwMP.ndi.AddReg Characteristics = 0x29 ;NCF_NOT_USER_REMOVABLE | NCF_VIRTUAL | NCF_HIDDEN ; ddinstall->addreg points here [IpfwMP.ndi.AddReg] HKR, Ndi, Service, 0, IpfwMP ;ddinstall.services section [IpfwMP.ndi.Services] AddService = IpfwMP,0x2, IpfwMP.AddService [IpfwMP.AddService] ServiceType = 1 ;SERVICE_KERNEL_DRIVER StartType = 3 ;SERVICE_DEMAND_START ErrorControl = 1 ;SERVICE_ERROR_NORMAL ServiceBinary = %12%\ipfw.sys AddReg = IpfwMP.AddService.AddReg [IpfwMP.AddService.AddReg] ; None [Strings] Unipi = "Unipi" Desc = "Ipfw Miniport" ================================================ FILE: binary/testme.bat ================================================ @echo on @set CYGWIN=nodosfilewarning @ipfw -q flush @ipfw -q pipe flush @echo ###################################################################### @echo ## Setting delay to 100ms for both incoming and outgoing ip packets ## @echo ## and sending 4 echo request to Google ## @echo ###################################################################### ipfw pipe 3 config delay 100ms ipfw add pipe 3 ip from any to any ipfw pipe show ping -n 4 www.google.it @echo ############################################## @echo ## Raising delay to 300ms and pinging again ## @echo ############################################## ipfw pipe 3 config delay 300ms ipfw pipe show ping -n 4 www.google.com @echo ################################## @echo ## Shaping bandwidth to 500kbps ## @echo ################################## ipfw pipe 3 config bw 500Kbit/s ipfw pipe show wget http://info.iet.unipi.it/~luigi/1m @del 1m @echo ################################### @echo ## Lowering bandwidth to 250kbps ## @echo ################################### ipfw pipe 3 config bw 250Kbit/s ipfw pipe show wget http://info.iet.unipi.it/~luigi/1m @del 1m @echo ################################################################### @echo ## Simulating 50 percent packet loss and sending 15 echo request ## @echo ################################################################### @ipfw -q flush @ipfw -q pipe flush ipfw add prob 0.5 deny proto icmp in ping -n 15 -w 300 www.google.it @ipfw -q flush @echo ############################## @echo ## Showing SYSCTL variables ## @echo ############################## ipfw sysctl -a @echo ############################################# @echo ## Inserting rules to test command parsing ## @echo ############################################# @echo -- dropping all packets of a specific protocol -- ipfw add deny proto icmp @echo -- dropping packets of all protocols except a specific one -- ipfw add deny not proto tcp @echo -- dropping all packets from IP x to IP y -- ipfw add deny src-ip 1.2.3.4 dst-ip 5.6.7.8 @echo -- dropping all ssh outgoing connections -- ipfw add deny out dst-port 22 @echo -- allowing already opened browser connections -- @echo -- but preventing new ones from being opened -- ipfw add deny out proto tcp dst-port 80 tcpflags syn @echo -- another way to do the same thing -- ipfw add allow out proto tcp dst-port 80 established ipfw add deny out proto tcp dst-port 80 setup @echo -- checking what rules have been inserted -- ipfw -c show @ipfw -q flush @echo ################# @echo ## Cleaning up ## @echo ################# ipfw -q flush ipfw -q pipe flush pause ================================================ FILE: configuration/README ================================================ This directorty contains some ipfw configurations and a scripts to safely change the firewall rules. The firewall configuration comes from the FreeBSD initial script. The change_rules_linux.sh allows to change the ipfw rules and in case os a misconfiguration which prevents to reach the remote host, to restore the old ruleset. To configure the firewall behavior, edit the ipfw.conf file and execute the ./change_rules_linux.sh script. The ipfw program executable should be located in /sbin (XXX) XXX seems we use something which is not compatible with dash ================================================ FILE: configuration/change_rules.sh ================================================ #!/bin/sh # # Copyright (c) 2000 Alexandre Peixoto # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # $FreeBSD: src/share/examples/ipfw/change_rules.sh,v 1.6 2003/09/07 07:52:56 jmg Exp $ # Change ipfw(8) rules with safety guarantees for remote operation # # Invoke this script to edit ${firewall_script}. It will call ${EDITOR}, # or vi(1) if the environment variable is not set, for you to edit # ${firewall_script}, ask for confirmation, and then run # ${firewall_script}. You can then examine the output of ipfw list and # confirm whether you want the new version or not. # # If no answer is received in 30 seconds, the previous # ${firewall_script} is run, restoring the old rules (this assumes ipfw # flush is present in it). # # If the new rules are confirmed, they'll replace ${firewall_script} and # the previous ones will be copied to ${firewall_script}.{date}. Mail # will also be sent to root with a unified diff of the rule change. # # Unapproved rules are kept in ${firewall_script}.new, and you are # offered the option of changing them instead of the present rules when # you call this script. # # This script could be improved by using version control # software. # XXX on linux /etc/rc.conf defines: # firewall_type and firewall_script if [ -r /etc/defaults/rc.conf ]; then . /etc/defaults/rc.conf source_rc_confs elif [ -r /etc/rc.conf ]; then . /etc/rc.conf fi EDITOR=${EDITOR:-/usr/bin/vi} PAGER=${PAGER:-/usr/bin/more} # on linux the default mktemp invocation behavior # is different, we should change the temporary file creation tempfoo=`basename $0` #TMPFILE=`mktemp -t ${tempfoo}` || exit 1 TMPFILE=`mktemp -t ${tempfoo}.XXXXX` || exit 1 get_yes_no() { while true do echo -n "$1 (Y/N) ? " read -t 30 a if [ $? != 0 ]; then a="No"; return; fi case $a in [Yy]) a="Yes"; return;; [Nn]) a="No"; return;; *);; esac done } restore_rules() { nohup sh ${firewall_script} /dev/null 2>&1 rm ${TMPFILE} exit 1 } case "${firewall_type}" in [Cc][Ll][Ii][Ee][Nn][Tt]|\ [Cc][Ll][Oo][Ss][Ee][Dd]|\ [Oo][Pp][Ee][Nn]|\ [Ss][Ii][Mm][Pp][Ll][Ee]|\ [Uu][Nn][Kk][Nn][Oo][Ww][Nn]) edit_file="${firewall_script}" rules_edit=no ;; *) if [ -r "${firewall_type}" ]; then edit_file="${firewall_type}" rules_edit=yes fi ;; esac if [ -f ${edit_file}.new ]; then get_yes_no "A new rules file already exists, do you want to use it" [ $a = 'No' ] && cp ${edit_file} ${edit_file}.new else cp ${edit_file} ${edit_file}.new fi trap restore_rules SIGHUP ${EDITOR} ${edit_file}.new get_yes_no "Do you want to install the new rules" [ $a = 'No' ] && exit 1 cat < ${TMPFILE} 2>&1 else nohup sh ${firewall_script}.new \ < /dev/null > ${TMPFILE} 2>&1 fi sleep 2; get_yes_no "Would you like to see the resulting new rules" [ $a = 'Yes' ] && ${PAGER} ${TMPFILE} get_yes_no "Type y to keep the new rules" [ $a != 'Yes' ] && restore_rules DATE=`date "+%Y%m%d%H%M"` cp ${edit_file} ${edit_file}.$DATE mv ${edit_file}.new ${edit_file} cat </dev/null fi ${fwcmd} add deny $log ip from any to any ;; [Cc][Ll][Oo][Ss][Ee][Dd]) ${fwcmd} add 65000 deny ip from any to any ;; [Uu][Nn][Kk][Nn][Oo][Ww][Nn]) ;; *) if [ -r "${firewall_type}" ]; then ${fwcmd} ${firewall_flags} ${firewall_type} fi ;; esac ================================================ FILE: glue.h ================================================ /* * Copyright (c) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id: glue.h 12501 2014-01-10 01:09:14Z luigi $ * * glue code to adapt the FreeBSD version to linux and windows, * userland and kernel. * This is included before any other headers, so we do not have * a chance to override any #define that should appear in other * headers. * First handle headers for userland and kernel. Then common code * (including headers that require a specific order of inclusion), * then the user- and kernel- specific parts. */ #if defined __FreeBSD__ #define _GLUE_H #endif /* __FreeBSD__ */ #ifndef _GLUE_H #define _GLUE_H /* * common definitions to allow portability */ #ifndef __FBSDID #define __FBSDID(x) #endif /* FBSDID */ #ifndef KERNEL_MODULE /* Userland headers */ #if defined(__CYGWIN32__) || defined(__CYGWIN__) #if !defined(_WIN32) #define _WIN32 #endif #endif #if defined(TCC) && defined(_WIN32) #include #endif /* TCC */ #include /* linux needs it in addition to sys/types.h */ #include /* for size_t */ #include #include #include #ifdef __linux__ #include /* linux only 20111031 */ #endif #else /* KERNEL_MODULE, kernel headers */ #define INET # want inet support #ifdef __linux__ #include #define ifnet net_device /* remap */ #define _KERNEL # make kernel structure visible #define KLD_MODULE # add the module glue #include /* linux kernel */ #include /* linux kernel */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) // or 2.4.x #include /* linux/msg.h require this */ #include /* just MAX_ADDR_LEN 8 on 2.4 32 on 2.6, also brings in byteorder */ #endif /* on 2.6.22, msg.h requires spinlock_types.h */ /* XXX spinlock_type.h was introduced in 2.6.14 */ #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,13) && \ LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) #include #endif /* XXX m_type define conflict with include/sys/mbuf.h, * so early include msg.h (to be solved) */ #include #include #include /* struct in_addr */ #include /* struct in6_addr */ #include /* * LIST_HEAD in queue.h conflict with linux/list.h * some previous linux include need list.h definition */ #undef LIST_HEAD #define IF_NAMESIZE (16) typedef uint32_t in_addr_t; #define printf(fmt, arg...) printk(KERN_ERR fmt, ##arg) #endif /* __linux__ */ #endif /* KERNEL_MODULE end of kernel headers */ /* * Part 2: common userland and kernel definitions */ #ifndef ETHER_ADDR_LEN #define ETHER_ADDR_LEN (6+0) /* length of an Ethernet address */ #endif #define ICMP6_DST_UNREACH_NOROUTE 0 /* no route to destination */ #define ICMP6_DST_UNREACH_ADMIN 1 /* administratively prohibited */ #define ICMP6_DST_UNREACH_ADDR 3 /* address unreachable */ #define ICMP6_DST_UNREACH_NOPORT 4 /* port unreachable */ /* * linux: sysctl are mapped into /sys/module/ipfw_mod parameters * windows: they are emulated via get/setsockopt */ #define CTLFLAG_RD 1 #define CTLFLAG_RDTUN 1 #define CTLFLAG_RW 2 #define CTLFLAG_SECURE3 0 // unsupported #define CTLFLAG_VNET 0 /* unsupported */ /* if needed, queue.h must be included here after list.h */ /* * struct thread is used in linux and windows kernel. * In windows, we need to emulate the sockopt interface * so also the userland needs to have the struct sockopt defined. * In order to achieve 64 bit compatibility, padding has been inserted. */ struct thread { void *sopt_td; void *td_ucred; }; enum sopt_dir { SOPT_GET, SOPT_SET }; struct sockopt { enum sopt_dir sopt_dir; /* is this a get or a set? */ int sopt_level; /* second arg of [gs]etsockopt */ int sopt_name; /* third arg of [gs]etsockopt */ #ifdef _X64EMU void* pad1; void* pad2; #endif void *sopt_val; /* fourth arg of [gs]etsockopt */ size_t sopt_valsize; /* (almost) fifth arg of [gs]etsockopt */ #ifdef _X64EMU void* pad3; void* pad4; #endif struct thread *sopt_td; /* calling thread or null if kernel */ }; #define INET_ADDRSTRLEN (16) /* missing in netinet/in.h */ /* * List of values used for set/getsockopt options. * The base value on FreeBSD is defined as a macro, * if not available we will use our own enum. * The TABLE_BASE value is used in the kernel. */ #ifndef IP_FW_TABLE_ADD #define _IPFW_SOCKOPT_BASE 100 /* 40 on freebsd */ enum ipfw_msg_type { IP_FW_TABLE_ADD = _IPFW_SOCKOPT_BASE, IP_FW_TABLE_DEL, IP_FW_TABLE_FLUSH, IP_FW_TABLE_GETSIZE, IP_FW_TABLE_LIST, IP_FW_DYN_GET, /* new addition */ /* IP_FW3 and IP_DUMMYNET3 are the new API */ IP_FW3 = _IPFW_SOCKOPT_BASE + 8, IP_DUMMYNET3, IP_FW_ADD = _IPFW_SOCKOPT_BASE + 10, IP_FW_DEL, IP_FW_FLUSH, IP_FW_ZERO, IP_FW_GET, IP_FW_RESETLOG, IP_FW_NAT_CFG, IP_FW_NAT_DEL, IP_FW_NAT_GET_CONFIG, IP_FW_NAT_GET_LOG, IP_DUMMYNET_CONFIGURE, IP_DUMMYNET_DEL , IP_DUMMYNET_FLUSH, /* 63 is missing */ IP_DUMMYNET_GET = _IPFW_SOCKOPT_BASE + 24, _IPFW_SOCKOPT_END }; #endif /* IP_FW_TABLE_ADD */ /* * Part 3: userland stuff */ #ifndef KERNEL_MODULE /* * internal names in struct in6_addr (netinet/in6.h) differ, * so we remap the FreeBSD names to the platform-specific ones. */ #ifndef _WIN32 #define __u6_addr in6_u #define __u6_addr32 u6_addr32 #define in6_u __in6_u /* missing type for ipv6 (linux 2.6.28) */ #else /* _WIN32 uses different naming */ #define __u6_addr __u6 #define __u6_addr32 __s6_addr32 #endif /* _WIN32 */ /* missing in linux netinet/ip.h */ #define IPTOS_ECN_ECT0 0x02 /* ECN-capable transport (0) */ #define IPTOS_ECN_CE 0x03 /* congestion experienced */ /* defined in freebsd netinet/icmp6.h */ #define ICMP6_MAXTYPE 201 /* on freebsd sys/socket.h pf specific */ #define NET_RT_IFLIST 3 /* survey interface list */ #if defined(__linux__) || defined(__CYGWIN32__) || defined(__CYGWIN__) /* on freebsd net/if.h XXX used */ struct if_data { /* ... */ u_long ifi_mtu; /* maximum transmission unit */ }; /* * Message format for use in obtaining information about interfaces * from getkerninfo and the routing socket. * This is used in nat.c */ struct if_msghdr { u_short ifm_msglen; /* to skip over unknown messages */ u_char ifm_version; /* future binary compatibility */ u_char ifm_type; /* message type */ int ifm_addrs; /* like rtm_addrs */ int ifm_flags; /* value of if_flags */ u_short ifm_index; /* index for associated ifp */ struct if_data ifm_data;/* stats and other ifdata */ }; /* * Message format for use in obtaining information about interface * addresses from getkerninfo and the routing socket */ struct ifa_msghdr { u_short ifam_msglen; /* to skip over unknown messages */ u_char ifam_version; /* future binary compatibility */ u_char ifam_type; /* message type */ int ifam_addrs; /* like rtm_addrs */ int ifam_flags; /* value of ifa_flags */ u_short ifam_index; /* index for associated ifp */ int ifam_metric; /* value of ifa_metric */ }; #ifndef NO_RTM /* conflicting with netlink */ /* missing in net/route.h */ #define RTM_VERSION 5 /* Up the ante and ignore older versions */ #define RTM_IFINFO 0xe /* iface going up/down etc. */ #define RTM_NEWADDR 0xc /* address being added to iface */ #define RTA_IFA 0x20 /* interface addr sockaddr present */ #endif /* NO_RTM */ /* SA_SIZE is used in the userland nat.c modified */ #define SA_SIZE(sa) \ ( (!(sa) ) ? \ sizeof(long) : \ 1 + ( (sizeof(struct sockaddr) - 1) | (sizeof(long) - 1) ) ) /* sys/time.h */ /* * Getkerninfo clock information structure */ struct clockinfo { int hz; /* clock frequency */ int tick; /* micro-seconds per hz tick */ int spare; int stathz; /* statistics clock frequency */ int profhz; /* profiling clock frequency */ }; /* no sin_len in sockaddr, we only remap in userland */ #define sin_len sin_zero[0] #endif /* Linux/Win */ /* * linux does not have a reentrant version of qsort, * so we the FreeBSD stdlib version. */ void qsort_r(void *a, size_t n, size_t es, void *thunk, int cmp_t(void *, const void *, const void *)); /* prototypes from libutil */ /* humanize_number(3) */ #define HN_DECIMAL 0x01 #define HN_NOSPACE 0x02 #define HN_B 0x04 #define HN_DIVISOR_1000 0x08 #define HN_GETSCALE 0x10 #define HN_AUTOSCALE 0x20 int humanize_number(char *_buf, size_t _len, int64_t _number, const char *_suffix, int _scale, int _flags); int expand_number(const char *_buf, int64_t *_num); #define setprogname(x) /* not present in linux */ extern int optreset; /* not present in linux */ size_t strlcpy(char * dst, const char * src, size_t siz); long long int strtonum(const char *nptr, long long minval, long long maxval, const char **errstr); int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, size_t newlen); #else /* KERNEL_MODULE */ /* * Part 4: kernel stuff */ /* linux and windows kernel do not have bcopy ? */ #define bcopy(_s, _d, _l) memcpy(_d, _s, _l) /* definitions useful for the kernel side */ struct route_in6 { int dummy; }; #ifdef __linux__ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) // or 2.4.x #include #endif /* skb_dst() and skb_dst_set() was introduced from linux 2.6.31 */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31) void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst); struct dst_entry *skb_dst(const struct sk_buff *skb); #endif /* The struct flowi changed */ #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,38) // check boundaries #define flow_daddr fl.u.ip4 #else #define flow_daddr fl.nl_u.ip4_u #endif #endif /* __linux__ */ /* * Do not load prio_heap.h header because of conflicting names * with our heap functions defined in include/netinet/ipfw/dn_heap.h * However do define struct ptr_heap used in linux 3.12.7 etc. */ #define _LINUX_PRIO_HEAP_H struct ptr_heap; /* * The following define prevent the ipv6.h header to be loaded. * Starting from the 2.6.38 kernel the ipv6.h file, which is included * by include/net/inetpeer.h in turn included by net/route.h * include the system tcp.h file while we want to include * our include/net/tcp.h instead. */ #ifndef _NET_IPV6_H #define _NET_IPV6_H static inline void ipv6_addr_copy(struct in6_addr *a1, const struct in6_addr *a2) { memcpy(a1, a2, sizeof(struct in6_addr)); } #endif /* _NET_IPV6_H */ #endif /* KERNEL_MODULE */ /* * Part 5: windows specific stuff */ #ifdef _WIN32 #ifndef KERNEL_MODULE #define CTL_CODE( DeviceType, Function, Method, Access ) ( \ ((DeviceType) << 16) | ((Access) << 14) | ((Function) << 2) | (Method) \ ) #define METHOD_BUFFERED 0 #define METHOD_IN_DIRECT 1 #define METHOD_OUT_DIRECT 2 #define METHOD_NEITHER 3 #define FILE_ANY_ACCESS 0 #define FILE_READ_DATA ( 0x0001 ) // file & pipe #define FILE_WRITE_DATA ( 0x0002 ) // file & pipe #endif /* !KERNEL_MODULE */ #define FILE_DEVICE_IPFW 0x00654324 #define IP_FW_BASE_CTL 0x840 #define IP_FW_SETSOCKOPT \ CTL_CODE(FILE_DEVICE_IPFW, IP_FW_BASE_CTL + 1, METHOD_BUFFERED, FILE_WRITE_DATA) #define IP_FW_GETSOCKOPT \ CTL_CODE(FILE_DEVICE_IPFW, IP_FW_BASE_CTL + 2, METHOD_BUFFERED, FILE_ANY_ACCESS) /********************************* * missing declarations in altq.c * **********************************/ #define _IOWR(x,y,t) _IOW(x,y,t) /********************************** * missing declarations in ipfw2.c * ***********************************/ #define ICMP_UNREACH_NET 0 /* bad net */ #define ICMP_UNREACH_HOST 1 /* bad host */ #define ICMP_UNREACH_PROTOCOL 2 /* bad protocol */ #define ICMP_UNREACH_PORT 3 /* bad port */ #define ICMP_UNREACH_NEEDFRAG 4 /* IP_DF caused drop */ #define ICMP_UNREACH_SRCFAIL 5 /* src route failed */ #define ICMP_UNREACH_NET_UNKNOWN 6 /* unknown net */ #define ICMP_UNREACH_HOST_UNKNOWN 7 /* unknown host */ #define ICMP_UNREACH_ISOLATED 8 /* src host isolated */ #define ICMP_UNREACH_NET_PROHIB 9 /* prohibited access */ #define ICMP_UNREACH_HOST_PROHIB 10 /* ditto */ #define ICMP_UNREACH_TOSNET 11 /* bad tos for net */ #define ICMP_UNREACH_TOSHOST 12 /* bad tos for host */ #define ICMP_UNREACH_FILTER_PROHIB 13 /* admin prohib */ #define ICMP_UNREACH_HOST_PRECEDENCE 14 /* host prec vio. */ #define ICMP_UNREACH_PRECEDENCE_CUTOFF 15 /* prec cutoff */ struct ether_addr; struct ether_addr * ether_aton(const char *a); /********************************* * missing declarations in ipv6.c * **********************************/ struct hostent* gethostbyname2(const char *name, int af); #define strcasecmp strcmp // windows XXX ip_dummynet.c /******************** * windows wrappings * *********************/ int my_socket(int domain, int ty, int proto); #define socket(_a, _b, _c) my_socket(_a, _b, _c) #endif /* _WIN32 */ /******************* * SYSCTL emulation * ********************/ #if defined (_WIN32) || defined (EMULATE_SYSCTL) #define STRINGIFY(x) #x /* flag is set with the last 2 bits for access, as defined in glue.h * and the rest for type */ enum { SYSCTLTYPE_INT = 0, SYSCTLTYPE_UINT, SYSCTLTYPE_SHORT, SYSCTLTYPE_USHORT, SYSCTLTYPE_LONG, SYSCTLTYPE_ULONG, SYSCTLTYPE_STRING, }; struct sysctlhead { uint32_t blocklen; //total size of the entry uint32_t namelen; //strlen(name) + '\0' uint32_t flags; //type and access uint32_t datalen; }; #ifdef _KERNEL #ifdef SYSCTL_NODE #undef SYSCTL_NODE #endif #define SYSCTL_NODE(a,b,c,d,e,f) #define SYSCTL_DECL(a) #define SYSCTL_VNET_PROC(a,b,c,d,e,f,g,h,i) #define GST_HARD_LIMIT 100 /* In the module, GST is implemented as an array of * sysctlentry, but while passing data to the userland * pointers are useless, the buffer is actually made of: * - sysctlhead (fixed size, containing lengths) * - data (typically 32 bit) * - name (zero-terminated and padded to mod4) */ struct sysctlentry { struct sysctlhead head; char* name; void* data; }; struct sysctltable { int count; //number of valid tables int totalsize; //total size of valid entries of al the valid tables void* namebuffer; //a buffer for all chained names struct sysctlentry entry[GST_HARD_LIMIT]; }; #ifdef SYSBEGIN #undef SYSBEGIN #endif #define SYSBEGIN(x) void sysctl_addgroup_##x() { #ifdef SYSEND #undef SYSEND #endif #define SYSEND } /* XXX remove duplication */ #define SYSCTL_INT(a,b,c,d,e,f,g) \ sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ (d) | (SYSCTLTYPE_INT << 2), sizeof(*e), e) #define SYSCTL_VNET_INT(a,b,c,d,e,f,g) \ sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ (d) | (SYSCTLTYPE_INT << 2), sizeof(*e), e) #define SYSCTL_UINT(a,b,c,d,e,f,g) \ sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ (d) | (SYSCTLTYPE_UINT << 2), sizeof(*e), e) #define SYSCTL_VNET_UINT(a,b,c,d,e,f,g) \ sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ (d) | (SYSCTLTYPE_UINT << 2), sizeof(*e), e) #define SYSCTL_LONG(a,b,c,d,e,f,g) \ sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ (d) | (SYSCTLTYPE_LONG << 2), sizeof(*e), e) #define SYSCTL_ULONG(a,b,c,d,e,f,g) \ sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ (d) | (SYSCTLTYPE_ULONG << 2), sizeof(*e), e) #define TUNABLE_INT(a,b) void keinit_GST(void); void keexit_GST(void); int kesysctl_emu_set(void* p, int l); int kesysctl_emu_get(struct sockopt* sopt); void sysctl_pushback(char* name, int flags, int datalen, void* data); #endif /* _KERNEL */ int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, size_t newlen); #endif /* _WIN32" || EMULATE_SYSCTL */ #ifdef _WIN32 int do_cmd(int optname, void *optval, uintptr_t optlen); #endif /* _WIN32 */ #define __PAST_END(v, idx) v[idx] #endif /* !_GLUE_H */ ================================================ FILE: ipfw/Makefile ================================================ # # $Id: Makefile 11688 2012-08-12 20:58:26Z luigi $ # # GNUMakefile to build the userland part of ipfw on Linux and Windows # # Do not set with = or := so we can inherit from the caller include ../Makefile.inc TARGET := ipfw all: $(TARGET) #TCC=c:/path/to/tcc # common flags EXTRA_CFLAGS += -O1 EXTRA_CFLAGS += -Wall EXTRA_CFLAGS += -include ../glue.h EXTRA_CFLAGS += -I ./include_e -I ./include ifneq ($(VER),openwrt) ifeq ($(OSARCH),Linux) EXTRA_CFLAGS += -D__BSD_VISIBLE EXTRA_CFLAGS += -Werror # Required by GCC 4.6 EXTRA_CFLAGS += -Wno-unused-but-set-variable endif ifeq ($(OSARCH),FreeBSD) EXTRA_CFLAGS += -D__BSD_VISIBLE EXTRA_CFLAGS += -Werror endif ifeq ($(OSARCH),Darwin) EXTRA_CFLAGS += -D__BSD_VISIBLE EXTRA_CFLAGS += -Werror endif ifeq ($(OSARCH),Windows) # we only support Cygwin and tcc as compilers. ifeq ($(WIN64),1) EXTRA_CFLAGS += -D_X64EMU endif ifeq ($(TCC),) # cygwin EXTRA_CFLAGS += -I/cygdrive/c/$(DDKDIR)/inc/ddk EXTRA_CFLAGS += -I . EXTRA_CFLAGS += -pipe -Wall else #-- build with tcc # TCC points to the root of tcc tree CC=$(TCC)/tcc.exe EXTRA_CFLAGS += -DTCC -I.. EXTRA_CFLAGS += -I$(TCC)/include/winapi -I$(TCC)/include EXTRA_CFLAGS += -nostdinc EFILES_. += err.h grp.h netdb.h pwd.h sysexits.h EFILES_arpa += inet.h EFILES_net += if.h EFILES_netinet += in.h in_systm.h ip.h ip_icmp.h EFILES_sys += cdefs.h wait.h ioctl.h socket.h endif # EXTRA_CFLAGS += -D_WIN32 # see who defines it EXTRA_CFLAGS += -Dsetsockopt=wnd_setsockopt EXTRA_CFLAGS += -Dgetsockopt=wnd_getsockopt EXTRA_CFLAGS += -DEMULATE_SYSCTL EFILES_net += ethernet.h route.h EFILES_netinet += ether.h icmp6.h EFILES_sys += sysctl.h TARGET = ipfw.exe ipfw: $(TARGET) endif # windows endif # !openwrt CFLAGS += $(EXTRA_CFLAGS) # Location of OS headers and libraries. After our stuff. USRDIR?= /usr ifeq ($(TCC),) CFLAGS += -I$(USRDIR)/include LDFLAGS += -L$(USRDIR)/lib else LDFLAGS += -L. -L$(TCC)/lib -lws2_32 endif OBJS = ipfw2.o dummynet.o main.o ipv6.o qsort_r.o OBJS += expand_number.o humanize_number.o glue.o # we don't use ALTQ CFLAGS += -DNO_ALTQ #OBJS += altq.o $(TARGET): $(OBJS) $(MSG) " LD $@" $(HIDE)$(CC) $(LDFLAGS) -o $@ $^ $(OBJS) : ipfw2.h ../glue.h include_e # support to create empty dirs and files in include_e/ # EDIRS is the list of directories, EFILES is the list of files. EFILES_sys += sockio.h EFILES_. += libutil.h EFILES_netinet += __emtpy.h M ?= $(shell pwd) # first make a list of directories from variable names EDIRS= $(subst EFILES_,,$(filter EFILES_%,$(.VARIABLES))) # then prepend the directory name to individual files. # $(empty) serves to interpret the following space literally, # and the ": = " substitution packs spaces into one. EFILES = $(foreach i,$(EDIRS),$(subst $(empty) , $(i)/, $(EFILES_$(i): = ))) include_e: $(MSG) "building include_e in $M" -@rm -rf $(M)/include_e opt_* -@mkdir -p $(M)/include_e -@(cd $(M)/include_e; mkdir -p $(EDIRS); touch $(EFILES) ) -@(cd $(M)/include_e/netinet; \ for i in ip_fw.h ip_dummynet.h tcp.h; do \ cp ../../../sys/netinet/$$i .; done; ) clean distclean: -@rm -rf $(OBJS) $(TARGET) include_e diff: -@(diff -upr $(BSD_HEAD)/sbin/ipfw .) ================================================ FILE: ipfw/add_rules ================================================ #!/bin/bash # # A test script to add rules PRG=./ipfw myfun() { $PRG add 10 count icmp from any to 131.114.9.128 $PRG add 20 count icmp from 131.114.9.128 to any $PRG add 20 count icmp from any to 131.114.9.130 $PRG add 30 count icmp from 131.114.9.130 to any $PRG add 40 count icmp from any to 131.114.9.129 $PRG add 50 count icmp from 131.114.9.129 to any $PRG add 60 count icmp from 131.114.9.236 to any sleep 1 $PRG del 10 $PRG del 20 $PRG del 20 $PRG del 30 $PRG del 40 $PRG del 50 $PRG del 60 } for ((i=0;i<100;i++)) ; do myfun done ================================================ FILE: ipfw/dummynet.c ================================================ /* * Copyright (c) 2002-2003,2010 Luigi Rizzo * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. * * $FreeBSD: head/sbin/ipfw/dummynet.c 206843 2010-04-19 15:11:45Z luigi $ * * dummynet support */ #include #include /* XXX there are several sysctl leftover here */ #include #include "ipfw2.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* inet_ntoa */ static struct _s_x dummynet_params[] = { { "plr", TOK_PLR }, { "noerror", TOK_NOERROR }, { "buckets", TOK_BUCKETS }, { "dst-ip", TOK_DSTIP }, { "src-ip", TOK_SRCIP }, { "dst-port", TOK_DSTPORT }, { "src-port", TOK_SRCPORT }, { "proto", TOK_PROTO }, { "weight", TOK_WEIGHT }, { "lmax", TOK_LMAX }, { "maxlen", TOK_LMAX }, { "all", TOK_ALL }, { "mask", TOK_MASK }, /* alias for both */ { "sched_mask", TOK_SCHED_MASK }, { "flow_mask", TOK_FLOW_MASK }, { "droptail", TOK_DROPTAIL }, { "red", TOK_RED }, { "gred", TOK_GRED }, { "bw", TOK_BW }, { "bandwidth", TOK_BW }, { "delay", TOK_DELAY }, { "link", TOK_LINK }, { "pipe", TOK_PIPE }, { "queue", TOK_QUEUE }, { "flowset", TOK_FLOWSET }, { "sched", TOK_SCHED }, { "pri", TOK_PRI }, { "priority", TOK_PRI }, { "type", TOK_TYPE }, { "flow-id", TOK_FLOWID}, { "dst-ipv6", TOK_DSTIP6}, { "dst-ip6", TOK_DSTIP6}, { "src-ipv6", TOK_SRCIP6}, { "src-ip6", TOK_SRCIP6}, { "profile", TOK_PROFILE}, { "burst", TOK_BURST}, { "dummynet-params", TOK_NULL }, { NULL, 0 } /* terminator */ }; #define O_NEXT(p, len) ((void *)((char *)p + len)) static void oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) { oid->len = len; oid->type = type; oid->subtype = 0; oid->id = id; } /* make room in the buffer and move the pointer forward */ static void * o_next(struct dn_id **o, int len, int type) { struct dn_id *ret = *o; oid_fill(ret, len, type, 0); *o = O_NEXT(*o, len); return ret; } /* handle variable length structures moving back the pointer and fixing length */ static void * o_compact(struct dn_id **o, int len, int real_length, int type) { struct dn_id *ret = *o; ret = O_NEXT(*o, -len); oid_fill(ret, real_length, type, 0); *o = O_NEXT(ret, real_length); return ret; } #if 0 static int sort_q(void *arg, const void *pa, const void *pb) { int rev = (co.do_sort < 0); int field = rev ? -co.do_sort : co.do_sort; long long res = 0; const struct dn_flow_queue *a = pa; const struct dn_flow_queue *b = pb; switch (field) { case 1: /* pkts */ res = a->len - b->len; break; case 2: /* bytes */ res = a->len_bytes - b->len_bytes; break; case 3: /* tot pkts */ res = a->tot_pkts - b->tot_pkts; break; case 4: /* tot bytes */ res = a->tot_bytes - b->tot_bytes; break; } if (res < 0) res = -1; if (res > 0) res = 1; return (int)(rev ? res : -res); } #endif /* print a mask and header for the subsequent list of flows */ static void print_mask(struct ipfw_flow_id *id) { if (!IS_IP6_FLOW_ID(id)) { printf(" " "mask: %s 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n", id->extra ? "queue," : "", id->proto, id->src_ip, id->src_port, id->dst_ip, id->dst_port); } else { char buf[255]; printf("\n mask: %sproto: 0x%02x, flow_id: 0x%08x, ", id->extra ? "queue," : "", id->proto, id->flow_id6); inet_ntop(AF_INET6, &(id->src_ip6), buf, sizeof(buf)); printf("%s/0x%04x -> ", buf, id->src_port); inet_ntop(AF_INET6, &(id->dst_ip6), buf, sizeof(buf)); printf("%s/0x%04x\n", buf, id->dst_port); } } static void print_header(struct ipfw_flow_id *id) { if (!IS_IP6_FLOW_ID(id)) printf("BKT Prot ___Source IP/port____ " "____Dest. IP/port____ " "Tot_pkt/bytes Pkt/Byte Drp\n"); else printf("BKT ___Prot___ _flow-id_ " "______________Source IPv6/port_______________ " "_______________Dest. IPv6/port_______________ " "Tot_pkt/bytes Pkt/Byte Drp\n"); } static void list_flow(struct dn_flow *ni, int *print) { char buff[255]; struct protoent *pe = NULL; struct in_addr ina; struct ipfw_flow_id *id = &ni->fid; if (*print) { print_header(&ni->fid); *print = 0; } pe = getprotobynumber(id->proto); /* XXX: Should check for IPv4 flows */ printf("%3u%c", (ni->oid.id) & 0xff, id->extra ? '*' : ' '); if (!IS_IP6_FLOW_ID(id)) { if (pe) printf("%-4s ", pe->p_name); else printf("%4u ", id->proto); ina.s_addr = htonl(id->src_ip); printf("%15s/%-5d ", inet_ntoa(ina), id->src_port); ina.s_addr = htonl(id->dst_ip); printf("%15s/%-5d ", inet_ntoa(ina), id->dst_port); } else { /* Print IPv6 flows */ if (pe != NULL) printf("%9s ", pe->p_name); else printf("%9u ", id->proto); printf("%7d %39s/%-5d ", id->flow_id6, inet_ntop(AF_INET6, &(id->src_ip6), buff, sizeof(buff)), id->src_port); printf(" %39s/%-5d ", inet_ntop(AF_INET6, &(id->dst_ip6), buff, sizeof(buff)), id->dst_port); } pr_u64(&ni->tot_pkts, 4); pr_u64(&ni->tot_bytes, 8); printf("%2u %4u %3u\n", ni->length, ni->len_bytes, ni->drops); } static void print_flowset_parms(struct dn_fs *fs, char *prefix) { int l; char qs[30]; char plr[30]; char red[90]; /* Display RED parameters */ l = fs->qsize; if (fs->flags & DN_QSIZE_BYTES) { if (l >= 8192) sprintf(qs, "%d KB", l / 1024); else sprintf(qs, "%d B", l); } else sprintf(qs, "%3d sl.", l); if (fs->plr) sprintf(plr, "plr %f", 1.0 * fs->plr / (double)(0x7fffffff)); else plr[0] = '\0'; if (fs->flags & DN_IS_RED) /* RED parameters */ sprintf(red, "\n\t %cRED w_q %f min_th %d max_th %d max_p %f", (fs->flags & DN_IS_GENTLE_RED) ? 'G' : ' ', 1.0 * fs->w_q / (double)(1 << SCALE_RED), fs->min_th, fs->max_th, 1.0 * fs->max_p / (double)(1 << SCALE_RED)); else sprintf(red, "droptail"); if (prefix[0]) { printf("%s %s%s %d queues (%d buckets) %s\n", prefix, qs, plr, fs->oid.id, fs->buckets, red); prefix[0] = '\0'; } else { printf("q%05d %s%s %d flows (%d buckets) sched %d " "weight %d lmax %d pri %d %s\n", fs->fs_nr, qs, plr, fs->oid.id, fs->buckets, fs->sched_nr, fs->par[0], fs->par[1], fs->par[2], red); if (fs->flags & DN_HAVE_MASK) print_mask(&fs->flow_mask); } } static void print_extra_delay_parms(struct dn_profile *p) { double loss; if (p->samples_no <= 0) return; loss = p->loss_level; loss /= p->samples_no; printf("\t profile: name \"%s\" loss %f samples %d\n", p->name, loss, p->samples_no); } static void flush_buf(char *buf) { if (buf[0]) printf("%s\n", buf); buf[0] = '\0'; } /* * generic list routine. We expect objects in a specific order, i.e. * PIPES AND SCHEDULERS: * link; scheduler; internal flowset if any; instances * we can tell a pipe from the number. * * FLOWSETS: * flowset; queues; * link i (int queue); scheduler i; si(i) { flowsets() : queues } */ static void list_pipes(struct dn_id *oid, struct dn_id *end) { char buf[160]; /* pending buffer */ int toPrint = 1; /* print header */ buf[0] = '\0'; for (; oid != end; oid = O_NEXT(oid, oid->len)) { if (oid->len < sizeof(*oid)) errx(1, "invalid oid len %d\n", oid->len); switch (oid->type) { default: flush_buf(buf); printf("unrecognized object %d size %d\n", oid->type, oid->len); break; case DN_TEXT: /* list of attached flowsets */ { int i, l; struct { struct dn_id id; uint32_t p[0]; } *d = (void *)oid; l = (oid->len - sizeof(*oid))/sizeof(d->p[0]); if (l == 0) break; printf(" Children flowsets: "); for (i = 0; i < l; i++) printf("%u ", d->p[i]); printf("\n"); break; } case DN_CMD_GET: if (co.verbose) printf("answer for cmd %d, len %d\n", oid->type, oid->id); break; case DN_SCH: { struct dn_sch *s = (struct dn_sch *)oid; flush_buf(buf); printf(" sched %d type %s flags 0x%x %d buckets %d active\n", s->sched_nr, s->name, s->flags, s->buckets, s->oid.id); if (s->flags & DN_HAVE_MASK) print_mask(&s->sched_mask); } break; case DN_FLOW: list_flow((struct dn_flow *)oid, &toPrint); break; case DN_LINK: { struct dn_link *p = (struct dn_link *)oid; double b = p->bandwidth; char bwbuf[30]; char burst[5 + 7]; /* This starts a new object so flush buffer */ flush_buf(buf); /* data rate */ if (b == 0) sprintf(bwbuf, "unlimited "); else if (b >= 1000000) sprintf(bwbuf, "%7.3f Mbit/s", b/1000000); else if (b >= 1000) sprintf(bwbuf, "%7.3f Kbit/s", b/1000); else sprintf(bwbuf, "%7.3f bit/s ", b); if (humanize_number(burst, sizeof(burst), p->burst, "", HN_AUTOSCALE, 0) < 0 || co.verbose) sprintf(burst, "%d", (int)p->burst); sprintf(buf, "%05d: %s %4d ms burst %s", p->link_nr % DN_MAX_ID, bwbuf, p->delay, burst); } break; case DN_FS: print_flowset_parms((struct dn_fs *)oid, buf); break; case DN_PROFILE: flush_buf(buf); print_extra_delay_parms((struct dn_profile *)oid); } flush_buf(buf); // XXX does it really go here ? } } /* * Delete pipe, queue or scheduler i */ int ipfw_delete_pipe(int do_pipe, int i) { struct { struct dn_id oid; uintptr_t a[1]; /* add more if we want a list */ } cmd; oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); cmd.oid.subtype = (do_pipe == 1) ? DN_LINK : ( (do_pipe == 2) ? DN_FS : DN_SCH); cmd.a[0] = i; i = do_cmd(IP_DUMMYNET3, &cmd, cmd.oid.len); if (i) { i = 1; warn("rule %u: setsockopt(IP_DUMMYNET_DEL)", i); } return i; } /* * Code to parse delay profiles. * * Some link types introduce extra delays in the transmission * of a packet, e.g. because of MAC level framing, contention on * the use of the channel, MAC level retransmissions and so on. * From our point of view, the channel is effectively unavailable * for this extra time, which is constant or variable depending * on the link type. Additionally, packets may be dropped after this * time (e.g. on a wireless link after too many retransmissions). * We can model the additional delay with an empirical curve * that represents its distribution. * * cumulative probability * 1.0 ^ * | * L +-- loss-level x * | ****** * | * * | ***** * | * * | ** * | * * +-------*-------------------> * delay * * The empirical curve may have both vertical and horizontal lines. * Vertical lines represent constant delay for a range of * probabilities; horizontal lines correspond to a discontinuty * in the delay distribution: the link will use the largest delay * for a given probability. * * To pass the curve to dummynet, we must store the parameters * in a file as described below, and issue the command * * ipfw pipe config ... bw XXX profile ... * * The file format is the following, with whitespace acting as * a separator and '#' indicating the beginning a comment: * * samples N * the number of samples used in the internal * representation (2..1024; default 100); * * loss-level L * The probability above which packets are lost. * (0.0 <= L <= 1.0, default 1.0 i.e. no loss); * * name identifier * Optional a name (listed by "ipfw pipe show") * to identify the distribution; * * "delay prob" | "prob delay" * One of these two lines is mandatory and defines * the format of the following lines with data points. * * XXX YYY * 2 or more lines representing points in the curve, * with either delay or probability first, according * to the chosen format. * The unit for delay is milliseconds. * * Data points does not need to be ordered or equal to the number * specified in the "samples" line. ipfw will sort and interpolate * the curve as needed. * * Example of a profile file: name bla_bla_bla samples 100 loss-level 0.86 prob delay 0 200 # minimum overhead is 200ms 0.5 200 0.5 300 0.8 1000 0.9 1300 1 1300 * Internally, we will convert the curve to a fixed number of * samples, and when it is time to transmit a packet we will * model the extra delay as extra bits in the packet. * */ #define ED_MAX_LINE_LEN 256+ED_MAX_NAME_LEN #define ED_TOK_SAMPLES "samples" #define ED_TOK_LOSS "loss-level" #define ED_TOK_NAME "name" #define ED_TOK_DELAY "delay" #define ED_TOK_PROB "prob" #define ED_TOK_BW "bw" #define ED_SEPARATORS " \t\n" #define ED_MIN_SAMPLES_NO 2 /* * returns 1 if s is a non-negative number, with at least one '.' */ static int is_valid_number(const char *s) { int i, dots_found = 0; int len = strlen(s); for (i = 0; i 1)) return 0; return 1; } /* * Take as input a string describing a bandwidth value * and return the numeric bandwidth value. * set clocking interface or bandwidth value */ static void read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen) { if (*bandwidth != -1) warnx("duplicate token, override bandwidth value!"); if (arg[0] >= 'a' && arg[0] <= 'z') { if (!if_name) { errx(1, "no if support"); } if (namelen >= IFNAMSIZ) warn("interface name truncated"); namelen--; /* interface name */ strncpy(if_name, arg, namelen); if_name[namelen] = '\0'; *bandwidth = 0; } else { /* read bandwidth value */ int bw; char *end = NULL; bw = strtoul(arg, &end, 0); if (*end == 'K' || *end == 'k') { end++; bw *= 1000; } else if (*end == 'M' || *end == 'm') { end++; bw *= 1000000; } if ((*end == 'B' && _substrcmp2(end, "Bi", "Bit/s") != 0) || _substrcmp2(end, "by", "bytes") == 0) bw *= 8; if (bw < 0) errx(EX_DATAERR, "bandwidth too large"); *bandwidth = bw; if (if_name) if_name[0] = '\0'; } } struct point { double prob; double delay; }; static int compare_points(const void *vp1, const void *vp2) { const struct point *p1 = vp1; const struct point *p2 = vp2; double res = 0; res = p1->prob - p2->prob; if (res == 0) res = p1->delay - p2->delay; if (res < 0) return -1; else if (res > 0) return 1; else return 0; } #define ED_EFMT(s) EX_DATAERR,"error in %s at line %d: "#s,filename,lineno /* * Interpolate a set of proability-value tuples. * * This function takes as input a tuple of values * and samples the interpolated curve described from the tuples. * * The user defined points are stored in the ponts structure. * The number of points is stored in points_no. * The user defined sampling value is stored in samples_no. * The resulting samples are in the "samples" pointer. * * We assume that The last point for the '1' value of the * probability should be defined. (XXX add checks for this) * * The input data are points and points_no. * The output data are s (the array of s_no samples) * and s_no (the number of samples) * */ static void interpolate_samples(struct point *p, int points_no, int *samples, int samples_no, const char *filename) { double dy; /* delta on the y axis */ double y; /* current value of y */ double x; /* current value of x */ double m; /* the y slope */ int i; /* samples index */ int curr; /* points current index */ /* make sure that there are enough points. */ /* XXX Duplicated should be removed */ if (points_no < 3) errx(EX_DATAERR, "%s too few samples, need at least %d", filename, 3); qsort(p, points_no, sizeof(struct point), compare_points); dy = 1.0/samples_no; y = 0; for (i=0, curr = 0; i < samples_no; i++, y+=dy) { /* This statment move the curr pointer to the next point * skipping the points with the same x value. We are * guaranteed to exit from the loop because the * last possible value of y is stricly less than 1 * and the last possible value of the y points is 1 */ while ( y >= p[curr+1].prob ) curr++; /* compute the slope of the curve */ m = (p[curr+1].delay - p[curr].delay) / (p[curr+1].prob - p[curr].prob); /* compute the x value starting from the current point */ x = p[curr].delay + (y - p[curr].prob) * m; samples[i] = x; } /* add the last sample */ samples[i] = p[curr+1].delay; } /* * p is the link (old pipe) * pf is the profile */ static void load_extra_delays(const char *filename, struct dn_profile *p, struct dn_link *link) { char line[ED_MAX_LINE_LEN]; FILE *f; int lineno = 0; int samples = -1; double loss = -1.0; char profile_name[ED_MAX_NAME_LEN]; int delay_first = -1; int do_points = 0; struct point points[ED_MAX_SAMPLES_NO]; int points_no = 0; /* XXX link never NULL? */ p->link_nr = link->link_nr; profile_name[0] = '\0'; f = fopen(filename, "r"); if (f == NULL) err(EX_UNAVAILABLE, "fopen: %s", filename); while (fgets(line, ED_MAX_LINE_LEN, f)) { /* read commands */ char *s, *cur = line, *name = NULL, *arg = NULL; ++lineno; /* parse the line */ while (cur) { s = strsep(&cur, ED_SEPARATORS); if (s == NULL || *s == '#') break; if (*s == '\0') continue; if (arg) errx(ED_EFMT("too many arguments")); if (name == NULL) name = s; else arg = s; } if ((name == NULL) || (*name == '#')) /* empty line */ continue; if (arg == NULL) errx(ED_EFMT("missing arg for %s"), name); if (!strcasecmp(name, ED_TOK_SAMPLES)) { if (samples > 0) errx(ED_EFMT("duplicate ``samples'' line")); if (atoi(arg) <=0) errx(ED_EFMT("invalid number of samples")); samples = atoi(arg); if (samples>=ED_MAX_SAMPLES_NO-1) errx(ED_EFMT("too many samples, maximum is %d"), ED_MAX_SAMPLES_NO-1); do_points = 0; } else if (!strcasecmp(name, ED_TOK_BW)) { char buf[IFNAMSIZ]; read_bandwidth(arg, &link->bandwidth, buf, sizeof(buf)); p->bandwidth = link->bandwidth; } else if (!strcasecmp(name, ED_TOK_LOSS)) { if (loss != -1.0) errx(ED_EFMT("duplicated token: %s"), name); if (!is_valid_number(arg)) errx(ED_EFMT("invalid %s"), arg); loss = atof(arg); if (loss > 1) errx(ED_EFMT("%s greater than 1.0"), name); do_points = 0; } else if (!strcasecmp(name, ED_TOK_NAME)) { if (profile_name[0] != '\0') errx(ED_EFMT("duplicated token: %s"), name); strncpy(profile_name, arg, sizeof(profile_name) - 1); profile_name[sizeof(profile_name)-1] = '\0'; do_points = 0; } else if (!strcasecmp(name, ED_TOK_DELAY)) { if (do_points) errx(ED_EFMT("duplicated token: %s"), name); delay_first = 1; do_points = 1; } else if (!strcasecmp(name, ED_TOK_PROB)) { if (do_points) errx(ED_EFMT("duplicated token: %s"), name); delay_first = 0; do_points = 1; } else if (do_points) { if (!is_valid_number(name) || !is_valid_number(arg)) errx(ED_EFMT("invalid point found")); if (delay_first) { points[points_no].delay = atof(name); points[points_no].prob = atof(arg); } else { points[points_no].delay = atof(arg); points[points_no].prob = atof(name); } if (points[points_no].prob > 1.0) errx(ED_EFMT("probability greater than 1.0")); ++points_no; } else { errx(ED_EFMT("unrecognised command '%s'"), name); } } fclose (f); if (samples == -1) { warnx("'%s' not found, assuming 100", ED_TOK_SAMPLES); samples = 100; } if (loss == -1.0) { warnx("'%s' not found, assuming no loss", ED_TOK_LOSS); loss = 1; } interpolate_samples(points, points_no, p->samples, samples, filename); p->samples_no = samples++; p->loss_level = loss * samples; strncpy(p->name, profile_name, sizeof(p->name)); } /* * configuration of pipes, schedulers, flowsets. * When we configure a new scheduler, an empty pipe is created, so: * * do_pipe = 1 -> "pipe N config ..." only for backward compatibility * sched N+Delta type fifo sched_mask ... * pipe N+Delta * flowset N+Delta pipe N+Delta (no parameters) * sched N type wf2q+ sched_mask ... * pipe N * * do_pipe = 2 -> flowset N config * flowset N parameters * * do_pipe = 3 -> sched N config * sched N parameters (default no pipe) * optional Pipe N config ... * pipe ==> */ void ipfw_config_pipe(int ac, char **av) { int i; u_int j; char *end; void *par = NULL; struct dn_id *buf, *base; struct dn_sch *sch = NULL; struct dn_link *p = NULL; struct dn_fs *fs = NULL; struct dn_profile *pf = NULL; struct ipfw_flow_id *mask = NULL; int lmax; uint32_t _foo = 0, *flags = &_foo , *buckets = &_foo; size_t max_pf_size = sizeof(struct dn_profile) + ED_MAX_SAMPLES_NO * sizeof(int); /* * allocate space for 1 header, * 1 scheduler, 1 link, 1 flowset, 1 profile */ lmax = sizeof(struct dn_id); /* command header */ lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + sizeof(struct dn_fs); lmax += max_pf_size; av++; ac--; /* Pipe number */ if (ac && isdigit(**av)) { i = atoi(*av); av++; ac--; } else i = -1; if (i <= 0) errx(EX_USAGE, "need a pipe/flowset/sched number"); base = buf = safe_calloc(1, lmax); /* all commands start with a 'CONFIGURE' and a version */ o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); base->id = DN_API_VERSION; switch (co.do_pipe) { case 1: /* "pipe N config ..." */ /* Allocate space for the WF2Q+ scheduler, its link * and the FIFO flowset. Set the number, but leave * the scheduler subtype and other parameters to 0 * so the kernel will use appropriate defaults. * XXX todo: add a flag to record if a parameter * is actually configured. * If we do a 'pipe config' mask -> sched_mask. * The FIFO scheduler and link are derived from the * WF2Q+ one in the kernel. */ sch = o_next(&buf, sizeof(*sch), DN_SCH); p = o_next(&buf, sizeof(*p), DN_LINK); fs = o_next(&buf, sizeof(*fs), DN_FS); sch->sched_nr = i; sch->oid.subtype = 0; /* defaults to WF2Q+ */ mask = &sch->sched_mask; flags = &sch->flags; buckets = &sch->buckets; *flags |= DN_PIPE_CMD; p->link_nr = i; /* This flowset is only for the FIFO scheduler */ fs->fs_nr = i + 2*DN_MAX_ID; fs->sched_nr = i + DN_MAX_ID; break; case 2: /* "queue N config ... " */ fs = o_next(&buf, sizeof(*fs), DN_FS); fs->fs_nr = i; mask = &fs->flow_mask; flags = &fs->flags; buckets = &fs->buckets; break; case 3: /* "sched N config ..." */ sch = o_next(&buf, sizeof(*sch), DN_SCH); fs = o_next(&buf, sizeof(*fs), DN_FS); sch->sched_nr = i; mask = &sch->sched_mask; flags = &sch->flags; buckets = &sch->buckets; /* fs is used only with !MULTIQUEUE schedulers */ fs->fs_nr = i + DN_MAX_ID; fs->sched_nr = i; break; } /* set to -1 those fields for which we want to reuse existing * values from the kernel. * Also, *_nr and subtype = 0 mean reuse the value from the kernel. * XXX todo: support reuse of the mask. */ if (p) p->bandwidth = -1; for (j = 0; j < sizeof(fs->par)/sizeof(fs->par[0]); j++) fs->par[j] = -1; while (ac > 0) { double d; int tok = match_token(dummynet_params, *av); ac--; av++; switch(tok) { case TOK_NOERROR: NEED(fs, "noerror is only for pipes"); fs->flags |= DN_NOERROR; break; case TOK_PLR: NEED(fs, "plr is only for pipes"); NEED1("plr needs argument 0..1\n"); d = strtod(av[0], NULL); if (d > 1) d = 1; else if (d < 0) d = 0; fs->plr = (int)(d*0x7fffffff); ac--; av++; break; case TOK_QUEUE: NEED(fs, "queue is only for pipes or flowsets"); NEED1("queue needs queue size\n"); end = NULL; fs->qsize = strtoul(av[0], &end, 0); if (*end == 'K' || *end == 'k') { fs->flags |= DN_QSIZE_BYTES; fs->qsize *= 1024; } else if (*end == 'B' || _substrcmp2(end, "by", "bytes") == 0) { fs->flags |= DN_QSIZE_BYTES; } ac--; av++; break; case TOK_BUCKETS: NEED(fs, "buckets is only for pipes or flowsets"); NEED1("buckets needs argument\n"); *buckets = strtoul(av[0], NULL, 0); ac--; av++; break; case TOK_FLOW_MASK: case TOK_SCHED_MASK: case TOK_MASK: NEED(mask, "tok_mask"); NEED1("mask needs mask specifier\n"); /* * per-flow queue, mask is dst_ip, dst_port, * src_ip, src_port, proto measured in bits */ par = NULL; bzero(mask, sizeof(*mask)); end = NULL; while (ac >= 1) { uint32_t *p32 = NULL; uint16_t *p16 = NULL; uint32_t *p20 = NULL; struct in6_addr *pa6 = NULL; uint32_t a; tok = match_token(dummynet_params, *av); ac--; av++; switch(tok) { case TOK_ALL: /* * special case, all bits significant * except 'extra' (the queue number) */ mask->dst_ip = ~0; mask->src_ip = ~0; mask->dst_port = ~0; mask->src_port = ~0; mask->proto = ~0; n2mask(&mask->dst_ip6, 128); n2mask(&mask->src_ip6, 128); mask->flow_id6 = ~0; *flags |= DN_HAVE_MASK; goto end_mask; case TOK_QUEUE: mask->extra = ~0; *flags |= DN_HAVE_MASK; goto end_mask; case TOK_DSTIP: mask->addr_type = 4; p32 = &mask->dst_ip; break; case TOK_SRCIP: mask->addr_type = 4; p32 = &mask->src_ip; break; case TOK_DSTIP6: mask->addr_type = 6; pa6 = &mask->dst_ip6; break; case TOK_SRCIP6: mask->addr_type = 6; pa6 = &mask->src_ip6; break; case TOK_FLOWID: mask->addr_type = 6; p20 = &mask->flow_id6; break; case TOK_DSTPORT: p16 = &mask->dst_port; break; case TOK_SRCPORT: p16 = &mask->src_port; break; case TOK_PROTO: break; default: ac++; av--; /* backtrack */ goto end_mask; } if (ac < 1) errx(EX_USAGE, "mask: value missing"); if (*av[0] == '/') { a = strtoul(av[0]+1, &end, 0); if (pa6 == NULL) a = (a == 32) ? ~0 : (1 << a) - 1; } else a = strtoul(av[0], &end, 0); if (p32 != NULL) *p32 = a; else if (p16 != NULL) { if (a > 0xFFFF) errx(EX_DATAERR, "port mask must be 16 bit"); *p16 = (uint16_t)a; } else if (p20 != NULL) { if (a > 0xfffff) errx(EX_DATAERR, "flow_id mask must be 20 bit"); *p20 = (uint32_t)a; } else if (pa6 != NULL) { if (a > 128) errx(EX_DATAERR, "in6addr invalid mask len"); else n2mask(pa6, a); } else { if (a > 0xFF) errx(EX_DATAERR, "proto mask must be 8 bit"); mask->proto = (uint8_t)a; } if (a != 0) *flags |= DN_HAVE_MASK; ac--; av++; } /* end while, config masks */ end_mask: break; case TOK_RED: case TOK_GRED: NEED1("red/gred needs w_q/min_th/max_th/max_p\n"); fs->flags |= DN_IS_RED; if (tok == TOK_GRED) fs->flags |= DN_IS_GENTLE_RED; /* * the format for parameters is w_q/min_th/max_th/max_p */ if ((end = strsep(&av[0], "/"))) { double w_q = strtod(end, NULL); if (w_q > 1 || w_q <= 0) errx(EX_DATAERR, "0 < w_q <= 1"); fs->w_q = (int) (w_q * (1 << SCALE_RED)); } if ((end = strsep(&av[0], "/"))) { fs->min_th = strtoul(end, &end, 0); if (*end == 'K' || *end == 'k') fs->min_th *= 1024; } if ((end = strsep(&av[0], "/"))) { fs->max_th = strtoul(end, &end, 0); if (*end == 'K' || *end == 'k') fs->max_th *= 1024; } if ((end = strsep(&av[0], "/"))) { double max_p = strtod(end, NULL); if (max_p > 1 || max_p <= 0) errx(EX_DATAERR, "0 < max_p <= 1"); fs->max_p = (int)(max_p * (1 << SCALE_RED)); } ac--; av++; break; case TOK_DROPTAIL: NEED(fs, "droptail is only for flowsets"); fs->flags &= ~(DN_IS_RED|DN_IS_GENTLE_RED); break; case TOK_BW: NEED(p, "bw is only for links"); NEED1("bw needs bandwidth or interface\n"); read_bandwidth(av[0], &p->bandwidth, NULL, 0); ac--; av++; break; case TOK_DELAY: NEED(p, "delay is only for links"); NEED1("delay needs argument 0..10000ms\n"); p->delay = strtoul(av[0], NULL, 0); ac--; av++; break; case TOK_TYPE: { int l; NEED(sch, "type is only for schedulers"); NEED1("type needs a string"); l = strlen(av[0]); if (l == 0 || l > 15) errx(1, "type %s too long\n", av[0]); strcpy(sch->name, av[0]); sch->oid.subtype = 0; /* use string */ ac--; av++; break; } case TOK_WEIGHT: NEED(fs, "weight is only for flowsets"); NEED1("weight needs argument\n"); fs->par[0] = strtol(av[0], &end, 0); ac--; av++; break; case TOK_LMAX: NEED(fs, "lmax is only for flowsets"); NEED1("lmax needs argument\n"); fs->par[1] = strtol(av[0], &end, 0); ac--; av++; break; case TOK_PRI: NEED(fs, "priority is only for flowsets"); NEED1("priority needs argument\n"); fs->par[2] = strtol(av[0], &end, 0); ac--; av++; break; case TOK_SCHED: case TOK_PIPE: NEED(fs, "pipe/sched"); NEED1("pipe/link/sched needs number\n"); fs->sched_nr = strtoul(av[0], &end, 0); ac--; av++; break; case TOK_PROFILE: { size_t real_length; NEED((!pf), "profile already set"); NEED(p, "profile"); NEED1("extra delay needs the file name\n"); /* load the profile structure using the DN_API */ pf = o_next(&buf, max_pf_size, DN_PROFILE); load_extra_delays(av[0], pf, p); //XXX can't fail? /* compact the dn_id structure */ real_length = sizeof(struct dn_profile) + pf->samples_no * sizeof(int); o_compact(&buf, max_pf_size, real_length, DN_PROFILE); --ac; ++av; } break; case TOK_BURST: NEED(p, "burst"); NEED1("burst needs argument\n"); errno = 0; if (expand_number(av[0], (int64_t *)&p->burst) < 0) if (errno != ERANGE) errx(EX_DATAERR, "burst: invalid argument"); if (errno || p->burst > (1ULL << 48) - 1) errx(EX_DATAERR, "burst: out of range (0..2^48-1)"); ac--; av++; break; default: errx(EX_DATAERR, "unrecognised option ``%s''", av[-1]); } } /* check validity of parameters */ if (p) { if (p->delay > 10000) errx(EX_DATAERR, "delay must be < 10000"); if (p->bandwidth == -1) p->bandwidth = 0; } if (fs) { /* XXX accept a 0 scheduler to keep the default */ if (fs->flags & DN_QSIZE_BYTES) { size_t len; long limit; len = sizeof(limit); if (sysctlbyname("net.inet.ip.dummynet.pipe_byte_limit", &limit, &len, NULL, 0) == -1) limit = 1024*1024; if (fs->qsize > limit) errx(EX_DATAERR, "queue size must be < %ldB", limit); } else { size_t len; long limit; len = sizeof(limit); if (sysctlbyname("net.inet.ip.dummynet.pipe_slot_limit", &limit, &len, NULL, 0) == -1) limit = 100; if (fs->qsize > limit) errx(EX_DATAERR, "2 <= queue size <= %ld", limit); } if (fs->flags & DN_IS_RED) { size_t len; int lookup_depth, avg_pkt_size; double w_q; if (fs->min_th >= fs->max_th) errx(EX_DATAERR, "min_th %d must be < than max_th %d", fs->min_th, fs->max_th); if (fs->max_th == 0) errx(EX_DATAERR, "max_th must be > 0"); len = sizeof(int); if (sysctlbyname("net.inet.ip.dummynet.red_lookup_depth", &lookup_depth, &len, NULL, 0) == -1) lookup_depth = 256; if (lookup_depth == 0) errx(EX_DATAERR, "net.inet.ip.dummynet.red_lookup_depth" " must be greater than zero"); len = sizeof(int); if (sysctlbyname("net.inet.ip.dummynet.red_avg_pkt_size", &avg_pkt_size, &len, NULL, 0) == -1) avg_pkt_size = 512; if (avg_pkt_size == 0) errx(EX_DATAERR, "net.inet.ip.dummynet.red_avg_pkt_size must" " be greater than zero"); /* * Ticks needed for sending a medium-sized packet. * Unfortunately, when we are configuring a WF2Q+ queue, we * do not have bandwidth information, because that is stored * in the parent pipe, and also we have multiple queues * competing for it. So we set s=0, which is not very * correct. But on the other hand, why do we want RED with * WF2Q+ ? */ #if 0 if (p.bandwidth==0) /* this is a WF2Q+ queue */ s = 0; else s = (double)ck.hz * avg_pkt_size * 8 / p.bandwidth; #endif /* * max idle time (in ticks) before avg queue size becomes 0. * NOTA: (3/w_q) is approx the value x so that * (1-w_q)^x < 10^-3. */ w_q = ((double)fs->w_q) / (1 << SCALE_RED); #if 0 // go in kernel idle = s * 3. / w_q; fs->lookup_step = (int)idle / lookup_depth; if (!fs->lookup_step) fs->lookup_step = 1; weight = 1 - w_q; for (t = fs->lookup_step; t > 1; --t) weight *= 1 - w_q; fs->lookup_weight = (int)(weight * (1 << SCALE_RED)); #endif /* code moved in the kernel */ } } i = do_cmd(IP_DUMMYNET3, base, (char *)buf - (char *)base); if (i) err(1, "setsockopt(%s)", "IP_DUMMYNET_CONFIGURE"); } void dummynet_flush(void) { struct dn_id oid; oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); do_cmd(IP_DUMMYNET3, &oid, oid.len); } /* Parse input for 'ipfw [pipe|sched|queue] show [range list]' * Returns the number of ranges, and possibly stores them * in the array v of size len. */ static int parse_range(int ac, char *av[], uint32_t *v, int len) { int n = 0; char *endptr, *s; uint32_t base[2]; if (v == NULL || len < 2) { v = base; len = 2; } for (s = *av; s != NULL; av++, ac--) { v[0] = strtoul(s, &endptr, 10); v[1] = (*endptr != '-') ? v[0] : strtoul(endptr+1, &endptr, 10); if (*endptr == '\0') { /* prepare for next round */ s = (ac > 0) ? *(av+1) : NULL; } else { if (*endptr != ',') { warn("invalid number: %s", s); s = ++endptr; continue; } /* continue processing from here */ s = ++endptr; ac++; av--; } if (v[1] < v[0] || v[1] >= DN_MAX_ID-1 || v[1] >= DN_MAX_ID-1) { continue; /* invalid entry */ } n++; /* translate if 'pipe list' */ if (co.do_pipe == 1) { v[0] += DN_MAX_ID; v[1] += DN_MAX_ID; } v = (n*2 < len) ? v + 2 : base; } return n; } /* main entry point for dummynet list functions. co.do_pipe indicates * which function we want to support. * av may contain filtering arguments, either individual entries * or ranges, or lists (space or commas are valid separators). * Format for a range can be n1-n2 or n3 n4 n5 ... * In a range n1 must be <= n2, otherwise the range is ignored. * A number 'n4' is translate in a range 'n4-n4' * All number must be > 0 and < DN_MAX_ID-1 */ void dummynet_list(int ac, char *av[], int show_counters) { struct dn_id *oid, *x = NULL; int ret, i; int n; /* # of ranges */ u_int buflen, l; u_int max_size; /* largest obj passed up */ (void)show_counters; // XXX unused, but we should use it. ac--; av++; /* skip 'list' | 'show' word */ n = parse_range(ac, av, NULL, 0); /* Count # of ranges. */ /* Allocate space to store ranges */ l = sizeof(*oid) + sizeof(uint32_t) * n * 2; oid = safe_calloc(1, l); oid_fill(oid, l, DN_CMD_GET, DN_API_VERSION); if (n > 0) /* store ranges in idx */ parse_range(ac, av, (uint32_t *)(oid + 1), n*2); /* * Compute the size of the largest object returned. If the * response leaves at least this much spare space in the * buffer, then surely the response is complete; otherwise * there might be a risk of truncation and we will need to * retry with a larger buffer. * XXX don't bother with smaller structs. */ max_size = sizeof(struct dn_fs); if (max_size < sizeof(struct dn_sch)) max_size = sizeof(struct dn_sch); if (max_size < sizeof(struct dn_flow)) max_size = sizeof(struct dn_flow); switch (co.do_pipe) { case 1: oid->subtype = DN_LINK; /* list pipe */ break; case 2: oid->subtype = DN_FS; /* list queue */ break; case 3: oid->subtype = DN_SCH; /* list sched */ break; } /* * Ask the kernel an estimate of the required space (result * in oid.id), unless we are requesting a subset of objects, * in which case the kernel does not give an exact answer. * In any case, space might grow in the meantime due to the * creation of new queues, so we must be prepared to retry. */ if (n > 0) { buflen = 4*1024; } else { ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); if (ret != 0 || oid->id <= sizeof(*oid)) goto done; buflen = oid->id + max_size; oid->len = sizeof(*oid); /* restore */ } /* Try a few times, until the buffer fits */ for (i = 0; i < 20; i++) { l = buflen; x = safe_realloc(x, l); bcopy(oid, x, oid->len); ret = do_cmd(-IP_DUMMYNET3, x, (uintptr_t)&l); if (ret != 0 || x->id <= sizeof(*oid)) goto done; /* no response */ if (l + max_size <= buflen) break; /* ok */ buflen *= 2; /* double for next attempt */ } list_pipes(x, O_NEXT(x, l)); done: if (x) free(x); free(oid); } ================================================ FILE: ipfw/expand_number.c ================================================ /*- * Copyright (c) 2007 Eric Anderson * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ // #include __FBSDID("$FreeBSD: src/lib/libutil/expand_number.c,v 1.2.4.2 2009/06/10 14:52:34 des Exp $"); #include #include #include #include //#include #include /* * Convert an expression of the following forms to a int64_t. * 1) A positive decimal number. * 2) A positive decimal number followed by a 'b' or 'B' (mult by 1). * 3) A positive decimal number followed by a 'k' or 'K' (mult by 1 << 10). * 4) A positive decimal number followed by a 'm' or 'M' (mult by 1 << 20). * 5) A positive decimal number followed by a 'g' or 'G' (mult by 1 << 30). * 6) A positive decimal number followed by a 't' or 'T' (mult by 1 << 40). * 7) A positive decimal number followed by a 'p' or 'P' (mult by 1 << 50). * 8) A positive decimal number followed by a 'e' or 'E' (mult by 1 << 60). */ int expand_number(const char *buf, int64_t *num) { static const char unit[] = "bkmgtpe"; char *endptr, s; int64_t number; int i; number = strtoimax(buf, &endptr, 0); if (endptr == buf) { /* No valid digits. */ errno = EINVAL; return (-1); } if (*endptr == '\0') { /* No unit. */ *num = number; return (0); } s = tolower(*endptr); switch (s) { case 'b': case 'k': case 'm': case 'g': case 't': case 'p': case 'e': break; default: /* Unrecognized unit. */ errno = EINVAL; return (-1); } for (i = 0; unit[i] != '\0'; i++) { if (s == unit[i]) break; if ((number < 0 && (number << 10) > number) || (number >= 0 && (number << 10) < number)) { errno = ERANGE; return (-1); } number <<= 10; } *num = number; return (0); } ================================================ FILE: ipfw/glue.c ================================================ /* * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id: glue.c 12264 2013-04-27 20:21:06Z luigi $ * * Userland functions missing in linux/Windows */ #include #include #include #ifdef _WIN32 #include #include #endif /* _WIN32 */ #ifndef HAVE_NAT /* dummy nat functions */ void ipfw_show_nat(int ac, char **av) { fprintf(stderr, "%s unsupported\n", __FUNCTION__); } void ipfw_config_nat(int ac, char **av) { fprintf(stderr, "%s unsupported\n", __FUNCTION__); } #endif #ifdef __linux__ int optreset; /* missing in linux */ #endif /* * not implemented in linux. * taken from /usr/src/lib/libc/string/strlcpy.c */ size_t strlcpy(char *dst, const char *src, size_t siz) { char *d = dst; const char *s = src; size_t n = siz; /* Copy as many bytes as will fit */ if (n != 0 && --n != 0) { do { if ((*d++ = *s++) == 0) break; } while (--n != 0); } /* Not enough room in dst, add NUL and traverse rest of src */ if (n == 0) { if (siz != 0) *d = '\0'; /* NUL-terminate dst */ while (*s++) ; } return(s - src - 1); /* count does not include NUL */ } /* missing in linux and windows */ long long int strtonum(const char *nptr, long long minval, long long maxval, const char **errstr) { long long ret; int errno_c = errno; /* save actual errno */ errno = 0; #ifdef TCC ret = strtol(nptr, (char **)errstr, 0); #else ret = strtoll(nptr, (char **)errstr, 0); #endif /* We accept only a string that represent exactly a number (ie. start * and end with a digit). * FreeBSD version wants errstr==NULL if no error occurs, otherwise * errstr should point to an error string. * For our purspose, we implement only the invalid error, ranges * error aren't checked */ if (errno != 0 || nptr == *errstr || **errstr != '\0') *errstr = "invalid"; else { *errstr = NULL; errno = errno_c; } return ret; } #if defined (_WIN32) || defined (EMULATE_SYSCTL) //XXX missing prerequisites #include //openwrt #include //openwrt #include #include #endif /* * set or get system information * XXX lock acquisition/serialize calls * * we export this as sys/module/ipfw_mod/parameters/___ * This function get or/and set the value of the sysctl passed by * the name parameter. If the old value is not desired, * oldp and oldlenp should be set to NULL. * * XXX * I do not know how this works in FreeBSD in the case * where there are no write permission on the sysctl var. * We read the value and set return variables in any way * but returns -1 on write failures, regardless the * read success. * * Since there is no information on types, in the following * code we assume a length of 4 is a int. * * Returns 0 on success, -1 on errors. */ int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { #if defined (_WIN32) || defined (EMULATE_SYSCTL) /* * we embed the sysctl request in the usual sockopt mechanics. * the sockopt buffer il filled with a dn_id with IP_DUMMYNET3 * command, and the special DN_SYSCTL_GET and DN_SYSCTL_SET * subcommands. * the syntax of this function is fully compatible with * POSIX sysctlby name: * if newp and newlen are != 0 => this is a set * else if oldp and oldlen are != 0 => this is a get * to avoid too much overhead in the module, the whole * sysctltable is returned, and the parsing is done in userland, * a probe request is done to retrieve the size needed to * transfer the table, before the real request * if both old and new params = 0 => this is a print * this is a special request, done only by main() * to implement the extension './ipfw sysctl', * a command that bypasses the normal getopt, and that * is available on those platforms that use this * sysctl emulation. * in this case, a negative oldlen signals that *oldp * is actually a FILE* to print somewhere else than stdout */ int l; int ret; struct dn_id* oid; struct sysctlhead* entry; char* pstring; char* pdata; FILE* fp; if((oldlenp != NULL) && (*oldlenp < 0)) fp = (FILE*)oldp; else fp = stdout; if(newp != NULL && newlen != 0) { //this is a set l = sizeof(struct dn_id) + sizeof(struct sysctlhead) + strlen(name)+1 + newlen; oid = malloc(l); if (oid == NULL) return -1; oid->len = l; oid->type = DN_SYSCTL_SET; oid->id = DN_API_VERSION; entry = (struct sysctlhead*)(oid+1); pdata = (char*)(entry+1); pstring = pdata + newlen; entry->blocklen = ((sizeof(struct sysctlhead) + strlen(name)+1 + newlen) + 3) & ~3; entry->namelen = strlen(name)+1; entry->flags = 0; entry->datalen = newlen; bcopy(newp, pdata, newlen); bcopy(name, pstring, strlen(name)+1); ret = do_cmd(IP_DUMMYNET3, oid, (uintptr_t)l); if (ret != 0) return -1; } else { //this is a get or a print l = sizeof(struct dn_id); oid = malloc(l); if (oid == NULL) return -1; oid->len = l; oid->type = DN_SYSCTL_GET; oid->id = DN_API_VERSION; ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); if (ret != 0) return -1; l=oid->id; free(oid); oid = malloc(l); if (oid == NULL) return -1; oid->len = l; oid->type = DN_SYSCTL_GET; oid->id = DN_API_VERSION; ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); if (ret != 0) return -1; entry = (struct sysctlhead*)(oid+1); while(entry->blocklen != 0) { pdata = (char*)(entry+1); pstring = pdata+entry->datalen; //time to check if this is a get or a print if(name != NULL && oldp != NULL && *oldlenp > 0) { //this is a get if(strcmp(name,pstring) == 0) { //match found, sanity chech on len if(*oldlenp < entry->datalen) { printf("%s error: buffer too small\n",__FUNCTION__); return -1; } *oldlenp = entry->datalen; bcopy(pdata, oldp, *oldlenp); return 0; } } else { //this is a print if( name == NULL ) goto print; if ( (strncmp(pstring,name,strlen(name)) == 0) && ( pstring[strlen(name)]=='\0' || pstring[strlen(name)]=='.' ) ) goto print; else goto skip; print: fprintf(fp, "%s: ",pstring); switch( entry->flags >> 2 ) { case SYSCTLTYPE_LONG: fprintf(fp, "%li ", *(long*)(pdata)); break; case SYSCTLTYPE_UINT: fprintf(fp, "%u ", *(unsigned int*)(pdata)); break; case SYSCTLTYPE_ULONG: fprintf(fp, "%lu ", *(unsigned long*)(pdata)); break; case SYSCTLTYPE_INT: default: fprintf(fp, "%i ", *(int*)(pdata)); } if( (entry->flags & 0x00000003) == CTLFLAG_RD ) fprintf(fp, "\t(read only)\n"); else fprintf(fp, "\n"); skip: ; } entry = (struct sysctlhead*)((unsigned char*)entry + entry->blocklen); } free(oid); return 0; } //fallback for invalid options return -1; #else /* __linux__ */ FILE *fp; char *basename = "/sys/module/ipfw_mod/parameters/"; char filename[256]; /* full filename */ char *varp; int ret = 0; /* return value */ long d; if (name == NULL) /* XXX set errno */ return -1; /* locate the filename */ varp = strrchr(name, '.'); if (varp == NULL) /* XXX set errno */ return -1; snprintf(filename, sizeof(filename), "%s%s", basename, varp+1); /* * XXX we could open the file here, in rw mode * but need to check if a file have write * permissions. */ /* check parameters */ if (oldp && oldlenp) { /* read mode */ fp = fopen(filename, "r"); if (fp == NULL) { fprintf(stderr, "%s fopen error reading filename %s\n", __FUNCTION__, filename); return -1; } if (fscanf(fp, "%ld", &d) != 1) { ret = -1; } else if (*oldlenp == sizeof(int)) { int dst = d; memcpy(oldp, &dst, *oldlenp); } else if (*oldlenp == sizeof(long)) { memcpy(oldp, &d, *oldlenp); } else { fprintf(stderr, "unknown paramerer len %d\n", (int)*oldlenp); } fclose(fp); } if (newp && newlen) { /* write */ fp = fopen(filename, "w"); if (fp == NULL) { fprintf(stderr, "%s fopen error writing filename %s\n", __FUNCTION__, filename); return -1; } if (newlen == sizeof(int)) { if (fprintf(fp, "%d", *(int *)newp) < 1) ret = -1; } else if (newlen == sizeof(long)) { if (fprintf(fp, "%ld", *(long *)newp) < 1) ret = -1; } else { fprintf(stderr, "unknown paramerer len %d\n", (int)newlen); } fclose(fp); } return ret; #endif /* __linux__ */ } #ifdef _WIN32 /* * On windows, set/getsockopt are mapped to DeviceIoControl() */ int wnd_setsockopt(int s, int level, int sopt_name, const void *optval, socklen_t optlen) { size_t len = sizeof (struct sockopt) + optlen; struct sockopt *sock; DWORD n; BOOL result; HANDLE _dev_h = (HANDLE)s; /* allocate a data structure for communication */ sock = malloc(len); if (sock == NULL) return -1; sock->sopt_dir = SOPT_SET; sock->sopt_name = sopt_name; sock->sopt_valsize = optlen; sock->sopt_val = (void *)(sock+1); memcpy(sock->sopt_val, optval, optlen); result = DeviceIoControl (_dev_h, IP_FW_SETSOCKOPT, sock, len, NULL, 0, &n, NULL); free (sock); return (result ? 0 : -1); } int wnd_getsockopt(int s, int level, int sopt_name, void *optval, socklen_t *optlen) { size_t len = sizeof (struct sockopt) + *optlen; struct sockopt *sock; DWORD n; BOOL result; HANDLE _dev_h = (HANDLE)s; sock = malloc(len); if (sock == NULL) return -1; sock->sopt_dir = SOPT_GET; sock->sopt_name = sopt_name; sock->sopt_valsize = *optlen; sock->sopt_val = (void *)(sock+1); memcpy (sock->sopt_val, optval, *optlen); result = DeviceIoControl (_dev_h, IP_FW_GETSOCKOPT, sock, len, sock, len, &n, NULL); //printf("len = %i, returned = %u, valsize = %i\n",len,n,sock->sopt_valsize); *optlen = sock->sopt_valsize; memcpy (optval, sock->sopt_val, *optlen); free (sock); return (result ? 0 : -1); } int my_socket(int domain, int ty, int proto) { TCHAR *pcCommPort = TEXT("\\\\.\\Ipfw"); HANDLE _dev_h = INVALID_HANDLE_VALUE; /* Special Handling For Accessing Device On Windows 2000 Terminal Server See Microsoft KB Article 259131 */ if (_dev_h == INVALID_HANDLE_VALUE) { _dev_h = CreateFile (pcCommPort, GENERIC_READ | GENERIC_WRITE, 0, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); } if (_dev_h == INVALID_HANDLE_VALUE) { printf("%s failed %u, cannot talk to kernel module\n", __FUNCTION__, (unsigned)GetLastError()); return -1; } return (int)_dev_h; } struct hostent* gethostbyname2(const char *name, int af) { return gethostbyname(name); } struct ether_addr* ether_aton(const char *a) { fprintf(stderr, "%s empty\n", __FUNCTION__); return NULL; } #ifdef TCC int opterr = 1, /* if error message should be printed */ optind = 1, /* index into parent argv vector */ optopt, /* character checked for validity */ optreset; /* reset getopt */ char *optarg; /* argument associated with option */ #define BADCH (int)'?' #define BADARG (int)':' #define EMSG "" #define PROGNAME "ipfw" /* * getopt -- * Parse argc/argv argument vector. */ int getopt(nargc, nargv, ostr) int nargc; char * const nargv[]; const char *ostr; { static char *place = EMSG; /* option letter processing */ char *oli; /* option letter list index */ if (optreset || *place == 0) { /* update scanning pointer */ optreset = 0; place = nargv[optind]; if (optind >= nargc || *place++ != '-') { /* Argument is absent or is not an option */ place = EMSG; return (-1); } optopt = *place++; if (optopt == '-' && *place == 0) { /* "--" => end of options */ ++optind; place = EMSG; return (-1); } if (optopt == 0) { /* Solitary '-', treat as a '-' option if the program (eg su) is looking for it. */ place = EMSG; if (strchr(ostr, '-') == NULL) return (-1); optopt = '-'; } } else optopt = *place++; /* See if option letter is one the caller wanted... */ if (optopt == ':' || (oli = strchr(ostr, optopt)) == NULL) { if (*place == 0) ++optind; if (opterr && *ostr != ':') (void)fprintf(stderr, "%s: illegal option -- %c\n", PROGNAME, optopt); return (BADCH); } /* Does this option need an argument? */ if (oli[1] != ':') { /* don't need argument */ optarg = NULL; if (*place == 0) ++optind; } else { /* Option-argument is either the rest of this argument or the entire next argument. */ if (*place) optarg = place; else if (nargc > ++optind) optarg = nargv[optind]; else { /* option-argument absent */ place = EMSG; if (*ostr == ':') return (BADARG); if (opterr) (void)fprintf(stderr, "%s: option requires an argument -- %c\n", PROGNAME, optopt); return (BADCH); } place = EMSG; ++optind; } return (optopt); /* return option letter */ } //static FILE *err_file = stderr; void verrx(int ex, int eval, const char *fmt, va_list ap) { fprintf(stderr, "%s: ", PROGNAME); if (fmt != NULL) vfprintf(stderr, fmt, ap); fprintf(stderr, "\n"); if (ex) exit(eval); } void errx(int eval, const char *fmt, ...) { va_list ap; va_start(ap, fmt); verrx(1, eval, fmt, ap); va_end(ap); } void warnx(const char *fmt, ...) { va_list ap; va_start(ap, fmt); verrx(0, 0, fmt, ap); va_end(ap); } char * strsep(char **stringp, const char *delim) { char *s; const char *spanp; int c, sc; char *tok; if ((s = *stringp) == NULL) return (NULL); for (tok = s;;) { c = *s++; spanp = delim; do { if ((sc = *spanp++) == c) { if (c == 0) s = NULL; else s[-1] = 0; *stringp = s; return (tok); } } while (sc != 0); } /* NOTREACHED */ } static unsigned char tolower(unsigned char c) { return (c >= 'A' && c <= 'Z') ? c + 'a' - 'A' : c; } static int isdigit(unsigned char c) { return (c >= '0' && c <= '9'); } static int isxdigit(unsigned char c) { return (strchr("0123456789ABCDEFabcdef", c) ? 1 : 0); } static int isspace(unsigned char c) { return (strchr(" \t\n\r", c) ? 1 : 0); } static int isascii(unsigned char c) { return (c < 128); } static int islower(unsigned char c) { return (c >= 'a' && c <= 'z'); } int strcasecmp(const char *s1, const char *s2) { const unsigned char *us1 = (const unsigned char *)s1, *us2 = (const unsigned char *)s2; while (tolower(*us1) == tolower(*us2++)) if (*us1++ == '\0') return (0); return (tolower(*us1) - tolower(*--us2)); } intmax_t strtoimax(const char * restrict nptr, char ** restrict endptr, int base) { return strtol(nptr, endptr,base); } void setservent(int a) { } #define NS_INADDRSZ 128 int inet_pton(int af, const char *src, void *dst) { static const char digits[] = "0123456789"; int saw_digit, octets, ch; u_char tmp[NS_INADDRSZ], *tp; if (af != AF_INET) { errno = EINVAL; return -1; } saw_digit = 0; octets = 0; *(tp = tmp) = 0; while ((ch = *src++) != '\0') { const char *pch; if ((pch = strchr(digits, ch)) != NULL) { u_int new = *tp * 10 + (pch - digits); if (saw_digit && *tp == 0) return (0); if (new > 255) return (0); *tp = new; if (!saw_digit) { if (++octets > 4) return (0); saw_digit = 1; } } else if (ch == '.' && saw_digit) { if (octets == 4) return (0); *++tp = 0; saw_digit = 0; } else return (0); } if (octets < 4) return (0); memcpy(dst, tmp, NS_INADDRSZ); return (1); } const char * inet_ntop(int af, const void *_src, char *dst, socklen_t size) { static const char fmt[] = "%u.%u.%u.%u"; char tmp[sizeof "255.255.255.255"]; const u_char *src = _src; int l; if (af != AF_INET) { errno = EINVAL; return NULL; } l = snprintf(tmp, sizeof(tmp), fmt, src[0], src[1], src[2], src[3]); if (l <= 0 || (socklen_t) l >= size) { errno = ENOSPC; return (NULL); } strlcpy(dst, tmp, size); return (dst); } /*% * Check whether "cp" is a valid ascii representation * of an Internet address and convert to a binary address. * Returns 1 if the address is valid, 0 if not. * This replaces inet_addr, the return value from which * cannot distinguish between failure and a local broadcast address. */ int inet_aton(const char *cp, struct in_addr *addr) { u_long val; int base, n; char c; u_int8_t parts[4]; u_int8_t *pp = parts; int digit; c = *cp; for (;;) { /* * Collect number up to ``.''. * Values are specified as for C: * 0x=hex, 0=octal, isdigit=decimal. */ if (!isdigit((unsigned char)c)) return (0); val = 0; base = 10; digit = 0; if (c == '0') { c = *++cp; if (c == 'x' || c == 'X') base = 16, c = *++cp; else { base = 8; digit = 1 ; } } for (;;) { if (isascii(c) && isdigit((unsigned char)c)) { if (base == 8 && (c == '8' || c == '9')) return (0); val = (val * base) + (c - '0'); c = *++cp; digit = 1; } else if (base == 16 && isascii(c) && isxdigit((unsigned char)c)) { val = (val << 4) | (c + 10 - (islower((unsigned char)c) ? 'a' : 'A')); c = *++cp; digit = 1; } else break; } if (c == '.') { /* * Internet format: * a.b.c.d * a.b.c (with c treated as 16 bits) * a.b (with b treated as 24 bits) */ if (pp >= parts + 3 || val > 0xffU) return (0); *pp++ = val; c = *++cp; } else break; } /* * Check for trailing characters. */ if (c != '\0' && (!isascii(c) || !isspace((unsigned char)c))) return (0); /* * Did we get a valid digit? */ if (!digit) return (0); /* * Concoct the address according to * the number of parts specified. */ n = pp - parts + 1; switch (n) { case 1: /*%< a -- 32 bits */ break; case 2: /*%< a.b -- 8.24 bits */ if (val > 0xffffffU) return (0); val |= parts[0] << 24; break; case 3: /*%< a.b.c -- 8.8.16 bits */ if (val > 0xffffU) return (0); val |= (parts[0] << 24) | (parts[1] << 16); break; case 4: /*%< a.b.c.d -- 8.8.8.8 bits */ if (val > 0xffU) return (0); val |= (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8); break; } if (addr != NULL) addr->s_addr = htonl(val); return (1); } #endif /* TCC */ #endif /* _WIN32 */ ================================================ FILE: ipfw/humanize_number.c ================================================ /* $NetBSD: humanize_number.c,v 1.13 2007/12/14 17:26:19 christos Exp $ */ /* * Copyright (c) 1997, 1998, 1999, 2002 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, by Luke Mewburn and by Tomas Svensson. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the NetBSD * Foundation, Inc. and its contributors. * 4. Neither the name of The NetBSD Foundation nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ // #include __FBSDID("$FreeBSD: src/lib/libutil/humanize_number.c,v 1.2.10.1 2008/04/20 16:29:01 antoine Exp $"); #include #include #include #include #include #include // #include //#include int humanize_number(char *buf, size_t len, int64_t bytes, const char *suffix, int scale, int flags) { const char *prefixes, *sep; int b, i, r, maxscale, s1, s2, sign; int64_t divisor, max; size_t baselen; assert(buf != NULL); assert(suffix != NULL); assert(scale >= 0); if (flags & HN_DIVISOR_1000) { /* SI for decimal multiplies */ divisor = 1000; if (flags & HN_B) prefixes = "B\0k\0M\0G\0T\0P\0E"; else prefixes = "\0\0k\0M\0G\0T\0P\0E"; } else { /* * binary multiplies * XXX IEC 60027-2 recommends Ki, Mi, Gi... */ divisor = 1024; if (flags & HN_B) prefixes = "B\0K\0M\0G\0T\0P\0E"; else prefixes = "\0\0K\0M\0G\0T\0P\0E"; } #define SCALE2PREFIX(scale) (&prefixes[(scale) << 1]) maxscale = 7; if (scale >= maxscale && (scale & (HN_AUTOSCALE | HN_GETSCALE)) == 0) return (-1); if (buf == NULL || suffix == NULL) return (-1); if (len > 0) buf[0] = '\0'; if (bytes < 0) { sign = -1; bytes *= -100; baselen = 3; /* sign, digit, prefix */ } else { sign = 1; bytes *= 100; baselen = 2; /* digit, prefix */ } if (flags & HN_NOSPACE) sep = ""; else { sep = " "; baselen++; } baselen += strlen(suffix); /* Check if enough room for `x y' + suffix + `\0' */ if (len < baselen + 1) return (-1); if (scale & (HN_AUTOSCALE | HN_GETSCALE)) { /* See if there is additional columns can be used. */ for (max = 100, i = len - baselen; i-- > 0;) max *= 10; /* * Divide the number until it fits the given column. * If there will be an overflow by the rounding below, * divide once more. */ for (i = 0; bytes >= max - 50 && i < maxscale; i++) bytes /= divisor; if (scale & HN_GETSCALE) return (i); } else for (i = 0; i < scale && i < maxscale; i++) bytes /= divisor; /* If a value <= 9.9 after rounding and ... */ if (bytes < 995 && i > 0 && flags & HN_DECIMAL) { /* baselen + \0 + .N */ if (len < baselen + 1 + 2) return (-1); b = ((int)bytes + 5) / 10; s1 = b / 10; s2 = b % 10; r = snprintf(buf, len, "%d%s%d%s%s%s", sign * s1, ".", s2, sep, SCALE2PREFIX(i), suffix); } else r = snprintf(buf, len, "%" PRId64 "%s%s%s", sign * ((bytes + 50) / 100), sep, SCALE2PREFIX(i), suffix); return (r); } ================================================ FILE: ipfw/include/alias.h ================================================ #ifndef _ALIAS_H_ #define _ALIAS_H_ #define LIBALIAS_BUF_SIZE 128 /* * If PKT_ALIAS_LOG is set, a message will be printed to /var/log/alias.log * every time a link is created or deleted. This is useful for debugging. */ #define PKT_ALIAS_LOG 0x01 /* * If PKT_ALIAS_DENY_INCOMING is set, then incoming connections (e.g. to ftp, * telnet or web servers will be prevented by the aliasing mechanism. */ #define PKT_ALIAS_DENY_INCOMING 0x02 /* * If PKT_ALIAS_SAME_PORTS is set, packets will be attempted sent from the * same port as they originated on. This allows e.g. rsh to work *99% of the * time*, but _not_ 100% (it will be slightly flakey instead of not working * at all). This mode bit is set by PacketAliasInit(), so it is a default * mode of operation. */ #define PKT_ALIAS_SAME_PORTS 0x04 /* * If PKT_ALIAS_USE_SOCKETS is set, then when partially specified links (e.g. * destination port and/or address is zero), the packet aliasing engine will * attempt to allocate a socket for the aliasing port it chooses. This will * avoid interference with the host machine. Fully specified links do not * require this. This bit is set after a call to PacketAliasInit(), so it is * a default mode of operation. */ #ifndef NO_USE_SOCKETS #define PKT_ALIAS_USE_SOCKETS 0x08 #endif /*- * If PKT_ALIAS_UNREGISTERED_ONLY is set, then only packets with * unregistered source addresses will be aliased. Private * addresses are those in the following ranges: * * 10.0.0.0 -> 10.255.255.255 * 172.16.0.0 -> 172.31.255.255 * 192.168.0.0 -> 192.168.255.255 */ #define PKT_ALIAS_UNREGISTERED_ONLY 0x10 /* * If PKT_ALIAS_RESET_ON_ADDR_CHANGE is set, then the table of dynamic * aliasing links will be reset whenever PacketAliasSetAddress() changes the * default aliasing address. If the default aliasing address is left * unchanged by this function call, then the table of dynamic aliasing links * will be left intact. This bit is set after a call to PacketAliasInit(). */ #define PKT_ALIAS_RESET_ON_ADDR_CHANGE 0x20 /* * If PKT_ALIAS_PROXY_ONLY is set, then NAT will be disabled and only * transparent proxying is performed. */ #define PKT_ALIAS_PROXY_ONLY 0x40 /* * If PKT_ALIAS_REVERSE is set, the actions of PacketAliasIn() and * PacketAliasOut() are reversed. */ #define PKT_ALIAS_REVERSE 0x80 #endif /* !_ALIAS_H_ */ ================================================ FILE: ipfw/include/net/if_dl.h ================================================ /*- * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_dl.h 8.1 (Berkeley) 6/10/93 * $FreeBSD: src/sys/net/if_dl.h,v 1.14 2005/01/07 01:45:34 imp Exp $ */ #ifndef _NET_IF_DL_H_ #define _NET_IF_DL_H_ /* * A Link-Level Sockaddr may specify the interface in one of two * ways: either by means of a system-provided index number (computed * anew and possibly differently on every reboot), or by a human-readable * string such as "il0" (for managerial convenience). * * Census taking actions, such as something akin to SIOCGCONF would return * both the index and the human name. * * High volume transactions (such as giving a link-level ``from'' address * in a recvfrom or recvmsg call) may be likely only to provide the indexed * form, (which requires fewer copy operations and less space). * * The form and interpretation of the link-level address is purely a matter * of convention between the device driver and its consumers; however, it is * expected that all drivers for an interface of a given if_type will agree. */ /* * Structure of a Link-Level sockaddr: */ struct sockaddr_dl { u_char sdl_len; /* Total length of sockaddr */ u_char sdl_family; /* AF_LINK */ u_short sdl_index; /* if != 0, system given index for interface */ u_char sdl_type; /* interface type */ u_char sdl_nlen; /* interface name length, no trailing 0 reqd. */ u_char sdl_alen; /* link level address length */ u_char sdl_slen; /* link layer selector length */ char sdl_data[46]; /* minimum work area, can be larger; contains both if name and ll address */ }; #define LLADDR(s) ((caddr_t)((s)->sdl_data + (s)->sdl_nlen)) #ifndef _KERNEL #include __BEGIN_DECLS void link_addr(const char *, struct sockaddr_dl *); char *link_ntoa(const struct sockaddr_dl *); __END_DECLS #endif /* !_KERNEL */ #endif ================================================ FILE: ipfw/include/net/pfvar.h ================================================ #ifndef _PF_VAR_H_ #define _PF_VAR_H_ /* * replacement for FreeBSD's pfqueue.h */ #include #define DIOCSTARTALTQ _IO ('D', 42) #define DIOCSTOPALTQ _IO ('D', 43) struct pf_altq { TAILQ_ENTRY(pf_altq) entries; /* ... */ u_int32_t qid; /* return value */ #define PF_QNAME_SIZE 64 char qname[PF_QNAME_SIZE]; /* queue name */ }; struct pfioc_altq { u_int32_t action; u_int32_t ticket; u_int32_t nr; struct pf_altq altq; }; #define DIOCGETALTQS _IOWR('D', 47, struct pfioc_altq) #define DIOCGETALTQ _IOWR('D', 48, struct pfioc_altq) #endif /* !_PF_VAR_H */ ================================================ FILE: ipfw/include/timeconv.h ================================================ /* * simple override for _long_to_time() */ #ifndef _TIMECONV_H_ #define _TIMECONV_H_ static __inline time_t _long_to_time(long tlong) { if (sizeof(long) == sizeof(__int32_t)) return((time_t)(__int32_t)(tlong)); return((time_t)tlong); } #endif /* _TIMECONV_H_ */ ================================================ FILE: ipfw/ipfw.8 ================================================ .\" .\" $FreeBSD$ .\" .Dd October 25, 2012 .Dt IPFW 8 .Os .Sh NAME .Nm ipfw .Nd User interface for firewall, traffic shaper, packet scheduler, in-kernel NAT. .Sh SYNOPSIS .Ss FIREWALL CONFIGURATION .Nm .Op Fl cq .Cm add .Ar rule .Nm .Op Fl acdefnNStT .Op Cm set Ar N .Brq Cm list | show .Op Ar rule | first-last ... .Nm .Op Fl f | q .Op Cm set Ar N .Cm flush .Nm .Op Fl q .Op Cm set Ar N .Brq Cm delete | zero | resetlog .Op Ar number ... .Pp .Nm .Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ... .Nm .Cm set move .Op Cm rule .Ar number Cm to Ar number .Nm .Cm set swap Ar number number .Nm .Cm set show .Ss SYSCTL SHORTCUTS .Nm .Cm enable .Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive .Nm .Cm disable .Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive .Ss LOOKUP TABLES .Nm .Cm table Ar number Cm add Ar addr Ns Oo / Ns Ar masklen Oc Op Ar value .Nm .Cm table Ar number Cm delete Ar addr Ns Op / Ns Ar masklen .Nm .Cm table .Brq Ar number | all .Cm flush .Nm .Cm table .Brq Ar number | all .Cm list .Ss DUMMYNET CONFIGURATION (TRAFFIC SHAPER AND PACKET SCHEDULER) .Nm .Brq Cm pipe | queue | sched .Ar number .Cm config .Ar config-options .Nm .Op Fl s Op Ar field .Brq Cm pipe | queue | sched .Brq Cm delete | list | show .Op Ar number ... .Ss IN-KERNEL NAT .Nm .Op Fl q .Cm nat .Ar number .Cm config .Ar config-options .Pp .Nm .Op Fl cfnNqS .Oo .Fl p Ar preproc .Oo .Ar preproc-flags .Oc .Oc .Ar pathname .Sh DESCRIPTION The .Nm utility is the user interface for controlling the .Xr ipfw 4 firewall, the .Xr dummynet 4 traffic shaper/packet scheduler, and the in-kernel NAT services. .Pp A firewall configuration, or .Em ruleset , is made of a list of .Em rules numbered from 1 to 65535. Packets are passed to the firewall from a number of different places in the protocol stack (depending on the source and destination of the packet, it is possible for the firewall to be invoked multiple times on the same packet). The packet passed to the firewall is compared against each of the rules in the .Em ruleset , in rule-number order (multiple rules with the same number are permitted, in which case they are processed in order of insertion). When a match is found, the action corresponding to the matching rule is performed. .Pp Depending on the action and certain system settings, packets can be reinjected into the firewall at some rule after the matching one for further processing. .Pp A ruleset always includes a .Em default rule (numbered 65535) which cannot be modified or deleted, and matches all packets. The action associated with the .Em default rule can be either .Cm deny or .Cm allow depending on how the kernel is configured. .Pp If the ruleset includes one or more rules with the .Cm keep-state or .Cm limit option, the firewall will have a .Em stateful behaviour, i.e., upon a match it will create .Em dynamic rules , i.e., rules that match packets with the same 5-tuple (protocol, source and destination addresses and ports) as the packet which caused their creation. Dynamic rules, which have a limited lifetime, are checked at the first occurrence of a .Cm check-state , .Cm keep-state or .Cm limit rule, and are typically used to open the firewall on-demand to legitimate traffic only. See the .Sx STATEFUL FIREWALL and .Sx EXAMPLES Sections below for more information on the stateful behaviour of .Nm . .Pp All rules (including dynamic ones) have a few associated counters: a packet count, a byte count, a log count and a timestamp indicating the time of the last match. Counters can be displayed or reset with .Nm commands. .Pp Each rule belongs to one of 32 different .Em sets , and there are .Nm commands to atomically manipulate sets, such as enable, disable, swap sets, move all rules in a set to another one, delete all rules in a set. These can be useful to install temporary configurations, or to test them. See Section .Sx SETS OF RULES for more information on .Em sets . .Pp Rules can be added with the .Cm add command; deleted individually or in groups with the .Cm delete command, and globally (except those in set 31) with the .Cm flush command; displayed, optionally with the content of the counters, using the .Cm show and .Cm list commands. Finally, counters can be reset with the .Cm zero and .Cm resetlog commands. .Pp .Ss COMMAND OPTIONS The following general options are available when invoking .Nm : .Bl -tag -width indent .It Fl a Show counter values when listing rules. The .Cm show command implies this option. .It Fl b Only show the action and the comment, not the body of a rule. Implies .Fl c . .It Fl c When entering or showing rules, print them in compact form, i.e., omitting the "ip from any to any" string when this does not carry any additional information. .It Fl d When listing, show dynamic rules in addition to static ones. .It Fl e When listing and .Fl d is specified, also show expired dynamic rules. .It Fl f Do not ask for confirmation for commands that can cause problems if misused, i.e., .Cm flush . If there is no tty associated with the process, this is implied. .It Fl i When listing a table (see the .Sx LOOKUP TABLES section below for more information on lookup tables), format values as IP addresses. By default, values are shown as integers. .It Fl n Only check syntax of the command strings, without actually passing them to the kernel. .It Fl N Try to resolve addresses and service names in output. .It Fl q Be quiet when executing the .Cm add , .Cm nat , .Cm zero , .Cm resetlog or .Cm flush commands; (implies .Fl f ) . This is useful when updating rulesets by executing multiple .Nm commands in a script (e.g., .Ql sh\ /etc/rc.firewall ) , or by processing a file with many .Nm rules across a remote login session. It also stops a table add or delete from failing if the entry already exists or is not present. .Pp The reason why this option may be important is that for some of these actions, .Nm may print a message; if the action results in blocking the traffic to the remote client, the remote login session will be closed and the rest of the ruleset will not be processed. Access to the console would then be required to recover. .It Fl S When listing rules, show the .Em set each rule belongs to. If this flag is not specified, disabled rules will not be listed. .It Fl s Op Ar field When listing pipes, sort according to one of the four counters (total or current packets or bytes). .It Fl t When listing, show last match timestamp converted with ctime(). .It Fl T When listing, show last match timestamp as seconds from the epoch. This form can be more convenient for postprocessing by scripts. .El .Ss LIST OF RULES AND PREPROCESSING To ease configuration, rules can be put into a file which is processed using .Nm as shown in the last synopsis line. An absolute .Ar pathname must be used. The file will be read line by line and applied as arguments to the .Nm utility. .Pp Optionally, a preprocessor can be specified using .Fl p Ar preproc where .Ar pathname is to be piped through. Useful preprocessors include .Xr cpp 1 and .Xr m4 1 . If .Ar preproc does not start with a slash .Pq Ql / as its first character, the usual .Ev PATH name search is performed. Care should be taken with this in environments where not all file systems are mounted (yet) by the time .Nm is being run (e.g.\& when they are mounted over NFS). Once .Fl p has been specified, any additional arguments are passed on to the preprocessor for interpretation. This allows for flexible configuration files (like conditionalizing them on the local hostname) and the use of macros to centralize frequently required arguments like IP addresses. .Ss TRAFFIC SHAPER CONFIGURATION The .Nm .Cm pipe , queue and .Cm sched commands are used to configure the traffic shaper and packet scheduler. See the .Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION Section below for details. .Pp If the world and the kernel get out of sync the .Nm ABI may break, preventing you from being able to add any rules. This can adversely affect the booting process. You can use .Nm .Cm disable .Cm firewall to temporarily disable the firewall to regain access to the network, allowing you to fix the problem. .Sh PACKET FLOW A packet is checked against the active ruleset in multiple places in the protocol stack, under control of several sysctl variables. These places and variables are shown below, and it is important to have this picture in mind in order to design a correct ruleset. .Bd -literal -offset indent ^ to upper layers V | | +----------->-----------+ ^ V [ip(6)_input] [ip(6)_output] net.inet(6).ip(6).fw.enable=1 | | ^ V [ether_demux] [ether_output_frame] net.link.ether.ipfw=1 | | +-->--[bdg_forward]-->--+ net.link.bridge.ipfw=1 ^ V | to devices | .Ed .Pp The number of times the same packet goes through the firewall can vary between 0 and 4 depending on packet source and destination, and system configuration. .Pp Note that as packets flow through the stack, headers can be stripped or added to it, and so they may or may not be available for inspection. E.g., incoming packets will include the MAC header when .Nm is invoked from .Cm ether_demux() , but the same packets will have the MAC header stripped off when .Nm is invoked from .Cm ip_input() or .Cm ip6_input() . .Pp Also note that each packet is always checked against the complete ruleset, irrespective of the place where the check occurs, or the source of the packet. If a rule contains some match patterns or actions which are not valid for the place of invocation (e.g.\& trying to match a MAC header within .Cm ip_input or .Cm ip6_input ), the match pattern will not match, but a .Cm not operator in front of such patterns .Em will cause the pattern to .Em always match on those packets. It is thus the responsibility of the programmer, if necessary, to write a suitable ruleset to differentiate among the possible places. .Cm skipto rules can be useful here, as an example: .Bd -literal -offset indent # packets from ether_demux or bdg_forward ipfw add 10 skipto 1000 all from any to any layer2 in # packets from ip_input ipfw add 10 skipto 2000 all from any to any not layer2 in # packets from ip_output ipfw add 10 skipto 3000 all from any to any not layer2 out # packets from ether_output_frame ipfw add 10 skipto 4000 all from any to any layer2 out .Ed .Pp (yes, at the moment there is no way to differentiate between ether_demux and bdg_forward). .Sh SYNTAX In general, each keyword or argument must be provided as a separate command line argument, with no leading or trailing spaces. Keywords are case-sensitive, whereas arguments may or may not be case-sensitive depending on their nature (e.g.\& uid's are, hostnames are not). .Pp Some arguments (e.g., port or address lists) are comma-separated lists of values. In this case, spaces after commas ',' are allowed to make the line more readable. You can also put the entire command (including flags) into a single argument. E.g., the following forms are equivalent: .Bd -literal -offset indent ipfw -q add deny src-ip 10.0.0.0/24,127.0.0.1/8 ipfw -q add deny src-ip 10.0.0.0/24, 127.0.0.1/8 ipfw "-q add deny src-ip 10.0.0.0/24, 127.0.0.1/8" .Ed .Sh RULE FORMAT The format of firewall rules is the following: .Bd -ragged -offset indent .Bk -words .Op Ar rule_number .Op Cm set Ar set_number .Op Cm prob Ar match_probability .Ar action .Op Cm log Op Cm logamount Ar number .Op Cm altq Ar queue .Oo .Bro Cm tag | untag .Brc Ar number .Oc .Ar body .Ek .Ed .Pp where the body of the rule specifies which information is used for filtering packets, among the following: .Pp .Bl -tag -width "Source and dest. addresses and ports" -offset XXX -compact .It Layer-2 header fields When available .It IPv4 and IPv6 Protocol TCP, UDP, ICMP, etc. .It Source and dest. addresses and ports .It Direction See Section .Sx PACKET FLOW .It Transmit and receive interface By name or address .It Misc. IP header fields Version, type of service, datagram length, identification, fragment flag (non-zero IP offset), Time To Live .It IP options .It IPv6 Extension headers Fragmentation, Hop-by-Hop options, Routing Headers, Source routing rthdr0, Mobile IPv6 rthdr2, IPSec options. .It IPv6 Flow-ID .It Misc. TCP header fields TCP flags (SYN, FIN, ACK, RST, etc.), sequence number, acknowledgment number, window .It TCP options .It ICMP types for ICMP packets .It ICMP6 types for ICMP6 packets .It User/group ID When the packet can be associated with a local socket. .It Divert status Whether a packet came from a divert socket (e.g., .Xr natd 8 ) . .It Fib annotation state Whether a packet has been tagged for using a specific FIB (routing table) in future forwarding decisions. .El .Pp Note that some of the above information, e.g.\& source MAC or IP addresses and TCP/UDP ports, can be easily spoofed, so filtering on those fields alone might not guarantee the desired results. .Bl -tag -width indent .It Ar rule_number Each rule is associated with a .Ar rule_number in the range 1..65535, with the latter reserved for the .Em default rule. Rules are checked sequentially by rule number. Multiple rules can have the same number, in which case they are checked (and listed) according to the order in which they have been added. If a rule is entered without specifying a number, the kernel will assign one in such a way that the rule becomes the last one before the .Em default rule. Automatic rule numbers are assigned by incrementing the last non-default rule number by the value of the sysctl variable .Ar net.inet.ip.fw.autoinc_step which defaults to 100. If this is not possible (e.g.\& because we would go beyond the maximum allowed rule number), the number of the last non-default value is used instead. .It Cm set Ar set_number Each rule is associated with a .Ar set_number in the range 0..31. Sets can be individually disabled and enabled, so this parameter is of fundamental importance for atomic ruleset manipulation. It can be also used to simplify deletion of groups of rules. If a rule is entered without specifying a set number, set 0 will be used. .br Set 31 is special in that it cannot be disabled, and rules in set 31 are not deleted by the .Nm ipfw flush command (but you can delete them with the .Nm ipfw delete set 31 command). Set 31 is also used for the .Em default rule. .It Cm prob Ar match_probability A match is only declared with the specified probability (floating point number between 0 and 1). This can be useful for a number of applications such as random packet drop or (in conjunction with .Nm dummynet ) to simulate the effect of multiple paths leading to out-of-order packet delivery. .Pp Note: this condition is checked before any other condition, including ones such as keep-state or check-state which might have side effects. .It Cm log Op Cm logamount Ar number Packets matching a rule with the .Cm log keyword will be made available for logging in two ways: if the sysctl variable .Va net.inet.ip.fw.verbose is set to 0 (default), one can use .Xr bpf 4 attached to the .Li ipfw0 pseudo interface. This pseudo interface can be created after a boot manually by using the following command: .Bd -literal -offset indent # ifconfig ipfw0 create .Ed .Pp Or, automatically at boot time by adding the following line to the .Xr rc.conf 5 file: .Bd -literal -offset indent firewall_logif="YES" .Ed .Pp There is no overhead if no .Xr bpf 4 is attached to the pseudo interface. .Pp If .Va net.inet.ip.fw.verbose is set to 1, packets will be logged to .Xr syslogd 8 with a .Dv LOG_SECURITY facility up to a maximum of .Cm logamount packets. If no .Cm logamount is specified, the limit is taken from the sysctl variable .Va net.inet.ip.fw.verbose_limit . In both cases, a value of 0 means unlimited logging. .Pp Once the limit is reached, logging can be re-enabled by clearing the logging counter or the packet counter for that entry, see the .Cm resetlog command. .Pp Note: logging is done after all other packet matching conditions have been successfully verified, and before performing the final action (accept, deny, etc.) on the packet. .It Cm tag Ar number When a packet matches a rule with the .Cm tag keyword, the numeric tag for the given .Ar number in the range 1..65534 will be attached to the packet. The tag acts as an internal marker (it is not sent out over the wire) that can be used to identify these packets later on. This can be used, for example, to provide trust between interfaces and to start doing policy-based filtering. A packet can have multiple tags at the same time. Tags are "sticky", meaning once a tag is applied to a packet by a matching rule it exists until explicit removal. Tags are kept with the packet everywhere within the kernel, but are lost when packet leaves the kernel, for example, on transmitting packet out to the network or sending packet to a .Xr divert 4 socket. .Pp To check for previously applied tags, use the .Cm tagged rule option. To delete previously applied tag, use the .Cm untag keyword. .Pp Note: since tags are kept with the packet everywhere in kernelspace, they can be set and unset anywhere in the kernel network subsystem (using the .Xr mbuf_tags 9 facility), not only by means of the .Xr ipfw 4 .Cm tag and .Cm untag keywords. For example, there can be a specialized .Xr netgraph 4 node doing traffic analyzing and tagging for later inspecting in firewall. .It Cm untag Ar number When a packet matches a rule with the .Cm untag keyword, the tag with the number .Ar number is searched among the tags attached to this packet and, if found, removed from it. Other tags bound to packet, if present, are left untouched. .It Cm altq Ar queue When a packet matches a rule with the .Cm altq keyword, the ALTQ identifier for the given .Ar queue (see .Xr altq 4 ) will be attached. Note that this ALTQ tag is only meaningful for packets going "out" of IPFW, and not being rejected or going to divert sockets. Note that if there is insufficient memory at the time the packet is processed, it will not be tagged, so it is wise to make your ALTQ "default" queue policy account for this. If multiple .Cm altq rules match a single packet, only the first one adds the ALTQ classification tag. In doing so, traffic may be shaped by using .Cm count Cm altq Ar queue rules for classification early in the ruleset, then later applying the filtering decision. For example, .Cm check-state and .Cm keep-state rules may come later and provide the actual filtering decisions in addition to the fallback ALTQ tag. .Pp You must run .Xr pfctl 8 to set up the queues before IPFW will be able to look them up by name, and if the ALTQ disciplines are rearranged, the rules in containing the queue identifiers in the kernel will likely have gone stale and need to be reloaded. Stale queue identifiers will probably result in misclassification. .Pp All system ALTQ processing can be turned on or off via .Nm .Cm enable Ar altq and .Nm .Cm disable Ar altq . The usage of .Va net.inet.ip.fw.one_pass is irrelevant to ALTQ traffic shaping, as the actual rule action is followed always after adding an ALTQ tag. .El .Ss RULE ACTIONS A rule can be associated with one of the following actions, which will be executed when the packet matches the body of the rule. .Bl -tag -width indent .It Cm allow | accept | pass | permit Allow packets that match rule. The search terminates. .It Cm check-state Checks the packet against the dynamic ruleset. If a match is found, execute the action associated with the rule which generated this dynamic rule, otherwise move to the next rule. .br .Cm Check-state rules do not have a body. If no .Cm check-state rule is found, the dynamic ruleset is checked at the first .Cm keep-state or .Cm limit rule. .It Cm count Update counters for all packets that match rule. The search continues with the next rule. .It Cm deny | drop Discard packets that match this rule. The search terminates. .It Cm divert Ar port Divert packets that match this rule to the .Xr divert 4 socket bound to port .Ar port . The search terminates. .It Cm fwd | forward Ar ipaddr | tablearg Ns Op , Ns Ar port Change the next-hop on matching packets to .Ar ipaddr , which can be an IP address or a host name. For IPv4, the next hop can also be supplied by the last table looked up for the packet by using the .Cm tablearg keyword instead of an explicit address. The search terminates if this rule matches. .Pp If .Ar ipaddr is a local address, then matching packets will be forwarded to .Ar port (or the port number in the packet if one is not specified in the rule) on the local machine. .br If .Ar ipaddr is not a local address, then the port number (if specified) is ignored, and the packet will be forwarded to the remote address, using the route as found in the local routing table for that IP. .br A .Ar fwd rule will not match layer-2 packets (those received on ether_input, ether_output, or bridged). .br The .Cm fwd action does not change the contents of the packet at all. In particular, the destination address remains unmodified, so packets forwarded to another system will usually be rejected by that system unless there is a matching rule on that system to capture them. For packets forwarded locally, the local address of the socket will be set to the original destination address of the packet. This makes the .Xr netstat 1 entry look rather weird but is intended for use with transparent proxy servers. .It Cm nat Ar nat_nr | tablearg Pass packet to a nat instance (for network address translation, address redirect, etc.): see the .Sx NETWORK ADDRESS TRANSLATION (NAT) Section for further information. .It Cm pipe Ar pipe_nr Pass packet to a .Nm dummynet .Dq pipe (for bandwidth limitation, delay, etc.). See the .Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION Section for further information. The search terminates; however, on exit from the pipe and if the .Xr sysctl 8 variable .Va net.inet.ip.fw.one_pass is not set, the packet is passed again to the firewall code starting from the next rule. .It Cm queue Ar queue_nr Pass packet to a .Nm dummynet .Dq queue (for bandwidth limitation using WF2Q+). .It Cm reject (Deprecated). Synonym for .Cm unreach host . .It Cm reset Discard packets that match this rule, and if the packet is a TCP packet, try to send a TCP reset (RST) notice. The search terminates. .It Cm reset6 Discard packets that match this rule, and if the packet is a TCP packet, try to send a TCP reset (RST) notice. The search terminates. .It Cm skipto Ar number | tablearg Skip all subsequent rules numbered less than .Ar number . The search continues with the first rule numbered .Ar number or higher. It is possible to use the .Cm tablearg keyword with a skipto for a .Em computed skipto, but care should be used, as no destination caching is possible in this case so the rules are always walked to find it, starting from the .Cm skipto . .It Cm call Ar number | tablearg The current rule number is saved in the internal stack and ruleset processing continues with the first rule numbered .Ar number or higher. If later a rule with the .Cm return action is encountered, the processing returns to the first rule with number of this .Cm call rule plus one or higher (the same behaviour as with packets returning from .Xr divert 4 socket after a .Cm divert action). This could be used to make somewhat like an assembly language .Dq subroutine calls to rules with common checks for different interfaces, etc. .Pp Rule with any number could be called, not just forward jumps as with .Cm skipto . So, to prevent endless loops in case of mistakes, both .Cm call and .Cm return actions don't do any jumps and simply go to the next rule if memory cannot be allocated or stack overflowed/underflowed. .Pp Internally stack for rule numbers is implemented using .Xr mbuf_tags 9 facility and currently has size of 16 entries. As mbuf tags are lost when packet leaves the kernel, .Cm divert should not be used in subroutines to avoid endless loops and other undesired effects. .It Cm return Takes rule number saved to internal stack by the last .Cm call action and returns ruleset processing to the first rule with number greater than number of corresponding .Cm call rule. See description of the .Cm call action for more details. .Pp Note that .Cm return rules usually end a .Dq subroutine and thus are unconditional, but .Nm command-line utility currently requires every action except .Cm check-state to have body. While it is sometimes useful to return only on some packets, usually you want to print just .Dq return for readability. A workaround for this is to use new syntax and .Fl c switch: .Bd -literal -offset indent # Add a rule without actual body ipfw add 2999 return via any # List rules without "from any to any" part ipfw -c list .Ed .Pp This cosmetic annoyance may be fixed in future releases. .It Cm tee Ar port Send a copy of packets matching this rule to the .Xr divert 4 socket bound to port .Ar port . The search continues with the next rule. .It Cm unreach Ar code Discard packets that match this rule, and try to send an ICMP unreachable notice with code .Ar code , where .Ar code is a number from 0 to 255, or one of these aliases: .Cm net , host , protocol , port , .Cm needfrag , srcfail , net-unknown , host-unknown , .Cm isolated , net-prohib , host-prohib , tosnet , .Cm toshost , filter-prohib , host-precedence or .Cm precedence-cutoff . The search terminates. .It Cm unreach6 Ar code Discard packets that match this rule, and try to send an ICMPv6 unreachable notice with code .Ar code , where .Ar code is a number from 0, 1, 3 or 4, or one of these aliases: .Cm no-route, admin-prohib, address or .Cm port . The search terminates. .It Cm netgraph Ar cookie Divert packet into netgraph with given .Ar cookie . The search terminates. If packet is later returned from netgraph it is either accepted or continues with the next rule, depending on .Va net.inet.ip.fw.one_pass sysctl variable. .It Cm ngtee Ar cookie A copy of packet is diverted into netgraph, original packet continues with the next rule. See .Xr ng_ipfw 4 for more information on .Cm netgraph and .Cm ngtee actions. .It Cm setfib Ar fibnum | tablearg The packet is tagged so as to use the FIB (routing table) .Ar fibnum in any subsequent forwarding decisions. In the current implementation, this is limited to the values 0 through 15, see .Xr setfib 2 . Processing continues at the next rule. It is possible to use the .Cm tablearg keyword with setfib. If the tablearg value is not within the compiled range of fibs, the packet's fib is set to 0. .It Cm setdscp Ar DSCP | number | tablearg Set specified DiffServ codepoint for an IPv4/IPv6 packet. Processing continues at the next rule. Supported values are: .Pp .Cm CS0 .Pq Dv 000000 , .Cm CS1 .Pq Dv 001000 , .Cm CS2 .Pq Dv 010000 , .Cm CS3 .Pq Dv 011000 , .Cm CS4 .Pq Dv 100000 , .Cm CS5 .Pq Dv 101000 , .Cm CS6 .Pq Dv 110000 , .Cm CS7 .Pq Dv 111000 , .Cm AF11 .Pq Dv 001010 , .Cm AF12 .Pq Dv 001100 , .Cm AF13 .Pq Dv 001110 , .Cm AF21 .Pq Dv 010010 , .Cm AF22 .Pq Dv 010100 , .Cm AF23 .Pq Dv 010110 , .Cm AF31 .Pq Dv 011010 , .Cm AF32 .Pq Dv 011100 , .Cm AF33 .Pq Dv 011110 , .Cm AF41 .Pq Dv 100010 , .Cm AF42 .Pq Dv 100100 , .Cm AF43 .Pq Dv 100110 , .Cm EF .Pq Dv 101110 , .Cm BE .Pq Dv 000000 . Additionally, DSCP value can be specified by number (0..64). It is also possible to use the .Cm tablearg keyword with setdscp. If the tablearg value is not within the 0..64 range, lower 6 bits of supplied value are used. .It Cm reass Queue and reassemble IP fragments. If the packet is not fragmented, counters are updated and processing continues with the next rule. If the packet is the last logical fragment, the packet is reassembled and, if .Va net.inet.ip.fw.one_pass is set to 0, processing continues with the next rule. Otherwise, the packet is allowed to pass and the search terminates. If the packet is a fragment in the middle of a logical group of fragments, it is consumed and processing stops immediately. .Pp Fragment handling can be tuned via .Va net.inet.ip.maxfragpackets and .Va net.inet.ip.maxfragsperpacket which limit, respectively, the maximum number of processable fragments (default: 800) and the maximum number of fragments per packet (default: 16). .Pp NOTA BENE: since fragments do not contain port numbers, they should be avoided with the .Nm reass rule. Alternatively, direction-based (like .Nm in / .Nm out ) and source-based (like .Nm via ) match patterns can be used to select fragments. .Pp Usually a simple rule like: .Bd -literal -offset indent # reassemble incoming fragments ipfw add reass all from any to any in .Ed .Pp is all you need at the beginning of your ruleset. .El .Ss RULE BODY The body of a rule contains zero or more patterns (such as specific source and destination addresses or ports, protocol options, incoming or outgoing interfaces, etc.) that the packet must match in order to be recognised. In general, the patterns are connected by (implicit) .Cm and operators -- i.e., all must match in order for the rule to match. Individual patterns can be prefixed by the .Cm not operator to reverse the result of the match, as in .Pp .Dl "ipfw add 100 allow ip from not 1.2.3.4 to any" .Pp Additionally, sets of alternative match patterns .Pq Em or-blocks can be constructed by putting the patterns in lists enclosed between parentheses ( ) or braces { }, and using the .Cm or operator as follows: .Pp .Dl "ipfw add 100 allow ip from { x or not y or z } to any" .Pp Only one level of parentheses is allowed. Beware that most shells have special meanings for parentheses or braces, so it is advisable to put a backslash \\ in front of them to prevent such interpretations. .Pp The body of a rule must in general include a source and destination address specifier. The keyword .Ar any can be used in various places to specify that the content of a required field is irrelevant. .Pp The rule body has the following format: .Bd -ragged -offset indent .Op Ar proto Cm from Ar src Cm to Ar dst .Op Ar options .Ed .Pp The first part (proto from src to dst) is for backward compatibility with earlier versions of .Fx . In modern .Fx any match pattern (including MAC headers, IP protocols, addresses and ports) can be specified in the .Ar options section. .Pp Rule fields have the following meaning: .Bl -tag -width indent .It Ar proto : protocol | Cm { Ar protocol Cm or ... } .It Ar protocol : Oo Cm not Oc Ar protocol-name | protocol-number An IP protocol specified by number or name (for a complete list see .Pa /etc/protocols ) , or one of the following keywords: .Bl -tag -width indent .It Cm ip4 | ipv4 Matches IPv4 packets. .It Cm ip6 | ipv6 Matches IPv6 packets. .It Cm ip | all Matches any packet. .El .Pp The .Cm ipv6 in .Cm proto option will be treated as inner protocol. And, the .Cm ipv4 is not available in .Cm proto option. .Pp The .Cm { Ar protocol Cm or ... } format (an .Em or-block ) is provided for convenience only but its use is deprecated. .It Ar src No and Ar dst : Bro Cm addr | Cm { Ar addr Cm or ... } Brc Op Oo Cm not Oc Ar ports An address (or a list, see below) optionally followed by .Ar ports specifiers. .Pp The second format .Em ( or-block with multiple addresses) is provided for convenience only and its use is discouraged. .It Ar addr : Oo Cm not Oc Bro .Cm any | me | me6 | .Cm table Ns Pq Ar number Ns Op , Ns Ar value .Ar | addr-list | addr-set .Brc .Bl -tag -width indent .It Cm any matches any IP address. .It Cm me matches any IP address configured on an interface in the system. .It Cm me6 matches any IPv6 address configured on an interface in the system. The address list is evaluated at the time the packet is analysed. .It Cm table Ns Pq Ar number Ns Op , Ns Ar value Matches any IPv4 address for which an entry exists in the lookup table .Ar number . If an optional 32-bit unsigned .Ar value is also specified, an entry will match only if it has this value. See the .Sx LOOKUP TABLES section below for more information on lookup tables. .El .It Ar addr-list : ip-addr Ns Op Ns , Ns Ar addr-list .It Ar ip-addr : A host or subnet address specified in one of the following ways: .Bl -tag -width indent .It Ar numeric-ip | hostname Matches a single IPv4 address, specified as dotted-quad or a hostname. Hostnames are resolved at the time the rule is added to the firewall list. .It Ar addr Ns / Ns Ar masklen Matches all addresses with base .Ar addr (specified as an IP address, a network number, or a hostname) and mask width of .Cm masklen bits. As an example, 1.2.3.4/25 or 1.2.3.0/25 will match all IP numbers from 1.2.3.0 to 1.2.3.127 . .It Ar addr Ns : Ns Ar mask Matches all addresses with base .Ar addr (specified as an IP address, a network number, or a hostname) and the mask of .Ar mask , specified as a dotted quad. As an example, 1.2.3.4:255.0.255.0 or 1.0.3.0:255.0.255.0 will match 1.*.3.*. This form is advised only for non-contiguous masks. It is better to resort to the .Ar addr Ns / Ns Ar masklen format for contiguous masks, which is more compact and less error-prone. .El .It Ar addr-set : addr Ns Oo Ns / Ns Ar masklen Oc Ns Cm { Ns Ar list Ns Cm } .It Ar list : Bro Ar num | num-num Brc Ns Op Ns , Ns Ar list Matches all addresses with base address .Ar addr (specified as an IP address, a network number, or a hostname) and whose last byte is in the list between braces { } . Note that there must be no spaces between braces and numbers (spaces after commas are allowed). Elements of the list can be specified as single entries or ranges. The .Ar masklen field is used to limit the size of the set of addresses, and can have any value between 24 and 32. If not specified, it will be assumed as 24. .br This format is particularly useful to handle sparse address sets within a single rule. Because the matching occurs using a bitmask, it takes constant time and dramatically reduces the complexity of rulesets. .br As an example, an address specified as 1.2.3.4/24{128,35-55,89} or 1.2.3.0/24{128,35-55,89} will match the following IP addresses: .br 1.2.3.128, 1.2.3.35 to 1.2.3.55, 1.2.3.89 . .It Ar addr6-list : ip6-addr Ns Op Ns , Ns Ar addr6-list .It Ar ip6-addr : A host or subnet specified one of the following ways: .Bl -tag -width indent .It Ar numeric-ip | hostname Matches a single IPv6 address as allowed by .Xr inet_pton 3 or a hostname. Hostnames are resolved at the time the rule is added to the firewall list. .It Ar addr Ns / Ns Ar masklen Matches all IPv6 addresses with base .Ar addr (specified as allowed by .Xr inet_pton or a hostname) and mask width of .Cm masklen bits. .El .Pp No support for sets of IPv6 addresses is provided because IPv6 addresses are typically random past the initial prefix. .It Ar ports : Bro Ar port | port Ns \&- Ns Ar port Ns Brc Ns Op , Ns Ar ports For protocols which support port numbers (such as TCP and UDP), optional .Cm ports may be specified as one or more ports or port ranges, separated by commas but no spaces, and an optional .Cm not operator. The .Ql \&- notation specifies a range of ports (including boundaries). .Pp Service names (from .Pa /etc/services ) may be used instead of numeric port values. The length of the port list is limited to 30 ports or ranges, though one can specify larger ranges by using an .Em or-block in the .Cm options section of the rule. .Pp A backslash .Pq Ql \e can be used to escape the dash .Pq Ql - character in a service name (from a shell, the backslash must be typed twice to avoid the shell itself interpreting it as an escape character). .Pp .Dl "ipfw add count tcp from any ftp\e\e-data-ftp to any" .Pp Fragmented packets which have a non-zero offset (i.e., not the first fragment) will never match a rule which has one or more port specifications. See the .Cm frag option for details on matching fragmented packets. .El .Ss RULE OPTIONS (MATCH PATTERNS) Additional match patterns can be used within rules. Zero or more of these so-called .Em options can be present in a rule, optionally prefixed by the .Cm not operand, and possibly grouped into .Em or-blocks . .Pp The following match patterns can be used (listed in alphabetical order): .Bl -tag -width indent .It Cm // this is a comment. Inserts the specified text as a comment in the rule. Everything following // is considered as a comment and stored in the rule. You can have comment-only rules, which are listed as having a .Cm count action followed by the comment. .It Cm bridged Alias for .Cm layer2 . .It Cm diverted Matches only packets generated by a divert socket. .It Cm diverted-loopback Matches only packets coming from a divert socket back into the IP stack input for delivery. .It Cm diverted-output Matches only packets going from a divert socket back outward to the IP stack output for delivery. .It Cm dst-ip Ar ip-address Matches IPv4 packets whose destination IP is one of the address(es) specified as argument. .It Bro Cm dst-ip6 | dst-ipv6 Brc Ar ip6-address Matches IPv6 packets whose destination IP is one of the address(es) specified as argument. .It Cm dst-port Ar ports Matches IP packets whose destination port is one of the port(s) specified as argument. .It Cm established Matches TCP packets that have the RST or ACK bits set. .It Cm ext6hdr Ar header Matches IPv6 packets containing the extended header given by .Ar header . Supported headers are: .Pp Fragment, .Pq Cm frag , Hop-to-hop options .Pq Cm hopopt , any type of Routing Header .Pq Cm route , Source routing Routing Header Type 0 .Pq Cm rthdr0 , Mobile IPv6 Routing Header Type 2 .Pq Cm rthdr2 , Destination options .Pq Cm dstopt , IPSec authentication headers .Pq Cm ah , and IPsec encapsulated security payload headers .Pq Cm esp . .It Cm fib Ar fibnum Matches a packet that has been tagged to use the given FIB (routing table) number. .It Cm flow-id Ar labels Matches IPv6 packets containing any of the flow labels given in .Ar labels . .Ar labels is a comma separated list of numeric flow labels. .It Cm frag Matches packets that are fragments and not the first fragment of an IP datagram. Note that these packets will not have the next protocol header (e.g.\& TCP, UDP) so options that look into these headers cannot match. .It Cm gid Ar group Matches all TCP or UDP packets sent by or received for a .Ar group . A .Ar group may be specified by name or number. .It Cm jail Ar prisonID Matches all TCP or UDP packets sent by or received for the jail whos prison ID is .Ar prisonID . .It Cm icmptypes Ar types Matches ICMP packets whose ICMP type is in the list .Ar types . The list may be specified as any combination of individual types (numeric) separated by commas. .Em Ranges are not allowed . The supported ICMP types are: .Pp echo reply .Pq Cm 0 , destination unreachable .Pq Cm 3 , source quench .Pq Cm 4 , redirect .Pq Cm 5 , echo request .Pq Cm 8 , router advertisement .Pq Cm 9 , router solicitation .Pq Cm 10 , time-to-live exceeded .Pq Cm 11 , IP header bad .Pq Cm 12 , timestamp request .Pq Cm 13 , timestamp reply .Pq Cm 14 , information request .Pq Cm 15 , information reply .Pq Cm 16 , address mask request .Pq Cm 17 and address mask reply .Pq Cm 18 . .It Cm icmp6types Ar types Matches ICMP6 packets whose ICMP6 type is in the list of .Ar types . The list may be specified as any combination of individual types (numeric) separated by commas. .Em Ranges are not allowed . .It Cm in | out Matches incoming or outgoing packets, respectively. .Cm in and .Cm out are mutually exclusive (in fact, .Cm out is implemented as .Cm not in Ns No ). .It Cm ipid Ar id-list Matches IPv4 packets whose .Cm ip_id field has value included in .Ar id-list , which is either a single value or a list of values or ranges specified in the same way as .Ar ports . .It Cm iplen Ar len-list Matches IP packets whose total length, including header and data, is in the set .Ar len-list , which is either a single value or a list of values or ranges specified in the same way as .Ar ports . .It Cm ipoptions Ar spec Matches packets whose IPv4 header contains the comma separated list of options specified in .Ar spec . The supported IP options are: .Pp .Cm ssrr (strict source route), .Cm lsrr (loose source route), .Cm rr (record packet route) and .Cm ts (timestamp). The absence of a particular option may be denoted with a .Ql \&! . .It Cm ipprecedence Ar precedence Matches IPv4 packets whose precedence field is equal to .Ar precedence . .It Cm ipsec Matches packets that have IPSEC history associated with them (i.e., the packet comes encapsulated in IPSEC, the kernel has IPSEC support and IPSEC_FILTERTUNNEL option, and can correctly decapsulate it). .Pp Note that specifying .Cm ipsec is different from specifying .Cm proto Ar ipsec as the latter will only look at the specific IP protocol field, irrespective of IPSEC kernel support and the validity of the IPSEC data. .Pp Further note that this flag is silently ignored in kernels without IPSEC support. It does not affect rule processing when given and the rules are handled as if with no .Cm ipsec flag. .It Cm iptos Ar spec Matches IPv4 packets whose .Cm tos field contains the comma separated list of service types specified in .Ar spec . The supported IP types of service are: .Pp .Cm lowdelay .Pq Dv IPTOS_LOWDELAY , .Cm throughput .Pq Dv IPTOS_THROUGHPUT , .Cm reliability .Pq Dv IPTOS_RELIABILITY , .Cm mincost .Pq Dv IPTOS_MINCOST , .Cm congestion .Pq Dv IPTOS_ECN_CE . The absence of a particular type may be denoted with a .Ql \&! . .It Cm dscp spec Ns Op , Ns Ar spec Matches IPv4/IPv6 packets whose .Cm DS field value is contained in .Ar spec mask. Multiple values can be specified via the comma separated list. Value can be one of keywords used in .Cm setdscp action or exact number. .It Cm ipttl Ar ttl-list Matches IPv4 packets whose time to live is included in .Ar ttl-list , which is either a single value or a list of values or ranges specified in the same way as .Ar ports . .It Cm ipversion Ar ver Matches IP packets whose IP version field is .Ar ver . .It Cm keep-state Upon a match, the firewall will create a dynamic rule, whose default behaviour is to match bidirectional traffic between source and destination IP/port using the same protocol. The rule has a limited lifetime (controlled by a set of .Xr sysctl 8 variables), and the lifetime is refreshed every time a matching packet is found. .It Cm layer2 Matches only layer2 packets, i.e., those passed to .Nm from ether_demux() and ether_output_frame(). .It Cm limit Bro Cm src-addr | src-port | dst-addr | dst-port Brc Ar N The firewall will only allow .Ar N connections with the same set of parameters as specified in the rule. One or more of source and destination addresses and ports can be specified. Currently, only IPv4 flows are supported. .It Cm lookup Bro Cm dst-ip | dst-port | src-ip | src-port | uid | jail Brc Ar N Search an entry in lookup table .Ar N that matches the field specified as argument. If not found, the match fails. Otherwise, the match succeeds and .Cm tablearg is set to the value extracted from the table. .Pp This option can be useful to quickly dispatch traffic based on certain packet fields. See the .Sx LOOKUP TABLES section below for more information on lookup tables. .It Cm { MAC | mac } Ar dst-mac src-mac Match packets with a given .Ar dst-mac and .Ar src-mac addresses, specified as the .Cm any keyword (matching any MAC address), or six groups of hex digits separated by colons, and optionally followed by a mask indicating the significant bits. The mask may be specified using either of the following methods: .Bl -enum -width indent .It A slash .Pq / followed by the number of significant bits. For example, an address with 33 significant bits could be specified as: .Pp .Dl "MAC 10:20:30:40:50:60/33 any" .Pp .It An ampersand .Pq & followed by a bitmask specified as six groups of hex digits separated by colons. For example, an address in which the last 16 bits are significant could be specified as: .Pp .Dl "MAC 10:20:30:40:50:60&00:00:00:00:ff:ff any" .Pp Note that the ampersand character has a special meaning in many shells and should generally be escaped. .Pp .El Note that the order of MAC addresses (destination first, source second) is the same as on the wire, but the opposite of the one used for IP addresses. .It Cm mac-type Ar mac-type Matches packets whose Ethernet Type field corresponds to one of those specified as argument. .Ar mac-type is specified in the same way as .Cm port numbers (i.e., one or more comma-separated single values or ranges). You can use symbolic names for known values such as .Em vlan , ipv4, ipv6 . Values can be entered as decimal or hexadecimal (if prefixed by 0x), and they are always printed as hexadecimal (unless the .Cm -N option is used, in which case symbolic resolution will be attempted). .It Cm proto Ar protocol Matches packets with the corresponding IP protocol. .It Cm recv | xmit | via Brq Ar ifX | Ar if Ns Cm * | Ar table Ns Pq Ar number Ns Op , Ns Ar value | Ar ipno | Ar any Matches packets received, transmitted or going through, respectively, the interface specified by exact name .Po Ar ifX Pc , by device name .Po Ar if* Pc , by IP address, or through some interface. .Pp The .Cm via keyword causes the interface to always be checked. If .Cm recv or .Cm xmit is used instead of .Cm via , then only the receive or transmit interface (respectively) is checked. By specifying both, it is possible to match packets based on both receive and transmit interface, e.g.: .Pp .Dl "ipfw add deny ip from any to any out recv ed0 xmit ed1" .Pp The .Cm recv interface can be tested on either incoming or outgoing packets, while the .Cm xmit interface can only be tested on outgoing packets. So .Cm out is required (and .Cm in is invalid) whenever .Cm xmit is used. .Pp A packet might not have a receive or transmit interface: packets originating from the local host have no receive interface, while packets destined for the local host have no transmit interface. .It Cm setup Matches TCP packets that have the SYN bit set but no ACK bit. This is the short form of .Dq Li tcpflags\ syn,!ack . .It Cm sockarg Matches packets that are associated to a local socket and for which the SO_USER_COOKIE socket option has been set to a non-zero value. As a side effect, the value of the option is made available as .Cm tablearg value, which in turn can be used as .Cm skipto or .Cm pipe number. .It Cm src-ip Ar ip-address Matches IPv4 packets whose source IP is one of the address(es) specified as an argument. .It Cm src-ip6 Ar ip6-address Matches IPv6 packets whose source IP is one of the address(es) specified as an argument. .It Cm src-port Ar ports Matches IP packets whose source port is one of the port(s) specified as argument. .It Cm tagged Ar tag-list Matches packets whose tags are included in .Ar tag-list , which is either a single value or a list of values or ranges specified in the same way as .Ar ports . Tags can be applied to the packet using .Cm tag rule action parameter (see it's description for details on tags). .It Cm tcpack Ar ack TCP packets only. Match if the TCP header acknowledgment number field is set to .Ar ack . .It Cm tcpdatalen Ar tcpdatalen-list Matches TCP packets whose length of TCP data is .Ar tcpdatalen-list , which is either a single value or a list of values or ranges specified in the same way as .Ar ports . .It Cm tcpflags Ar spec TCP packets only. Match if the TCP header contains the comma separated list of flags specified in .Ar spec . The supported TCP flags are: .Pp .Cm fin , .Cm syn , .Cm rst , .Cm psh , .Cm ack and .Cm urg . The absence of a particular flag may be denoted with a .Ql \&! . A rule which contains a .Cm tcpflags specification can never match a fragmented packet which has a non-zero offset. See the .Cm frag option for details on matching fragmented packets. .It Cm tcpseq Ar seq TCP packets only. Match if the TCP header sequence number field is set to .Ar seq . .It Cm tcpwin Ar tcpwin-list Matches TCP packets whose header window field is set to .Ar tcpwin-list , which is either a single value or a list of values or ranges specified in the same way as .Ar ports . .It Cm tcpoptions Ar spec TCP packets only. Match if the TCP header contains the comma separated list of options specified in .Ar spec . The supported TCP options are: .Pp .Cm mss (maximum segment size), .Cm window (tcp window advertisement), .Cm sack (selective ack), .Cm ts (rfc1323 timestamp) and .Cm cc (rfc1644 t/tcp connection count). The absence of a particular option may be denoted with a .Ql \&! . .It Cm uid Ar user Match all TCP or UDP packets sent by or received for a .Ar user . A .Ar user may be matched by name or identification number. .It Cm verrevpath For incoming packets, a routing table lookup is done on the packet's source address. If the interface on which the packet entered the system matches the outgoing interface for the route, the packet matches. If the interfaces do not match up, the packet does not match. All outgoing packets or packets with no incoming interface match. .Pp The name and functionality of the option is intentionally similar to the Cisco IOS command: .Pp .Dl ip verify unicast reverse-path .Pp This option can be used to make anti-spoofing rules to reject all packets with source addresses not from this interface. See also the option .Cm antispoof . .It Cm versrcreach For incoming packets, a routing table lookup is done on the packet's source address. If a route to the source address exists, but not the default route or a blackhole/reject route, the packet matches. Otherwise, the packet does not match. All outgoing packets match. .Pp The name and functionality of the option is intentionally similar to the Cisco IOS command: .Pp .Dl ip verify unicast source reachable-via any .Pp This option can be used to make anti-spoofing rules to reject all packets whose source address is unreachable. .It Cm antispoof For incoming packets, the packet's source address is checked if it belongs to a directly connected network. If the network is directly connected, then the interface the packet came on in is compared to the interface the network is connected to. When incoming interface and directly connected interface are not the same, the packet does not match. Otherwise, the packet does match. All outgoing packets match. .Pp This option can be used to make anti-spoofing rules to reject all packets that pretend to be from a directly connected network but do not come in through that interface. This option is similar to but more restricted than .Cm verrevpath because it engages only on packets with source addresses of directly connected networks instead of all source addresses. .El .Sh LOOKUP TABLES Lookup tables are useful to handle large sparse sets of addresses or other search keys (e.g., ports, jail IDs, interface names). In the rest of this section we will use the term ``address''. There may be up to 65535 different lookup tables, numbered 0 to 65534. .Pp Each entry is represented by an .Ar addr Ns Op / Ns Ar masklen and will match all addresses with base .Ar addr (specified as an IPv4/IPv6 address, a hostname or an unsigned integer) and mask width of .Ar masklen bits. If .Ar masklen is not specified, it defaults to 32 for IPv4 and 128 for IPv6. When looking up an IP address in a table, the most specific entry will match. Associated with each entry is a 32-bit unsigned .Ar value , which can optionally be checked by a rule matching code. When adding an entry, if .Ar value is not specified, it defaults to 0. .Pp An entry can be added to a table .Pq Cm add , or removed from a table .Pq Cm delete . A table can be examined .Pq Cm list or flushed .Pq Cm flush . .Pp Internally, each table is stored in a Radix tree, the same way as the routing table (see .Xr route 4 ) . .Pp Lookup tables currently support only ports, jail IDs, IPv4/IPv6 addresses and interface names. Wildcards is not supported for interface names. .Pp The .Cm tablearg feature provides the ability to use a value, looked up in the table, as the argument for a rule action, action parameter or rule option. This can significantly reduce number of rules in some configurations. If two tables are used in a rule, the result of the second (destination) is used. The .Cm tablearg argument can be used with the following actions: .Cm nat, pipe , queue, divert, tee, netgraph, ngtee, fwd, skipto, setfib, action parameters: .Cm tag, untag, rule options: .Cm limit, tagged. .Pp When used with .Cm fwd it is possible to supply table entries with values that are in the form of IP addresses or hostnames. See the .Sx EXAMPLES Section for example usage of tables and the tablearg keyword. .Pp When used with the .Cm skipto action, the user should be aware that the code will walk the ruleset up to a rule equal to, or past, the given number, and should therefore try keep the ruleset compact between the skipto and the target rules. .Sh SETS OF RULES Each rule belongs to one of 32 different .Em sets , numbered 0 to 31. Set 31 is reserved for the default rule. .Pp By default, rules are put in set 0, unless you use the .Cm set N attribute when entering a new rule. Sets can be individually and atomically enabled or disabled, so this mechanism permits an easy way to store multiple configurations of the firewall and quickly (and atomically) switch between them. The command to enable/disable sets is .Bd -ragged -offset indent .Nm .Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ... .Ed .Pp where multiple .Cm enable or .Cm disable sections can be specified. Command execution is atomic on all the sets specified in the command. By default, all sets are enabled. .Pp When you disable a set, its rules behave as if they do not exist in the firewall configuration, with only one exception: .Bd -ragged -offset indent dynamic rules created from a rule before it had been disabled will still be active until they expire. In order to delete dynamic rules you have to explicitly delete the parent rule which generated them. .Ed .Pp The set number of rules can be changed with the command .Bd -ragged -offset indent .Nm .Cm set move .Brq Cm rule Ar rule-number | old-set .Cm to Ar new-set .Ed .Pp Also, you can atomically swap two rulesets with the command .Bd -ragged -offset indent .Nm .Cm set swap Ar first-set second-set .Ed .Pp See the .Sx EXAMPLES Section on some possible uses of sets of rules. .Sh STATEFUL FIREWALL Stateful operation is a way for the firewall to dynamically create rules for specific flows when packets that match a given pattern are detected. Support for stateful operation comes through the .Cm check-state , keep-state and .Cm limit options of .Nm rules . .Pp Dynamic rules are created when a packet matches a .Cm keep-state or .Cm limit rule, causing the creation of a .Em dynamic rule which will match all and only packets with a given .Em protocol between a .Em src-ip/src-port dst-ip/dst-port pair of addresses .Em ( src and .Em dst are used here only to denote the initial match addresses, but they are completely equivalent afterwards). Dynamic rules will be checked at the first .Cm check-state, keep-state or .Cm limit occurrence, and the action performed upon a match will be the same as in the parent rule. .Pp Note that no additional attributes other than protocol and IP addresses and ports are checked on dynamic rules. .Pp The typical use of dynamic rules is to keep a closed firewall configuration, but let the first TCP SYN packet from the inside network install a dynamic rule for the flow so that packets belonging to that session will be allowed through the firewall: .Pp .Dl "ipfw add check-state" .Dl "ipfw add allow tcp from my-subnet to any setup keep-state" .Dl "ipfw add deny tcp from any to any" .Pp A similar approach can be used for UDP, where an UDP packet coming from the inside will install a dynamic rule to let the response through the firewall: .Pp .Dl "ipfw add check-state" .Dl "ipfw add allow udp from my-subnet to any keep-state" .Dl "ipfw add deny udp from any to any" .Pp Dynamic rules expire after some time, which depends on the status of the flow and the setting of some .Cm sysctl variables. See Section .Sx SYSCTL VARIABLES for more details. For TCP sessions, dynamic rules can be instructed to periodically send keepalive packets to refresh the state of the rule when it is about to expire. .Pp See Section .Sx EXAMPLES for more examples on how to use dynamic rules. .Sh TRAFFIC SHAPER (DUMMYNET) CONFIGURATION .Nm is also the user interface for the .Nm dummynet traffic shaper, packet scheduler and network emulator, a subsystem that can artificially queue, delay or drop packets emulating the behaviour of certain network links or queueing systems. .Pp .Nm dummynet operates by first using the firewall to select packets using any match pattern that can be used in .Nm rules. Matching packets are then passed to either of two different objects, which implement the traffic regulation: .Bl -hang -offset XXXX .It Em pipe A .Em pipe emulates a .Em link with given bandwidth and propagation delay, driven by a FIFO scheduler and a single queue with programmable queue size and packet loss rate. Packets are appended to the queue as they come out from .Nm ipfw , and then transferred in FIFO order to the link at the desired rate. .It Em queue A .Em queue is an abstraction used to implement packet scheduling using one of several packet scheduling algorithms. Packets sent to a .Em queue are first grouped into flows according to a mask on the 5-tuple. Flows are then passed to the scheduler associated to the .Em queue , and each flow uses scheduling parameters (weight and others) as configured in the .Em queue itself. A scheduler in turn is connected to an emulated link, and arbitrates the link's bandwidth among backlogged flows according to weights and to the features of the scheduling algorithm in use. .El .Pp In practice, .Em pipes can be used to set hard limits to the bandwidth that a flow can use, whereas .Em queues can be used to determine how different flows share the available bandwidth. .Pp A graphical representation of the binding of queues, flows, schedulers and links is below. .Bd -literal -offset indent (flow_mask|sched_mask) sched_mask +---------+ weight Wx +-------------+ | |->-[flow]-->--| |-+ -->--| QUEUE x | ... | | | | |->-[flow]-->--| SCHEDuler N | | +---------+ | | | ... | +--[LINK N]-->-- +---------+ weight Wy | | +--[LINK N]-->-- | |->-[flow]-->--| | | -->--| QUEUE y | ... | | | | |->-[flow]-->--| | | +---------+ +-------------+ | +-------------+ .Ed It is important to understand the role of the SCHED_MASK and FLOW_MASK, which are configured through the commands .Dl "ipfw sched N config mask SCHED_MASK ..." and .Dl "ipfw queue X config mask FLOW_MASK ..." . .Pp The SCHED_MASK is used to assign flows to one or more scheduler instances, one for each value of the packet's 5-tuple after applying SCHED_MASK. As an example, using ``src-ip 0xffffff00'' creates one instance for each /24 destination subnet. .Pp The FLOW_MASK, together with the SCHED_MASK, is used to split packets into flows. As an example, using ``src-ip 0x000000ff'' together with the previous SCHED_MASK makes a flow for each individual source address. In turn, flows for each /24 subnet will be sent to the same scheduler instance. .Pp The above diagram holds even for the .Em pipe case, with the only restriction that a .Em pipe only supports a SCHED_MASK, and forces the use of a FIFO scheduler (these are for backward compatibility reasons; in fact, internally, a .Nm dummynet's pipe is implemented exactly as above). .Pp There are two modes of .Nm dummynet operation: .Dq normal and .Dq fast . The .Dq normal mode tries to emulate a real link: the .Nm dummynet scheduler ensures that the packet will not leave the pipe faster than it would on the real link with a given bandwidth. The .Dq fast mode allows certain packets to bypass the .Nm dummynet scheduler (if packet flow does not exceed pipe's bandwidth). This is the reason why the .Dq fast mode requires less CPU cycles per packet (on average) and packet latency can be significantly lower in comparison to a real link with the same bandwidth. The default mode is .Dq normal . The .Dq fast mode can be enabled by setting the .Va net.inet.ip.dummynet.io_fast .Xr sysctl 8 variable to a non-zero value. .Pp .Ss PIPE, QUEUE AND SCHEDULER CONFIGURATION The .Em pipe , .Em queue and .Em scheduler configuration commands are the following: .Bd -ragged -offset indent .Cm pipe Ar number Cm config Ar pipe-configuration .Pp .Cm queue Ar number Cm config Ar queue-configuration .Pp .Cm sched Ar number Cm config Ar sched-configuration .Ed .Pp The following parameters can be configured for a pipe: .Pp .Bl -tag -width indent -compact .It Cm bw Ar bandwidth | device Bandwidth, measured in .Sm off .Op Cm K | M .Brq Cm bit/s | Byte/s . .Sm on .Pp A value of 0 (default) means unlimited bandwidth. The unit must immediately follow the number, as in .Pp .Dl "ipfw pipe 1 config bw 300Kbit/s" .Pp If a device name is specified instead of a numeric value, as in .Pp .Dl "ipfw pipe 1 config bw tun0" .Pp then the transmit clock is supplied by the specified device. At the moment only the .Xr tun 4 device supports this functionality, for use in conjunction with .Xr ppp 8 . .Pp .It Cm delay Ar ms-delay Propagation delay, measured in milliseconds. The value is rounded to the next multiple of the clock tick (typically 10ms, but it is a good practice to run kernels with .Dq "options HZ=1000" to reduce the granularity to 1ms or less). The default value is 0, meaning no delay. .Pp .It Cm burst Ar size If the data to be sent exceeds the pipe's bandwidth limit (and the pipe was previously idle), up to .Ar size bytes of data are allowed to bypass the .Nm dummynet scheduler, and will be sent as fast as the physical link allows. Any additional data will be transmitted at the rate specified by the .Nm pipe bandwidth. The burst size depends on how long the pipe has been idle; the effective burst size is calculated as follows: MAX( .Ar size , .Nm bw * pipe_idle_time). .Pp .It Cm profile Ar filename A file specifying the additional overhead incurred in the transmission of a packet on the link. .Pp Some link types introduce extra delays in the transmission of a packet, e.g., because of MAC level framing, contention on the use of the channel, MAC level retransmissions and so on. From our point of view, the channel is effectively unavailable for this extra time, which is constant or variable depending on the link type. Additionally, packets may be dropped after this time (e.g., on a wireless link after too many retransmissions). We can model the additional delay with an empirical curve that represents its distribution. .Bd -literal -offset indent cumulative probability 1.0 ^ | L +-- loss-level x | ****** | * | ***** | * | ** | * +-------*-------------------> delay .Ed The empirical curve may have both vertical and horizontal lines. Vertical lines represent constant delay for a range of probabilities. Horizontal lines correspond to a discontinuity in the delay distribution: the pipe will use the largest delay for a given probability. .Pp The file format is the following, with whitespace acting as a separator and '#' indicating the beginning a comment: .Bl -tag -width indent .It Cm name Ar identifier optional name (listed by "ipfw pipe show") to identify the delay distribution; .It Cm bw Ar value the bandwidth used for the pipe. If not specified here, it must be present explicitly as a configuration parameter for the pipe; .It Cm loss-level Ar L the probability above which packets are lost. (0.0 <= L <= 1.0, default 1.0 i.e., no loss); .It Cm samples Ar N the number of samples used in the internal representation of the curve (2..1024; default 100); .It Cm "delay prob" | "prob delay" One of these two lines is mandatory and defines the format of the following lines with data points. .It Ar XXX Ar YYY 2 or more lines representing points in the curve, with either delay or probability first, according to the chosen format. The unit for delay is milliseconds. Data points do not need to be sorted. Also, the number of actual lines can be different from the value of the "samples" parameter: .Nm utility will sort and interpolate the curve as needed. .El .Pp Example of a profile file: .Bd -literal -offset indent name bla_bla_bla samples 100 loss-level 0.86 prob delay 0 200 # minimum overhead is 200ms 0.5 200 0.5 300 0.8 1000 0.9 1300 1 1300 #configuration file end .Ed .El .Pp The following parameters can be configured for a queue: .Pp .Bl -tag -width indent -compact .It Cm pipe Ar pipe_nr Connects a queue to the specified pipe. Multiple queues (with the same or different weights) can be connected to the same pipe, which specifies the aggregate rate for the set of queues. .Pp .It Cm weight Ar weight Specifies the weight to be used for flows matching this queue. The weight must be in the range 1..100, and defaults to 1. .El .Pp The following case-insensitive parameters can be configured for a scheduler: .Pp .Bl -tag -width indent -compact .It Cm type Ar {fifo | wf2q+ | rr | qfq} specifies the scheduling algorithm to use. .Bl -tag -width indent -compact .It Cm fifo is just a FIFO scheduler (which means that all packets are stored in the same queue as they arrive to the scheduler). FIFO has O(1) per-packet time complexity, with very low constants (estimate 60-80ns on a 2GHz desktop machine) but gives no service guarantees. .It Cm wf2q+ implements the WF2Q+ algorithm, which is a Weighted Fair Queueing algorithm which permits flows to share bandwidth according to their weights. Note that weights are not priorities; even a flow with a minuscule weight will never starve. WF2Q+ has O(log N) per-packet processing cost, where N is the number of flows, and is the default algorithm used by previous versions dummynet's queues. .It Cm rr implements the Deficit Round Robin algorithm, which has O(1) processing costs (roughly, 100-150ns per packet) and permits bandwidth allocation according to weights, but with poor service guarantees. .It Cm qfq implements the QFQ algorithm, which is a very fast variant of WF2Q+, with similar service guarantees and O(1) processing costs (roughly, 200-250ns per packet). .El .El .Pp In addition to the type, all parameters allowed for a pipe can also be specified for a scheduler. .Pp Finally, the following parameters can be configured for both pipes and queues: .Pp .Bl -tag -width XXXX -compact .It Cm buckets Ar hash-table-size Specifies the size of the hash table used for storing the various queues. Default value is 64 controlled by the .Xr sysctl 8 variable .Va net.inet.ip.dummynet.hash_size , allowed range is 16 to 65536. .Pp .It Cm mask Ar mask-specifier Packets sent to a given pipe or queue by an .Nm rule can be further classified into multiple flows, each of which is then sent to a different .Em dynamic pipe or queue. A flow identifier is constructed by masking the IP addresses, ports and protocol types as specified with the .Cm mask options in the configuration of the pipe or queue. For each different flow identifier, a new pipe or queue is created with the same parameters as the original object, and matching packets are sent to it. .Pp Thus, when .Em dynamic pipes are used, each flow will get the same bandwidth as defined by the pipe, whereas when .Em dynamic queues are used, each flow will share the parent's pipe bandwidth evenly with other flows generated by the same queue (note that other queues with different weights might be connected to the same pipe). .br Available mask specifiers are a combination of one or more of the following: .Pp .Cm dst-ip Ar mask , .Cm dst-ip6 Ar mask , .Cm src-ip Ar mask , .Cm src-ip6 Ar mask , .Cm dst-port Ar mask , .Cm src-port Ar mask , .Cm flow-id Ar mask , .Cm proto Ar mask or .Cm all , .Pp where the latter means all bits in all fields are significant. .Pp .It Cm noerror When a packet is dropped by a .Nm dummynet queue or pipe, the error is normally reported to the caller routine in the kernel, in the same way as it happens when a device queue fills up. Setting this option reports the packet as successfully delivered, which can be needed for some experimental setups where you want to simulate loss or congestion at a remote router. .Pp .It Cm plr Ar packet-loss-rate Packet loss rate. Argument .Ar packet-loss-rate is a floating-point number between 0 and 1, with 0 meaning no loss, 1 meaning 100% loss. The loss rate is internally represented on 31 bits. .Pp .It Cm queue Brq Ar slots | size Ns Cm Kbytes Queue size, in .Ar slots or .Cm KBytes . Default value is 50 slots, which is the typical queue size for Ethernet devices. Note that for slow speed links you should keep the queue size short or your traffic might be affected by a significant queueing delay. E.g., 50 max-sized ethernet packets (1500 bytes) mean 600Kbit or 20s of queue on a 30Kbit/s pipe. Even worse effects can result if you get packets from an interface with a much larger MTU, e.g.\& the loopback interface with its 16KB packets. The .Xr sysctl 8 variables .Em net.inet.ip.dummynet.pipe_byte_limit and .Em net.inet.ip.dummynet.pipe_slot_limit control the maximum lengths that can be specified. .Pp .It Cm red | gred Ar w_q Ns / Ns Ar min_th Ns / Ns Ar max_th Ns / Ns Ar max_p Make use of the RED (Random Early Detection) queue management algorithm. .Ar w_q and .Ar max_p are floating point numbers between 0 and 1 (0 not included), while .Ar min_th and .Ar max_th are integer numbers specifying thresholds for queue management (thresholds are computed in bytes if the queue has been defined in bytes, in slots otherwise). The .Nm dummynet also supports the gentle RED variant (gred). Three .Xr sysctl 8 variables can be used to control the RED behaviour: .Bl -tag -width indent .It Va net.inet.ip.dummynet.red_lookup_depth specifies the accuracy in computing the average queue when the link is idle (defaults to 256, must be greater than zero) .It Va net.inet.ip.dummynet.red_avg_pkt_size specifies the expected average packet size (defaults to 512, must be greater than zero) .It Va net.inet.ip.dummynet.red_max_pkt_size specifies the expected maximum packet size, only used when queue thresholds are in bytes (defaults to 1500, must be greater than zero). .El .El .Pp When used with IPv6 data, .Nm dummynet currently has several limitations. Information necessary to route link-local packets to an interface is not available after processing by .Nm dummynet so those packets are dropped in the output path. Care should be taken to ensure that link-local packets are not passed to .Nm dummynet . .Sh CHECKLIST Here are some important points to consider when designing your rules: .Bl -bullet .It Remember that you filter both packets going .Cm in and .Cm out . Most connections need packets going in both directions. .It Remember to test very carefully. It is a good idea to be near the console when doing this. If you cannot be near the console, use an auto-recovery script such as the one in .Pa /usr/share/examples/ipfw/change_rules.sh . .It Do not forget the loopback interface. .El .Sh FINE POINTS .Bl -bullet .It There are circumstances where fragmented datagrams are unconditionally dropped. TCP packets are dropped if they do not contain at least 20 bytes of TCP header, UDP packets are dropped if they do not contain a full 8 byte UDP header, and ICMP packets are dropped if they do not contain 4 bytes of ICMP header, enough to specify the ICMP type, code, and checksum. These packets are simply logged as .Dq pullup failed since there may not be enough good data in the packet to produce a meaningful log entry. .It Another type of packet is unconditionally dropped, a TCP packet with a fragment offset of one. This is a valid packet, but it only has one use, to try to circumvent firewalls. When logging is enabled, these packets are reported as being dropped by rule -1. .It If you are logged in over a network, loading the .Xr kld 4 version of .Nm is probably not as straightforward as you would think. The following command line is recommended: .Bd -literal -offset indent kldload ipfw && \e ipfw add 32000 allow ip from any to any .Ed .Pp Along the same lines, doing an .Bd -literal -offset indent ipfw flush .Ed .Pp in similar surroundings is also a bad idea. .It The .Nm filter list may not be modified if the system security level is set to 3 or higher (see .Xr init 8 for information on system security levels). .El .Sh PACKET DIVERSION A .Xr divert 4 socket bound to the specified port will receive all packets diverted to that port. If no socket is bound to the destination port, or if the divert module is not loaded, or if the kernel was not compiled with divert socket support, the packets are dropped. .Sh NETWORK ADDRESS TRANSLATION (NAT) .Nm support in-kernel NAT using the kernel version of .Xr libalias 3 . .Pp The nat configuration command is the following: .Bd -ragged -offset indent .Bk -words .Cm nat .Ar nat_number .Cm config .Ar nat-configuration .Ek .Ed .Pp The following parameters can be configured: .Bl -tag -width indent .It Cm ip Ar ip_address Define an ip address to use for aliasing. .It Cm if Ar nic Use ip address of NIC for aliasing, dynamically changing it if NIC's ip address changes. .It Cm log Enable logging on this nat instance. .It Cm deny_in Deny any incoming connection from outside world. .It Cm same_ports Try to leave the alias port numbers unchanged from the actual local port numbers. .It Cm unreg_only Traffic on the local network not originating from an unregistered address spaces will be ignored. .It Cm reset Reset table of the packet aliasing engine on address change. .It Cm reverse Reverse the way libalias handles aliasing. .It Cm proxy_only Obey transparent proxy rules only, packet aliasing is not performed. .It Cm skip_global Skip instance in case of global state lookup (see below). .El .Pp Some specials value can be supplied instead of .Va nat_number: .Bl -tag -width indent .It Cm global Looks up translation state in all configured nat instances. If an entry is found, packet is aliased according to that entry. If no entry was found in any of the instances, packet is passed unchanged, and no new entry will be created. See section .Sx MULTIPLE INSTANCES in .Xr natd 8 for more information. .It Cm tablearg Uses argument supplied in lookup table. See .Sx LOOKUP TABLES section below for more information on lookup tables. .El .Pp To let the packet continue after being (de)aliased, set the sysctl variable .Va net.inet.ip.fw.one_pass to 0. For more information about aliasing modes, refer to .Xr libalias 3 . See Section .Sx EXAMPLES for some examples about nat usage. .Ss REDIRECT AND LSNAT SUPPORT IN IPFW Redirect and LSNAT support follow closely the syntax used in .Xr natd 8 . See Section .Sx EXAMPLES for some examples on how to do redirect and lsnat. .Ss SCTP NAT SUPPORT SCTP nat can be configured in a similar manner to TCP through the .Nm command line tool. The main difference is that .Nm sctp nat does not do port translation. Since the local and global side ports will be the same, there is no need to specify both. Ports are redirected as follows: .Bd -ragged -offset indent .Bk -words .Cm nat .Ar nat_number .Cm config if .Ar nic .Cm redirect_port sctp .Ar ip_address [,addr_list] {[port | port-port] [,ports]} .Ek .Ed .Pp Most .Nm sctp nat configuration can be done in real-time through the .Xr sysctl 8 interface. All may be changed dynamically, though the hash_table size will only change for new .Nm nat instances. See .Sx SYSCTL VARIABLES for more info. .Sh LOADER TUNABLES Tunables can be set in .Xr loader 8 prompt, .Xr loader.conf 5 or .Xr kenv 1 before ipfw module gets loaded. .Bl -tag -width indent .It Va net.inet.ip.fw.default_to_accept: No 0 Defines ipfw last rule behavior. This value overrides .Cd "options IPFW_DEFAULT_TO_(ACCEPT|DENY)" from kernel configuration file. .It Va net.inet.ip.fw.tables_max: No 128 Defines number of tables available in ipfw. Number cannot exceed 65534. .El .Sh SYSCTL VARIABLES A set of .Xr sysctl 8 variables controls the behaviour of the firewall and associated modules .Pq Nm dummynet , bridge , sctp nat . These are shown below together with their default value (but always check with the .Xr sysctl 8 command what value is actually in use) and meaning: .Bl -tag -width indent .It Va net.inet.ip.alias.sctp.accept_global_ootb_addip: No 0 Defines how the .Nm nat responds to receipt of global OOTB ASCONF-AddIP: .Bl -tag -width indent .It Cm 0 No response (unless a partially matching association exists - ports and vtags match but global address does not) .It Cm 1 .Nm nat will accept and process all OOTB global AddIP messages. .El .Pp Option 1 should never be selected as this forms a security risk. An attacker can establish multiple fake associations by sending AddIP messages. .It Va net.inet.ip.alias.sctp.chunk_proc_limit: No 5 Defines the maximum number of chunks in an SCTP packet that will be parsed for a packet that matches an existing association. This value is enforced to be greater or equal than .Cm net.inet.ip.alias.sctp.initialising_chunk_proc_limit . A high value is a DoS risk yet setting too low a value may result in important control chunks in the packet not being located and parsed. .It Va net.inet.ip.alias.sctp.error_on_ootb: No 1 Defines when the .Nm nat responds to any Out-of-the-Blue (OOTB) packets with ErrorM packets. An OOTB packet is a packet that arrives with no existing association registered in the .Nm nat and is not an INIT or ASCONF-AddIP packet: .Bl -tag -width indent .It Cm 0 ErrorM is never sent in response to OOTB packets. .It Cm 1 ErrorM is only sent to OOTB packets received on the local side. .It Cm 2 ErrorM is sent to the local side and on the global side ONLY if there is a partial match (ports and vtags match but the source global IP does not). This value is only useful if the .Nm nat is tracking global IP addresses. .It Cm 3 ErrorM is sent in response to all OOTB packets on both the local and global side (DoS risk). .El .Pp At the moment the default is 0, since the ErrorM packet is not yet supported by most SCTP stacks. When it is supported, and if not tracking global addresses, we recommend setting this value to 1 to allow multi-homed local hosts to function with the .Nm nat . To track global addresses, we recommend setting this value to 2 to allow global hosts to be informed when they need to (re)send an ASCONF-AddIP. Value 3 should never be chosen (except for debugging) as the .Nm nat will respond to all OOTB global packets (a DoS risk). .It Va net.inet.ip.alias.sctp.hashtable_size: No 2003 Size of hash tables used for .Nm nat lookups (100 < prime_number > 1000001). This value sets the .Nm hash table size for any future created .Nm nat instance and therefore must be set prior to creating a .Nm nat instance. The table sizes may be changed to suit specific needs. If there will be few concurrent associations, and memory is scarce, you may make these smaller. If there will be many thousands (or millions) of concurrent associations, you should make these larger. A prime number is best for the table size. The sysctl update function will adjust your input value to the next highest prime number. .It Va net.inet.ip.alias.sctp.holddown_time: No 0 Hold association in table for this many seconds after receiving a SHUTDOWN-COMPLETE. This allows endpoints to correct shutdown gracefully if a shutdown_complete is lost and retransmissions are required. .It Va net.inet.ip.alias.sctp.init_timer: No 15 Timeout value while waiting for (INIT-ACK|AddIP-ACK). This value cannot be 0. .It Va net.inet.ip.alias.sctp.initialising_chunk_proc_limit: No 2 Defines the maximum number of chunks in an SCTP packet that will be parsed when no existing association exists that matches that packet. Ideally this packet will only be an INIT or ASCONF-AddIP packet. A higher value may become a DoS risk as malformed packets can consume processing resources. .It Va net.inet.ip.alias.sctp.param_proc_limit: No 25 Defines the maximum number of parameters within a chunk that will be parsed in a packet. As for other similar sysctl variables, larger values pose a DoS risk. .It Va net.inet.ip.alias.sctp.log_level: No 0 Level of detail in the system log messages (0 \- minimal, 1 \- event, 2 \- info, 3 \- detail, 4 \- debug, 5 \- max debug). May be a good option in high loss environments. .It Va net.inet.ip.alias.sctp.shutdown_time: No 15 Timeout value while waiting for SHUTDOWN-COMPLETE. This value cannot be 0. .It Va net.inet.ip.alias.sctp.track_global_addresses: No 0 Enables/disables global IP address tracking within the .Nm nat and places an upper limit on the number of addresses tracked for each association: .Bl -tag -width indent .It Cm 0 Global tracking is disabled .It Cm >1 Enables tracking, the maximum number of addresses tracked for each association is limited to this value .El .Pp This variable is fully dynamic, the new value will be adopted for all newly arriving associations, existing associations are treated as they were previously. Global tracking will decrease the number of collisions within the .Nm nat at a cost of increased processing load, memory usage, complexity, and possible .Nm nat state problems in complex networks with multiple .Nm nats . We recommend not tracking global IP addresses, this will still result in a fully functional .Nm nat . .It Va net.inet.ip.alias.sctp.up_timer: No 300 Timeout value to keep an association up with no traffic. This value cannot be 0. .It Va net.inet.ip.dummynet.expire : No 1 Lazily delete dynamic pipes/queue once they have no pending traffic. You can disable this by setting the variable to 0, in which case the pipes/queues will only be deleted when the threshold is reached. .It Va net.inet.ip.dummynet.hash_size : No 64 Default size of the hash table used for dynamic pipes/queues. This value is used when no .Cm buckets option is specified when configuring a pipe/queue. .It Va net.inet.ip.dummynet.io_fast : No 0 If set to a non-zero value, the .Dq fast mode of .Nm dummynet operation (see above) is enabled. .It Va net.inet.ip.dummynet.io_pkt Number of packets passed to .Nm dummynet . .It Va net.inet.ip.dummynet.io_pkt_drop Number of packets dropped by .Nm dummynet . .It Va net.inet.ip.dummynet.io_pkt_fast Number of packets bypassed by the .Nm dummynet scheduler. .It Va net.inet.ip.dummynet.max_chain_len : No 16 Target value for the maximum number of pipes/queues in a hash bucket. The product .Cm max_chain_len*hash_size is used to determine the threshold over which empty pipes/queues will be expired even when .Cm net.inet.ip.dummynet.expire=0 . .It Va net.inet.ip.dummynet.red_lookup_depth : No 256 .It Va net.inet.ip.dummynet.red_avg_pkt_size : No 512 .It Va net.inet.ip.dummynet.red_max_pkt_size : No 1500 Parameters used in the computations of the drop probability for the RED algorithm. .It Va net.inet.ip.dummynet.pipe_byte_limit : No 1048576 .It Va net.inet.ip.dummynet.pipe_slot_limit : No 100 The maximum queue size that can be specified in bytes or packets. These limits prevent accidental exhaustion of resources such as mbufs. If you raise these limits, you should make sure the system is configured so that sufficient resources are available. .It Va net.inet.ip.fw.autoinc_step : No 100 Delta between rule numbers when auto-generating them. The value must be in the range 1..1000. .It Va net.inet.ip.fw.curr_dyn_buckets : Va net.inet.ip.fw.dyn_buckets The current number of buckets in the hash table for dynamic rules (readonly). .It Va net.inet.ip.fw.debug : No 1 Controls debugging messages produced by .Nm . .It Va net.inet.ip.fw.default_rule : No 65535 The default rule number (read-only). By the design of .Nm , the default rule is the last one, so its number can also serve as the highest number allowed for a rule. .It Va net.inet.ip.fw.dyn_buckets : No 256 The number of buckets in the hash table for dynamic rules. Must be a power of 2, up to 65536. It only takes effect when all dynamic rules have expired, so you are advised to use a .Cm flush command to make sure that the hash table is resized. .It Va net.inet.ip.fw.dyn_count : No 3 Current number of dynamic rules (read-only). .It Va net.inet.ip.fw.dyn_keepalive : No 1 Enables generation of keepalive packets for .Cm keep-state rules on TCP sessions. A keepalive is generated to both sides of the connection every 5 seconds for the last 20 seconds of the lifetime of the rule. .It Va net.inet.ip.fw.dyn_max : No 8192 Maximum number of dynamic rules. When you hit this limit, no more dynamic rules can be installed until old ones expire. .It Va net.inet.ip.fw.dyn_ack_lifetime : No 300 .It Va net.inet.ip.fw.dyn_syn_lifetime : No 20 .It Va net.inet.ip.fw.dyn_fin_lifetime : No 1 .It Va net.inet.ip.fw.dyn_rst_lifetime : No 1 .It Va net.inet.ip.fw.dyn_udp_lifetime : No 5 .It Va net.inet.ip.fw.dyn_short_lifetime : No 30 These variables control the lifetime, in seconds, of dynamic rules. Upon the initial SYN exchange the lifetime is kept short, then increased after both SYN have been seen, then decreased again during the final FIN exchange or when a RST is received. Both .Em dyn_fin_lifetime and .Em dyn_rst_lifetime must be strictly lower than 5 seconds, the period of repetition of keepalives. The firewall enforces that. .It Va net.inet.ip.fw.dyn_keep_states: No 0 Keep dynamic states on rule/set deletion. States are relinked to default rule (65535). This can be handly for ruleset reload. Turned off by default. .It Va net.inet.ip.fw.enable : No 1 Enables the firewall. Setting this variable to 0 lets you run your machine without firewall even if compiled in. .It Va net.inet6.ip6.fw.enable : No 1 provides the same functionality as above for the IPv6 case. .It Va net.inet.ip.fw.one_pass : No 1 When set, the packet exiting from the .Nm dummynet pipe or from .Xr ng_ipfw 4 node is not passed though the firewall again. Otherwise, after an action, the packet is reinjected into the firewall at the next rule. .It Va net.inet.ip.fw.tables_max : No 128 Maximum number of tables. .It Va net.inet.ip.fw.verbose : No 1 Enables verbose messages. .It Va net.inet.ip.fw.verbose_limit : No 0 Limits the number of messages produced by a verbose firewall. .It Va net.inet6.ip6.fw.deny_unknown_exthdrs : No 1 If enabled packets with unknown IPv6 Extension Headers will be denied. .It Va net.link.ether.ipfw : No 0 Controls whether layer-2 packets are passed to .Nm . Default is no. .It Va net.link.bridge.ipfw : No 0 Controls whether bridged packets are passed to .Nm . Default is no. .El .Sh EXAMPLES There are far too many possible uses of .Nm so this Section will only give a small set of examples. .Pp .Ss BASIC PACKET FILTERING This command adds an entry which denies all tcp packets from .Em cracker.evil.org to the telnet port of .Em wolf.tambov.su from being forwarded by the host: .Pp .Dl "ipfw add deny tcp from cracker.evil.org to wolf.tambov.su telnet" .Pp This one disallows any connection from the entire cracker's network to my host: .Pp .Dl "ipfw add deny ip from 123.45.67.0/24 to my.host.org" .Pp A first and efficient way to limit access (not using dynamic rules) is the use of the following rules: .Pp .Dl "ipfw add allow tcp from any to any established" .Dl "ipfw add allow tcp from net1 portlist1 to net2 portlist2 setup" .Dl "ipfw add allow tcp from net3 portlist3 to net3 portlist3 setup" .Dl "..." .Dl "ipfw add deny tcp from any to any" .Pp The first rule will be a quick match for normal TCP packets, but it will not match the initial SYN packet, which will be matched by the .Cm setup rules only for selected source/destination pairs. All other SYN packets will be rejected by the final .Cm deny rule. .Pp If you administer one or more subnets, you can take advantage of the address sets and or-blocks and write extremely compact rulesets which selectively enable services to blocks of clients, as below: .Pp .Dl "goodguys=\*q{ 10.1.2.0/24{20,35,66,18} or 10.2.3.0/28{6,3,11} }\*q" .Dl "badguys=\*q10.1.2.0/24{8,38,60}\*q" .Dl "" .Dl "ipfw add allow ip from ${goodguys} to any" .Dl "ipfw add deny ip from ${badguys} to any" .Dl "... normal policies ..." .Pp The .Cm verrevpath option could be used to do automated anti-spoofing by adding the following to the top of a ruleset: .Pp .Dl "ipfw add deny ip from any to any not verrevpath in" .Pp This rule drops all incoming packets that appear to be coming to the system on the wrong interface. For example, a packet with a source address belonging to a host on a protected internal network would be dropped if it tried to enter the system from an external interface. .Pp The .Cm antispoof option could be used to do similar but more restricted anti-spoofing by adding the following to the top of a ruleset: .Pp .Dl "ipfw add deny ip from any to any not antispoof in" .Pp This rule drops all incoming packets that appear to be coming from another directly connected system but on the wrong interface. For example, a packet with a source address of .Li 192.168.0.0/24 , configured on .Li fxp0 , but coming in on .Li fxp1 would be dropped. .Pp The .Cm setdscp option could be used to (re)mark user traffic, by adding the following to the appropriate place in ruleset: .Pp .Dl "ipfw add setdscp be ip from any to any dscp af11,af21" .Ss DYNAMIC RULES In order to protect a site from flood attacks involving fake TCP packets, it is safer to use dynamic rules: .Pp .Dl "ipfw add check-state" .Dl "ipfw add deny tcp from any to any established" .Dl "ipfw add allow tcp from my-net to any setup keep-state" .Pp This will let the firewall install dynamic rules only for those connection which start with a regular SYN packet coming from the inside of our network. Dynamic rules are checked when encountering the first occurrence of a .Cm check-state , .Cm keep-state or .Cm limit rule. A .Cm check-state rule should usually be placed near the beginning of the ruleset to minimize the amount of work scanning the ruleset. Your mileage may vary. .Pp To limit the number of connections a user can open you can use the following type of rules: .Pp .Dl "ipfw add allow tcp from my-net/24 to any setup limit src-addr 10" .Dl "ipfw add allow tcp from any to me setup limit src-addr 4" .Pp The former (assuming it runs on a gateway) will allow each host on a /24 network to open at most 10 TCP connections. The latter can be placed on a server to make sure that a single client does not use more than 4 simultaneous connections. .Pp .Em BEWARE : stateful rules can be subject to denial-of-service attacks by a SYN-flood which opens a huge number of dynamic rules. The effects of such attacks can be partially limited by acting on a set of .Xr sysctl 8 variables which control the operation of the firewall. .Pp Here is a good usage of the .Cm list command to see accounting records and timestamp information: .Pp .Dl ipfw -at list .Pp or in short form without timestamps: .Pp .Dl ipfw -a list .Pp which is equivalent to: .Pp .Dl ipfw show .Pp Next rule diverts all incoming packets from 192.168.2.0/24 to divert port 5000: .Pp .Dl ipfw divert 5000 ip from 192.168.2.0/24 to any in .Ss TRAFFIC SHAPING The following rules show some of the applications of .Nm and .Nm dummynet for simulations and the like. .Pp This rule drops random incoming packets with a probability of 5%: .Pp .Dl "ipfw add prob 0.05 deny ip from any to any in" .Pp A similar effect can be achieved making use of .Nm dummynet pipes: .Pp .Dl "ipfw add pipe 10 ip from any to any" .Dl "ipfw pipe 10 config plr 0.05" .Pp We can use pipes to artificially limit bandwidth, e.g.\& on a machine acting as a router, if we want to limit traffic from local clients on 192.168.2.0/24 we do: .Pp .Dl "ipfw add pipe 1 ip from 192.168.2.0/24 to any out" .Dl "ipfw pipe 1 config bw 300Kbit/s queue 50KBytes" .Pp note that we use the .Cm out modifier so that the rule is not used twice. Remember in fact that .Nm rules are checked both on incoming and outgoing packets. .Pp Should we want to simulate a bidirectional link with bandwidth limitations, the correct way is the following: .Pp .Dl "ipfw add pipe 1 ip from any to any out" .Dl "ipfw add pipe 2 ip from any to any in" .Dl "ipfw pipe 1 config bw 64Kbit/s queue 10Kbytes" .Dl "ipfw pipe 2 config bw 64Kbit/s queue 10Kbytes" .Pp The above can be very useful, e.g.\& if you want to see how your fancy Web page will look for a residential user who is connected only through a slow link. You should not use only one pipe for both directions, unless you want to simulate a half-duplex medium (e.g.\& AppleTalk, Ethernet, IRDA). It is not necessary that both pipes have the same configuration, so we can also simulate asymmetric links. .Pp Should we want to verify network performance with the RED queue management algorithm: .Pp .Dl "ipfw add pipe 1 ip from any to any" .Dl "ipfw pipe 1 config bw 500Kbit/s queue 100 red 0.002/30/80/0.1" .Pp Another typical application of the traffic shaper is to introduce some delay in the communication. This can significantly affect applications which do a lot of Remote Procedure Calls, and where the round-trip-time of the connection often becomes a limiting factor much more than bandwidth: .Pp .Dl "ipfw add pipe 1 ip from any to any out" .Dl "ipfw add pipe 2 ip from any to any in" .Dl "ipfw pipe 1 config delay 250ms bw 1Mbit/s" .Dl "ipfw pipe 2 config delay 250ms bw 1Mbit/s" .Pp Per-flow queueing can be useful for a variety of purposes. A very simple one is counting traffic: .Pp .Dl "ipfw add pipe 1 tcp from any to any" .Dl "ipfw add pipe 1 udp from any to any" .Dl "ipfw add pipe 1 ip from any to any" .Dl "ipfw pipe 1 config mask all" .Pp The above set of rules will create queues (and collect statistics) for all traffic. Because the pipes have no limitations, the only effect is collecting statistics. Note that we need 3 rules, not just the last one, because when .Nm tries to match IP packets it will not consider ports, so we would not see connections on separate ports as different ones. .Pp A more sophisticated example is limiting the outbound traffic on a net with per-host limits, rather than per-network limits: .Pp .Dl "ipfw add pipe 1 ip from 192.168.2.0/24 to any out" .Dl "ipfw add pipe 2 ip from any to 192.168.2.0/24 in" .Dl "ipfw pipe 1 config mask src-ip 0x000000ff bw 200Kbit/s queue 20Kbytes" .Dl "ipfw pipe 2 config mask dst-ip 0x000000ff bw 200Kbit/s queue 20Kbytes" .Ss LOOKUP TABLES In the following example, we need to create several traffic bandwidth classes and we need different hosts/networks to fall into different classes. We create one pipe for each class and configure them accordingly. Then we create a single table and fill it with IP subnets and addresses. For each subnet/host we set the argument equal to the number of the pipe that it should use. Then we classify traffic using a single rule: .Pp .Dl "ipfw pipe 1 config bw 1000Kbyte/s" .Dl "ipfw pipe 4 config bw 4000Kbyte/s" .Dl "..." .Dl "ipfw table 1 add 192.168.2.0/24 1" .Dl "ipfw table 1 add 192.168.0.0/27 4" .Dl "ipfw table 1 add 192.168.0.2 1" .Dl "..." .Dl "ipfw add pipe tablearg ip from table(1) to any" .Pp Using the .Cm fwd action, the table entries may include hostnames and IP addresses. .Pp .Dl "ipfw table 1 add 192.168.2.0/24 10.23.2.1" .Dl "ipfw table 1 add 192.168.0.0/27 router1.dmz" .Dl "..." .Dl "ipfw add 100 fwd tablearg ip from any to table(1)" .Pp In the following example per-interface firewall is created: .Pp .Dl "ipfw table 10 add vlan20 12000" .Dl "ipfw table 10 add vlan30 13000" .Dl "ipfw table 20 add vlan20 22000" .Dl "ipfw table 20 add vlan30 23000" .Dl ".." .Dl "ipfw add 100 ipfw skipto tablearg ip from any to any recv 'table(10)' in" .Dl "ipfw add 200 ipfw skipto tablearg ip from any to any xmit 'table(10)' out" .Ss SETS OF RULES To add a set of rules atomically, e.g.\& set 18: .Pp .Dl "ipfw set disable 18" .Dl "ipfw add NN set 18 ... # repeat as needed" .Dl "ipfw set enable 18" .Pp To delete a set of rules atomically the command is simply: .Pp .Dl "ipfw delete set 18" .Pp To test a ruleset and disable it and regain control if something goes wrong: .Pp .Dl "ipfw set disable 18" .Dl "ipfw add NN set 18 ... # repeat as needed" .Dl "ipfw set enable 18; echo done; sleep 30 && ipfw set disable 18" .Pp Here if everything goes well, you press control-C before the "sleep" terminates, and your ruleset will be left active. Otherwise, e.g.\& if you cannot access your box, the ruleset will be disabled after the sleep terminates thus restoring the previous situation. .Pp To show rules of the specific set: .Pp .Dl "ipfw set 18 show" .Pp To show rules of the disabled set: .Pp .Dl "ipfw -S set 18 show" .Pp To clear a specific rule counters of the specific set: .Pp .Dl "ipfw set 18 zero NN" .Pp To delete a specific rule of the specific set: .Pp .Dl "ipfw set 18 delete NN" .Ss NAT, REDIRECT AND LSNAT First redirect all the traffic to nat instance 123: .Pp .Dl "ipfw add nat 123 all from any to any" .Pp Then to configure nat instance 123 to alias all the outgoing traffic with ip 192.168.0.123, blocking all incoming connections, trying to keep same ports on both sides, clearing aliasing table on address change and keeping a log of traffic/link statistics: .Pp .Dl "ipfw nat 123 config ip 192.168.0.123 log deny_in reset same_ports" .Pp Or to change address of instance 123, aliasing table will be cleared (see reset option): .Pp .Dl "ipfw nat 123 config ip 10.0.0.1" .Pp To see configuration of nat instance 123: .Pp .Dl "ipfw nat 123 show config" .Pp To show logs of all the instances in range 111-999: .Pp .Dl "ipfw nat 111-999 show" .Pp To see configurations of all instances: .Pp .Dl "ipfw nat show config" .Pp Or a redirect rule with mixed modes could looks like: .Pp .Dl "ipfw nat 123 config redirect_addr 10.0.0.1 10.0.0.66" .Dl " redirect_port tcp 192.168.0.1:80 500" .Dl " redirect_proto udp 192.168.1.43 192.168.1.1" .Dl " redirect_addr 192.168.0.10,192.168.0.11" .Dl " 10.0.0.100 # LSNAT" .Dl " redirect_port tcp 192.168.0.1:80,192.168.0.10:22" .Dl " 500 # LSNAT" .Pp or it could be split in: .Pp .Dl "ipfw nat 1 config redirect_addr 10.0.0.1 10.0.0.66" .Dl "ipfw nat 2 config redirect_port tcp 192.168.0.1:80 500" .Dl "ipfw nat 3 config redirect_proto udp 192.168.1.43 192.168.1.1" .Dl "ipfw nat 4 config redirect_addr 192.168.0.10,192.168.0.11,192.168.0.12" .Dl " 10.0.0.100" .Dl "ipfw nat 5 config redirect_port tcp" .Dl " 192.168.0.1:80,192.168.0.10:22,192.168.0.20:25 500" .Sh SEE ALSO .Xr cpp 1 , .Xr m4 1 , .Xr altq 4 , .Xr divert 4 , .Xr dummynet 4 , .Xr if_bridge 4 , .Xr ip 4 , .Xr ipfirewall 4 , .Xr ng_ipfw 4 , .Xr protocols 5 , .Xr services 5 , .Xr init 8 , .Xr kldload 8 , .Xr reboot 8 , .Xr sysctl 8 , .Xr syslogd 8 .Sh HISTORY The .Nm utility first appeared in .Fx 2.0 . .Nm dummynet was introduced in .Fx 2.2.8 . Stateful extensions were introduced in .Fx 4.0 . .Nm ipfw2 was introduced in Summer 2002. .Sh AUTHORS .An Ugen J. S. Antsilevich , .An Poul-Henning Kamp , .An Alex Nash , .An Archie Cobbs , .An Luigi Rizzo . .Pp .An -nosplit API based upon code written by .An Daniel Boulet for BSDI. .Pp Dummynet has been introduced by Luigi Rizzo in 1997-1998. .Pp Some early work (1999-2000) on the .Nm dummynet traffic shaper supported by Akamba Corp. .Pp The ipfw core (ipfw2) has been completely redesigned and reimplemented by Luigi Rizzo in summer 2002. Further actions and options have been added by various developer over the years. .Pp .An -nosplit In-kernel NAT support written by .An Paolo Pisati Aq piso@FreeBSD.org as part of a Summer of Code 2005 project. .Pp SCTP .Nm nat support has been developed by .An The Centre for Advanced Internet Architectures (CAIA) Aq http://www.caia.swin.edu.au . The primary developers and maintainers are David Hayes and Jason But. For further information visit: .Aq http://www.caia.swin.edu.au/urp/SONATA .Pp Delay profiles have been developed by Alessandro Cerri and Luigi Rizzo, supported by the European Commission within Projects Onelab and Onelab2. .Sh BUGS The syntax has grown over the years and sometimes it might be confusing. Unfortunately, backward compatibility prevents cleaning up mistakes made in the definition of the syntax. .Pp .Em !!! WARNING !!! .Pp Misconfiguring the firewall can put your computer in an unusable state, possibly shutting down network services and requiring console access to regain control of it. .Pp Incoming packet fragments diverted by .Cm divert are reassembled before delivery to the socket. The action used on those packet is the one from the rule which matches the first fragment of the packet. .Pp Packets diverted to userland, and then reinserted by a userland process may lose various packet attributes. The packet source interface name will be preserved if it is shorter than 8 bytes and the userland process saves and reuses the sockaddr_in (as does .Xr natd 8 ) ; otherwise, it may be lost. If a packet is reinserted in this manner, later rules may be incorrectly applied, making the order of .Cm divert rules in the rule sequence very important. .Pp Dummynet drops all packets with IPv6 link-local addresses. .Pp Rules using .Cm uid or .Cm gid may not behave as expected. In particular, incoming SYN packets may have no uid or gid associated with them since they do not yet belong to a TCP connection, and the uid/gid associated with a packet may not be as expected if the associated process calls .Xr setuid 2 or similar system calls. .Pp Rule syntax is subject to the command line environment and some patterns may need to be escaped with the backslash character or quoted appropriately. .Pp Due to the architecture of .Xr libalias 3 , ipfw nat is not compatible with the TCP segmentation offloading (TSO). Thus, to reliably nat your network traffic, please disable TSO on your NICs using .Xr ifconfig 8 . .Pp ICMP error messages are not implicitly matched by dynamic rules for the respective conversations. To avoid failures of network error detection and path MTU discovery, ICMP error messages may need to be allowed explicitly through static rules. .Pp Rules using .Cm call and .Cm return actions may lead to confusing behaviour if ruleset has mistakes, and/or interaction with other subsystems (netgraph, dummynet, etc.) is used. One possible case for this is packet leaving .Nm in subroutine on the input pass, while later on output encountering unpaired .Cm return first. As the call stack is kept intact after input pass, packet will suddenly return to the rule number used on input pass, not on output one. Order of processing should be checked carefully to avoid such mistakes. ================================================ FILE: ipfw/ipfw2.c ================================================ /* * Copyright (c) 2002-2003 Luigi Rizzo * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp * Copyright (c) 1994 Ugen J.S.Antsilevich * * Idea and grammar partially left from: * Copyright (c) 1993 Daniel Boulet * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. * * NEW command line interface for IP firewall facility * * $FreeBSD: head/sbin/ipfw/ipfw2.c 206843 2010-04-19 15:11:45Z luigi $ */ #include #include #include #include #include #include "ipfw2.h" #include #include #include #include #include #include #include #include #include #include #include /* ctime */ #include /* _long_to_time */ #include #include #include /* offsetof */ #include #include /* only IFNAMSIZ */ #include #include /* only n_short, n_long */ #include #include #include #include #include struct cmdline_opts co; /* global options */ int resvd_set_number = RESVD_SET; int ipfw_socket = -1; #ifndef s6_addr32 #define s6_addr32 __u6_addr.__u6_addr32 #endif #define GET_UINT_ARG(arg, min, max, tok, s_x) do { \ if (!av[0]) \ errx(EX_USAGE, "%s: missing argument", match_value(s_x, tok)); \ if (_substrcmp(*av, "tablearg") == 0) { \ arg = IP_FW_TABLEARG; \ break; \ } \ \ { \ long _xval; \ char *end; \ \ _xval = strtol(*av, &end, 10); \ \ if (!isdigit(**av) || *end != '\0' || (_xval == 0 && errno == EINVAL)) \ errx(EX_DATAERR, "%s: invalid argument: %s", \ match_value(s_x, tok), *av); \ \ if (errno == ERANGE || _xval < min || _xval > max) \ errx(EX_DATAERR, "%s: argument is out of range (%u..%u): %s", \ match_value(s_x, tok), min, max, *av); \ \ if (_xval == IP_FW_TABLEARG) \ errx(EX_DATAERR, "%s: illegal argument value: %s", \ match_value(s_x, tok), *av); \ arg = _xval; \ } \ } while (0) static void PRINT_UINT_ARG(const char *str, uint32_t arg) { if (str != NULL) printf("%s",str); if (arg == IP_FW_TABLEARG) printf("tablearg"); else printf("%u", arg); } static struct _s_x f_tcpflags[] = { { "syn", TH_SYN }, { "fin", TH_FIN }, { "ack", TH_ACK }, { "psh", TH_PUSH }, { "rst", TH_RST }, { "urg", TH_URG }, { "tcp flag", 0 }, { NULL, 0 } }; static struct _s_x f_tcpopts[] = { { "mss", IP_FW_TCPOPT_MSS }, { "maxseg", IP_FW_TCPOPT_MSS }, { "window", IP_FW_TCPOPT_WINDOW }, { "sack", IP_FW_TCPOPT_SACK }, { "ts", IP_FW_TCPOPT_TS }, { "timestamp", IP_FW_TCPOPT_TS }, { "cc", IP_FW_TCPOPT_CC }, { "tcp option", 0 }, { NULL, 0 } }; /* * IP options span the range 0 to 255 so we need to remap them * (though in fact only the low 5 bits are significant). */ static struct _s_x f_ipopts[] = { { "ssrr", IP_FW_IPOPT_SSRR}, { "lsrr", IP_FW_IPOPT_LSRR}, { "rr", IP_FW_IPOPT_RR}, { "ts", IP_FW_IPOPT_TS}, { "ip option", 0 }, { NULL, 0 } }; static struct _s_x f_iptos[] = { { "lowdelay", IPTOS_LOWDELAY}, { "throughput", IPTOS_THROUGHPUT}, { "reliability", IPTOS_RELIABILITY}, { "mincost", IPTOS_MINCOST}, { "congestion", IPTOS_ECN_CE}, { "ecntransport", IPTOS_ECN_ECT0}, { "ip tos option", 0}, { NULL, 0 } }; static struct _s_x limit_masks[] = { {"all", DYN_SRC_ADDR|DYN_SRC_PORT|DYN_DST_ADDR|DYN_DST_PORT}, {"src-addr", DYN_SRC_ADDR}, {"src-port", DYN_SRC_PORT}, {"dst-addr", DYN_DST_ADDR}, {"dst-port", DYN_DST_PORT}, {NULL, 0} }; /* * we use IPPROTO_ETHERTYPE as a fake protocol id to call the print routines * This is only used in this code. */ #define IPPROTO_ETHERTYPE 0x1000 static struct _s_x ether_types[] = { /* * Note, we cannot use "-:&/" in the names because they are field * separators in the type specifications. Also, we use s = NULL as * end-delimiter, because a type of 0 can be legal. */ { "ip", 0x0800 }, { "ipv4", 0x0800 }, { "ipv6", 0x86dd }, { "arp", 0x0806 }, { "rarp", 0x8035 }, { "vlan", 0x8100 }, { "loop", 0x9000 }, { "trail", 0x1000 }, { "at", 0x809b }, { "atalk", 0x809b }, { "aarp", 0x80f3 }, { "pppoe_disc", 0x8863 }, { "pppoe_sess", 0x8864 }, { "ipx_8022", 0x00E0 }, { "ipx_8023", 0x0000 }, { "ipx_ii", 0x8137 }, { "ipx_snap", 0x8137 }, { "ipx", 0x8137 }, { "ns", 0x0600 }, { NULL, 0 } }; static struct _s_x rule_actions[] = { { "accept", TOK_ACCEPT }, { "pass", TOK_ACCEPT }, { "allow", TOK_ACCEPT }, { "permit", TOK_ACCEPT }, { "count", TOK_COUNT }, { "pipe", TOK_PIPE }, { "queue", TOK_QUEUE }, { "divert", TOK_DIVERT }, { "tee", TOK_TEE }, { "netgraph", TOK_NETGRAPH }, { "ngtee", TOK_NGTEE }, { "fwd", TOK_FORWARD }, { "forward", TOK_FORWARD }, { "skipto", TOK_SKIPTO }, { "deny", TOK_DENY }, { "drop", TOK_DENY }, { "reject", TOK_REJECT }, { "reset6", TOK_RESET6 }, { "reset", TOK_RESET }, { "unreach6", TOK_UNREACH6 }, { "unreach", TOK_UNREACH }, { "check-state", TOK_CHECKSTATE }, { "//", TOK_COMMENT }, { "nat", TOK_NAT }, { "reass", TOK_REASS }, { "setfib", TOK_SETFIB }, { "call", TOK_CALL }, { "return", TOK_RETURN }, { NULL, 0 } /* terminator */ }; static struct _s_x rule_action_params[] = { { "altq", TOK_ALTQ }, { "log", TOK_LOG }, { "tag", TOK_TAG }, { "untag", TOK_UNTAG }, { NULL, 0 } /* terminator */ }; /* * The 'lookup' instruction accepts one of the following arguments. * -1 is a terminator for the list. * Arguments are passed as v[1] in O_DST_LOOKUP options. */ static int lookup_key[] = { TOK_DSTIP, TOK_SRCIP, TOK_DSTPORT, TOK_SRCPORT, TOK_UID, TOK_JAIL, TOK_DSCP, -1 }; static struct _s_x rule_options[] = { { "tagged", TOK_TAGGED }, { "uid", TOK_UID }, { "gid", TOK_GID }, { "jail", TOK_JAIL }, { "in", TOK_IN }, { "limit", TOK_LIMIT }, { "keep-state", TOK_KEEPSTATE }, { "bridged", TOK_LAYER2 }, { "layer2", TOK_LAYER2 }, { "out", TOK_OUT }, { "diverted", TOK_DIVERTED }, { "diverted-loopback", TOK_DIVERTEDLOOPBACK }, { "diverted-output", TOK_DIVERTEDOUTPUT }, { "xmit", TOK_XMIT }, { "recv", TOK_RECV }, { "via", TOK_VIA }, { "fragment", TOK_FRAG }, { "frag", TOK_FRAG }, { "fib", TOK_FIB }, { "ipoptions", TOK_IPOPTS }, { "ipopts", TOK_IPOPTS }, { "iplen", TOK_IPLEN }, { "ipid", TOK_IPID }, { "ipprecedence", TOK_IPPRECEDENCE }, { "dscp", TOK_DSCP }, { "iptos", TOK_IPTOS }, { "ipttl", TOK_IPTTL }, { "ipversion", TOK_IPVER }, { "ipver", TOK_IPVER }, { "estab", TOK_ESTAB }, { "established", TOK_ESTAB }, { "setup", TOK_SETUP }, { "sockarg", TOK_SOCKARG }, { "tcpdatalen", TOK_TCPDATALEN }, { "tcpflags", TOK_TCPFLAGS }, { "tcpflgs", TOK_TCPFLAGS }, { "tcpoptions", TOK_TCPOPTS }, { "tcpopts", TOK_TCPOPTS }, { "tcpseq", TOK_TCPSEQ }, { "tcpack", TOK_TCPACK }, { "tcpwin", TOK_TCPWIN }, { "icmptype", TOK_ICMPTYPES }, { "icmptypes", TOK_ICMPTYPES }, { "dst-ip", TOK_DSTIP }, { "src-ip", TOK_SRCIP }, { "dst-port", TOK_DSTPORT }, { "src-port", TOK_SRCPORT }, { "proto", TOK_PROTO }, { "MAC", TOK_MAC }, { "mac", TOK_MAC }, { "mac-type", TOK_MACTYPE }, { "verrevpath", TOK_VERREVPATH }, { "versrcreach", TOK_VERSRCREACH }, { "antispoof", TOK_ANTISPOOF }, { "ipsec", TOK_IPSEC }, { "icmp6type", TOK_ICMP6TYPES }, { "icmp6types", TOK_ICMP6TYPES }, { "ext6hdr", TOK_EXT6HDR}, { "flow-id", TOK_FLOWID}, { "ipv6", TOK_IPV6}, { "ip6", TOK_IPV6}, { "ipv4", TOK_IPV4}, { "ip4", TOK_IPV4}, { "dst-ipv6", TOK_DSTIP6}, { "dst-ip6", TOK_DSTIP6}, { "src-ipv6", TOK_SRCIP6}, { "src-ip6", TOK_SRCIP6}, { "lookup", TOK_LOOKUP}, { "//", TOK_COMMENT }, { "not", TOK_NOT }, /* pseudo option */ { "!", /* escape ? */ TOK_NOT }, /* pseudo option */ { "or", TOK_OR }, /* pseudo option */ { "|", /* escape */ TOK_OR }, /* pseudo option */ { "{", TOK_STARTBRACE }, /* pseudo option */ { "(", TOK_STARTBRACE }, /* pseudo option */ { "}", TOK_ENDBRACE }, /* pseudo option */ { ")", TOK_ENDBRACE }, /* pseudo option */ { NULL, 0 } /* terminator */ }; /* * Helper routine to print a possibly unaligned uint64_t on * various platform. If width > 0, print the value with * the desired width, followed by a space; * otherwise, return the required width. */ int pr_u64(uint64_t *pd, int width) { #ifdef TCC #define U64_FMT "I64" #else #define U64_FMT "llu" #endif uint64_t u; unsigned long long d; bcopy (pd, &u, sizeof(u)); d = u; return (width > 0) ? printf("%*" U64_FMT " ", width, d) : snprintf(NULL, 0, "%" U64_FMT, d) ; #undef U64_FMT } void * safe_calloc(size_t number, size_t size) { void *ret = calloc(number, size); if (ret == NULL) err(EX_OSERR, "calloc"); return ret; } void * safe_realloc(void *ptr, size_t size) { void *ret = realloc(ptr, size); if (ret == NULL) err(EX_OSERR, "realloc"); return ret; } /* * conditionally runs the command. * Selected options or negative -> getsockopt */ int do_cmd(int optname, void *optval, uintptr_t optlen) { int i; if (co.test_only) return 0; if (ipfw_socket == -1) ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); if (ipfw_socket < 0) err(EX_UNAVAILABLE, "socket"); if (optname == IP_FW_GET || optname == IP_DUMMYNET_GET || optname == IP_FW_ADD || optname == IP_FW_TABLE_LIST || optname == IP_FW_TABLE_GETSIZE || optname == IP_FW_NAT_GET_CONFIG || optname < 0 || optname == IP_FW_NAT_GET_LOG) { if (optname < 0) optname = -optname; i = getsockopt(ipfw_socket, IPPROTO_IP, optname, optval, (socklen_t *)optlen); } else { i = setsockopt(ipfw_socket, IPPROTO_IP, optname, optval, optlen); } return i; } #if 0 // XXX still unused /* * do_setcmd3 - pass ipfw control cmd to kernel * @optname: option name * @optval: pointer to option data * @optlen: option length * * Function encapsulates option value in IP_FW3 socket option * and calls setsockopt(). * Function returns 0 on success or -1 otherwise. */ static int do_setcmd3(int optname, void *optval, socklen_t optlen) { socklen_t len; ip_fw3_opheader *op3; if (co.test_only) return (0); if (ipfw_socket == -1) ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); if (ipfw_socket < 0) err(EX_UNAVAILABLE, "socket"); len = sizeof(ip_fw3_opheader) + optlen; op3 = alloca(len); /* Zero reserved fields */ memset(op3, 0, sizeof(ip_fw3_opheader)); memcpy(op3 + 1, optval, optlen); op3->opcode = optname; return setsockopt(ipfw_socket, IPPROTO_IP, IP_FW3, op3, len); } #endif // XXX still unused /** * match_token takes a table and a string, returns the value associated * with the string (-1 in case of failure). */ int match_token(struct _s_x *table, char *string) { struct _s_x *pt; uint i = strlen(string); for (pt = table ; i && pt->s != NULL ; pt++) if (strlen(pt->s) == i && !bcmp(string, pt->s, i)) return pt->x; return -1; } /** * match_value takes a table and a value, returns the string associated * with the value (NULL in case of failure). */ char const * match_value(struct _s_x *p, int value) { for (; p->s != NULL; p++) if (p->x == value) return p->s; return NULL; } /* * _substrcmp takes two strings and returns 1 if they do not match, * and 0 if they match exactly or the first string is a sub-string * of the second. A warning is printed to stderr in the case that the * first string is a sub-string of the second. * * This function will be removed in the future through the usual * deprecation process. */ int _substrcmp(const char *str1, const char* str2) { if (strncmp(str1, str2, strlen(str1)) != 0) return 1; if (strlen(str1) != strlen(str2)) warnx("DEPRECATED: '%s' matched '%s' as a sub-string", str1, str2); return 0; } /* * _substrcmp2 takes three strings and returns 1 if the first two do not match, * and 0 if they match exactly or the second string is a sub-string * of the first. A warning is printed to stderr in the case that the * first string does not match the third. * * This function exists to warn about the bizarre construction * strncmp(str, "by", 2) which is used to allow people to use a shortcut * for "bytes". The problem is that in addition to accepting "by", * "byt", "byte", and "bytes", it also excepts "by_rabid_dogs" and any * other string beginning with "by". * * This function will be removed in the future through the usual * deprecation process. */ int _substrcmp2(const char *str1, const char* str2, const char* str3) { if (strncmp(str1, str2, strlen(str2)) != 0) return 1; if (strcmp(str1, str3) != 0) warnx("DEPRECATED: '%s' matched '%s'", str1, str3); return 0; } /* * prints one port, symbolic or numeric */ static void print_port(int proto, uint16_t port) { if (proto == IPPROTO_ETHERTYPE) { char const *s; if (co.do_resolv && (s = match_value(ether_types, port)) ) printf("%s", s); else printf("0x%04x", port); } else { struct servent *se = NULL; if (co.do_resolv) { struct protoent *pe = getprotobynumber(proto); se = getservbyport(htons(port), pe ? pe->p_name : NULL); } if (se) printf("%s", se->s_name); else printf("%d", port); } } static struct _s_x _port_name[] = { {"dst-port", O_IP_DSTPORT}, {"src-port", O_IP_SRCPORT}, {"ipid", O_IPID}, {"iplen", O_IPLEN}, {"ipttl", O_IPTTL}, {"mac-type", O_MAC_TYPE}, {"tcpdatalen", O_TCPDATALEN}, {"tcpwin", O_TCPWIN}, {"tagged", O_TAGGED}, {NULL, 0} }; /* * Print the values in a list 16-bit items of the types above. * XXX todo: add support for mask. */ static void print_newports(ipfw_insn_u16 *cmd, int proto, int opcode) { uint16_t *p = cmd->ports; int i; char const *sep; if (opcode != 0) { sep = match_value(_port_name, opcode); if (sep == NULL) sep = "???"; printf (" %s", sep); } sep = " "; for (i = F_LEN((ipfw_insn *)cmd) - 1; i > 0; i--, p += 2) { printf("%s", sep); print_port(proto, p[0]); if (p[0] != p[1]) { printf("-"); print_port(proto, p[1]); } sep = ","; } } /* * Like strtol, but also translates service names into port numbers * for some protocols. * In particular: * proto == -1 disables the protocol check; * proto == IPPROTO_ETHERTYPE looks up an internal table * proto == matches the values there. * Returns *end == s in case the parameter is not found. */ static int strtoport(char *s, char **end, int base, int proto) { char *p, *buf; char *s1; int i; *end = s; /* default - not found */ if (*s == '\0') return 0; /* not found */ if (isdigit(*s)) return strtol(s, end, base); /* * find separator. '\\' escapes the next char. */ for (s1 = s; *s1 && (isalnum(*s1) || *s1 == '\\') ; s1++) if (*s1 == '\\' && s1[1] != '\0') s1++; buf = safe_calloc(s1 - s + 1, 1); /* * copy into a buffer skipping backslashes */ for (p = s, i = 0; p != s1 ; p++) if (*p != '\\') buf[i++] = *p; buf[i++] = '\0'; if (proto == IPPROTO_ETHERTYPE) { i = match_token(ether_types, buf); free(buf); if (i != -1) { /* found */ *end = s1; return i; } } else { struct protoent *pe = NULL; struct servent *se; if (proto != 0) pe = getprotobynumber(proto); setservent(1); se = getservbyname(buf, pe ? pe->p_name : NULL); free(buf); if (se != NULL) { *end = s1; return ntohs(se->s_port); } } return 0; /* not found */ } /* * Fill the body of the command with the list of port ranges. */ static int fill_newports(ipfw_insn_u16 *cmd, char *av, int proto) { uint16_t a, b, *p = cmd->ports; int i = 0; char *s = av; while (*s) { a = strtoport(av, &s, 0, proto); if (s == av) /* empty or invalid argument */ return (0); switch (*s) { case '-': /* a range */ av = s + 1; b = strtoport(av, &s, 0, proto); /* Reject expressions like '1-abc' or '1-2-3'. */ if (s == av || (*s != ',' && *s != '\0')) return (0); p[0] = a; p[1] = b; break; case ',': /* comma separated list */ case '\0': p[0] = p[1] = a; break; default: warnx("port list: invalid separator <%c> in <%s>", *s, av); return (0); } i++; p += 2; av = s + 1; } if (i > 0) { if (i + 1 > F_LEN_MASK) errx(EX_DATAERR, "too many ports/ranges\n"); cmd->o.len |= i + 1; /* leave F_NOT and F_OR untouched */ } return (i); } static struct _s_x icmpcodes[] = { { "net", ICMP_UNREACH_NET }, { "host", ICMP_UNREACH_HOST }, { "protocol", ICMP_UNREACH_PROTOCOL }, { "port", ICMP_UNREACH_PORT }, { "needfrag", ICMP_UNREACH_NEEDFRAG }, { "srcfail", ICMP_UNREACH_SRCFAIL }, { "net-unknown", ICMP_UNREACH_NET_UNKNOWN }, { "host-unknown", ICMP_UNREACH_HOST_UNKNOWN }, { "isolated", ICMP_UNREACH_ISOLATED }, { "net-prohib", ICMP_UNREACH_NET_PROHIB }, { "host-prohib", ICMP_UNREACH_HOST_PROHIB }, { "tosnet", ICMP_UNREACH_TOSNET }, { "toshost", ICMP_UNREACH_TOSHOST }, { "filter-prohib", ICMP_UNREACH_FILTER_PROHIB }, { "host-precedence", ICMP_UNREACH_HOST_PRECEDENCE }, { "precedence-cutoff", ICMP_UNREACH_PRECEDENCE_CUTOFF }, { NULL, 0 } }; static void fill_reject_code(u_short *codep, char *str) { int val; char *s; val = strtoul(str, &s, 0); if (s == str || *s != '\0' || val >= 0x100) val = match_token(icmpcodes, str); if (val < 0) errx(EX_DATAERR, "unknown ICMP unreachable code ``%s''", str); *codep = val; return; } static void print_reject_code(uint16_t code) { char const *s = match_value(icmpcodes, code); if (s != NULL) printf("unreach %s", s); else printf("unreach %u", code); } /* * Returns the number of bits set (from left) in a contiguous bitmask, * or -1 if the mask is not contiguous. * XXX this needs a proper fix. * This effectively works on masks in big-endian (network) format. * when compiled on little endian architectures. * * First bit is bit 7 of the first byte -- note, for MAC addresses, * the first bit on the wire is bit 0 of the first byte. * len is the max length in bits. */ int contigmask(uint8_t *p, int len) { int i, n; for (i=0; iarg1 & 0xff; uint8_t clear = (cmd->arg1 >> 8) & 0xff; if (list == f_tcpflags && set == TH_SYN && clear == TH_ACK) { printf(" setup"); return; } printf(" %s ", name); for (i=0; list[i].x != 0; i++) { if (set & list[i].x) { set &= ~list[i].x; printf("%s%s", comma, list[i].s); comma = ","; } if (clear & list[i].x) { clear &= ~list[i].x; printf("%s!%s", comma, list[i].s); comma = ","; } } } /* * Print the ip address contained in a command. */ static void print_ip(ipfw_insn_ip *cmd, char const *s) { struct hostent *he = NULL; uint32_t len = F_LEN((ipfw_insn *)cmd); uint32_t *a = ((ipfw_insn_u32 *)cmd)->d; if (cmd->o.opcode == O_IP_DST_LOOKUP && len > F_INSN_SIZE(ipfw_insn_u32)) { uint32_t d = a[1]; const char *arg = ""; if (d < sizeof(lookup_key)/sizeof(lookup_key[0])) arg = match_value(rule_options, lookup_key[d]); printf("%s lookup %s %d", cmd->o.len & F_NOT ? " not": "", arg, cmd->o.arg1); return; } printf("%s%s ", cmd->o.len & F_NOT ? " not": "", s); if (cmd->o.opcode == O_IP_SRC_ME || cmd->o.opcode == O_IP_DST_ME) { printf("me"); return; } if (cmd->o.opcode == O_IP_SRC_LOOKUP || cmd->o.opcode == O_IP_DST_LOOKUP) { printf("table(%u", ((ipfw_insn *)cmd)->arg1); if (len == F_INSN_SIZE(ipfw_insn_u32)) printf(",%u", *a); printf(")"); return; } if (cmd->o.opcode == O_IP_SRC_SET || cmd->o.opcode == O_IP_DST_SET) { uint32_t x, *map = (uint32_t *)&(cmd->mask); int i, j; char comma = '{'; x = cmd->o.arg1 - 1; x = htonl( ~x ); cmd->addr.s_addr = htonl(cmd->addr.s_addr); printf("%s/%d", inet_ntoa(cmd->addr), contigmask((uint8_t *)&x, 32)); x = cmd->addr.s_addr = htonl(cmd->addr.s_addr); x &= 0xff; /* base */ /* * Print bits and ranges. * Locate first bit set (i), then locate first bit unset (j). * If we have 3+ consecutive bits set, then print them as a * range, otherwise only print the initial bit and rescan. */ for (i=0; i < cmd->o.arg1; i++) if (map[i/32] & (1<<(i & 31))) { for (j=i+1; j < cmd->o.arg1; j++) if (!(map[ j/32] & (1<<(j & 31)))) break; printf("%c%d", comma, i+x); if (j>i+2) { /* range has at least 3 elements */ printf("-%d", j-1+x); i = j-1; } comma = ','; } printf("}"); return; } /* * len == 2 indicates a single IP, whereas lists of 1 or more * addr/mask pairs have len = (2n+1). We convert len to n so we * use that to count the number of entries. */ for (len = len / 2; len > 0; len--, a += 2) { int mb = /* mask length */ (cmd->o.opcode == O_IP_SRC || cmd->o.opcode == O_IP_DST) ? 32 : contigmask((uint8_t *)&(a[1]), 32); if (mb == 32 && co.do_resolv) he = gethostbyaddr((char *)&(a[0]), sizeof(u_long), AF_INET); if (he != NULL) /* resolved to name */ printf("%s", he->h_name); else if (mb == 0) /* any */ printf("any"); else { /* numeric IP followed by some kind of mask */ printf("%s", inet_ntoa( *((struct in_addr *)&a[0]) ) ); if (mb < 0) printf(":%s", inet_ntoa( *((struct in_addr *)&a[1]) ) ); else if (mb < 32) printf("/%d", mb); } if (len > 1) printf(","); } } /* * prints a MAC address/mask pair */ static void print_mac(uint8_t *addr, uint8_t *mask) { int l = contigmask(mask, 48); if (l == 0) printf(" any"); else { printf(" %02x:%02x:%02x:%02x:%02x:%02x", addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); if (l == -1) printf("&%02x:%02x:%02x:%02x:%02x:%02x", mask[0], mask[1], mask[2], mask[3], mask[4], mask[5]); else if (l < 48) printf("/%d", l); } } static void fill_icmptypes(ipfw_insn_u32 *cmd, char *av) { uint8_t type; cmd->d[0] = 0; while (*av) { if (*av == ',') av++; type = strtoul(av, &av, 0); if (*av != ',' && *av != '\0') errx(EX_DATAERR, "invalid ICMP type"); if (type > 31) errx(EX_DATAERR, "ICMP type out of range"); cmd->d[0] |= 1 << type; } cmd->o.opcode = O_ICMPTYPE; cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); } static void print_icmptypes(ipfw_insn_u32 *cmd) { int i; char sep= ' '; printf(" icmptypes"); for (i = 0; i < 32; i++) { if ( (cmd->d[0] & (1 << (i))) == 0) continue; printf("%c%d", sep, i); sep = ','; } } /* * show_ipfw() prints the body of an ipfw rule. * Because the standard rule has at least proto src_ip dst_ip, we use * a helper function to produce these entries if not provided explicitly. * The first argument is the list of fields we have, the second is * the list of fields we want to be printed. * * Special cases if we have provided a MAC header: * + if the rule does not contain IP addresses/ports, do not print them; * + if the rule does not contain an IP proto, print "all" instead of "ip"; * * Once we have 'have_options', IP header fields are printed as options. */ #define HAVE_PROTO 0x0001 #define HAVE_SRCIP 0x0002 #define HAVE_DSTIP 0x0004 #define HAVE_PROTO4 0x0008 #define HAVE_PROTO6 0x0010 #define HAVE_IP 0x0100 #define HAVE_OPTIONS 0x8000 static void show_prerequisites(int *flags, int want, int cmd) { (void)cmd; /* UNUSED */ if (co.comment_only) return; if ( (*flags & HAVE_IP) == HAVE_IP) *flags |= HAVE_OPTIONS; if ( !(*flags & HAVE_OPTIONS)) { if ( !(*flags & HAVE_PROTO) && (want & HAVE_PROTO)) { if ( (*flags & HAVE_PROTO4)) printf(" ip4"); else if ( (*flags & HAVE_PROTO6)) printf(" ip6"); else printf(" ip"); } if ( !(*flags & HAVE_SRCIP) && (want & HAVE_SRCIP)) printf(" from any"); if ( !(*flags & HAVE_DSTIP) && (want & HAVE_DSTIP)) printf(" to any"); } *flags |= want; } static void show_ipfw(struct ip_fw *rule, int pcwidth, int bcwidth) { static int twidth = 0; int l; ipfw_insn *cmd, *tagptr = NULL; const char *comment = NULL; /* ptr to comment if we have one */ int proto = 0; /* default */ int flags = 0; /* prerequisites */ ipfw_insn_log *logptr = NULL; /* set if we find an O_LOG */ ipfw_insn_altq *altqptr = NULL; /* set if we find an O_ALTQ */ int or_block = 0; /* we are in an or block */ uint32_t set_disable; bcopy(&rule->next_rule, &set_disable, sizeof(set_disable)); if (set_disable & (1 << rule->set)) { /* disabled */ if (!co.show_sets) return; else printf("# DISABLED "); } printf("%05u ", rule->rulenum); if (pcwidth > 0 || bcwidth > 0) { pr_u64(&rule->pcnt, pcwidth); pr_u64(&rule->bcnt, bcwidth); } if (co.do_time == 2) printf("%10u ", rule->timestamp); else if (co.do_time == 1) { char timestr[30]; time_t t = (time_t)0; if (twidth == 0) { strcpy(timestr, ctime(&t)); *strchr(timestr, '\n') = '\0'; twidth = strlen(timestr); } if (rule->timestamp) { t = _long_to_time(rule->timestamp); strcpy(timestr, ctime(&t)); *strchr(timestr, '\n') = '\0'; printf("%s ", timestr); } else { printf("%*s", twidth, " "); } } if (co.show_sets) printf("set %d ", rule->set); /* * print the optional "match probability" */ if (rule->cmd_len > 0) { cmd = rule->cmd ; if (cmd->opcode == O_PROB) { ipfw_insn_u32 *p = (ipfw_insn_u32 *)cmd; double d = 1.0 * p->d[0]; d = (d / 0x7fffffff); printf("prob %f ", d); } } /* * first print actions */ for (l = rule->cmd_len - rule->act_ofs, cmd = ACTION_PTR(rule); l > 0 ; l -= F_LEN(cmd), cmd += F_LEN(cmd)) { switch(cmd->opcode) { case O_CHECK_STATE: printf("check-state"); /* avoid printing anything else */ flags = HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP | HAVE_IP; break; case O_ACCEPT: printf("allow"); break; case O_COUNT: printf("count"); break; case O_DENY: printf("deny"); break; case O_REJECT: if (cmd->arg1 == ICMP_REJECT_RST) printf("reset"); else if (cmd->arg1 == ICMP_UNREACH_HOST) printf("reject"); else print_reject_code(cmd->arg1); break; case O_UNREACH6: if (cmd->arg1 == ICMP6_UNREACH_RST) printf("reset6"); else print_unreach6_code(cmd->arg1); break; case O_SKIPTO: PRINT_UINT_ARG("skipto ", cmd->arg1); break; case O_PIPE: PRINT_UINT_ARG("pipe ", cmd->arg1); break; case O_QUEUE: PRINT_UINT_ARG("queue ", cmd->arg1); break; case O_DIVERT: PRINT_UINT_ARG("divert ", cmd->arg1); break; case O_TEE: PRINT_UINT_ARG("tee ", cmd->arg1); break; case O_NETGRAPH: PRINT_UINT_ARG("netgraph ", cmd->arg1); break; case O_NGTEE: PRINT_UINT_ARG("ngtee ", cmd->arg1); break; case O_FORWARD_IP: { ipfw_insn_sa *s = (ipfw_insn_sa *)cmd; if (s->sa.sin_addr.s_addr == INADDR_ANY) { printf("fwd tablearg"); } else { printf("fwd %s", inet_ntoa(s->sa.sin_addr)); } if (s->sa.sin_port) printf(",%d", s->sa.sin_port); } break; #if 0 // XXX unused yet case O_FORWARD_IP6: { char buf[4 + INET6_ADDRSTRLEN + 1]; ipfw_insn_sa6 *s = (ipfw_insn_sa6 *)cmd; printf("fwd %s", inet_ntop(AF_INET6, &s->sa.sin6_addr, buf, sizeof(buf))); if (s->sa.sin6_port) printf(",%d", s->sa.sin6_port); } break; #endif // XXX unused yet case O_LOG: /* O_LOG is printed last */ logptr = (ipfw_insn_log *)cmd; break; case O_ALTQ: /* O_ALTQ is printed after O_LOG */ altqptr = (ipfw_insn_altq *)cmd; break; case O_TAG: tagptr = cmd; break; case O_NAT: if (cmd->arg1 != 0) PRINT_UINT_ARG("nat ", cmd->arg1); else printf("nat global"); break; case O_SETFIB: PRINT_UINT_ARG("setfib ", cmd->arg1); break; case O_REASS: printf("reass"); break; case O_CALLRETURN: if (cmd->len & F_NOT) printf("return"); else PRINT_UINT_ARG("call ", cmd->arg1); break; default: printf("** unrecognized action %d len %d ", cmd->opcode, cmd->len); } } if (logptr) { if (logptr->max_log > 0) printf(" log logamount %d", logptr->max_log); else printf(" log"); } #ifndef NO_ALTQ if (altqptr) { print_altq_cmd(altqptr); } #endif if (tagptr) { if (tagptr->len & F_NOT) PRINT_UINT_ARG(" untag ", tagptr->arg1); else PRINT_UINT_ARG(" tag ", tagptr->arg1); } /* * then print the body. */ for (l = rule->act_ofs, cmd = rule->cmd ; l > 0 ; l -= F_LEN(cmd) , cmd += F_LEN(cmd)) { if ((cmd->len & F_OR) || (cmd->len & F_NOT)) continue; if (cmd->opcode == O_IP4) { flags |= HAVE_PROTO4; break; } else if (cmd->opcode == O_IP6) { flags |= HAVE_PROTO6; break; } } if (rule->_pad & 1) { /* empty rules before options */ if (!co.do_compact) { show_prerequisites(&flags, HAVE_PROTO, 0); printf(" from any to any"); } flags |= HAVE_IP | HAVE_OPTIONS | HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP; } if (co.comment_only) comment = "..."; for (l = rule->act_ofs, cmd = rule->cmd ; l > 0 ; l -= F_LEN(cmd) , cmd += F_LEN(cmd)) { /* useful alias */ ipfw_insn_u32 *cmd32 = (ipfw_insn_u32 *)cmd; if (co.comment_only) { if (cmd->opcode != O_NOP) continue; printf(" // %s\n", (char *)(cmd + 1)); return; } show_prerequisites(&flags, 0, cmd->opcode); switch(cmd->opcode) { case O_PROB: break; /* done already */ case O_PROBE_STATE: break; /* no need to print anything here */ case O_IP_SRC: case O_IP_SRC_LOOKUP: case O_IP_SRC_MASK: case O_IP_SRC_ME: case O_IP_SRC_SET: show_prerequisites(&flags, HAVE_PROTO, 0); if (!(flags & HAVE_SRCIP)) printf(" from"); if ((cmd->len & F_OR) && !or_block) printf(" {"); print_ip((ipfw_insn_ip *)cmd, (flags & HAVE_OPTIONS) ? " src-ip" : ""); flags |= HAVE_SRCIP; break; case O_IP_DST: case O_IP_DST_LOOKUP: case O_IP_DST_MASK: case O_IP_DST_ME: case O_IP_DST_SET: show_prerequisites(&flags, HAVE_PROTO|HAVE_SRCIP, 0); if (!(flags & HAVE_DSTIP)) printf(" to"); if ((cmd->len & F_OR) && !or_block) printf(" {"); print_ip((ipfw_insn_ip *)cmd, (flags & HAVE_OPTIONS) ? " dst-ip" : ""); flags |= HAVE_DSTIP; break; case O_IP6_SRC: case O_IP6_SRC_MASK: case O_IP6_SRC_ME: show_prerequisites(&flags, HAVE_PROTO, 0); if (!(flags & HAVE_SRCIP)) printf(" from"); if ((cmd->len & F_OR) && !or_block) printf(" {"); print_ip6((ipfw_insn_ip6 *)cmd, (flags & HAVE_OPTIONS) ? " src-ip6" : ""); flags |= HAVE_SRCIP | HAVE_PROTO; break; case O_IP6_DST: case O_IP6_DST_MASK: case O_IP6_DST_ME: show_prerequisites(&flags, HAVE_PROTO|HAVE_SRCIP, 0); if (!(flags & HAVE_DSTIP)) printf(" to"); if ((cmd->len & F_OR) && !or_block) printf(" {"); print_ip6((ipfw_insn_ip6 *)cmd, (flags & HAVE_OPTIONS) ? " dst-ip6" : ""); flags |= HAVE_DSTIP; break; case O_FLOW6ID: print_flow6id( (ipfw_insn_u32 *) cmd ); flags |= HAVE_OPTIONS; break; case O_IP_DSTPORT: show_prerequisites(&flags, HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP | HAVE_IP, 0); case O_IP_SRCPORT: if (flags & HAVE_DSTIP) flags |= HAVE_IP; show_prerequisites(&flags, HAVE_PROTO | HAVE_SRCIP, 0); if ((cmd->len & F_OR) && !or_block) printf(" {"); if (cmd->len & F_NOT) printf(" not"); print_newports((ipfw_insn_u16 *)cmd, proto, (flags & HAVE_OPTIONS) ? cmd->opcode : 0); break; case O_PROTO: { struct protoent *pe = NULL; if ((cmd->len & F_OR) && !or_block) printf(" {"); if (cmd->len & F_NOT) printf(" not"); proto = cmd->arg1; pe = getprotobynumber(cmd->arg1); if ((flags & (HAVE_PROTO4 | HAVE_PROTO6)) && !(flags & HAVE_PROTO)) show_prerequisites(&flags, HAVE_PROTO | HAVE_IP | HAVE_SRCIP | HAVE_DSTIP | HAVE_OPTIONS, 0); if (flags & HAVE_OPTIONS) printf(" proto"); if (pe) printf(" %s", pe->p_name); else printf(" %u", cmd->arg1); } flags |= HAVE_PROTO; break; default: /*options ... */ if (!(cmd->len & (F_OR|F_NOT))) if (((cmd->opcode == O_IP6) && (flags & HAVE_PROTO6)) || ((cmd->opcode == O_IP4) && (flags & HAVE_PROTO4))) break; show_prerequisites(&flags, HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP | HAVE_IP | HAVE_OPTIONS, 0); if ((cmd->len & F_OR) && !or_block) printf(" {"); if (cmd->len & F_NOT && cmd->opcode != O_IN) printf(" not"); switch(cmd->opcode) { case O_MACADDR2: { ipfw_insn_mac *m = (ipfw_insn_mac *)cmd; printf(" MAC"); print_mac(m->addr, m->mask); print_mac(m->addr + 6, m->mask + 6); } break; case O_MAC_TYPE: print_newports((ipfw_insn_u16 *)cmd, IPPROTO_ETHERTYPE, cmd->opcode); break; case O_FRAG: printf(" frag"); break; case O_FIB: printf(" fib %u", cmd->arg1 ); break; case O_SOCKARG: printf(" sockarg"); break; case O_IN: printf(cmd->len & F_NOT ? " out" : " in"); break; case O_DIVERTED: switch (cmd->arg1) { case 3: printf(" diverted"); break; case 1: printf(" diverted-loopback"); break; case 2: printf(" diverted-output"); break; default: printf(" diverted-?<%u>", cmd->arg1); break; } break; case O_LAYER2: printf(" layer2"); break; case O_XMIT: case O_RECV: case O_VIA: { char const *s; ipfw_insn_if *cmdif = (ipfw_insn_if *)cmd; if (cmd->opcode == O_XMIT) s = "xmit"; else if (cmd->opcode == O_RECV) s = "recv"; else /* if (cmd->opcode == O_VIA) */ s = "via"; if (cmdif->name[0] == '\0') printf(" %s %s", s, inet_ntoa(cmdif->p.ip)); else printf(" %s %s", s, cmdif->name); break; } case O_IPID: if (F_LEN(cmd) == 1) printf(" ipid %u", cmd->arg1 ); else print_newports((ipfw_insn_u16 *)cmd, 0, O_IPID); break; case O_IPTTL: if (F_LEN(cmd) == 1) printf(" ipttl %u", cmd->arg1 ); else print_newports((ipfw_insn_u16 *)cmd, 0, O_IPTTL); break; case O_IPVER: printf(" ipver %u", cmd->arg1 ); break; case O_IPPRECEDENCE: printf(" ipprecedence %u", (cmd->arg1) >> 5 ); break; case O_IPLEN: if (F_LEN(cmd) == 1) printf(" iplen %u", cmd->arg1 ); else print_newports((ipfw_insn_u16 *)cmd, 0, O_IPLEN); break; case O_IPOPT: print_flags("ipoptions", cmd, f_ipopts); break; case O_IPTOS: print_flags("iptos", cmd, f_iptos); break; case O_ICMPTYPE: print_icmptypes((ipfw_insn_u32 *)cmd); break; case O_ESTAB: printf(" established"); break; case O_TCPDATALEN: if (F_LEN(cmd) == 1) printf(" tcpdatalen %u", cmd->arg1 ); else print_newports((ipfw_insn_u16 *)cmd, 0, O_TCPDATALEN); break; case O_TCPFLAGS: print_flags("tcpflags", cmd, f_tcpflags); break; case O_TCPOPTS: print_flags("tcpoptions", cmd, f_tcpopts); break; case O_TCPWIN: printf(" tcpwin %d", ntohs(cmd->arg1)); break; case O_TCPACK: printf(" tcpack %d", ntohl(cmd32->d[0])); break; case O_TCPSEQ: printf(" tcpseq %d", ntohl(cmd32->d[0])); break; case O_UID: { struct passwd *pwd = getpwuid(cmd32->d[0]); if (pwd) printf(" uid %s", pwd->pw_name); else printf(" uid %u", cmd32->d[0]); } break; case O_GID: { struct group *grp = getgrgid(cmd32->d[0]); if (grp) printf(" gid %s", grp->gr_name); else printf(" gid %u", cmd32->d[0]); } break; case O_JAIL: printf(" jail %d", cmd32->d[0]); break; case O_VERREVPATH: printf(" verrevpath"); break; case O_VERSRCREACH: printf(" versrcreach"); break; case O_ANTISPOOF: printf(" antispoof"); break; case O_IPSEC: printf(" ipsec"); break; case O_NOP: comment = (char *)(cmd + 1); break; case O_KEEP_STATE: printf(" keep-state"); break; case O_LIMIT: { struct _s_x *p = limit_masks; ipfw_insn_limit *c = (ipfw_insn_limit *)cmd; uint8_t x = c->limit_mask; char const *comma = " "; printf(" limit"); for (; p->x != 0 ; p++) if ((x & p->x) == p->x) { x &= ~p->x; printf("%s%s", comma, p->s); comma = ","; } PRINT_UINT_ARG(" ", c->conn_limit); break; } case O_IP6: printf(" ip6"); break; case O_IP4: printf(" ip4"); break; case O_ICMP6TYPE: print_icmp6types((ipfw_insn_u32 *)cmd); break; case O_EXT_HDR: print_ext6hdr( (ipfw_insn *) cmd ); break; case O_TAGGED: if (F_LEN(cmd) == 1) PRINT_UINT_ARG(" tagged ", cmd->arg1); else print_newports((ipfw_insn_u16 *)cmd, 0, O_TAGGED); break; default: printf(" [opcode %d len %d]", cmd->opcode, cmd->len); } } if (cmd->len & F_OR) { printf(" or"); or_block = 1; } else if (or_block) { printf(" }"); or_block = 0; } } show_prerequisites(&flags, HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP | HAVE_IP, 0); if (comment) printf(" // %s", comment); printf("\n"); } static void show_dyn_ipfw(ipfw_dyn_rule *d, int pcwidth, int bcwidth) { struct protoent *pe; struct in_addr a; uint16_t rulenum; char buf[INET6_ADDRSTRLEN]; if (!co.do_expired) { if (!d->expire && !(d->dyn_type == O_LIMIT_PARENT)) return; } bcopy(&d->rule, &rulenum, sizeof(rulenum)); printf("%05d", rulenum); if (pcwidth > 0 || bcwidth > 0) { printf(" "); pr_u64(&d->pcnt, pcwidth); pr_u64(&d->bcnt, bcwidth); printf("(%ds)", d->expire); } switch (d->dyn_type) { case O_LIMIT_PARENT: printf(" PARENT %d", d->count); break; case O_LIMIT: printf(" LIMIT"); break; case O_KEEP_STATE: /* bidir, no mask */ printf(" STATE"); break; } if ((pe = getprotobynumber(d->id.proto)) != NULL) printf(" %s", pe->p_name); else printf(" proto %u", d->id.proto); if (d->id.addr_type == 4) { a.s_addr = htonl(d->id.src_ip); printf(" %s %d", inet_ntoa(a), d->id.src_port); a.s_addr = htonl(d->id.dst_ip); printf(" <-> %s %d", inet_ntoa(a), d->id.dst_port); } else if (d->id.addr_type == 6) { printf(" %s %d", inet_ntop(AF_INET6, &d->id.src_ip6, buf, sizeof(buf)), d->id.src_port); printf(" <-> %s %d", inet_ntop(AF_INET6, &d->id.dst_ip6, buf, sizeof(buf)), d->id.dst_port); } else printf(" UNKNOWN <-> UNKNOWN\n"); printf("\n"); } /* * This one handles all set-related commands * ipfw set { show | enable | disable } * ipfw set swap X Y * ipfw set move X to Y * ipfw set move rule X to Y */ void ipfw_sets_handler(char *av[]) { uint32_t set_disable, masks[2]; int i, nbytes; uint16_t rulenum; uint8_t cmd, new_set; av++; if (av[0] == NULL) errx(EX_USAGE, "set needs command"); if (_substrcmp(*av, "show") == 0) { void *data = NULL; char const *msg; int nalloc; nalloc = nbytes = sizeof(struct ip_fw); while (nbytes >= nalloc) { if (data) free(data); nalloc = nalloc * 2 + 200; nbytes = nalloc; data = safe_calloc(1, nbytes); if (do_cmd(IP_FW_GET, data, (uintptr_t)&nbytes) < 0) err(EX_OSERR, "getsockopt(IP_FW_GET)"); } bcopy(&((struct ip_fw *)data)->next_rule, &set_disable, sizeof(set_disable)); for (i = 0, msg = "disable" ; i < RESVD_SET; i++) if ((set_disable & (1< RESVD_SET) errx(EX_DATAERR, "invalid set number %s\n", av[0]); if (!isdigit(*(av[1])) || new_set > RESVD_SET) errx(EX_DATAERR, "invalid set number %s\n", av[1]); masks[0] = (4 << 24) | (new_set << 16) | (rulenum); i = do_cmd(IP_FW_DEL, masks, sizeof(uint32_t)); } else if (_substrcmp(*av, "move") == 0) { av++; if (av[0] && _substrcmp(*av, "rule") == 0) { cmd = 2; av++; } else cmd = 3; if (av[0] == NULL || av[1] == NULL || av[2] == NULL || av[3] != NULL || _substrcmp(av[1], "to") != 0) errx(EX_USAGE, "syntax: set move [rule] X to Y\n"); rulenum = atoi(av[0]); new_set = atoi(av[2]); if (!isdigit(*(av[0])) || (cmd == 3 && rulenum > RESVD_SET) || (cmd == 2 && rulenum == IPFW_DEFAULT_RULE) ) errx(EX_DATAERR, "invalid source number %s\n", av[0]); if (!isdigit(*(av[2])) || new_set > RESVD_SET) errx(EX_DATAERR, "invalid dest. set %s\n", av[1]); masks[0] = (cmd << 24) | (new_set << 16) | (rulenum); i = do_cmd(IP_FW_DEL, masks, sizeof(uint32_t)); } else if (_substrcmp(*av, "disable") == 0 || _substrcmp(*av, "enable") == 0 ) { int which = _substrcmp(*av, "enable") == 0 ? 1 : 0; av++; masks[0] = masks[1] = 0; while (av[0]) { if (isdigit(**av)) { i = atoi(*av); if (i < 0 || i > RESVD_SET) errx(EX_DATAERR, "invalid set number %d\n", i); masks[which] |= (1<= nalloc) { nalloc = nalloc * 2 + 200; nbytes = nalloc; data = safe_realloc(data, nbytes); if (do_cmd(ocmd, data, (uintptr_t)&nbytes) < 0) err(EX_OSERR, "getsockopt(IP_%s_GET)", co.do_pipe ? "DUMMYNET" : "FW"); } /* * Count static rules. They have variable size so we * need to scan the list to count them. */ for (nstat = 1, r = data, lim = (char *)data + nbytes; r->rulenum < IPFW_DEFAULT_RULE && (char *)r < lim; ++nstat, r = NEXT(r) ) ; /* nothing */ /* * Count dynamic rules. This is easier as they have * fixed size. */ r = NEXT(r); dynrules = (ipfw_dyn_rule *)r ; n = (char *)r - (char *)data; ndyn = (nbytes - n) / sizeof *dynrules; /* if showing stats, figure out column widths ahead of time */ bcwidth = pcwidth = 0; if (show_counters) { for (n = 0, r = data; n < nstat; n++, r = NEXT(r)) { /* skip rules from another set */ if (co.use_set && r->set != co.use_set - 1) continue; /* packet counter */ width = pr_u64(&r->pcnt, 0); if (width > pcwidth) pcwidth = width; /* byte counter */ width = pr_u64(&r->bcnt, 0); if (width > bcwidth) bcwidth = width; } } if (co.do_dynamic && ndyn) { for (n = 0, d = dynrules; n < ndyn; n++, d++) { if (co.use_set) { /* skip rules from another set */ bcopy((char *)&d->rule + sizeof(uint16_t), &set, sizeof(uint8_t)); if (set != co.use_set - 1) continue; } width = pr_u64(&d->pcnt, 0); if (width > pcwidth) pcwidth = width; width = pr_u64(&d->bcnt, 0); if (width > bcwidth) bcwidth = width; } } /* if no rule numbers were specified, list all rules */ if (ac == 0) { for (n = 0, r = data; n < nstat; n++, r = NEXT(r)) { if (co.use_set && r->set != co.use_set - 1) continue; show_ipfw(r, pcwidth, bcwidth); } if (co.do_dynamic && ndyn) { printf("## Dynamic rules (%d):\n", ndyn); for (n = 0, d = dynrules; n < ndyn; n++, d++) { if (co.use_set) { bcopy((char *)&d->rule + sizeof(uint16_t), &set, sizeof(uint8_t)); if (set != co.use_set - 1) continue; } show_dyn_ipfw(d, pcwidth, bcwidth); } } goto done; } /* display specific rules requested on command line */ for (lac = ac, lav = av; lac != 0; lac--) { /* convert command line rule # */ last = rnum = strtoul(*lav++, &endptr, 10); if (*endptr == '-') last = strtoul(endptr+1, &endptr, 10); if (*endptr) { exitval = EX_USAGE; warnx("invalid rule number: %s", *(lav - 1)); continue; } for (n = seen = 0, r = data; n < nstat; n++, r = NEXT(r) ) { if (r->rulenum > last) break; if (co.use_set && r->set != co.use_set - 1) continue; if (r->rulenum >= rnum && r->rulenum <= last) { show_ipfw(r, pcwidth, bcwidth); seen = 1; } } if (!seen) { /* give precedence to other error(s) */ if (exitval == EX_OK) exitval = EX_UNAVAILABLE; warnx("rule %lu does not exist", rnum); } } if (co.do_dynamic && ndyn) { printf("## Dynamic rules:\n"); for (lac = ac, lav = av; lac != 0; lac--) { last = rnum = strtoul(*lav++, &endptr, 10); if (*endptr == '-') last = strtoul(endptr+1, &endptr, 10); if (*endptr) /* already warned */ continue; for (n = 0, d = dynrules; n < ndyn; n++, d++) { uint16_t rulenum; bcopy(&d->rule, &rulenum, sizeof(rulenum)); if (rulenum > rnum) break; if (co.use_set) { bcopy((char *)&d->rule + sizeof(uint16_t), &set, sizeof(uint8_t)); if (set != co.use_set - 1) continue; } if (r->rulenum >= rnum && r->rulenum <= last) show_dyn_ipfw(d, pcwidth, bcwidth); } } } ac = 0; done: free(data); if (exitval != EX_OK) exit(exitval); #undef NEXT } static int lookup_host (char *host, struct in_addr *ipaddr) { struct hostent *he; if (!inet_aton(host, ipaddr)) { if ((he = gethostbyname(host)) == NULL) return(-1); *ipaddr = *(struct in_addr *)he->h_addr_list[0]; } return(0); } /* * fills the addr and mask fields in the instruction as appropriate from av. * Update length as appropriate. * The following formats are allowed: * me returns O_IP_*_ME * 1.2.3.4 single IP address * 1.2.3.4:5.6.7.8 address:mask * 1.2.3.4/24 address/mask * 1.2.3.4/26{1,6,5,4,23} set of addresses in a subnet * We can have multiple comma-separated address/mask entries. */ static void fill_ip(ipfw_insn_ip *cmd, char *av) { int len = 0; uint32_t *d = ((ipfw_insn_u32 *)cmd)->d; cmd->o.len &= ~F_LEN_MASK; /* zero len */ if (_substrcmp(av, "any") == 0) return; if (_substrcmp(av, "me") == 0) { cmd->o.len |= F_INSN_SIZE(ipfw_insn); return; } if (strncmp(av, "table(", 6) == 0) { char *p = strchr(av + 6, ','); if (p) *p++ = '\0'; cmd->o.opcode = O_IP_DST_LOOKUP; cmd->o.arg1 = strtoul(av + 6, NULL, 0); if (p) { cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); d[0] = strtoul(p, NULL, 0); } else cmd->o.len |= F_INSN_SIZE(ipfw_insn); return; } while (av) { /* * After the address we can have '/' or ':' indicating a mask, * ',' indicating another address follows, '{' indicating a * set of addresses of unspecified size. */ char *t = NULL, *p = strpbrk(av, "/:,{"); int masklen; char md, nd = '\0'; if (p) { md = *p; *p++ = '\0'; if ((t = strpbrk(p, ",{")) != NULL) { nd = *t; *t = '\0'; } } else md = '\0'; if (lookup_host(av, (struct in_addr *)&d[0]) != 0) errx(EX_NOHOST, "hostname ``%s'' unknown", av); switch (md) { case ':': if (!inet_aton(p, (struct in_addr *)&d[1])) errx(EX_DATAERR, "bad netmask ``%s''", p); break; case '/': masklen = atoi(p); if (masklen == 0) d[1] = htonl(0); /* mask */ else if (masklen > 32) errx(EX_DATAERR, "bad width ``%s''", p); else d[1] = htonl(~0 << (32 - masklen)); break; case '{': /* no mask, assume /24 and put back the '{' */ d[1] = htonl(~0 << (32 - 24)); *(--p) = md; break; case ',': /* single address plus continuation */ *(--p) = md; /* FALLTHROUGH */ case 0: /* initialization value */ default: d[1] = htonl(~0); /* force /32 */ break; } d[0] &= d[1]; /* mask base address with mask */ if (t) *t = nd; /* find next separator */ if (p) p = strpbrk(p, ",{"); if (p && *p == '{') { /* * We have a set of addresses. They are stored as follows: * arg1 is the set size (powers of 2, 2..256) * addr is the base address IN HOST FORMAT * mask.. is an array of arg1 bits (rounded up to * the next multiple of 32) with bits set * for each host in the map. */ uint32_t *map = (uint32_t *)&cmd->mask; int low, high; int i = contigmask((uint8_t *)&(d[1]), 32); if (len > 0) errx(EX_DATAERR, "address set cannot be in a list"); if (i < 24 || i > 31) errx(EX_DATAERR, "invalid set with mask %d\n", i); cmd->o.arg1 = 1<<(32-i); /* map length */ d[0] = ntohl(d[0]); /* base addr in host format */ cmd->o.opcode = O_IP_DST_SET; /* default */ cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32) + (cmd->o.arg1+31)/32; for (i = 0; i < (cmd->o.arg1+31)/32 ; i++) map[i] = 0; /* clear map */ av = p + 1; low = d[0] & 0xff; high = low + cmd->o.arg1 - 1; /* * Here, i stores the previous value when we specify a range * of addresses within a mask, e.g. 45-63. i = -1 means we * have no previous value. */ i = -1; /* previous value in a range */ while (isdigit(*av)) { char *s; int a = strtol(av, &s, 0); if (s == av) { /* no parameter */ if (*av != '}') errx(EX_DATAERR, "set not closed\n"); if (i != -1) errx(EX_DATAERR, "incomplete range %d-", i); break; } if (a < low || a > high) errx(EX_DATAERR, "addr %d out of range [%d-%d]\n", a, low, high); a -= low; if (i == -1) /* no previous in range */ i = a; else { /* check that range is valid */ if (i > a) errx(EX_DATAERR, "invalid range %d-%d", i+low, a+low); if (*s == '-') errx(EX_DATAERR, "double '-' in range"); } for (; i <= a; i++) map[i/32] |= 1<<(i & 31); i = -1; if (*s == '-') i = a; else if (*s == '}') break; av = s+1; } return; } av = p; if (av) /* then *av must be a ',' */ av++; /* Check this entry */ if (d[1] == 0) { /* "any", specified as x.x.x.x/0 */ /* * 'any' turns the entire list into a NOP. * 'not any' never matches, so it is removed from the * list unless it is the only item, in which case we * report an error. */ if (cmd->o.len & F_NOT) { /* "not any" never matches */ if (av == NULL && len == 0) /* only this entry */ errx(EX_DATAERR, "not any never matches"); } /* else do nothing and skip this entry */ return; } /* A single IP can be stored in an optimized format */ if (d[1] == (uint32_t)~0 && av == NULL && len == 0) { cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); return; } len += 2; /* two words... */ d += 2; } /* end while */ if (len + 1 > F_LEN_MASK) errx(EX_DATAERR, "address list too long"); cmd->o.len |= len+1; } /* n2mask sets n bits of the mask */ void n2mask(struct in6_addr *mask, int n) { static int minimask[9] = { 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff }; u_char *p; memset(mask, 0, sizeof(struct in6_addr)); p = (u_char *) mask; for (; n > 0; p++, n -= 8) { if (n >= 8) *p = 0xff; else *p = minimask[n]; } return; } /* * helper function to process a set of flags and set bits in the * appropriate masks. */ static void fill_flags(ipfw_insn *cmd, enum ipfw_opcodes opcode, struct _s_x *flags, char *p) { uint8_t set=0, clear=0; while (p && *p) { char *q; /* points to the separator */ int val; uint8_t *which; /* mask we are working on */ if (*p == '!') { p++; which = &clear; } else which = &set; q = strchr(p, ','); if (q) *q++ = '\0'; val = match_token(flags, p); if (val <= 0) errx(EX_DATAERR, "invalid flag %s", p); *which |= (uint8_t)val; p = q; } cmd->opcode = opcode; cmd->len = (cmd->len & (F_NOT | F_OR)) | 1; cmd->arg1 = (set & 0xff) | ( (clear & 0xff) << 8); } void ipfw_delete(char *av[]) { uint32_t rulenum; int i; int exitval = EX_OK; int do_set = 0; av++; NEED1("missing rule specification"); if ( *av && _substrcmp(*av, "set") == 0) { /* Do not allow using the following syntax: * ipfw set N delete set M */ if (co.use_set) errx(EX_DATAERR, "invalid syntax"); do_set = 1; /* delete set */ av++; } /* Rule number */ while (*av && isdigit(**av)) { i = atoi(*av); av++; if (co.do_nat) { exitval = do_cmd(IP_FW_NAT_DEL, &i, sizeof i); if (exitval) { exitval = EX_UNAVAILABLE; warn("rule %u not available", i); } } else if (co.do_pipe) { exitval = ipfw_delete_pipe(co.do_pipe, i); } else { if (co.use_set) rulenum = (i & 0xffff) | (5 << 24) | ((co.use_set - 1) << 16); else rulenum = (i & 0xffff) | (do_set << 24); i = do_cmd(IP_FW_DEL, &rulenum, sizeof rulenum); if (i) { exitval = EX_UNAVAILABLE; warn("rule %u: setsockopt(IP_FW_DEL)", rulenum); } } } if (exitval != EX_OK) exit(exitval); } /* * fill the interface structure. We do not check the name as we can * create interfaces dynamically, so checking them at insert time * makes relatively little sense. * Interface names containing '*', '?', or '[' are assumed to be shell * patterns which match interfaces. */ static void fill_iface(ipfw_insn_if *cmd, char *arg) { cmd->name[0] = '\0'; cmd->o.len |= F_INSN_SIZE(ipfw_insn_if); /* Parse the interface or address */ if (strcmp(arg, "any") == 0) cmd->o.len = 0; /* effectively ignore this command */ else if (!isdigit(*arg)) { strlcpy(cmd->name, arg, sizeof(cmd->name)); cmd->p.glob = strpbrk(arg, "*?[") != NULL ? 1 : 0; } else if (!inet_aton(arg, &cmd->p.ip)) errx(EX_DATAERR, "bad ip address ``%s''", arg); } static void get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask) { int i; size_t l; char *ap, *ptr, *optr; struct ether_addr *mac; const char *macset = "0123456789abcdefABCDEF:"; if (strcmp(p, "any") == 0) { for (i = 0; i < ETHER_ADDR_LEN; i++) addr[i] = mask[i] = 0; return; } optr = ptr = strdup(p); if ((ap = strsep(&ptr, "&/")) != NULL && *ap != 0) { l = strlen(ap); if (strspn(ap, macset) != l || (mac = ether_aton(ap)) == NULL) errx(EX_DATAERR, "Incorrect MAC address"); bcopy(mac, addr, ETHER_ADDR_LEN); } else errx(EX_DATAERR, "Incorrect MAC address"); if (ptr != NULL) { /* we have mask? */ if (p[ptr - optr - 1] == '/') { /* mask len */ long ml = strtol(ptr, &ap, 10); if (*ap != 0 || ml > ETHER_ADDR_LEN * 8 || ml < 0) errx(EX_DATAERR, "Incorrect mask length"); for (i = 0; ml > 0 && i < ETHER_ADDR_LEN; ml -= 8, i++) mask[i] = (ml >= 8) ? 0xff: (~0) << (8 - ml); } else { /* mask */ l = strlen(ptr); if (strspn(ptr, macset) != l || (mac = ether_aton(ptr)) == NULL) errx(EX_DATAERR, "Incorrect mask"); bcopy(mac, mask, ETHER_ADDR_LEN); } } else { /* default mask: ff:ff:ff:ff:ff:ff */ for (i = 0; i < ETHER_ADDR_LEN; i++) mask[i] = 0xff; } for (i = 0; i < ETHER_ADDR_LEN; i++) addr[i] &= mask[i]; free(optr); } /* * helper function, updates the pointer to cmd with the length * of the current command, and also cleans up the first word of * the new command in case it has been clobbered before. */ static ipfw_insn * next_cmd(ipfw_insn *cmd) { cmd += F_LEN(cmd); bzero(cmd, sizeof(*cmd)); return cmd; } /* * Takes arguments and copies them into a comment */ static void fill_comment(ipfw_insn *cmd, char **av) { int i, l; char *p = (char *)(cmd + 1); cmd->opcode = O_NOP; cmd->len = (cmd->len & (F_NOT | F_OR)); /* Compute length of comment string. */ for (i = 0, l = 0; av[i] != NULL; i++) l += strlen(av[i]) + 1; if (l == 0) return; if (l > 84) errx(EX_DATAERR, "comment too long (max 80 chars)"); l = 1 + (l+3)/4; cmd->len = (cmd->len & (F_NOT | F_OR)) | l; for (i = 0; av[i] != NULL; i++) { strcpy(p, av[i]); p += strlen(av[i]); *p++ = ' '; } *(--p) = '\0'; } /* * A function to fill simple commands of size 1. * Existing flags are preserved. */ static void fill_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, int flags, uint16_t arg) { cmd->opcode = opcode; cmd->len = ((cmd->len | flags) & (F_NOT | F_OR)) | 1; cmd->arg1 = arg; } /* * Fetch and add the MAC address and type, with masks. This generates one or * two microinstructions, and returns the pointer to the last one. */ static ipfw_insn * add_mac(ipfw_insn *cmd, char *av[]) { ipfw_insn_mac *mac; if ( ( av[0] == NULL ) || ( av[1] == NULL ) ) errx(EX_DATAERR, "MAC dst src"); cmd->opcode = O_MACADDR2; cmd->len = (cmd->len & (F_NOT | F_OR)) | F_INSN_SIZE(ipfw_insn_mac); mac = (ipfw_insn_mac *)cmd; get_mac_addr_mask(av[0], mac->addr, mac->mask); /* dst */ get_mac_addr_mask(av[1], &(mac->addr[ETHER_ADDR_LEN]), &(mac->mask[ETHER_ADDR_LEN])); /* src */ return cmd; } static ipfw_insn * add_mactype(ipfw_insn *cmd, char *av) { if (!av) errx(EX_DATAERR, "missing MAC type"); if (strcmp(av, "any") != 0) { /* we have a non-null type */ fill_newports((ipfw_insn_u16 *)cmd, av, IPPROTO_ETHERTYPE); cmd->opcode = O_MAC_TYPE; return cmd; } else return NULL; } static ipfw_insn * add_proto0(ipfw_insn *cmd, char *av, u_char *protop) { struct protoent *pe; char *ep; int proto; proto = strtol(av, &ep, 10); if (*ep != '\0' || proto <= 0) { if ((pe = getprotobyname(av)) == NULL) return NULL; proto = pe->p_proto; } fill_cmd(cmd, O_PROTO, 0, proto); *protop = proto; return cmd; } static ipfw_insn * add_proto(ipfw_insn *cmd, char *av, u_char *protop) { u_char proto = IPPROTO_IP; if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0) ; /* do not set O_IP4 nor O_IP6 */ else if (strcmp(av, "ip4") == 0) /* explicit "just IPv4" rule */ fill_cmd(cmd, O_IP4, 0, 0); else if (strcmp(av, "ip6") == 0) { /* explicit "just IPv6" rule */ proto = IPPROTO_IPV6; fill_cmd(cmd, O_IP6, 0, 0); } else return add_proto0(cmd, av, protop); *protop = proto; return cmd; } static ipfw_insn * add_proto_compat(ipfw_insn *cmd, char *av, u_char *protop) { u_char proto = IPPROTO_IP; if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0) ; /* do not set O_IP4 nor O_IP6 */ else if (strcmp(av, "ipv4") == 0 || strcmp(av, "ip4") == 0) /* explicit "just IPv4" rule */ fill_cmd(cmd, O_IP4, 0, 0); else if (strcmp(av, "ipv6") == 0 || strcmp(av, "ip6") == 0) { /* explicit "just IPv6" rule */ proto = IPPROTO_IPV6; fill_cmd(cmd, O_IP6, 0, 0); } else return add_proto0(cmd, av, protop); *protop = proto; return cmd; } static ipfw_insn * add_srcip(ipfw_insn *cmd, char *av) { fill_ip((ipfw_insn_ip *)cmd, av); if (cmd->opcode == O_IP_DST_SET) /* set */ cmd->opcode = O_IP_SRC_SET; else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ cmd->opcode = O_IP_SRC_LOOKUP; else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) /* me */ cmd->opcode = O_IP_SRC_ME; else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) /* one IP */ cmd->opcode = O_IP_SRC; else /* addr/mask */ cmd->opcode = O_IP_SRC_MASK; return cmd; } static ipfw_insn * add_dstip(ipfw_insn *cmd, char *av) { fill_ip((ipfw_insn_ip *)cmd, av); if (cmd->opcode == O_IP_DST_SET) /* set */ ; else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ ; else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) /* me */ cmd->opcode = O_IP_DST_ME; else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) /* one IP */ cmd->opcode = O_IP_DST; else /* addr/mask */ cmd->opcode = O_IP_DST_MASK; return cmd; } static ipfw_insn * add_ports(ipfw_insn *cmd, char *av, u_char proto, int opcode) { /* XXX "any" is trapped before. Perhaps "to" */ if (_substrcmp(av, "any") == 0) { return NULL; } else if (fill_newports((ipfw_insn_u16 *)cmd, av, proto)) { /* XXX todo: check that we have a protocol with ports */ cmd->opcode = opcode; return cmd; } return NULL; } static ipfw_insn * add_src(ipfw_insn *cmd, char *av, u_char proto) { struct in6_addr a; char *host, *ch; ipfw_insn *ret = NULL; if ((host = strdup(av)) == NULL) return NULL; if ((ch = strrchr(host, '/')) != NULL) *ch = '\0'; if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || inet_pton(AF_INET6, host, &a) == 1) ret = add_srcip6(cmd, av); /* XXX: should check for IPv4, not !IPv6 */ if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || inet_pton(AF_INET6, host, &a) != 1)) ret = add_srcip(cmd, av); if (ret == NULL && strcmp(av, "any") != 0) ret = cmd; free(host); return ret; } static ipfw_insn * add_dst(ipfw_insn *cmd, char *av, u_char proto) { struct in6_addr a; char *host, *ch; ipfw_insn *ret = NULL; if ((host = strdup(av)) == NULL) return NULL; if ((ch = strrchr(host, '/')) != NULL) *ch = '\0'; if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || inet_pton(AF_INET6, host, &a) == 1) ret = add_dstip6(cmd, av); /* XXX: should check for IPv4, not !IPv6 */ if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || inet_pton(AF_INET6, host, &a) != 1)) ret = add_dstip(cmd, av); if (ret == NULL && strcmp(av, "any") != 0) ret = cmd; free(host); return ret; } /* * Parse arguments and assemble the microinstructions which make up a rule. * Rules are added into the 'rulebuf' and then copied in the correct order * into the actual rule. * * The syntax for a rule starts with the action, followed by * optional action parameters, and the various match patterns. * In the assembled microcode, the first opcode must be an O_PROBE_STATE * (generated if the rule includes a keep-state option), then the * various match patterns, log/altq actions, and the actual action. * */ void ipfw_add(char *av[]) { /* * rules are added into the 'rulebuf' and then copied in * the correct order into the actual rule. * Some things that need to go out of order (prob, action etc.) * go into actbuf[]. */ static uint32_t rulebuf[255], actbuf[255], cmdbuf[255]; ipfw_insn *src, *dst, *cmd, *action, *prev=NULL; ipfw_insn *first_cmd; /* first match pattern */ struct ip_fw *rule; /* * various flags used to record that we entered some fields. */ ipfw_insn *have_state = NULL; /* check-state or keep-state */ ipfw_insn *have_log = NULL, *have_altq = NULL, *have_tag = NULL; size_t len; int i; int open_par = 0; /* open parenthesis ( */ /* proto is here because it is used to fetch ports */ u_char proto = IPPROTO_IP; /* default protocol */ double match_prob = 1; /* match probability, default is always match */ bzero(actbuf, sizeof(actbuf)); /* actions go here */ bzero(cmdbuf, sizeof(cmdbuf)); bzero(rulebuf, sizeof(rulebuf)); rule = (struct ip_fw *)rulebuf; cmd = (ipfw_insn *)cmdbuf; action = (ipfw_insn *)actbuf; av++; /* [rule N] -- Rule number optional */ if (av[0] && isdigit(**av)) { rule->rulenum = atoi(*av); av++; } /* [set N] -- set number (0..RESVD_SET), optional */ if (av[0] && av[1] && _substrcmp(*av, "set") == 0) { int set = strtoul(av[1], NULL, 10); if (set < 0 || set > RESVD_SET) errx(EX_DATAERR, "illegal set %s", av[1]); rule->set = set; av += 2; } /* [prob D] -- match probability, optional */ if (av[0] && av[1] && _substrcmp(*av, "prob") == 0) { match_prob = strtod(av[1], NULL); if (match_prob <= 0 || match_prob > 1) errx(EX_DATAERR, "illegal match prob. %s", av[1]); av += 2; } /* action -- mandatory */ NEED1("missing action"); i = match_token(rule_actions, *av); av++; action->len = 1; /* default */ switch(i) { case TOK_CHECKSTATE: have_state = action; action->opcode = O_CHECK_STATE; break; case TOK_ACCEPT: action->opcode = O_ACCEPT; break; case TOK_DENY: action->opcode = O_DENY; action->arg1 = 0; break; case TOK_REJECT: action->opcode = O_REJECT; action->arg1 = ICMP_UNREACH_HOST; break; case TOK_RESET: action->opcode = O_REJECT; action->arg1 = ICMP_REJECT_RST; break; case TOK_RESET6: action->opcode = O_UNREACH6; action->arg1 = ICMP6_UNREACH_RST; break; case TOK_UNREACH: action->opcode = O_REJECT; NEED1("missing reject code"); fill_reject_code(&action->arg1, *av); av++; break; case TOK_UNREACH6: action->opcode = O_UNREACH6; NEED1("missing unreach code"); fill_unreach6_code(&action->arg1, *av); av++; break; case TOK_COUNT: action->opcode = O_COUNT; break; case TOK_NAT: action->opcode = O_NAT; action->len = F_INSN_SIZE(ipfw_insn_nat); goto chkarg; case TOK_QUEUE: action->opcode = O_QUEUE; goto chkarg; case TOK_PIPE: action->opcode = O_PIPE; goto chkarg; case TOK_SKIPTO: action->opcode = O_SKIPTO; goto chkarg; case TOK_NETGRAPH: action->opcode = O_NETGRAPH; goto chkarg; case TOK_NGTEE: action->opcode = O_NGTEE; goto chkarg; case TOK_DIVERT: action->opcode = O_DIVERT; goto chkarg; case TOK_TEE: action->opcode = O_TEE; goto chkarg; case TOK_CALL: action->opcode = O_CALLRETURN; chkarg: if (!av[0]) errx(EX_USAGE, "missing argument for %s", *(av - 1)); if (isdigit(**av)) { action->arg1 = strtoul(*av, NULL, 10); if (action->arg1 <= 0 || action->arg1 >= IP_FW_TABLEARG) errx(EX_DATAERR, "illegal argument for %s", *(av - 1)); } else if (_substrcmp(*av, "tablearg") == 0) { action->arg1 = IP_FW_TABLEARG; } else if (i == TOK_DIVERT || i == TOK_TEE) { struct servent *s; setservent(1); s = getservbyname(av[0], "divert"); if (s != NULL) action->arg1 = ntohs(s->s_port); else errx(EX_DATAERR, "illegal divert/tee port"); } else errx(EX_DATAERR, "illegal argument for %s", *(av - 1)); av++; break; case TOK_FORWARD: { ipfw_insn_sa *p = (ipfw_insn_sa *)action; char *s, *end; NEED1("missing forward address[:port]"); action->opcode = O_FORWARD_IP; action->len = F_INSN_SIZE(ipfw_insn_sa); /* * In the kernel we assume AF_INET and use only * sin_port and sin_addr. Remember to set sin_len as * the routing code seems to use it too. */ p->sa.sin_family = AF_INET; p->sa.sin_len = sizeof(struct sockaddr_in); p->sa.sin_port = 0; /* * locate the address-port separator (':' or ',') */ s = strchr(*av, ':'); if (s == NULL) s = strchr(*av, ','); if (s != NULL) { *(s++) = '\0'; i = strtoport(s, &end, 0 /* base */, 0 /* proto */); if (s == end) errx(EX_DATAERR, "illegal forwarding port ``%s''", s); p->sa.sin_port = (u_short)i; } if (_substrcmp(*av, "tablearg") == 0) p->sa.sin_addr.s_addr = INADDR_ANY; else lookup_host(*av, &(p->sa.sin_addr)); av++; break; } case TOK_COMMENT: /* pretend it is a 'count' rule followed by the comment */ action->opcode = O_COUNT; av--; /* go back... */ break; case TOK_SETFIB: { int numfibs; size_t intsize = sizeof(int); action->opcode = O_SETFIB; NEED1("missing fib number"); action->arg1 = strtoul(*av, NULL, 10); if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1) errx(EX_DATAERR, "fibs not suported.\n"); if (action->arg1 >= numfibs) /* Temporary */ errx(EX_DATAERR, "fib too large.\n"); av++; break; } case TOK_REASS: action->opcode = O_REASS; break; case TOK_RETURN: fill_cmd(action, O_CALLRETURN, F_NOT, 0); break; default: errx(EX_DATAERR, "invalid action %s\n", av[-1]); } action = next_cmd(action); /* * [altq queuename] -- altq tag, optional * [log [logamount N]] -- log, optional * * If they exist, it go first in the cmdbuf, but then it is * skipped in the copy section to the end of the buffer. */ while (av[0] != NULL && (i = match_token(rule_action_params, *av)) != -1) { av++; switch (i) { case TOK_LOG: { ipfw_insn_log *c = (ipfw_insn_log *)cmd; int l; if (have_log) errx(EX_DATAERR, "log cannot be specified more than once"); have_log = (ipfw_insn *)c; cmd->len = F_INSN_SIZE(ipfw_insn_log); cmd->opcode = O_LOG; if (av[0] && _substrcmp(*av, "logamount") == 0) { av++; NEED1("logamount requires argument"); l = atoi(*av); if (l < 0) errx(EX_DATAERR, "logamount must be positive"); c->max_log = l; av++; } else { len = sizeof(c->max_log); if (sysctlbyname("net.inet.ip.fw.verbose_limit", &c->max_log, &len, NULL, 0) == -1) errx(1, "sysctlbyname(\"%s\")", "net.inet.ip.fw.verbose_limit"); } } break; #ifndef NO_ALTQ case TOK_ALTQ: { ipfw_insn_altq *a = (ipfw_insn_altq *)cmd; NEED1("missing altq queue name"); if (have_altq) errx(EX_DATAERR, "altq cannot be specified more than once"); have_altq = (ipfw_insn *)a; cmd->len = F_INSN_SIZE(ipfw_insn_altq); cmd->opcode = O_ALTQ; a->qid = altq_name_to_qid(*av); av++; } break; #endif case TOK_TAG: case TOK_UNTAG: { uint16_t tag; if (have_tag) errx(EX_USAGE, "tag and untag cannot be " "specified more than once"); GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX, i, rule_action_params); have_tag = cmd; fill_cmd(cmd, O_TAG, (i == TOK_TAG) ? 0: F_NOT, tag); av++; break; } default: abort(); } cmd = next_cmd(cmd); } if (have_state) /* must be a check-state, we are done */ goto done; #define OR_START(target) \ if (av[0] && (*av[0] == '(' || *av[0] == '{')) { \ if (open_par) \ errx(EX_USAGE, "nested \"(\" not allowed\n"); \ prev = NULL; \ open_par = 1; \ if ( (av[0])[1] == '\0') { \ av++; \ } else \ (*av)++; \ } \ target: \ #define CLOSE_PAR \ if (open_par) { \ if (av[0] && ( \ strcmp(*av, ")") == 0 || \ strcmp(*av, "}") == 0)) { \ prev = NULL; \ open_par = 0; \ av++; \ } else \ errx(EX_USAGE, "missing \")\"\n"); \ } #define NOT_BLOCK \ if (av[0] && _substrcmp(*av, "not") == 0) { \ if (cmd->len & F_NOT) \ errx(EX_USAGE, "double \"not\" not allowed\n"); \ cmd->len |= F_NOT; \ av++; \ } #define OR_BLOCK(target) \ if (av[0] && _substrcmp(*av, "or") == 0) { \ if (prev == NULL || open_par == 0) \ errx(EX_DATAERR, "invalid OR block"); \ prev->len |= F_OR; \ av++; \ goto target; \ } \ CLOSE_PAR; first_cmd = cmd; #if 0 /* * MAC addresses, optional. * If we have this, we skip the part "proto from src to dst" * and jump straight to the option parsing. */ NOT_BLOCK; NEED1("missing protocol"); if (_substrcmp(*av, "MAC") == 0 || _substrcmp(*av, "mac") == 0) { av++; /* the "MAC" keyword */ add_mac(cmd, av); /* exits in case of errors */ cmd = next_cmd(cmd); av += 2; /* dst-mac and src-mac */ NOT_BLOCK; NEED1("missing mac type"); if (add_mactype(cmd, av[0])) cmd = next_cmd(cmd); av++; /* any or mac-type */ goto read_options; } #endif /* * protocol, mandatory */ OR_START(get_proto); NOT_BLOCK; NEED1("missing protocol"); if (add_proto_compat(cmd, *av, &proto)) { av++; if (F_LEN(cmd) != 0) { prev = cmd; cmd = next_cmd(cmd); } } else if (first_cmd != cmd) { errx(EX_DATAERR, "invalid protocol ``%s''", *av); } else goto read_options; OR_BLOCK(get_proto); /* * "from", mandatory */ if ((av[0] == NULL) || _substrcmp(*av, "from") != 0) errx(EX_USAGE, "missing ``from''"); av++; /* * source IP, mandatory */ OR_START(source_ip); NOT_BLOCK; /* optional "not" */ NEED1("missing source address"); if (add_src(cmd, *av, proto)) { av++; if (F_LEN(cmd) != 0) { /* ! any */ prev = cmd; cmd = next_cmd(cmd); } } else errx(EX_USAGE, "bad source address %s", *av); OR_BLOCK(source_ip); /* * source ports, optional */ NOT_BLOCK; /* optional "not" */ if ( av[0] != NULL ) { if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_SRCPORT)) { av++; if (F_LEN(cmd) != 0) cmd = next_cmd(cmd); } } /* * "to", mandatory */ if ( (av[0] == NULL) || _substrcmp(*av, "to") != 0 ) errx(EX_USAGE, "missing ``to''"); av++; /* * destination, mandatory */ OR_START(dest_ip); NOT_BLOCK; /* optional "not" */ NEED1("missing dst address"); if (add_dst(cmd, *av, proto)) { av++; if (F_LEN(cmd) != 0) { /* ! any */ prev = cmd; cmd = next_cmd(cmd); } } else errx( EX_USAGE, "bad destination address %s", *av); OR_BLOCK(dest_ip); /* * dest. ports, optional */ NOT_BLOCK; /* optional "not" */ if (av[0]) { if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_DSTPORT)) { av++; if (F_LEN(cmd) != 0) cmd = next_cmd(cmd); } } read_options: if (av[0] && first_cmd == cmd) { /* * nothing specified so far, store in the rule to ease * printout later. */ rule->_pad = 1; } prev = NULL; while ( av[0] != NULL ) { char *s; ipfw_insn_u32 *cmd32; /* alias for cmd */ s = *av; cmd32 = (ipfw_insn_u32 *)cmd; if (*s == '!') { /* alternate syntax for NOT */ if (cmd->len & F_NOT) errx(EX_USAGE, "double \"not\" not allowed\n"); cmd->len = F_NOT; s++; } i = match_token(rule_options, s); av++; switch(i) { case TOK_NOT: if (cmd->len & F_NOT) errx(EX_USAGE, "double \"not\" not allowed\n"); cmd->len = F_NOT; break; case TOK_OR: if (open_par == 0 || prev == NULL) errx(EX_USAGE, "invalid \"or\" block\n"); prev->len |= F_OR; break; case TOK_STARTBRACE: if (open_par) errx(EX_USAGE, "+nested \"(\" not allowed\n"); open_par = 1; break; case TOK_ENDBRACE: if (!open_par) errx(EX_USAGE, "+missing \")\"\n"); open_par = 0; prev = NULL; break; case TOK_IN: fill_cmd(cmd, O_IN, 0, 0); break; case TOK_OUT: cmd->len ^= F_NOT; /* toggle F_NOT */ fill_cmd(cmd, O_IN, 0, 0); break; case TOK_DIVERTED: fill_cmd(cmd, O_DIVERTED, 0, 3); break; case TOK_DIVERTEDLOOPBACK: fill_cmd(cmd, O_DIVERTED, 0, 1); break; case TOK_DIVERTEDOUTPUT: fill_cmd(cmd, O_DIVERTED, 0, 2); break; case TOK_FRAG: fill_cmd(cmd, O_FRAG, 0, 0); break; case TOK_LAYER2: fill_cmd(cmd, O_LAYER2, 0, 0); break; case TOK_XMIT: case TOK_RECV: case TOK_VIA: NEED1("recv, xmit, via require interface name" " or address"); fill_iface((ipfw_insn_if *)cmd, av[0]); av++; if (F_LEN(cmd) == 0) /* not a valid address */ break; if (i == TOK_XMIT) cmd->opcode = O_XMIT; else if (i == TOK_RECV) cmd->opcode = O_RECV; else if (i == TOK_VIA) cmd->opcode = O_VIA; break; case TOK_ICMPTYPES: NEED1("icmptypes requires list of types"); fill_icmptypes((ipfw_insn_u32 *)cmd, *av); av++; break; case TOK_ICMP6TYPES: NEED1("icmptypes requires list of types"); fill_icmp6types((ipfw_insn_icmp6 *)cmd, *av); av++; break; case TOK_IPTTL: NEED1("ipttl requires TTL"); if (strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_IPTTL)) errx(EX_DATAERR, "invalid ipttl %s", *av); } else fill_cmd(cmd, O_IPTTL, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_IPID: NEED1("ipid requires id"); if (strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_IPID)) errx(EX_DATAERR, "invalid ipid %s", *av); } else fill_cmd(cmd, O_IPID, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_IPLEN: NEED1("iplen requires length"); if (strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_IPLEN)) errx(EX_DATAERR, "invalid ip len %s", *av); } else fill_cmd(cmd, O_IPLEN, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_IPVER: NEED1("ipver requires version"); fill_cmd(cmd, O_IPVER, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_IPPRECEDENCE: NEED1("ipprecedence requires value"); fill_cmd(cmd, O_IPPRECEDENCE, 0, (strtoul(*av, NULL, 0) & 7) << 5); av++; break; case TOK_IPOPTS: NEED1("missing argument for ipoptions"); fill_flags(cmd, O_IPOPT, f_ipopts, *av); av++; break; case TOK_IPTOS: NEED1("missing argument for iptos"); fill_flags(cmd, O_IPTOS, f_iptos, *av); av++; break; case TOK_UID: NEED1("uid requires argument"); { char *end; uid_t uid; struct passwd *pwd; cmd->opcode = O_UID; uid = strtoul(*av, &end, 0); pwd = (*end == '\0') ? getpwuid(uid) : getpwnam(*av); if (pwd == NULL) errx(EX_DATAERR, "uid \"%s\" nonexistent", *av); cmd32->d[0] = pwd->pw_uid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); av++; } break; case TOK_GID: NEED1("gid requires argument"); { char *end; gid_t gid; struct group *grp; cmd->opcode = O_GID; gid = strtoul(*av, &end, 0); grp = (*end == '\0') ? getgrgid(gid) : getgrnam(*av); if (grp == NULL) errx(EX_DATAERR, "gid \"%s\" nonexistent", *av); cmd32->d[0] = grp->gr_gid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); av++; } break; case TOK_JAIL: NEED1("jail requires argument"); { char *end; int jid; cmd->opcode = O_JAIL; jid = (int)strtol(*av, &end, 0); if (jid < 0 || *end != '\0') errx(EX_DATAERR, "jail requires prison ID"); cmd32->d[0] = (uint32_t)jid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); av++; } break; case TOK_ESTAB: fill_cmd(cmd, O_ESTAB, 0, 0); break; case TOK_SETUP: fill_cmd(cmd, O_TCPFLAGS, 0, (TH_SYN) | ( (TH_ACK) & 0xff) <<8 ); break; case TOK_TCPDATALEN: NEED1("tcpdatalen requires length"); if (strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_TCPDATALEN)) errx(EX_DATAERR, "invalid tcpdata len %s", *av); } else fill_cmd(cmd, O_TCPDATALEN, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_TCPOPTS: NEED1("missing argument for tcpoptions"); fill_flags(cmd, O_TCPOPTS, f_tcpopts, *av); av++; break; case TOK_TCPSEQ: case TOK_TCPACK: NEED1("tcpseq/tcpack requires argument"); cmd->len = F_INSN_SIZE(ipfw_insn_u32); cmd->opcode = (i == TOK_TCPSEQ) ? O_TCPSEQ : O_TCPACK; cmd32->d[0] = htonl(strtoul(*av, NULL, 0)); av++; break; case TOK_TCPWIN: NEED1("tcpwin requires length"); fill_cmd(cmd, O_TCPWIN, 0, htons(strtoul(*av, NULL, 0))); av++; break; case TOK_TCPFLAGS: NEED1("missing argument for tcpflags"); cmd->opcode = O_TCPFLAGS; fill_flags(cmd, O_TCPFLAGS, f_tcpflags, *av); av++; break; case TOK_KEEPSTATE: if (open_par) errx(EX_USAGE, "keep-state cannot be part " "of an or block"); if (have_state) errx(EX_USAGE, "only one of keep-state " "and limit is allowed"); have_state = cmd; fill_cmd(cmd, O_KEEP_STATE, 0, 0); break; case TOK_LIMIT: { ipfw_insn_limit *c = (ipfw_insn_limit *)cmd; int val; if (open_par) errx(EX_USAGE, "limit cannot be part of an or block"); if (have_state) errx(EX_USAGE, "only one of keep-state and " "limit is allowed"); have_state = cmd; cmd->len = F_INSN_SIZE(ipfw_insn_limit); cmd->opcode = O_LIMIT; c->limit_mask = c->conn_limit = 0; while ( av[0] != NULL ) { if ((val = match_token(limit_masks, *av)) <= 0) break; c->limit_mask |= val; av++; } if (c->limit_mask == 0) errx(EX_USAGE, "limit: missing limit mask"); GET_UINT_ARG(c->conn_limit, IPFW_ARG_MIN, IPFW_ARG_MAX, TOK_LIMIT, rule_options); av++; break; } case TOK_PROTO: NEED1("missing protocol"); if (add_proto(cmd, *av, &proto)) { av++; } else errx(EX_DATAERR, "invalid protocol ``%s''", *av); break; case TOK_SRCIP: NEED1("missing source IP"); if (add_srcip(cmd, *av)) { av++; } break; case TOK_DSTIP: NEED1("missing destination IP"); if (add_dstip(cmd, *av)) { av++; } break; case TOK_SRCIP6: NEED1("missing source IP6"); if (add_srcip6(cmd, *av)) { av++; } break; case TOK_DSTIP6: NEED1("missing destination IP6"); if (add_dstip6(cmd, *av)) { av++; } break; case TOK_SRCPORT: NEED1("missing source port"); if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_SRCPORT)) { av++; } else errx(EX_DATAERR, "invalid source port %s", *av); break; case TOK_DSTPORT: NEED1("missing destination port"); if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_DSTPORT)) { av++; } else errx(EX_DATAERR, "invalid destination port %s", *av); break; case TOK_MAC: if (add_mac(cmd, av)) av += 2; break; case TOK_MACTYPE: NEED1("missing mac type"); if (!add_mactype(cmd, *av)) errx(EX_DATAERR, "invalid mac type %s", *av); av++; break; case TOK_VERREVPATH: fill_cmd(cmd, O_VERREVPATH, 0, 0); break; case TOK_VERSRCREACH: fill_cmd(cmd, O_VERSRCREACH, 0, 0); break; case TOK_ANTISPOOF: fill_cmd(cmd, O_ANTISPOOF, 0, 0); break; case TOK_IPSEC: fill_cmd(cmd, O_IPSEC, 0, 0); break; case TOK_IPV6: fill_cmd(cmd, O_IP6, 0, 0); break; case TOK_IPV4: fill_cmd(cmd, O_IP4, 0, 0); break; case TOK_EXT6HDR: fill_ext6hdr( cmd, *av ); av++; break; case TOK_FLOWID: if (proto != IPPROTO_IPV6 ) errx( EX_USAGE, "flow-id filter is active " "only for ipv6 protocol\n"); fill_flow6( (ipfw_insn_u32 *) cmd, *av ); av++; break; case TOK_COMMENT: fill_comment(cmd, av); av[0]=NULL; break; case TOK_TAGGED: if (av[0] && strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_TAGGED)) errx(EX_DATAERR, "tagged: invalid tag" " list: %s", *av); } else { uint16_t tag; GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX, TOK_TAGGED, rule_options); fill_cmd(cmd, O_TAGGED, 0, tag); } av++; break; case TOK_FIB: NEED1("fib requires fib number"); fill_cmd(cmd, O_FIB, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_SOCKARG: fill_cmd(cmd, O_SOCKARG, 0, 0); break; case TOK_LOOKUP: { ipfw_insn_u32 *c = (ipfw_insn_u32 *)cmd; char *p; int j; if (!av[0] || !av[1]) errx(EX_USAGE, "format: lookup argument tablenum"); cmd->opcode = O_IP_DST_LOOKUP; cmd->len |= F_INSN_SIZE(ipfw_insn) + 2; i = match_token(rule_options, *av); for (j = 0; lookup_key[j] >= 0 ; j++) { if (i == lookup_key[j]) break; } if (lookup_key[j] <= 0) errx(EX_USAGE, "format: cannot lookup on %s", *av); __PAST_END(c->d, 1) = j; // i converted to option av++; cmd->arg1 = strtoul(*av, &p, 0); if (p && *p) errx(EX_USAGE, "format: lookup argument tablenum"); av++; } break; default: errx(EX_USAGE, "unrecognised option [%d] %s\n", i, s); } if (F_LEN(cmd) > 0) { /* prepare to advance */ prev = cmd; cmd = next_cmd(cmd); } } done: /* * Now copy stuff into the rule. * If we have a keep-state option, the first instruction * must be a PROBE_STATE (which is generated here). * If we have a LOG option, it was stored as the first command, * and now must be moved to the top of the action part. */ dst = (ipfw_insn *)rule->cmd; /* * First thing to write into the command stream is the match probability. */ if (match_prob != 1) { /* 1 means always match */ dst->opcode = O_PROB; dst->len = 2; *((int32_t *)(dst+1)) = (int32_t)(match_prob * 0x7fffffff); dst += dst->len; } /* * generate O_PROBE_STATE if necessary */ if (have_state && have_state->opcode != O_CHECK_STATE) { fill_cmd(dst, O_PROBE_STATE, 0, 0); dst = next_cmd(dst); } /* copy all commands but O_LOG, O_KEEP_STATE, O_LIMIT, O_ALTQ, O_TAG */ for (src = (ipfw_insn *)cmdbuf; src != cmd; src += i) { i = F_LEN(src); switch (src->opcode) { case O_LOG: case O_KEEP_STATE: case O_LIMIT: case O_ALTQ: case O_TAG: break; default: bcopy(src, dst, i * sizeof(uint32_t)); dst += i; } } /* * put back the have_state command as last opcode */ if (have_state && have_state->opcode != O_CHECK_STATE) { i = F_LEN(have_state); bcopy(have_state, dst, i * sizeof(uint32_t)); dst += i; } /* * start action section */ rule->act_ofs = dst - rule->cmd; /* put back O_LOG, O_ALTQ, O_TAG if necessary */ if (have_log) { i = F_LEN(have_log); bcopy(have_log, dst, i * sizeof(uint32_t)); dst += i; } if (have_altq) { i = F_LEN(have_altq); bcopy(have_altq, dst, i * sizeof(uint32_t)); dst += i; } if (have_tag) { i = F_LEN(have_tag); bcopy(have_tag, dst, i * sizeof(uint32_t)); dst += i; } /* * copy all other actions */ for (src = (ipfw_insn *)actbuf; src != action; src += i) { i = F_LEN(src); bcopy(src, dst, i * sizeof(uint32_t)); dst += i; } rule->cmd_len = (uint32_t *)dst - (uint32_t *)(rule->cmd); i = (char *)dst - (char *)rule; if (do_cmd(IP_FW_ADD, rule, (uintptr_t)&i) == -1) err(EX_UNAVAILABLE, "getsockopt(%s)", "IP_FW_ADD"); if (!co.do_quiet) show_ipfw(rule, 0, 0); } /* * clear the counters or the log counters. */ void ipfw_zero(int ac, char *av[], int optname /* 0 = IP_FW_ZERO, 1 = IP_FW_RESETLOG */) { uint32_t arg, saved_arg; int failed = EX_OK; char const *errstr; char const *name = optname ? "RESETLOG" : "ZERO"; optname = optname ? IP_FW_RESETLOG : IP_FW_ZERO; av++; ac--; if (!ac) { /* clear all entries */ if (do_cmd(optname, NULL, 0) < 0) err(EX_UNAVAILABLE, "setsockopt(IP_FW_%s)", name); if (!co.do_quiet) printf("%s.\n", optname == IP_FW_ZERO ? "Accounting cleared":"Logging counts reset"); return; } while (ac) { /* Rule number */ if (isdigit(**av)) { arg = strtonum(*av, 0, 0xffff, &errstr); if (errstr) errx(EX_DATAERR, "invalid rule number %s\n", *av); saved_arg = arg; if (co.use_set) arg |= (1 << 24) | ((co.use_set - 1) << 16); av++; ac--; if (do_cmd(optname, &arg, sizeof(arg))) { warn("rule %u: setsockopt(IP_FW_%s)", saved_arg, name); failed = EX_UNAVAILABLE; } else if (!co.do_quiet) printf("Entry %d %s.\n", saved_arg, optname == IP_FW_ZERO ? "cleared" : "logging count reset"); } else { errx(EX_USAGE, "invalid rule number ``%s''", *av); } } if (failed != EX_OK) exit(failed); } void ipfw_flush(int force) { int cmd = co.do_pipe ? IP_DUMMYNET_FLUSH : IP_FW_FLUSH; if (!force && !co.do_quiet) { /* need to ask user */ int c; printf("Are you sure? [yn] "); fflush(stdout); do { c = toupper(getc(stdin)); while (c != '\n' && getc(stdin) != '\n') if (feof(stdin)) return; /* and do not flush */ } while (c != 'Y' && c != 'N'); printf("\n"); if (c == 'N') /* user said no */ return; } if (co.do_pipe) { dummynet_flush(); return; } /* `ipfw set N flush` - is the same that `ipfw delete set N` */ if (co.use_set) { uint32_t arg = ((co.use_set - 1) & 0xffff) | (1 << 24); if (do_cmd(IP_FW_DEL, &arg, sizeof(arg)) < 0) err(EX_UNAVAILABLE, "setsockopt(IP_FW_DEL)"); } else if (do_cmd(cmd, NULL, 0) < 0) err(EX_UNAVAILABLE, "setsockopt(IP_%s_FLUSH)", co.do_pipe ? "DUMMYNET" : "FW"); if (!co.do_quiet) printf("Flushed all %s.\n", co.do_pipe ? "pipes" : "rules"); } static void table_list(ipfw_table_entry ent, int need_header); /* * This one handles all table-related commands * ipfw table N add addr[/masklen] [value] * ipfw table N delete addr[/masklen] * ipfw table {N | all} flush * ipfw table {N | all} list */ void ipfw_table_handler(int ac, char *av[]) { ipfw_table_entry ent; int do_add; int is_all; size_t len; char *p; uint32_t a; uint32_t tables_max; len = sizeof(tables_max); if (sysctlbyname("net.inet.ip.fw.tables_max", &tables_max, &len, NULL, 0) == -1) { #ifdef IPFW_TABLES_MAX warn("Warn: Failed to get the max tables number via sysctl. " "Using the compiled in defaults. \nThe reason was"); tables_max = IPFW_TABLES_MAX; #else errx(1, "Failed sysctlbyname(\"net.inet.ip.fw.tables_max\")"); #endif } ac--; av++; if (ac && isdigit(**av)) { ent.tbl = atoi(*av); is_all = 0; ac--; av++; } else if (ac && _substrcmp(*av, "all") == 0) { ent.tbl = 0; is_all = 1; ac--; av++; } else errx(EX_USAGE, "table number or 'all' keyword required"); if (ent.tbl >= tables_max) errx(EX_USAGE, "The table number exceeds the maximum allowed " "value (%d)", tables_max - 1); NEED1("table needs command"); if (is_all && _substrcmp(*av, "list") != 0 && _substrcmp(*av, "flush") != 0) errx(EX_USAGE, "table number required"); if (_substrcmp(*av, "add") == 0 || _substrcmp(*av, "delete") == 0) { do_add = **av == 'a'; ac--; av++; if (!ac) errx(EX_USAGE, "IP address required"); p = strchr(*av, '/'); if (p) { *p++ = '\0'; ent.masklen = atoi(p); if (ent.masklen > 32) errx(EX_DATAERR, "bad width ``%s''", p); } else ent.masklen = 32; if (lookup_host(*av, (struct in_addr *)&ent.addr) != 0) errx(EX_NOHOST, "hostname ``%s'' unknown", *av); ac--; av++; if (do_add && ac) { unsigned int tval; /* isdigit is a bit of a hack here.. */ if (strchr(*av, (int)'.') == NULL && isdigit(**av)) { ent.value = strtoul(*av, NULL, 0); } else { if (lookup_host(*av, (struct in_addr *)&tval) == 0) { /* The value must be stored in host order * * so that the values < 65k can be distinguished */ ent.value = ntohl(tval); } else { errx(EX_NOHOST, "hostname ``%s'' unknown", *av); } } } else ent.value = 0; if (do_cmd(do_add ? IP_FW_TABLE_ADD : IP_FW_TABLE_DEL, &ent, sizeof(ent)) < 0) { /* If running silent, don't bomb out on these errors. */ if (!(co.do_quiet && (errno == (do_add ? EEXIST : ESRCH)))) err(EX_OSERR, "setsockopt(IP_FW_TABLE_%s)", do_add ? "ADD" : "DEL"); /* In silent mode, react to a failed add by deleting */ if (do_add) { do_cmd(IP_FW_TABLE_DEL, &ent, sizeof(ent)); if (do_cmd(IP_FW_TABLE_ADD, &ent, sizeof(ent)) < 0) err(EX_OSERR, "setsockopt(IP_FW_TABLE_ADD)"); } } } else if (_substrcmp(*av, "flush") == 0) { a = is_all ? tables_max : (uint32_t)(ent.tbl + 1); do { if (do_cmd(IP_FW_TABLE_FLUSH, &ent.tbl, sizeof(ent.tbl)) < 0) err(EX_OSERR, "setsockopt(IP_FW_TABLE_FLUSH)"); } while (++ent.tbl < a); } else if (_substrcmp(*av, "list") == 0) { a = is_all ? tables_max : (uint32_t)(ent.tbl + 1); do { table_list(ent, is_all); } while (++ent.tbl < a); } else errx(EX_USAGE, "invalid table command %s", *av); } static void table_list(ipfw_table_entry ent, int need_header) { ipfw_table *tbl; socklen_t l; uint32_t a; a = ent.tbl; l = sizeof(a); if (do_cmd(IP_FW_TABLE_GETSIZE, &a, (uintptr_t)&l) < 0) err(EX_OSERR, "getsockopt(IP_FW_TABLE_GETSIZE)"); /* If a is zero we have nothing to do, the table is empty. */ if (a == 0) return; l = sizeof(*tbl) + a * sizeof(ipfw_table_entry); tbl = safe_calloc(1, l); tbl->tbl = ent.tbl; if (do_cmd(IP_FW_TABLE_LIST, tbl, (uintptr_t)&l) < 0) err(EX_OSERR, "getsockopt(IP_FW_TABLE_LIST)"); if (tbl->cnt && need_header) printf("---table(%d)---\n", tbl->tbl); for (a = 0; a < tbl->cnt; a++) { unsigned int tval; tval = tbl->ent[a].value; if (co.do_value_as_ip) { char tbuf[128]; strncpy(tbuf, inet_ntoa(*(struct in_addr *) &tbl->ent[a].addr), 127); /* inet_ntoa expects network order */ tval = htonl(tval); printf("%s/%u %s\n", tbuf, tbl->ent[a].masklen, inet_ntoa(*(struct in_addr *)&tval)); } else { printf("%s/%u %u\n", inet_ntoa(*(struct in_addr *)&tbl->ent[a].addr), tbl->ent[a].masklen, tval); } } free(tbl); } ================================================ FILE: ipfw/ipfw2.h ================================================ /* * Copyright (c) 2002-2003 Luigi Rizzo * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp * Copyright (c) 1994 Ugen J.S.Antsilevich * * Idea and grammar partially left from: * Copyright (c) 1993 Daniel Boulet * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. * * NEW command line interface for IP firewall facility * * $FreeBSD: head/sbin/ipfw/ipfw2.h 206843 2010-04-19 15:11:45Z luigi $ */ /* * Options that can be set on the command line. * When reading commands from a file, a subset of the options can also * be applied globally by specifying them before the file name. * After that, each line can contain its own option that changes * the global value. * XXX The context is not restored after each line. */ struct cmdline_opts { /* boolean options: */ int do_value_as_ip; /* show table value as IP */ int do_resolv; /* try to resolve all ip to names */ int do_time; /* Show time stamps */ int do_quiet; /* Be quiet in add and flush */ int do_pipe; /* this cmd refers to a pipe/queue/sched */ int do_nat; /* this cmd refers to a nat config */ int do_dynamic; /* display dynamic rules */ int do_expired; /* display expired dynamic rules */ int do_compact; /* show rules in compact mode */ int do_force; /* do not ask for confirmation */ int show_sets; /* display the set each rule belongs to */ int test_only; /* only check syntax */ int comment_only; /* only print action and comment */ int verbose; /* be verbose on some commands */ /* The options below can have multiple values. */ int do_sort; /* field to sort results (0 = no) */ /* valid fields are 1 and above */ int use_set; /* work with specified set number */ /* 0 means all sets, otherwise apply to set use_set - 1 */ }; extern struct cmdline_opts co; /* * _s_x is a structure that stores a string <-> token pairs, used in * various places in the parser. Entries are stored in arrays, * with an entry with s=NULL as terminator. * The search routines are match_token() and match_value(). * Often, an element with x=0 contains an error string. * */ struct _s_x { char const *s; int x; }; enum tokens { TOK_NULL=0, TOK_OR, TOK_NOT, TOK_STARTBRACE, TOK_ENDBRACE, TOK_ACCEPT, TOK_COUNT, TOK_PIPE, TOK_LINK, TOK_QUEUE, TOK_FLOWSET, TOK_SCHED, TOK_DIVERT, TOK_TEE, TOK_NETGRAPH, TOK_NGTEE, TOK_FORWARD, TOK_SKIPTO, TOK_DENY, TOK_REJECT, TOK_RESET, TOK_UNREACH, TOK_CHECKSTATE, TOK_NAT, TOK_REASS, TOK_CALL, TOK_RETURN, TOK_ALTQ, TOK_LOG, TOK_TAG, TOK_UNTAG, TOK_TAGGED, TOK_UID, TOK_GID, TOK_JAIL, TOK_IN, TOK_LIMIT, TOK_KEEPSTATE, TOK_LAYER2, TOK_OUT, TOK_DIVERTED, TOK_DIVERTEDLOOPBACK, TOK_DIVERTEDOUTPUT, TOK_XMIT, TOK_RECV, TOK_VIA, TOK_FRAG, TOK_IPOPTS, TOK_IPLEN, TOK_IPID, TOK_IPPRECEDENCE, TOK_DSCP, TOK_IPTOS, TOK_IPTTL, TOK_IPVER, TOK_ESTAB, TOK_SETUP, TOK_TCPDATALEN, TOK_TCPFLAGS, TOK_TCPOPTS, TOK_TCPSEQ, TOK_TCPACK, TOK_TCPWIN, TOK_ICMPTYPES, TOK_MAC, TOK_MACTYPE, TOK_VERREVPATH, TOK_VERSRCREACH, TOK_ANTISPOOF, TOK_IPSEC, TOK_COMMENT, TOK_PLR, TOK_NOERROR, TOK_BUCKETS, TOK_DSTIP, TOK_SRCIP, TOK_DSTPORT, TOK_SRCPORT, TOK_ALL, TOK_MASK, TOK_FLOW_MASK, TOK_SCHED_MASK, TOK_BW, TOK_DELAY, TOK_PROFILE, TOK_BURST, TOK_RED, TOK_GRED, TOK_DROPTAIL, TOK_PROTO, /* dummynet tokens */ TOK_WEIGHT, TOK_LMAX, TOK_PRI, TOK_TYPE, TOK_SLOTSIZE, TOK_IP, TOK_IF, TOK_ALOG, TOK_DENY_INC, TOK_SAME_PORTS, TOK_UNREG_ONLY, TOK_SKIP_GLOBAL, TOK_RESET_ADDR, TOK_ALIAS_REV, TOK_PROXY_ONLY, TOK_REDIR_ADDR, TOK_REDIR_PORT, TOK_REDIR_PROTO, TOK_IPV6, TOK_FLOWID, TOK_ICMP6TYPES, TOK_EXT6HDR, TOK_DSTIP6, TOK_SRCIP6, TOK_IPV4, TOK_UNREACH6, TOK_RESET6, TOK_FIB, TOK_SETFIB, TOK_LOOKUP, TOK_SOCKARG, }; /* * the following macro returns an error message if we run out of * arguments. */ #define NEED(_p, msg) {if (!_p) errx(EX_USAGE, msg);} #define NEED1(msg) {if (!(*av)) errx(EX_USAGE, msg);} int pr_u64(uint64_t *pd, int width); /* memory allocation support */ void *safe_calloc(size_t number, size_t size); void *safe_realloc(void *ptr, size_t size); /* string comparison functions used for historical compatibility */ int _substrcmp(const char *str1, const char* str2); int _substrcmp2(const char *str1, const char* str2, const char* str3); /* utility functions */ int match_token(struct _s_x *table, char *string); char const *match_value(struct _s_x *p, int value); int do_cmd(int optname, void *optval, uintptr_t optlen); struct in6_addr; void n2mask(struct in6_addr *mask, int n); int contigmask(uint8_t *p, int len); /* * Forward declarations to avoid include way too many headers. * C does not allow duplicated typedefs, so we use the base struct * that the typedef points to. * Should the typedefs use a different type, the compiler will * still detect the change when compiling the body of the * functions involved, so we do not lose error checking. */ struct _ipfw_insn; struct _ipfw_insn_altq; struct _ipfw_insn_u32; struct _ipfw_insn_ip6; struct _ipfw_insn_icmp6; /* * The reserved set numer. This is a constant in ip_fw.h * but we store it in a variable so other files do not depend * in that header just for one constant. */ extern int resvd_set_number; /* first-level command handlers */ void ipfw_add(char *av[]); void ipfw_show_nat(int ac, char **av); void ipfw_config_pipe(int ac, char **av); void ipfw_config_nat(int ac, char **av); void ipfw_sets_handler(char *av[]); void ipfw_table_handler(int ac, char *av[]); void ipfw_sysctl_handler(char *av[], int which); void ipfw_delete(char *av[]); void ipfw_flush(int force); void ipfw_zero(int ac, char *av[], int optname); void ipfw_list(int ac, char *av[], int show_counters); /* altq.c */ void altq_set_enabled(int enabled); u_int32_t altq_name_to_qid(const char *name); void print_altq_cmd(struct _ipfw_insn_altq *altqptr); /* dummynet.c */ void dummynet_list(int ac, char *av[], int show_counters); void dummynet_flush(void); int ipfw_delete_pipe(int pipe_or_queue, int n); /* ipv6.c */ void print_unreach6_code(uint16_t code); void print_ip6(struct _ipfw_insn_ip6 *cmd, char const *s); void print_flow6id(struct _ipfw_insn_u32 *cmd); void print_icmp6types(struct _ipfw_insn_u32 *cmd); void print_ext6hdr(struct _ipfw_insn *cmd ); struct _ipfw_insn *add_srcip6(struct _ipfw_insn *cmd, char *av); struct _ipfw_insn *add_dstip6(struct _ipfw_insn *cmd, char *av); void fill_flow6(struct _ipfw_insn_u32 *cmd, char *av ); void fill_unreach6_code(u_short *codep, char *str); void fill_icmp6types(struct _ipfw_insn_icmp6 *cmd, char *av); int fill_ext6hdr(struct _ipfw_insn *cmd, char *av); ================================================ FILE: ipfw/ipv6.c ================================================ /* * Copyright (c) 2002-2003 Luigi Rizzo * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp * Copyright (c) 1994 Ugen J.S.Antsilevich * * Idea and grammar partially left from: * Copyright (c) 1993 Daniel Boulet * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. * * NEW command line interface for IP firewall facility * * $FreeBSD: user/luigi/ipfw3-head/sbin/ipfw/ipv6.c 187770 2009-01-27 12:01:30Z luigi $ * * ipv6 support */ #include #include #include "ipfw2.h" #include #include #include #include #include #include #include #include #include #include #include #include #include static struct _s_x icmp6codes[] = { { "no-route", ICMP6_DST_UNREACH_NOROUTE }, { "admin-prohib", ICMP6_DST_UNREACH_ADMIN }, { "address", ICMP6_DST_UNREACH_ADDR }, { "port", ICMP6_DST_UNREACH_NOPORT }, { NULL, 0 } }; void fill_unreach6_code(u_short *codep, char *str) { int val; char *s; val = strtoul(str, &s, 0); if (s == str || *s != '\0' || val >= 0x100) val = match_token(icmp6codes, str); if (val < 0) errx(EX_DATAERR, "unknown ICMPv6 unreachable code ``%s''", str); *codep = val; return; } void print_unreach6_code(uint16_t code) { char const *s = match_value(icmp6codes, code); if (s != NULL) printf("unreach6 %s", s); else printf("unreach6 %u", code); } /* * Print the ip address contained in a command. */ void print_ip6(ipfw_insn_ip6 *cmd, char const *s) { struct hostent *he = NULL; int len = F_LEN((ipfw_insn *) cmd) - 1; struct in6_addr *a = &(cmd->addr6); char trad[255]; printf("%s%s ", cmd->o.len & F_NOT ? " not": "", s); if (cmd->o.opcode == O_IP6_SRC_ME || cmd->o.opcode == O_IP6_DST_ME) { printf("me6"); return; } if (cmd->o.opcode == O_IP6) { printf(" ip6"); return; } /* * len == 4 indicates a single IP, whereas lists of 1 or more * addr/mask pairs have len = (2n+1). We convert len to n so we * use that to count the number of entries. */ for (len = len / 4; len > 0; len -= 2, a += 2) { int mb = /* mask length */ (cmd->o.opcode == O_IP6_SRC || cmd->o.opcode == O_IP6_DST) ? 128 : contigmask((uint8_t *)&(a[1]), 128); if (mb == 128 && co.do_resolv) he = gethostbyaddr((char *)a, sizeof(*a), AF_INET6); if (he != NULL) /* resolved to name */ printf("%s", he->h_name); else if (mb == 0) /* any */ printf("any"); else { /* numeric IP followed by some kind of mask */ if (inet_ntop(AF_INET6, a, trad, sizeof( trad ) ) == NULL) printf("Error ntop in print_ip6\n"); printf("%s", trad ); if (mb < 0) /* XXX not really legal... */ printf(":%s", inet_ntop(AF_INET6, &a[1], trad, sizeof(trad))); else if (mb < 128) printf("/%d", mb); } if (len > 2) printf(","); } } void fill_icmp6types(ipfw_insn_icmp6 *cmd, char *av) { uint8_t type; bzero(cmd, sizeof(*cmd)); while (*av) { if (*av == ',') av++; type = strtoul(av, &av, 0); if (*av != ',' && *av != '\0') errx(EX_DATAERR, "invalid ICMP6 type"); /* * XXX: shouldn't this be 0xFF? I can't see any reason why * we shouldn't be able to filter all possiable values * regardless of the ability of the rest of the kernel to do * anything useful with them. */ if (type > ICMP6_MAXTYPE) errx(EX_DATAERR, "ICMP6 type out of range"); cmd->d[type / 32] |= ( 1 << (type % 32)); } cmd->o.opcode = O_ICMP6TYPE; cmd->o.len |= F_INSN_SIZE(ipfw_insn_icmp6); } void print_icmp6types(ipfw_insn_u32 *cmd) { int i, j; char sep= ' '; printf(" ip6 icmp6types"); for (i = 0; i < 7; i++) for (j=0; j < 32; ++j) { if ( (cmd->d[i] & (1 << (j))) == 0) continue; printf("%c%d", sep, (i*32 + j)); sep = ','; } } void print_flow6id( ipfw_insn_u32 *cmd) { uint16_t i, limit = cmd->o.arg1; char sep = ','; printf(" flow-id "); for( i=0; i < limit; ++i) { if (i == limit - 1) sep = ' '; printf("%d%c", cmd->d[i], sep); } } /* structure and define for the extension header in ipv6 */ static struct _s_x ext6hdrcodes[] = { { "frag", EXT_FRAGMENT }, { "hopopt", EXT_HOPOPTS }, { "route", EXT_ROUTING }, { "dstopt", EXT_DSTOPTS }, { "ah", EXT_AH }, { "esp", EXT_ESP }, { "rthdr0", EXT_RTHDR0 }, { "rthdr2", EXT_RTHDR2 }, { NULL, 0 } }; /* fills command for the extension header filtering */ int fill_ext6hdr( ipfw_insn *cmd, char *av) { int tok; char *s = av; cmd->arg1 = 0; while(s) { av = strsep( &s, ",") ; tok = match_token(ext6hdrcodes, av); switch (tok) { case EXT_FRAGMENT: cmd->arg1 |= EXT_FRAGMENT; break; case EXT_HOPOPTS: cmd->arg1 |= EXT_HOPOPTS; break; case EXT_ROUTING: cmd->arg1 |= EXT_ROUTING; break; case EXT_DSTOPTS: cmd->arg1 |= EXT_DSTOPTS; break; case EXT_AH: cmd->arg1 |= EXT_AH; break; case EXT_ESP: cmd->arg1 |= EXT_ESP; break; case EXT_RTHDR0: cmd->arg1 |= EXT_RTHDR0; break; case EXT_RTHDR2: cmd->arg1 |= EXT_RTHDR2; break; default: errx( EX_DATAERR, "invalid option for ipv6 exten header" ); break; } } if (cmd->arg1 == 0 ) return 0; cmd->opcode = O_EXT_HDR; cmd->len |= F_INSN_SIZE( ipfw_insn ); return 1; } void print_ext6hdr( ipfw_insn *cmd ) { char sep = ' '; printf(" extension header:"); if (cmd->arg1 & EXT_FRAGMENT ) { printf("%cfragmentation", sep); sep = ','; } if (cmd->arg1 & EXT_HOPOPTS ) { printf("%chop options", sep); sep = ','; } if (cmd->arg1 & EXT_ROUTING ) { printf("%crouting options", sep); sep = ','; } if (cmd->arg1 & EXT_RTHDR0 ) { printf("%crthdr0", sep); sep = ','; } if (cmd->arg1 & EXT_RTHDR2 ) { printf("%crthdr2", sep); sep = ','; } if (cmd->arg1 & EXT_DSTOPTS ) { printf("%cdestination options", sep); sep = ','; } if (cmd->arg1 & EXT_AH ) { printf("%cauthentication header", sep); sep = ','; } if (cmd->arg1 & EXT_ESP ) { printf("%cencapsulated security payload", sep); } } /* Try to find ipv6 address by hostname */ static int lookup_host6 (char *host, struct in6_addr *ip6addr) { struct hostent *he; if (!inet_pton(AF_INET6, host, ip6addr)) { if ((he = gethostbyname2(host, AF_INET6)) == NULL) return(-1); memcpy(ip6addr, he->h_addr_list[0], sizeof( struct in6_addr)); } return(0); } /* * fill the addr and mask fields in the instruction as appropriate from av. * Update length as appropriate. * The following formats are allowed: * any matches any IP6. Actually returns an empty instruction. * me returns O_IP6_*_ME * * 03f1::234:123:0342 single IP6 addres * 03f1::234:123:0342/24 address/mask * 03f1::234:123:0342/24,03f1::234:123:0343/ List of address * * Set of address (as in ipv6) not supported because ipv6 address * are typically random past the initial prefix. * Return 1 on success, 0 on failure. */ static int fill_ip6(ipfw_insn_ip6 *cmd, char *av) { int len = 0; struct in6_addr *d = &(cmd->addr6); /* * Needed for multiple address. * Note d[1] points to struct in6_add r mask6 of cmd */ cmd->o.len &= ~F_LEN_MASK; /* zero len */ if (strcmp(av, "any") == 0) return (1); if (strcmp(av, "me") == 0) { /* Set the data for "me" opt*/ cmd->o.len |= F_INSN_SIZE(ipfw_insn); return (1); } if (strcmp(av, "me6") == 0) { /* Set the data for "me" opt*/ cmd->o.len |= F_INSN_SIZE(ipfw_insn); return (1); } av = strdup(av); while (av) { /* * After the address we can have '/' indicating a mask, * or ',' indicating another address follows. */ char *p; int masklen; char md = '\0'; if ((p = strpbrk(av, "/,")) ) { md = *p; /* save the separator */ *p = '\0'; /* terminate address string */ p++; /* and skip past it */ } /* now p points to NULL, mask or next entry */ /* lookup stores address in *d as a side effect */ if (lookup_host6(av, d) != 0) { /* XXX: failed. Free memory and go */ errx(EX_DATAERR, "bad address \"%s\"", av); } /* next, look at the mask, if any */ masklen = (md == '/') ? atoi(p) : 128; if (masklen > 128 || masklen < 0) errx(EX_DATAERR, "bad width \"%s\''", p); else n2mask(&d[1], masklen); APPLY_MASK(d, &d[1]) /* mask base address with mask */ /* find next separator */ if (md == '/') { /* find separator past the mask */ p = strpbrk(p, ","); if (p != NULL) p++; } av = p; /* Check this entry */ if (masklen == 0) { /* * 'any' turns the entire list into a NOP. * 'not any' never matches, so it is removed from the * list unless it is the only item, in which case we * report an error. */ if (cmd->o.len & F_NOT && av == NULL && len == 0) errx(EX_DATAERR, "not any never matches"); continue; } /* * A single IP can be stored alone */ if (masklen == 128 && av == NULL && len == 0) { len = F_INSN_SIZE(struct in6_addr); break; } /* Update length and pointer to arguments */ len += F_INSN_SIZE(struct in6_addr)*2; d += 2; } /* end while */ /* * Total length of the command, remember that 1 is the size of * the base command. */ if (len + 1 > F_LEN_MASK) errx(EX_DATAERR, "address list too long"); cmd->o.len |= len+1; free(av); return (1); } /* * fills command for ipv6 flow-id filtering * note that the 20 bit flow number is stored in a array of u_int32_t * it's supported lists of flow-id, so in the o.arg1 we store how many * additional flow-id we want to filter, the basic is 1 */ void fill_flow6( ipfw_insn_u32 *cmd, char *av ) { u_int32_t type; /* Current flow number */ u_int16_t nflow = 0; /* Current flow index */ char *s = av; cmd->d[0] = 0; /* Initializing the base number*/ while (s) { av = strsep( &s, ",") ; type = strtoul(av, &av, 0); if (*av != ',' && *av != '\0') errx(EX_DATAERR, "invalid ipv6 flow number %s", av); if (type > 0xfffff) errx(EX_DATAERR, "flow number out of range %s", av); cmd->d[nflow] |= type; nflow++; } if( nflow > 0 ) { cmd->o.opcode = O_FLOW6ID; cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32) + nflow; cmd->o.arg1 = nflow; } else { errx(EX_DATAERR, "invalid ipv6 flow number %s", av); } } ipfw_insn * add_srcip6(ipfw_insn *cmd, char *av) { fill_ip6((ipfw_insn_ip6 *)cmd, av); if (F_LEN(cmd) == 0) { /* any */ } else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) { /* "me" */ cmd->opcode = O_IP6_SRC_ME; } else if (F_LEN(cmd) == (F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn))) { /* single IP, no mask*/ cmd->opcode = O_IP6_SRC; } else { /* addr/mask opt */ cmd->opcode = O_IP6_SRC_MASK; } return cmd; } ipfw_insn * add_dstip6(ipfw_insn *cmd, char *av) { fill_ip6((ipfw_insn_ip6 *)cmd, av); if (F_LEN(cmd) == 0) { /* any */ } else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) { /* "me" */ cmd->opcode = O_IP6_DST_ME; } else if (F_LEN(cmd) == (F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn))) { /* single IP, no mask*/ cmd->opcode = O_IP6_DST; } else { /* addr/mask opt */ cmd->opcode = O_IP6_DST_MASK; } return cmd; } ================================================ FILE: ipfw/main.c ================================================ /* * Copyright (c) 2002-2003,2010 Luigi Rizzo * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp * Copyright (c) 1994 Ugen J.S.Antsilevich * * Idea and grammar partially left from: * Copyright (c) 1993 Daniel Boulet * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. * * Command line interface for IP firewall facility * * $FreeBSD: head/sbin/ipfw/main.c 206494 2010-04-12 08:27:53Z luigi $ */ #include #include #include #include #include #include #include #include #include #include #include "ipfw2.h" static void help(void) { fprintf(stderr, "ipfw syntax summary (but please do read the ipfw(8) manpage):\n\n" "\tipfw [-abcdefhnNqStTv] \n\n" "where is one of the following:\n\n" "add [num] [set N] [prob x] RULE-BODY\n" "{pipe|queue} N config PIPE-BODY\n" "[pipe|queue] {zero|delete|show} [N{,N}]\n" "nat N config {ip IPADDR|if IFNAME|log|deny_in|same_ports|unreg_only|reset|\n" " reverse|proxy_only|redirect_addr linkspec|\n" " redirect_port linkspec|redirect_proto linkspec}\n" "set [disable N... enable N...] | move [rule] X to Y | swap X Y | show\n" "set N {show|list|zero|resetlog|delete} [N{,N}] | flush\n" "table N {add ip[/bits] [value] | delete ip[/bits] | flush | list}\n" "table all {flush | list}\n" "\n" "RULE-BODY: check-state [PARAMS] | ACTION [PARAMS] ADDR [OPTION_LIST]\n" "ACTION: check-state | allow | count | deny | unreach{,6} CODE |\n" " skipto N | {divert|tee} PORT | forward ADDR |\n" " pipe N | queue N | nat N | setfib FIB | reass\n" "PARAMS: [log [logamount LOGLIMIT]] [altq QUEUE_NAME]\n" "ADDR: [ MAC dst src ether_type ] \n" " [ ip from IPADDR [ PORT ] to IPADDR [ PORTLIST ] ]\n" " [ ipv6|ip6 from IP6ADDR [ PORT ] to IP6ADDR [ PORTLIST ] ]\n" "IPADDR: [not] { any | me | ip/bits{x,y,z} | table(t[,v]) | IPLIST }\n" "IP6ADDR: [not] { any | me | me6 | ip6/bits | IP6LIST }\n" "IP6LIST: { ip6 | ip6/bits }[,IP6LIST]\n" "IPLIST: { ip | ip/bits | ip:mask }[,IPLIST]\n" "OPTION_LIST: OPTION [OPTION_LIST]\n" "OPTION: bridged | diverted | diverted-loopback | diverted-output |\n" " {dst-ip|src-ip} IPADDR | {dst-ip6|src-ip6|dst-ipv6|src-ipv6} IP6ADDR |\n" " {dst-port|src-port} LIST |\n" " estab | frag | {gid|uid} N | icmptypes LIST | in | out | ipid LIST |\n" " iplen LIST | ipoptions SPEC | ipprecedence | ipsec | iptos SPEC |\n" " ipttl LIST | ipversion VER | keep-state | layer2 | limit ... |\n" " icmp6types LIST | ext6hdr LIST | flow-id N[,N] | fib FIB |\n" " mac ... | mac-type LIST | proto LIST | {recv|xmit|via} {IF|IPADDR} |\n" " setup | {tcpack|tcpseq|tcpwin} NN | tcpflags SPEC | tcpoptions SPEC |\n" " tcpdatalen LIST | verrevpath | versrcreach | antispoof\n" ); exit(0); } /* * Called with the arguments, including program name because getopt * wants it to be present. * Returns 0 if successful, 1 if empty command, errx() in case of errors. * First thing we do is process parameters creating an argv[] array * which includes the program name and a NULL entry at the end. * If we are called with a single string, we split it on whitespace. * Also, arguments with a trailing ',' are joined to the next one. * The pointers (av[]) and data are in a single chunk of memory. * av[0] points to the original program name, all other entries * point into the allocated chunk. */ static int ipfw_main(int oldac, char **oldav) { int ch, ac; const char *errstr; char **av, **save_av; int do_acct = 0; /* Show packet/byte count */ int try_next = 0; /* set if pipe cmd not found */ int av_size; /* compute the av size */ char *av_p; /* used to build the av list */ #define WHITESP " \t\f\v\n\r" if (oldac < 2) return 1; /* need at least one argument */ if (oldac == 2) { /* * If we are called with one argument, try to split it into * words for subsequent parsing. Spaces after a ',' are * removed by copying the string in-place. */ char *arg = oldav[1]; /* The string is the first arg. */ int l = strlen(arg); int copy = 0; /* 1 if we need to copy, 0 otherwise */ int i, j; for (i = j = 0; i < l; i++) { if (arg[i] == '#') /* comment marker */ break; if (copy) { arg[j++] = arg[i]; copy = !strchr("," WHITESP, arg[i]); } else { copy = !strchr(WHITESP, arg[i]); if (copy) arg[j++] = arg[i]; } } if (!copy && j > 0) /* last char was a 'blank', remove it */ j--; l = j; /* the new argument length */ arg[j++] = '\0'; if (l == 0) /* empty string! */ return 1; /* * First, count number of arguments. Because of the previous * processing, this is just the number of blanks plus 1. */ for (i = 0, ac = 1; i < l; i++) if (strchr(WHITESP, arg[i]) != NULL) ac++; /* * Allocate the argument list structure as a single block * of memory, containing pointers and the argument * strings. We include one entry for the program name * because getopt expects it, and a NULL at the end * to simplify further parsing. */ ac++; /* add 1 for the program name */ av_size = (ac+1) * sizeof(char *) + l + 1; av = safe_calloc(av_size, 1); /* * Init the argument pointer to the end of the array * and copy arguments from arg[] to av[]. For each one, * j is the initial character, i is the one past the end. */ av_p = (char *)&av[ac+1]; for (ac = 1, i = j = 0; i < l; i++) { if (strchr(WHITESP, arg[i]) != NULL || i == l-1) { if (i == l-1) i++; bcopy(arg+j, av_p, i-j); av[ac] = av_p; av_p += i-j; /* the length of the string */ *av_p++ = '\0'; ac++; j = i + 1; } } } else { /* * If an argument ends with ',' join with the next one. */ int first, i, l=0; /* * Allocate the argument list structure as a single block * of memory, containing both pointers and the argument * strings. We include some space for the program name * because getopt expects it. * We add an extra pointer to the end of the array, * to make simpler further parsing. */ for (i=0; i= 2 && !strcmp(av[1], "sysctl")) { char *s; int i; if (ac != 3) { printf( "sysctl emulation usage:\n" " ipfw sysctl name[=value]\n" " ipfw sysctl -a\n"); return 0; } s = strchr(av[2], '='); if (s == NULL) { s = !strcmp(av[2], "-a") ? NULL : av[2]; sysctlbyname(s, NULL, NULL, NULL, 0); } else { /* ipfw sysctl x.y.z=value */ /* assume an INT value, will extend later */ if (s[1] == '\0') { printf("ipfw sysctl: missing value\n\n"); return 0; } *s = '\0'; i = strtol(s+1, NULL, 0); sysctlbyname(av[2], NULL, NULL, &i, sizeof(int)); } return 0; } #endif /* Save arguments for final freeing of memory. */ save_av = av; optind = optreset = 1; /* restart getopt() */ while ((ch = getopt(ac, av, "abcdefhinNp:qs:STtv")) != -1) switch (ch) { case 'a': do_acct = 1; break; case 'b': co.comment_only = 1; co.do_compact = 1; break; case 'c': co.do_compact = 1; break; case 'd': co.do_dynamic = 1; break; case 'e': co.do_expired = 1; break; case 'f': co.do_force = 1; break; case 'h': /* help */ free(save_av); help(); break; /* NOTREACHED */ case 'i': co.do_value_as_ip = 1; break; case 'n': co.test_only = 1; break; case 'N': co.do_resolv = 1; break; case 'q': co.do_quiet = 1; break; case 'p': errx(EX_USAGE, "An absolute pathname must be used " "with -p option."); /* NOTREACHED */ case 's': /* sort */ co.do_sort = atoi(optarg); break; case 'S': co.show_sets = 1; break; case 't': co.do_time = 1; break; case 'T': co.do_time = 2; /* numeric timestamp */ break; case 'v': /* verbose */ co.verbose = 1; break; default: free(save_av); return 1; } ac -= optind; av += optind; NEED1("bad arguments, for usage summary ``ipfw''"); /* * An undocumented behaviour of ipfw1 was to allow rule numbers first, * e.g. "100 add allow ..." instead of "add 100 allow ...". * In case, swap first and second argument to get the normal form. */ if (ac > 1 && isdigit(*av[0])) { char *p = av[0]; av[0] = av[1]; av[1] = p; } /* * Optional: pipe, queue or nat. */ co.do_nat = 0; co.do_pipe = 0; co.use_set = 0; if (!strncmp(*av, "nat", strlen(*av))) co.do_nat = 1; else if (!strncmp(*av, "pipe", strlen(*av))) co.do_pipe = 1; else if (_substrcmp(*av, "queue") == 0) co.do_pipe = 2; else if (_substrcmp(*av, "flowset") == 0) co.do_pipe = 2; else if (_substrcmp(*av, "sched") == 0) co.do_pipe = 3; else if (!strncmp(*av, "set", strlen(*av))) { if (ac > 1 && isdigit(av[1][0])) { co.use_set = strtonum(av[1], 0, resvd_set_number, &errstr); if (errstr) errx(EX_DATAERR, "invalid set number %s\n", av[1]); ac -= 2; av += 2; co.use_set++; } } if (co.do_pipe || co.do_nat) { ac--; av++; } NEED1("missing command"); /* * For pipes, queues and nats we normally say 'nat|pipe NN config' * but the code is easier to parse as 'nat|pipe config NN' * so we swap the two arguments. */ if ((co.do_pipe || co.do_nat) && ac > 1 && isdigit(*av[0])) { char *p = av[0]; av[0] = av[1]; av[1] = p; } if (co.use_set == 0) { if (_substrcmp(*av, "add") == 0) ipfw_add(av); else if (co.do_nat && _substrcmp(*av, "show") == 0) ipfw_show_nat(ac, av); else if (co.do_pipe && _substrcmp(*av, "config") == 0) ipfw_config_pipe(ac, av); else if (co.do_nat && _substrcmp(*av, "config") == 0) ipfw_config_nat(ac, av); else if (_substrcmp(*av, "set") == 0) ipfw_sets_handler(av); else if (_substrcmp(*av, "table") == 0) ipfw_table_handler(ac, av); else if (_substrcmp(*av, "enable") == 0) ipfw_sysctl_handler(av, 1); else if (_substrcmp(*av, "disable") == 0) ipfw_sysctl_handler(av, 0); else try_next = 1; } if (co.use_set || try_next) { if (_substrcmp(*av, "delete") == 0) ipfw_delete(av); else if (_substrcmp(*av, "flush") == 0) ipfw_flush(co.do_force); else if (_substrcmp(*av, "zero") == 0) ipfw_zero(ac, av, 0 /* IP_FW_ZERO */); else if (_substrcmp(*av, "resetlog") == 0) ipfw_zero(ac, av, 1 /* IP_FW_RESETLOG */); else if (_substrcmp(*av, "print") == 0 || _substrcmp(*av, "list") == 0) ipfw_list(ac, av, do_acct); else if (_substrcmp(*av, "show") == 0) ipfw_list(ac, av, 1 /* show counters */); else errx(EX_USAGE, "bad command `%s'", *av); } /* Free memory allocated in the argument parsing. */ free(save_av); return 0; } static void ipfw_readfile(int ac, char *av[]) { #define MAX_ARGS 32 char buf[4096]; char *progname = av[0]; /* original program name */ const char *cmd = NULL; /* preprocessor name, if any */ const char *filename = av[ac-1]; /* file to read */ int c, lineno=0; FILE *f = NULL; pid_t preproc = 0; while ((c = getopt(ac, av, "cfNnp:qS")) != -1) { switch(c) { case 'c': co.do_compact = 1; break; case 'f': co.do_force = 1; break; case 'N': co.do_resolv = 1; break; case 'n': co.test_only = 1; break; case 'p': /* * ipfw -p cmd [args] filename * * We are done with getopt(). All arguments * except the filename go to the preprocessor, * so we need to do the following: * - check that a filename is actually present; * - advance av by optind-1 to skip arguments * already processed; * - decrease ac by optind, to remove the args * already processed and the final filename; * - set the last entry in av[] to NULL so * popen() can detect the end of the array; * - set optind=ac to let getopt() terminate. */ if (optind == ac) errx(EX_USAGE, "no filename argument"); cmd = optarg; av[ac-1] = NULL; av += optind - 1; ac -= optind; optind = ac; break; case 'q': co.do_quiet = 1; break; case 'S': co.show_sets = 1; break; default: errx(EX_USAGE, "bad arguments, for usage" " summary ``ipfw''"); } } if (cmd == NULL && ac != optind + 1) errx(EX_USAGE, "extraneous filename arguments %s", av[ac-1]); if ((f = fopen(filename, "r")) == NULL) err(EX_UNAVAILABLE, "fopen: %s", filename); if (cmd != NULL) { /* pipe through preprocessor */ int pipedes[2]; if (pipe(pipedes) == -1) err(EX_OSERR, "cannot create pipe"); preproc = fork(); if (preproc == -1) err(EX_OSERR, "cannot fork"); if (preproc == 0) { /* * Child, will run the preprocessor with the * file on stdin and the pipe on stdout. */ if (dup2(fileno(f), 0) == -1 || dup2(pipedes[1], 1) == -1) err(EX_OSERR, "dup2()"); fclose(f); close(pipedes[1]); close(pipedes[0]); execvp(cmd, av); err(EX_OSERR, "execvp(%s) failed", cmd); } else { /* parent, will reopen f as the pipe */ fclose(f); close(pipedes[1]); if ((f = fdopen(pipedes[0], "r")) == NULL) { int savederrno = errno; (void)kill(preproc, SIGTERM); errno = savederrno; err(EX_OSERR, "fdopen()"); } } } while (fgets(buf, sizeof(buf), f)) { /* read commands */ char linename[20]; char *args[2]; lineno++; snprintf(linename, sizeof(linename), "Line %d", lineno); setprogname(linename); /* XXX */ args[0] = progname; args[1] = buf; ipfw_main(2, args); } fclose(f); if (cmd != NULL) { int status; if (waitpid(preproc, &status, 0) == -1) errx(EX_OSERR, "waitpid()"); if (WIFEXITED(status) && WEXITSTATUS(status) != EX_OK) errx(EX_UNAVAILABLE, "preprocessor exited with status %d", WEXITSTATUS(status)); else if (WIFSIGNALED(status)) errx(EX_UNAVAILABLE, "preprocessor exited with signal %d", WTERMSIG(status)); } } int main(int ac, char *av[]) { #if defined(_WIN32) && defined(TCC) { WSADATA wsaData; int ret=0; unsigned short wVersionRequested = MAKEWORD(2, 2); ret = WSAStartup(wVersionRequested, &wsaData); if (ret != 0) { /* Tell the user that we could not find a usable */ /* Winsock DLL. */ printf("WSAStartup failed with error: %d\n", ret); return 1; } } #endif /* * If the last argument is an absolute pathname, interpret it * as a file to be preprocessed. */ if (ac > 1 && av[ac - 1][0] == '/') { if (access(av[ac - 1], R_OK) == 0) ipfw_readfile(ac, av); else err(EX_USAGE, "pathname: %s", av[ac - 1]); } else { if (ipfw_main(ac, av)) { errx(EX_USAGE, "usage: ipfw [options]\n" "do \"ipfw -h\" or \"man ipfw\" for details"); } } return EX_OK; } ================================================ FILE: ipfw/qsort.c ================================================ /*- * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if defined(LIBC_SCCS) && !defined(lint) static char sccsid[] = "@(#)qsort.c 8.1 (Berkeley) 6/4/93"; #endif /* LIBC_SCCS and not lint */ #include __FBSDID("$FreeBSD: src/lib/libc/stdlib/qsort.c,v 1.15 2008/01/14 09:21:34 das Exp $"); #include #ifdef I_AM_QSORT_R typedef int cmp_t(void *, const void *, const void *); #else typedef int cmp_t(const void *, const void *); #endif static inline char *med3(char *, char *, char *, cmp_t *, void *); static inline void swapfunc(char *, char *, int, int); #define min(a, b) (a) < (b) ? a : b /* * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function". */ #define swapcode(TYPE, parmi, parmj, n) { \ long i = (n) / sizeof (TYPE); \ TYPE *pi = (TYPE *) (parmi); \ TYPE *pj = (TYPE *) (parmj); \ do { \ TYPE t = *pi; \ *pi++ = *pj; \ *pj++ = t; \ } while (--i > 0); \ } #define SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \ es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1; static inline void swapfunc(a, b, n, swaptype) char *a, *b; int n, swaptype; { if(swaptype <= 1) swapcode(long, a, b, n) else swapcode(char, a, b, n) } #define swap(a, b) \ if (swaptype == 0) { \ long t = *(long *)(a); \ *(long *)(a) = *(long *)(b); \ *(long *)(b) = t; \ } else \ swapfunc(a, b, es, swaptype) #define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n, swaptype) #ifdef I_AM_QSORT_R #define CMP(t, x, y) (cmp((t), (x), (y))) #else #define CMP(t, x, y) (cmp((x), (y))) #endif static inline char * med3(char *a, char *b, char *c, cmp_t *cmp, void *thunk #ifndef I_AM_QSORT_R __unused // XXX what ? #endif ) { return CMP(thunk, a, b) < 0 ? (CMP(thunk, b, c) < 0 ? b : (CMP(thunk, a, c) < 0 ? c : a )) :(CMP(thunk, b, c) > 0 ? b : (CMP(thunk, a, c) < 0 ? a : c )); } #ifdef I_AM_QSORT_R void qsort_r(void *a, size_t n, size_t es, void *thunk, cmp_t *cmp) #else #define thunk NULL void qsort(void *a, size_t n, size_t es, cmp_t *cmp) #endif { char *pa, *pb, *pc, *pd, *pl, *pm, *pn; size_t d, r; int cmp_result; int swaptype, swap_cnt; loop: SWAPINIT(a, es); swap_cnt = 0; if (n < 7) { for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es) for (pl = pm; pl > (char *)a && CMP(thunk, pl - es, pl) > 0; pl -= es) swap(pl, pl - es); return; } pm = (char *)a + (n / 2) * es; if (n > 7) { pl = a; pn = (char *)a + (n - 1) * es; if (n > 40) { d = (n / 8) * es; pl = med3(pl, pl + d, pl + 2 * d, cmp, thunk); pm = med3(pm - d, pm, pm + d, cmp, thunk); pn = med3(pn - 2 * d, pn - d, pn, cmp, thunk); } pm = med3(pl, pm, pn, cmp, thunk); } swap(a, pm); pa = pb = (char *)a + es; pc = pd = (char *)a + (n - 1) * es; for (;;) { while (pb <= pc && (cmp_result = CMP(thunk, pb, a)) <= 0) { if (cmp_result == 0) { swap_cnt = 1; swap(pa, pb); pa += es; } pb += es; } while (pb <= pc && (cmp_result = CMP(thunk, pc, a)) >= 0) { if (cmp_result == 0) { swap_cnt = 1; swap(pc, pd); pd -= es; } pc -= es; } if (pb > pc) break; swap(pb, pc); swap_cnt = 1; pb += es; pc -= es; } if (swap_cnt == 0) { /* Switch to insertion sort */ for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es) for (pl = pm; pl > (char *)a && CMP(thunk, pl - es, pl) > 0; pl -= es) swap(pl, pl - es); return; } pn = (char *)a + n * es; r = min(pa - (char *)a, pb - pa); vecswap(a, pb - r, r); r = min(pd - pc, pn - pd - es); vecswap(pb, pn - r, r); if ((r = pb - pa) > es) #ifdef I_AM_QSORT_R qsort_r(a, r / es, es, thunk, cmp); #else qsort(a, r / es, es, cmp); #endif if ((r = pd - pc) > es) { /* Iterate rather than recurse to save stack space */ a = pn - r; n = r / es; goto loop; } /* qsort(pn - r, r / es, es, cmp);*/ } ================================================ FILE: ipfw/qsort_r.c ================================================ /* * This file is in the public domain. Originally written by Garrett * A. Wollman. * * $FreeBSD: src/lib/libc/stdlib/qsort_r.c,v 1.1 2002/09/10 02:04:49 wollman Exp $ */ #define I_AM_QSORT_R #include "qsort.c" ================================================ FILE: ipfw/rule_test.sh ================================================ #/bin/bash COMMAND=ipfw echo .########## Set $COMMAND mode .########## $COMMAND add allow ip from any to any $COMMAND -q flush echo .########## empty rules .########## $COMMAND list $COMMAND add allow ip from any to any $COMMAND add allow ip from any to { 1.2.3.4 or 2.3.4.5 } $COMMAND add allow { dst-ip 1.2.3.4 or dst-ip 2.3.4.5 } echo .########## listing 3 rules .########## $COMMAND list $COMMAND delete 200 echo .########## listing 2 rules .########## $COMMAND list $COMMAND table 10 add 1.2.3.4 $COMMAND table 10 add 1.2.3.5 $COMMAND table 10 add 1.2.3.6 $COMMAND table 10 add 1.2.3.7/13 $COMMAND table 10 add 1.2.3.7/20 $COMMAND table 10 add 1.2.3.7/28 echo .########## listing table 10 with 6 elements .########## $COMMAND table 10 list $COMMAND table 10 delete 1.2.3.6 echo .########## listing table 10 with 5 elements .########## $COMMAND table 10 list $COMMAND table 10 flush echo .########## table 10 empty .########## $COMMAND table 10 list echo .########## move rule 100 to set 1 300 to 3 .########## $COMMAND set move rule 100 to 1 $COMMAND set move rule 300 to 3 $COMMAND -S show echo .########## move rule 200 to 2 but 200 do not exist .###### $COMMAND set move rule 200 to 2 echo .########## add some rules .########## $COMMAND add 200 queue 2 proto ip $COMMAND add 300 queue 5 proto ip $COMMAND add 400 queue 40 proto ip $COMMAND add 400 queue 50 proto ip echo .########## move rule 200 to 2 .###### $COMMAND set move rule 200 to 2 echo .########## move rule 400 to 5 .###### $COMMAND set move rule 400 to 5 echo .########## set 5 show 2 rules .###### $COMMAND set 5 show echo .########## flush set 5 .###### $COMMAND -q set 5 flush echo .########## set 5 show 0 rule .###### $COMMAND set 5 show echo .########## disable set 1 .###### $COMMAND set disable 1 echo .########## show all rules except set 1 .###### $COMMAND -S show echo .########## enable set 1 .###### $COMMAND set enable 1 echo .########## show all rules .###### $COMMAND -S show ================================================ FILE: ipfw/ws2_32.def ================================================ LIBRARY ws2_32.dll EXPORTS FreeAddrInfoW GetAddrInfoW GetNameInfoW WEP WPUCompleteOverlappedRequest WSAAccept WSAAddressToStringA WSAAddressToStringW WSAAsyncGetHostByAddr WSAAsyncGetHostByName WSAAsyncGetProtoByName WSAAsyncGetProtoByNumber WSAAsyncGetServByName WSAAsyncGetServByPort WSAAsyncSelect WSACancelAsyncRequest WSACancelBlockingCall WSACleanup WSACloseEvent WSAConnect WSACreateEvent WSADuplicateSocketA WSADuplicateSocketW WSAEnumNameSpaceProvidersA WSAEnumNameSpaceProvidersW WSAEnumNetworkEvents WSAEnumProtocolsA WSAEnumProtocolsW WSAEventSelect WSAGetLastError WSAGetOverlappedResult WSAGetQOSByName WSAGetServiceClassInfoA WSAGetServiceClassInfoW WSAGetServiceClassNameByClassIdA WSAGetServiceClassNameByClassIdW WSAHtonl WSAHtons WSAInstallServiceClassA WSAInstallServiceClassW WSAIoctl WSAIsBlocking WSAJoinLeaf WSALookupServiceBeginA WSALookupServiceBeginW WSALookupServiceEnd WSALookupServiceNextA WSALookupServiceNextW WSANSPIoctl WSANtohl WSANtohs WSAProviderConfigChange WSARecv WSARecvDisconnect WSARecvFrom WSARemoveServiceClass WSAResetEvent WSASend WSASendDisconnect WSASendTo WSASetBlockingHook WSASetEvent WSASetLastError WSASetServiceA WSASetServiceW WSASocketA WSASocketW WSAStartup WSAStringToAddressA WSAStringToAddressW WSAUnhookBlockingHook WSAWaitForMultipleEvents WSApSetPostRoutine WSCDeinstallProvider WSCEnableNSProvider WSCEnumProtocols WSCGetProviderPath WSCInstallNameSpace WSCInstallProvider WSCUnInstallNameSpace WSCUpdateProvider WSCWriteNameSpaceOrder WSCWriteProviderOrder __WSAFDIsSet accept bind closesocket connect freeaddrinfo getaddrinfo gethostbyaddr gethostbyname gethostname getnameinfo getpeername getprotobyname getprotobynumber getservbyname getservbyport getsockname getsockopt htonl htons inet_addr inet_ntoa ioctlsocket listen ntohl ntohs recv recvfrom select send sendto setsockopt shutdown socket ================================================ FILE: kipfw/Makefile ================================================ # $Id: Makefile 12257 2013-04-26 21:13:24Z luigi $ # gnu Makefile to build linux/Windows module for ipfw+dummynet. # # The defaults are set to build without modifications on PlanetLab # and possibly 2.6 versions. # On Windows, we use gnu-make and MSC # Some variables need to have specific names, because they are used # by the build infrastructure on Linux and OpenWrt. They are: # # ccflags-y additional $(CC) flags # M used by Kbuild, we must set it to `pwd` # obj-m list of .o modules to build # $(MOD)-y for each $MOD in obj-m, the list of objects # obj-y same as above, for openwrt # O_TARGET the link target, for openwrt # EXTRA_CFLAGS as the name says... in openwrt # EXTRA_CFLAGS is used in 2.6.22 module kernel compilation too # KERNELPATH the path to the kernel sources or headers # (on planetlab it is set already by the build system, # for other systems we take KSRC which is either guessed # or taken from the command line. # # Not sure about this (the name might be reserved) # ipfw-cflags our flags for building the module # # Other variables are only private and can be renamed. They include: # # VER linux version we are building for (2.4 2.6 or openwrt) # #--- # # The windows files (passthru etc.) are modified version of the # examples found in the $(DDK)/src/network/ndis/passthru/driver/ # They can be re-created using the 'ndis-glue' target in the IPFW3_ROOT ?= $(PWD)/.. include $(IPFW3_ROOT)/Makefile.inc TARGET = kipfw # lets default for 2.6 for planetlab builds VER ?= 2.6 # $(warning ########## linux dir is $(LINUX_DIR) ###########) # $(warning ########## KERNELPATH is $(KERNELPATH) ###########) #--- General values for all types of build --- # obj-m is the target module obj-m := ipfw_mod.o #-- the list of source files. IPFW_SRCS is our own name. # Original ipfw and dummynet sources + FreeBSD stuff, IPFW_SRCS := ip_fw2.c ip_fw_pfil.c ip_fw_sockopt.c IPFW_SRCS += ip_fw_dynamic.c ip_fw_table.c ip_fw_log.c IPFW_SRCS += radix.c in_cksum.c IPFW_SRCS += ip_dummynet.c ip_dn_io.c ip_dn_glue.c IPFW_SRCS += dn_heap.c IPFW_SRCS += dn_sched_fifo.c dn_sched_wf2q.c IPFW_SRCS += dn_sched_rr.c dn_sched_qfq.c IPFW_SRCS += dn_sched_prio.c # Module glue and functions missing in linux IPFW_SRCS += ipfw2_mod.c bsd_compat.c # generic cflags used on all systems #ipfw-cflags += -DIPFW_HASHTABLES ipfw-cflags += -DIPFIREWALL_DEFAULT_TO_ACCEPT # _BSD_SOURCE enables __FAVOR_BSD (udp/tcp bsd structs instead of posix) ipfw-cflags += -D_BSD_SOURCE ipfw-cflags += -DKERNEL_MODULE # build linux kernel module # the two header trees for empty and override files ipfw-cflags += -I $(M)/include_e ipfw-cflags += -I $(M)/../sys ipfw-cflags += -include $(M)/../glue.h # headers ipfw-cflags += -include $(M)/missing.h # headers $(warning ------ arch $(OSARCH) goals $(MAKECMDGOALS) -----------) ifeq ($(OSARCH),Windows) #--- { Windows block ifeq ($(VER),win64) $(warning ---- building for 64-bit windows ---) win_arch= -DAMD64=1 else win_arch= -Di386=1 endif M ?= $(shell pwd) WIN_SRCS += md_win.c WIN_SRCS += miniport.c protocol.c passthru.c debug.c #compiler, linker, target, sources and objects #DDK is exported from the root makefile #DDK = C:/WinDDK/7600.16385.1 CSOURCES = $(IPFW_SRCS) $(WIN_SRCS) COBJS := $(CSOURCES:.c=.obj) COBJS := $(addprefix $(OBJDIR)/,$(COBJS)) #include paths INCLUDE_PATHS = -Ii386 -I../sys -Iinclude_e -I. # INCLUDE_PATHS += -I$(OBJDIR) INCLUDE_PATHS += -I$(DDK)/inc/api INCLUDE_PATHS += -I$(DDK)/inc/ddk INCLUDE_PATHS += -I$(DDK)/inc/crt # #preprocessor MS defines PREPROC = -D_X86_=1 -Di386=1 -DSTD_CALL -DCONDITION_HANDLING=1 PREPROC += -DNT_UP=0 -DNT_INST=0 -DWIN32=100 -D_NT1X_=100 -DWINNT=1 PREPROC += -D_WIN32_WINNT=0x0501 -DWINVER=0x0501 -D_WIN32_IE=0x0603 PREPROC += -DWIN32_LEAN_AND_MEAN=1 PREPROC += -D__BUILDMACHINE__=WinDDK -DFPO=0 -D_DLL=1 PREPROC += -DNDIS_MINIPORT_DRIVER -DNDIS_WDM=1 PREPROC += -DNDIS51_MINIPORT=1 -DNDIS51=1 PREPROC += -DMSC_NOOPT -DNTDDI_VERSION=0x05010200 PREPROC += -DKMDF_MAJOR_VERSION_STRING=01 -DKMDF_MINOR_VERSION_STRING=009 #PREPROC += -DDBG=1 #debug PREPROC += -DNDEBUG #always up, seems no effect, possibly no debug? PREPROC += -DDEVL=1 #always up, seems no effect #macroing module name, WARNING: must match the one in .inf files PREPROC += -DMODULENAME=Ipfw #our defines OUR_PREPROC = -D_KERNEL -DKERNEL_MODULE -DKLD_MODULE OUR_PREPROC += -D__BSD_VISIBLE -DIPFIREWALL_DEFAULT_TO_ACCEPT OUR_PREPROC += -D__LITTLE_ENDIAN -DSYSCTL_NODE -DEMULATE_SYSCTL ifeq ($(TCC),) # Microsoft C compiler CC = $(DDK)/bin/x86/x86/cl.exe LD = $(DDK)/bin/x86/x86/link.exe # #complier options CFLAGS = -Fo$(OBJDIR)/ -c -FC -Zc:wchar_t- CFLAGS += -Zl -Zp8 -Gy -Gm- -GF -cbstring -Gz -hotpatch -EHs-c- CFLAGS += -W2 # -W3 gives too many conversion errors CFLAGS += -GR- -GF -GS -Zi # XXX do we need this ? CFLAGS += -Fd$(OBJDIR)/ CFLAGS += -wd4603 -wd4627 -typedil- CFLAGS += -FI $(DDK)/inc/api/warning.h CFLAGS += -FI winmissing.h CFLAGS += -FI missing.h # headers CFLAGS += -FI ../glue.h # headers #optimization options OPTIMIZE = -Od -Oi -Oy- #linker options LDFLAGS = /MERGE:_PAGE=PAGE /MERGE:_TEXT=.text LDFLAGS += /SECTION:INIT,d /OPT:REF /OPT:ICF LDFLAGS += /IGNORE:4198,4010,4037,4039,4065,4070,4078,4087,4089,4221 LDFLAGS += /INCREMENTAL:NO /release /NODEFAULTLIB /WX LDFLAGS += /debug /debugtype:cv,fixup,pdata LDFLAGS += /version:6.1 /osversion:6.1 /functionpadmin:5 LDFLAGS += /safeseh /pdbcompress LDFLAGS += /STACK:0x40000,0x1000 /driver /base:0x10000 /align:0x80 LDFLAGS += /stub:$(DDK)\\lib\\wxp\\stub512.com LDFLAGS += /subsystem:native,5.01 /entry:GsDriverEntry@8 LDFLAGS += /out:$(OBJDIR)/ipfw.sys #libraries to build against LIBS = $(DDK)/lib/wxp/i386/BufferOverflowK.lib LIBS += $(DDK)/lib/wxp/i386/ntoskrnl.lib LIBS += $(DDK)/lib/wxp/i386/hal.lib LIBS += $(DDK)/lib/wxp/i386/wmilib.lib LIBS += $(DDK)/lib/wxp/i386/ndis.lib LIBS += $(DDK)/lib/wxp/i386/sehupd.lib else # use tcc. not working yet for the kernel module. # TCC points to the root of tcc tree CC=$(TCC)/bin/wintcc EXTRA_CFLAGS += -DTCC -I.. EXTRA_CFLAGS += -I$(TCC)/include/winapi -I$(TCC)/include EXTRA_CFLAGS += -nostdinc CFLAGS += -include winmissing.h -include missing.h -include ../glue.h CFLAGS += -I../../inc/api -I../../inc/ddk -I../../inc/crt CFLAGS += -DRC_INVOKED endif # use tcc #empty include directory to be built M ?= $(shell pwd) EFILES_asm += div64.h EFILES_linux += if.h random.h errno.h EFILES_net += if_types.h inet_hashtables.h route.h #targets all: $(TARGET) $(TARGET): include_e # XXX dangerous rm -rf $(OBJDIR) mkdir -p $(OBJDIR) $(MSG) " CC [$(CC)] $(CSOURCES)" $(HIDE) $(CC) $(INCLUDE_PATHS) $(PREPROC) $(OUR_PREPROC) $(CFLAGS) $(OPTIMIZE) $(CSOURCES) $(MSG) " LD [$(LD)] $(COBJS)" $(HIDE) $(LD) $(LDFLAGS) $(COBJS) $(LIBS) else # } { linux variables and targets # extract version number (hex, aXXYY). Newer linuxes have a different dir # if not set, use the version from the installed system KERNELPATH ?= $(KSRC) LIN_VER := $(shell V=linux/version.h; G=. ; \ [ -f $(KERNELPATH)/include/$${V} ] || G=generated/uapi ;\ grep LINUX_VERSION_CODE $(KERNELPATH)/include/$${G}/linux/version.h | \ awk '{printf "%03x%02x", $$3/256, $$3%256} ') $(warning ------------- linux version $(LIN_VER) (hex) ------------) # We have three sections: OpenWrt, Linux 2.4 and Linux 2.6 ifeq ($(LIN_VER),openwrt) #--- { The Makefile section for openwrt --- # this was used on openwrt, but not anymore $(error ------ build on openwrt ---------- ) # We do not include a dependency on include_e as it is called # by Makefile.openwrt in Build/Prepare M=. obj-y := $(IPFW_SRCS:%.c=%.o) O_TARGET := $(obj-m) # xcflags-y is a temporary variable where we store build options xcflags-y += -O1 -DLINUX_24 xcflags-y += -g EXTRA_CFLAGS := $(xcflags-y) $(ipfw-cflags) -DSYSCTL_NODE -DEMULATE_SYSCTL # we should not export anything #export-objs := ipfw2_mod.o -include $(TOPDIR)/Rules.make endif # ---- } end openwrt version ifneq ($(shell echo $(LIN_VER)|grep '2.4'),) #--- { # Makefile section for the linux 2.4 version # tested on linux-2.4.35.4, does not work with 2.4.37 # # guess the kernel path -- or is it under /lib/modules ? KERNELPATH ?= $(KSRC) # We need to figure out the gcc include directory, if not # set by the user through MYGCC_INCLUDE # Find compiler version (3rd field in last line returned by gcc -v) # e.g. gcc version 4.3.2 (Debian 4.3.2-1.1) MYGCC_VER ?= $(shell $(CC) -v 2>&1 |tail -n 1 | cut -d " " -f 3) # We don't know the exact directory under /usr/lib/gcc so we guess MYGCC_INCLUDE ?= $(shell echo /usr/lib/gcc/*/$(MYGCC_VER) | cut -d " " -f 1)/include $(warning "---- gcc includes guessed to $(MYGCC_INCLUDE)") # additional warning WARN += -Wall -Wundef WARN += -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing WARN += -fno-common -Werror-implicit-function-declaration # WARN += -O2 -fno-stack-protector -m32 -msoft-float -mregparm=3 # -mregparm=3 gives a printk error WARN += -m32 -msoft-float # -mregparm=3 #WARN += -freg-struct-return -mpreferred-stack-boundary=2 WARN += -Wno-sign-compare WARN += -Wdeclaration-after-statement ifneq ($(MYGCC_VER),3.4.6) WARN += -Wno-pointer-sign endif ccflags-y += -O1 -DLINUX_24 CFLAGS = -DMODULE -D__KERNEL__ -nostdinc \ -isystem ${KERNELPATH}/include -isystem $(MYGCC_INCLUDE) \ ${ccflags-y} # The Main target all: mod24 else # --- } { linux 2.6 and newer $(warning --- build 2.6 and newer target $(TARGET) ----) # This is the Makefile section for Linux 2.6.x including planetlab ifeq ($(IPFW_PLANETLAB),1) $(warning "---- Building for PlanetLab") ipfw-cflags += -DIPFW_PLANETLAB # PlanetLab compilation endif WARN := -O1 -Wall -Werror -DDEBUG_SPINLOCK -DDEBUG_MUTEXES # The main target # Required by GCC 4.6 ccflags-y += -Wno-unused-but-set-variable ifeq ($(shell if [ -z $(LIN_VER) ] ; then echo "true"; fi),true) $(warning "---- Perhaps you miss a (cd $(KERNELPATH); make oldconfig; make prepare; make scripts)"); endif # Required by kernel < 2.6.23, ccflags-y is used on newer version ifeq ($(shell [ "$(LIN_VER)" \< "20617" ] && echo "true"),true) EXTRA_CFLAGS += $(ccflags-y) endif $(warning $(shell [ "$(LIN_VER)" \< "2061c" ] && \ [ `$(MAKE) -version | head -1 | cut -d " " -f 3` != '3.81' ] && \ echo "**** need make 3.81 *****") ) # $(warning make is $(MAKE) version is $(shell $(MAKE) -version | head -1) ) #--- openwrt ? ifeq ($(_VER),xx-openwrt) $(warning ----------------------- compiling for openwrt -----) M=. obj-y := $(IPFW_SRCS:%.c=%.o) O_TARGET := $(obj-m) # xcflags-y is a temporary variable where we store build options xcflags-y += -O1 xcflags-y += -g EXTRA_CFLAGS := $(xcflags-y) $(ipfw-cflags) -DSYSCTL_NODE -DEMULATE_SYSCTL endif #---- end openwrt all: $(TARGET) $(TARGET): include_e echo "xxxxxxxxxxxxx $(MAKE) -C $(KERNELPATH) V=$(V) M=`pwd` modules" $(MAKE) -C $(KERNELPATH) V=$(V) M=`pwd` modules endif # } --- linux 2.6 and newer #-- back to the common section for linux # the list of objects used to build the module ipfw_mod-y = $(IPFW_SRCS:%.c=%.o) # additional $(CC) flags ccflags-y += $(WARN) ccflags-y += $(ipfw-cflags) # if we really want debug symbols... ccflags-y += -g mod24: include_e $(obj-m) $(obj-m): $(ipfw_mod-y) $(LD) $(LDFLAGS) -m elf_i386 -r -o $@ $^ # M is the current directory, used in recursive builds # so we allow it to be overridden M ?= $(shell pwd) endif # } ----- end of the non-Windows block ifneq ($(OBJDIR),mia) $(error objdir set to $(OBJDIR)) endif #--- various common targets clean: -@rm -f *.o *.ko Module.symvers *.mod.c -@# rm -rf $(OBJDIR) -@rm -rf include_e distclean: clean -@rm -f .*cmd modules.order opt_* -@rm -rf .tmp_versions .*.o.d _CL_* # support to create empty dirs and files in include_e/ # EFILES_foo/bar is the list of files to be created in foo/bar # (/ and . are allowed in gmake variable names) EFILES_. += opt_inet.h opt_inet6.h opt_ipfw.h opt_ipsec.h opt_mpath.h EFILES_. += opt_mbuf_stress_test.h opt_param.h opt_ipdivert.h EFILES_altq += if_altq.h EFILES_arpa += inet.h EFILES_machine += in_cksum.h EFILES_net += ethernet.h netisr.h pf_mtag.h bpf.h if_types.h vnet.h EFILES_netinet += ether.h icmp6.h if_ether.h in.h in_pcb.h in_var.h EFILES_netinet += in_systm.h ip_carp.h ip_var.h pim.h EFILES_netinet += sctp.h tcp_timer.h tcpip.h udp_var.h EFILES_netinet6 += ip6_var.h EFILES_sys += _lock.h _rwlock.h rmlock.h _mutex.h jail.h EFILES_sys += condvar.h eventhandler.h domain.h EFILES_sys += limits.h lock.h mutex.h priv.h EFILES_sys += proc.h rwlock.h socket.h socketvar.h EFILES_sys += sysctl.h time.h ucred.h # first make a list of directories from variable names EDIRS= $(subst EFILES_,,$(filter EFILES_%,$(.VARIABLES))) # then prepend the directory name to individual files. # $(empty) serves to interpret the following space literally, # and the ": = " substitution packs spaces into one. EFILES = $(foreach i,$(EDIRS),$(subst $(empty) , $(i)/, $(EFILES_$(i): = ))) include_e: -@rm -rf $(M)/include_e opt_* -@mkdir -p $(M)/include_e -@(cd $(M)/include_e; mkdir -p $(EDIRS); touch $(EFILES) ) #--- some other targets for testing purposes test_radix: test_radix.o radix.o test_lookup: ip_fw_lookup.o test_radix test_lookup: CFLAGS=-Wall -Werror -O1 ================================================ FILE: kipfw/bsd_compat.c ================================================ /* * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id: bsd_compat.c 11530 2012-08-01 10:29:32Z luigi $ * * kernel variables and functions that are not available in linux. */ #include #include /* do_div on 2.4 */ #include /* get_random_bytes on 2.4 */ #include #include #include /* * gettimeofday would be in sys/time.h but it is not * visible if _KERNEL is defined */ int gettimeofday(struct timeval *, struct timezone *); int ticks; /* kernel ticks counter */ int hz = 1000; /* default clock time */ long tick = 1000; /* XXX is this 100000/hz ? */ int bootverbose = 0; struct timeval boottime; int ip_defttl = 64; /* XXX set default value */ int max_linkhdr = 16; int fw_one_pass = 1; u_long in_ifaddrhmask; /* mask for hash table */ struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ u_int rt_numfibs = RT_NUMFIBS; /* * pfil hook support. * We make pfil_head_get return a non-null pointer, which is then ignored * in our 'add-hook' routines. */ struct pfil_head; typedef int (pfil_hook_t) (void *, struct mbuf **, struct ifnet *, int, struct inpcb *); struct pfil_head * pfil_head_get(int proto, u_long flags) { static int dummy; return (struct pfil_head *)&dummy; } int pfil_add_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h) { return 0; } int pfil_remove_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h) { return 0; } /* define empty body for kernel function */ int priv_check(struct thread *td, int priv) { return 0; } int securelevel_ge(struct ucred *cr, int level) { return 0; } int sysctl_handle_int(SYSCTL_HANDLER_ARGS) { return 0; } int sysctl_handle_long(SYSCTL_HANDLER_ARGS) { return 0; } void ether_demux(struct ifnet *ifp, struct mbuf *m) { return; } int ether_output_frame(struct ifnet *ifp, struct mbuf *m) { return 0; } void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum) { return; } void icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu) { return; } u_short in_cksum_skip(struct mbuf *m, int len, int skip) { return 0; } u_short in_cksum_hdr(struct ip *ip) { return 0; } /* * we don't really reassemble, just return whatever we had. */ struct mbuf * ip_reass(struct mbuf *clone) { return clone; } #ifdef INP_LOCK_ASSERT #undef INP_LOCK_ASSERT #define INP_LOCK_ASSERT(a) #endif /* credentials check */ #include #ifdef __linux__ int cred_check(void *_insn, int proto, struct ifnet *oif, struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp, struct sk_buff *skb) { int match = 0; ipfw_insn_u32 *insn = (ipfw_insn_u32 *)_insn; if (*ugid_lookupp == 0) { /* actively lookup and copy in cache */ /* returns null if any element of the chain up to file is null. * if sk != NULL then we also have a reference */ *ugid_lookupp = linux_lookup(proto, src_ip.s_addr, htons(src_port), dst_ip.s_addr, htons(dst_port), skb, oif ? 1 : 0, u); } if (*ugid_lookupp < 0) return 0; if (insn->o.opcode == O_UID) match = (u->uid == (uid_t)insn->d[0]); else if (insn->o.opcode == O_JAIL) match = (u->xid == (uid_t)insn->d[0]); else if (insn->o.opcode == O_GID) match = (u->gid == (uid_t)insn->d[0]); return match; } #endif /* __linux__ */ int jailed(struct ucred *cred) { return 0; } /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). If subnetsarelocal * is true, this includes other subnets of the local net. * Otherwise, it includes only the directly-connected (sub)nets. */ int in_localaddr(struct in_addr in) { return 1; } int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) { size_t valsize = sopt->sopt_valsize; if (len < valsize) sopt->sopt_valsize = valsize = len; //printf("copyout buf = %p, sopt = %p, soptval = %p, len = %d \n", buf, sopt, sopt->sopt_val, len); bcopy(buf, sopt->sopt_val, valsize); return 0; } /* * copy data from userland to kernel */ int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) { size_t valsize = sopt->sopt_valsize; if (valsize < minlen) return EINVAL; if (valsize > len) sopt->sopt_valsize = valsize = len; //printf("copyin buf = %p, sopt = %p, soptval = %p, len = %d \n", buf, sopt, sopt->sopt_val, len); bcopy(sopt->sopt_val, buf, valsize); return 0; } void getmicrouptime(struct timeval *tv) { do_gettimeofday(tv); } #include char * inet_ntoa_r(struct in_addr ina, char *buf) { #ifdef _WIN32 #else unsigned char *ucp = (unsigned char *)&ina; sprintf(buf, "%d.%d.%d.%d", ucp[0] & 0xff, ucp[1] & 0xff, ucp[2] & 0xff, ucp[3] & 0xff); #endif return buf; } char * inet_ntoa(struct in_addr ina) { static char buf[16]; return inet_ntoa_r(ina, buf); } int random(void) { #ifdef _WIN32 static unsigned long seed; if (seed == 0) { LARGE_INTEGER tm; KeQuerySystemTime(&tm); seed = tm.LowPart; } return RtlRandomEx(&seed) & 0x7fffffff; #else int r; get_random_bytes(&r, sizeof(r)); return r & 0x7fffffff; #endif } /* * do_div really does a u64 / u32 bit division. * we save the sign and convert to uint befor calling. * We are safe just because we always call it with small operands. */ int64_t div64(int64_t a, int64_t b) { #ifdef _WIN32 int a1 = a, b1 = b; return a1/b1; #else uint64_t ua, ub; int sign = ((a>0)?1:-1) * ((b>0)?1:-1); ua = ((a>0)?a:-a); ub = ((b>0)?b:-b); do_div(ua, ub); return sign*ua; #endif } #ifdef __MIPSEL__ size_t strlcpy(char *dst, const char *src, size_t siz) { char *d = dst; const char *s = src; size_t n = siz; /* Copy as many bytes as will fit */ if (n != 0 && --n != 0) { do { if ((*d++ = *s++) == 0) break; } while (--n != 0); } /* Not enough room in dst, add NUL and traverse rest of src */ if (n == 0) { if (siz != 0) *d = '\0'; /* NUL-terminate dst */ while (*s++) ; } return(s - src - 1); /* count does not include NUL */ } #endif // __MIPSEL__ /* * compact version of fnmatch. */ int fnmatch(const char *pattern, const char *string, int flags) { char s; if (!string || !pattern) return 1; /* no match */ while ( (s = *string++) ) { char p = *pattern++; if (p == '\0') /* pattern is over, no match */ return 1; if (p == '*') /* wildcard, match */ return 0; if (p == '.' || p == s) /* char match, continue */ continue; return 1; /* no match */ } /* end of string, make sure the pattern is over too */ if (*pattern == '\0' || *pattern == '*') return 0; return 1; /* no match */ } /* * linux 2.6.33 defines these functions to access to * skbuff internal structures. Define the missing * function for the previous versions too. */ #ifdef linux #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31) inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) { skb->dst = dst; } inline struct dst_entry *skb_dst(const struct sk_buff *skb) { return (struct dst_entry *)skb->dst; } #endif /* < 2.6.31 */ #endif /* linux */ /* support for sysctl emulation. * XXX this is actually MI code that should be enabled also on openwrt */ #ifdef EMULATE_SYSCTL static struct sysctltable GST; int kesysctl_emu_get(struct sockopt* sopt) { struct dn_id* oid = sopt->sopt_val; struct sysctlhead* entry; int sizeneeded = sizeof(struct dn_id) + GST.totalsize + sizeof(struct sysctlhead); unsigned char* pstring; unsigned char* pdata; int i; if (sopt->sopt_valsize < sizeneeded) { // this is a probe to retrieve the space needed for // a dump of the sysctl table oid->id = sizeneeded; sopt->sopt_valsize = sizeof(struct dn_id); return 0; } entry = (struct sysctlhead*)(oid+1); for( i=0; iblocklen = GST.entry[i].head.blocklen; entry->namelen = GST.entry[i].head.namelen; entry->flags = GST.entry[i].head.flags; entry->datalen = GST.entry[i].head.datalen; pdata = (unsigned char*)(entry+1); pstring = pdata+GST.entry[i].head.datalen; bcopy(GST.entry[i].data, pdata, GST.entry[i].head.datalen); bcopy(GST.entry[i].name, pstring, GST.entry[i].head.namelen); entry = (struct sysctlhead*) ((unsigned char*)(entry) + GST.entry[i].head.blocklen); } sopt->sopt_valsize = sizeneeded; return 0; } int kesysctl_emu_set(void* p, int l) { struct sysctlhead* entry; unsigned char* pdata; unsigned char* pstring; int i = 0; entry = (struct sysctlhead*)(((struct dn_id*)p)+1); pdata = (unsigned char*)(entry+1); pstring = pdata + entry->datalen; for (i=0; idatalen != GST.entry[i].head.datalen) { printf("%s: len mismatch, user %d vs kernel %d\n", __FUNCTION__, entry->datalen, GST.entry[i].head.datalen); return -1; } // check access (at the moment flags handles only the R/W rights //later on will be type + access if( (GST.entry[i].head.flags & 3) == CTLFLAG_RD) { printf("%s: the entry %s is read only\n", __FUNCTION__,GST.entry[i].name); return -1; } bcopy(pdata, GST.entry[i].data, GST.entry[i].head.datalen); return 0; } printf("%s: match not found\n",__FUNCTION__); return 0; } /* convert all _ to . until the first . */ static void underscoretopoint(char* s) { for (; *s && *s != '.'; s++) if (*s == '_') *s = '.'; } static int formatnames() { int i; int size=0; char* name; for (i=0; i> 2, GST.entry[i].head.flags & 0x00000003); printf("data %i\n", *(int*)(GST.entry[i].data)); printf("datalen %i\n", GST.entry[i].head.datalen); printf("blocklen %i\n", GST.entry[i].head.blocklen); } } void sysctl_addgroup_f1(); void sysctl_addgroup_f2(); void sysctl_addgroup_f3(); void sysctl_addgroup_f4(); void keinit_GST() { int ret; sysctl_addgroup_f1(); sysctl_addgroup_f2(); sysctl_addgroup_f3(); sysctl_addgroup_f4(); ret = formatnames(); if (ret != 0) printf("conversion of names failed for some reason\n"); //dumpGST(); printf("*** Global Sysctl Table entries = %i, total size = %i ***\n", GST.count, GST.totalsize); } void keexit_GST() { if (GST.namebuffer != NULL) free(GST.namebuffer,0); bzero(&GST, sizeof(GST)); } void sysctl_pushback(char* name, int flags, int datalen, void* data) { if (GST.count >= GST_HARD_LIMIT) { printf("WARNING: global sysctl table full, this entry will not be added," "please recompile the module increasing the table size\n"); return; } GST.entry[GST.count].head.namelen = strlen(name)+1; //add space for '\0' GST.entry[GST.count].name = name; GST.entry[GST.count].head.flags = flags; GST.entry[GST.count].data = data; GST.entry[GST.count].head.datalen = datalen; GST.entry[GST.count].head.blocklen = ((sizeof(struct sysctlhead) + GST.entry[GST.count].head.namelen + GST.entry[GST.count].head.datalen)+3) & ~3; GST.totalsize += GST.entry[GST.count].head.blocklen; GST.count++; } #endif /* EMULATE_SYSCTL */ ================================================ FILE: kipfw/debug.c ================================================ #include const char* texify_cmd(int i) { if (i==110) return("IP_FW_ADD"); if (i==111) return("IP_FW_DEL"); if (i==112) return("IP_FW_FLUSH"); if (i==113) return("IP_FW_ZERO"); if (i==114) return("IP_FW_GET"); if (i==115) return("IP_FW_RESETLOG"); if (i==116) return("IP_FW_NAT_CFG"); if (i==117) return("IP_FW_NAT_DEL"); if (i==118) return("IP_FW_NAT_GET_CONFIG"); if (i==119) return("IP_FW_NAT_GET_LOG"); if (i==120) return("IP_DUMMYNET_CONFIGURE"); if (i==121) return("IP_DUMMYNET_DEL"); if (i==122) return("IP_DUMMYNET_FLUSH"); if (i==124) return("IP_DUMMYNET_GET"); if (i==108) return("IP_FW3"); if (i==109) return("IP_DUMMYNET3"); return ("BOH"); } const char* texify_proto(unsigned int p) { if (p==1) return("ICMP"); if (p==6) return("TCP"); if (p==17) return("UDP"); return("OTHER"); } void hexdump(unsigned char* addr, int len, const char *msg) { int i; const int cicli = len/8; const int resto = len%8; unsigned char d[8]; DbgPrint("%s at %p len %d\n", msg, addr, len); for (i=0; i<=cicli; i++) { bzero(d, 8); bcopy(addr+i*8, d, i < cicli ? 8 : resto); DbgPrint("%04X %02X %02X %02X %02X %02X %02X %02X %02X\n", i*8, d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]); } DbgPrint("\n"); } ================================================ FILE: kipfw/ipfw2_mod.c ================================================ /* * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id: ipfw2_mod.c 12501 2014-01-10 01:09:14Z luigi $ * * The main interface to build ipfw+dummynet as a linux module. * (and possibly as a windows module as well, though that part * is not complete yet). * * The control interface uses the sockopt mechanism * on a socket(AF_INET, SOCK_RAW, IPPROTO_RAW). * * The data interface uses the netfilter interface, at the moment * hooked to the PRE_ROUTING and POST_ROUTING hooks. * Unfortunately the netfilter interface is a moving target, * so we need a set of macros to adapt to the various cases. * * In the netfilter hook we just mark packet as 'QUEUE' and then * let the queue handler to do the whole work (filtering and * possibly emulation). * As we receive packets, we wrap them with an mbuf descriptor * so the existing ipfw+dummynet code runs unmodified. */ #include #include /* sizeof struct mbuf */ #include /* NGROUPS */ #ifndef D #define ND(fmt, ...) do {} while (0) #define D1(fmt, ...) do {} while (0) #define D(fmt, ...) printf("%-10s " fmt "\n", \ __FUNCTION__, ## __VA_ARGS__) #endif #ifdef __linux__ #include #include #ifndef CONFIG_NETFILTER #error should configure netfilter (broken on 2.6.26 and below ?) #endif #include #include /* NF_IP_PRI_FILTER */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25) #include /* nf_queue */ #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) #define __read_mostly #endif #endif /* !__linux__ */ #include /* in_addr */ #include /* ip_fw_ctl_t, ip_fw_chk_t */ #include /* ip_fw_ctl_t, ip_fw_chk_t */ #include /* ip_dn_ctl_t, ip_dn_io_t */ #include /* PFIL_IN, PFIL_OUT */ #ifdef __linux__ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,13) /* XXX was < 2.6.0: inet_hashtables.h is introduced in 2.6.14 */ // #warning --- inet_hashtables not present on 2.4 #include #include #include static inline int inet_iif(const struct sk_buff *skb) { return ((struct rtable *)skb->dst)->rt_iif; } #else #include /* inet_lookup */ #endif #endif /* __linux__ */ #include /* inet_iif */ /* * Here we allocate some global variables used in the firewall. */ //ip_dn_ctl_t *ip_dn_ctl_ptr; int (*ip_dn_ctl_ptr)(struct sockopt *); ip_fw_ctl_t *ip_fw_ctl_ptr; int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa); ip_fw_chk_t *ip_fw_chk_ptr; void (*bridge_dn_p)(struct mbuf *, struct ifnet *); /* Divert hooks. */ void (*ip_divert_ptr)(struct mbuf *m, int incoming); /* ng_ipfw hooks. */ ng_ipfw_input_t *ng_ipfw_input_p = NULL; /*--- * Glue code to implement the registration of children with the parent. * Each child should call my_mod_register() when linking, so that * module_init() and module_exit() can call init_children() and * fini_children() to provide the necessary initialization. * We use the same mechanism for MODULE_ and SYSINIT_. * The former only get a pointer to the moduledata, * the latter have two function pointers (init/uninit) */ #include struct mod_args { const char *name; int order; struct moduledata *mod; void (*init)(void), (*uninit)(void); }; static unsigned int mod_idx; static struct mod_args mods[10]; /* hard limit to 10 modules */ int my_mod_register(const char *name, int order, struct moduledata *mod, void *init, void *uninit); /* * my_mod_register should be called automatically as the init * functions in the submodules. Unfortunately this compiler/linker * trick is not supported yet so we call it manually. */ int my_mod_register(const char *name, int order, struct moduledata *mod, void *init, void *uninit) { struct mod_args m; m.name = name; m.order = order; m.mod = mod; m.init = init; m.uninit = uninit; printf("%s %s called\n", __FUNCTION__, name); if (mod_idx < sizeof(mods) / sizeof(mods[0])) mods[mod_idx++] = m; return 0; } static void init_children(void) { unsigned int i; /* Call the functions registered at init time. */ printf("%s mod_idx value %d\n", __FUNCTION__, mod_idx); for (i = 0; i < mod_idx; i++) { struct mod_args *m = &mods[i]; printf("+++ start module %d %s %s at %p order 0x%x\n", i, m->name, m->mod ? m->mod->name : "SYSINIT", m->mod, m->order); if (m->mod && m->mod->evhand) m->mod->evhand(NULL, MOD_LOAD, m->mod->priv); else if (m->init) m->init(); } } static void fini_children(void) { int i; /* Call the functions registered at init time. */ for (i = mod_idx - 1; i >= 0; i--) { struct mod_args *m = &mods[i]; printf("+++ end module %d %s %s at %p order 0x%x\n", i, m->name, m->mod ? m->mod->name : "SYSINIT", m->mod, m->order); if (m->mod && m->mod->evhand) m->mod->evhand(NULL, MOD_UNLOAD, m->mod->priv); else if (m->uninit) m->uninit(); } } /*--- end of module binding helper functions ---*/ /*--- * Control hooks: * ipfw_ctl_h() is a wrapper for linux to FreeBSD sockopt call convention. * then call the ipfw handler in order to manage requests. * In turn this is called by the linux set/get handlers. */ static int ipfw_ctl_h(struct sockopt *s, int cmd, int dir, int len, void __user *user) { struct thread t; int ret = EINVAL; memset(s, 0, sizeof(*s)); s->sopt_name = cmd; s->sopt_dir = dir; s->sopt_valsize = len; s->sopt_val = user; /* sopt_td is not used but it is referenced */ memset(&t, 0, sizeof(t)); s->sopt_td = &t; //printf("%s called with cmd %d len %d sopt %p user %p\n", __FUNCTION__, cmd, len, s, user); if (ip_fw_ctl_ptr && cmd != IP_DUMMYNET3 && (cmd == IP_FW3 || cmd < IP_DUMMYNET_CONFIGURE)) ret = ip_fw_ctl_ptr(s); else if (ip_dn_ctl_ptr && (cmd == IP_DUMMYNET3 || cmd >= IP_DUMMYNET_CONFIGURE)) ret = ip_dn_ctl_ptr(s); return -ret; /* errors are < 0 on linux */ } #ifdef linux /* * Convert an mbuf into an skbuff * At the moment this only works for ip packets fully contained * in a single mbuf. We assume that on entry ip_len and ip_off are * in host format, and the ip checksum is not computed. */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) /* check boundary */ int dst_output(struct skbuff *s) { return 0; } struct sk_buff * mbuf2skbuff(struct mbuf* m) { return NULL; } #else struct sk_buff * mbuf2skbuff(struct mbuf* m) { struct sk_buff *skb; size_t len = m->m_pkthdr.len; /* used to lookup the routing table */ struct rtable *r; struct flowi fl; int ret = 0; /* success for ip_route_output_key() */ struct ip *ip = mtod(m, struct ip *); /* XXX ip_output has ip_len and ip_off in network format, * linux expects host format */ ip->ip_len = ntohs(ip->ip_len); ip->ip_off = ntohs(ip->ip_off); ip->ip_sum = 0; ip->ip_sum = in_cksum(m, ip->ip_hl<<2); /* fill flowi struct, we need just the dst addr, see XXX */ bzero(&fl, sizeof(fl)); flow_daddr.daddr = ip->ip_dst.s_addr; /* * ip_route_output_key() should increment * r->u.dst.__use and call a dst_hold(dst) * XXX verify how we release the resources. */ #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,38) /* check boundary */ r = ip_route_output_key(&init_net, &fl.u.ip4); #elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26) /* check boundary */ ret = ip_route_output_key(&init_net, &r, &fl); #else ret = ip_route_output_key(&r, &fl); #endif if (ret != 0 || r == NULL ) { printf("NO ROUTE FOUND\n"); return NULL; } /* allocate the skbuff and the data */ skb = alloc_skb(len + sizeof(struct ethhdr), GFP_ATOMIC); if (skb == NULL) { printf("%s: can not allocate SKB buffers.\n", __FUNCTION__); return NULL; } skb->protocol = htons(ETH_P_IP); // XXX 8 or 16 bit ? /* sk_dst_set XXX take the lock (?) */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36) skb_dst_set(skb, &r->u.dst); #else skb_dst_set(skb, &r->dst); #endif skb->dev = skb_dst(skb)->dev; /* reserve space for ethernet header */ skb_reserve(skb, sizeof(struct ethhdr)); #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) skb_reset_network_header(skb); // skb->network_header = skb->data - skb->head #else skb->nh.raw = skb->data; #endif /* set skbuff tail pointers and copy content */ skb_put(skb, len); memcpy(skb->data, m->m_data, len); return skb; } #endif /* linux 2.6+ */ #endif /* linux */ /* * This function is called to reinject packets to the * kernel stack within the linux netfilter system * or to send a new created mbuf. * In the first case we have a valid sk_buff pointer * encapsulated within the fake mbuf, so we can call * the reinject function trough netisr_dispatch. * In the last case we need to build a sk_buff from scratch, * before sending out the packet. */ int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { (void)opt; (void)ro; (void)flags; (void)imo; (void)inp; /* UNUSED */ if ( m->m_skb != NULL ) { /* reinjected packet, just call dispatch */ ND("sending... "); netisr_dispatch(0, m); } else { /* self-generated packet, wrap as appropriate and send */ #ifdef __linux__ struct sk_buff *skb = mbuf2skbuff(m); if (skb != NULL) dst_output(skb); #else /* Windows */ D("unimplemented."); #endif FREE_PKT(m); } return 0; } /* * setsockopt hook has no return value other than the error code. */ int do_ipfw_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { struct sockopt s; /* pass arguments */ (void)sk; /* UNUSED */ return ipfw_ctl_h(&s, cmd, SOPT_SET, len, user); } /* * getsockopt can can return a block of data in response. */ int do_ipfw_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { struct sockopt s; /* pass arguments */ int ret = ipfw_ctl_h(&s, cmd, SOPT_GET, *len, user); (void)sk; /* UNUSED */ *len = s.sopt_valsize; /* return length back to the caller */ return ret; } #ifdef __linux__ /* * declare our [get|set]sockopt hooks */ static struct nf_sockopt_ops ipfw_sockopts = { .pf = PF_INET, .set_optmin = _IPFW_SOCKOPT_BASE, .set_optmax = _IPFW_SOCKOPT_END, .set = do_ipfw_set_ctl, .get_optmin = _IPFW_SOCKOPT_BASE, .get_optmax = _IPFW_SOCKOPT_END, .get = do_ipfw_get_ctl, #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) .owner = THIS_MODULE, #endif }; /*---- * We need a number of macros to adapt to the various APIs in * different linux versions. Among them: * * - the hook names change between macros (NF_IP*) and enum NF_INET_* * * - the second argument to the netfilter hook is * struct sk_buff ** in kernels <= 2.6.22 * struct sk_buff * in kernels > 2.6.22 * * - NF_STOP is not defined before 2.6 so we remap it to NF_ACCEPT * * - the packet descriptor passed to the queue handler is * struct nf_info in kernels <= 2.6.24 * struct nf_queue_entry in kernels <= 2.6.24 * * - the arguments to the queue handler also change; */ /* * declare hook to grab packets from the netfilter interface. * The NF_* names change in different versions of linux, in some * cases they are #defines, in others they are enum, so we * need to adapt. */ #ifndef NF_IP_PRE_ROUTING #define NF_IP_PRE_ROUTING NF_INET_PRE_ROUTING #endif #ifndef NF_IP_POST_ROUTING #define NF_IP_POST_ROUTING NF_INET_POST_ROUTING #endif /* * ipfw hooks into the POST_ROUTING and the PRE_ROUTING chains. * PlanetLab sets skb_tag to the slice id in the LOCAL_INPUT and * POST_ROUTING chains, so if we want to use that information we * need to hook the LOCAL_INPUT chain instead of the PRE_ROUTING. * However at the moment the skb_tag info is not reliable so * we stay with the standard hooks. */ #if 0 // defined(IPFW_PLANETLAB) #define IPFW_HOOK_IN NF_IP_LOCAL_IN #else #define IPFW_HOOK_IN NF_IP_PRE_ROUTING #endif /* * The main netfilter hook. * To make life simple, we queue everything and then do all the * decision in the queue handler. * * XXX note that in 2.4 and up to 2.6.22 the skbuf is passed as sk_buff** * so we have an #ifdef to set the proper argument type. */ static unsigned int call_ipfw( #if LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0) unsigned int hooknum, #else const struct nf_hook_ops *hooknum, #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) // in 2.6.22 we have ** struct sk_buff **skb, #else struct sk_buff *skb, #endif const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { (void)hooknum; (void)skb; (void)in; (void)out; (void)okfn; /* UNUSED */ return NF_QUEUE; } #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12) /* XXX was 2.6.0 */ #define NF_STOP NF_ACCEPT #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) /* * nf_queue_entry is a recent addition, in previous versions * of the code the struct is called nf_info. */ #define nf_queue_entry nf_info /* for simplicity */ /* also, 2.4 and perhaps something else have different arguments */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) /* XXX unsure */ /* on 2.4 we use nf_info */ #define QH_ARGS struct sk_buff *skb, struct nf_info *info, void *data #else /* 2.6.14. 2.6.24 */ #define QH_ARGS struct sk_buff *skb, struct nf_info *info, unsigned int qnum, void *data #endif #define DEFINE_SKB /* nothing, already an argument */ #define REINJECT(_inf, _verd) nf_reinject(skb, _inf, _verd) #else /* 2.6.25 and above */ #define QH_ARGS struct nf_queue_entry *info, unsigned int queuenum #define DEFINE_SKB struct sk_buff *skb = info->skb; #define REINJECT(_inf, _verd) nf_reinject(_inf, _verd) #endif /* * used by dummynet when dropping packets * XXX use dummynet_send() */ void reinject_drop(struct mbuf* m) { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) /* unsure on the exact boundary */ struct sk_buff *skb = (struct sk_buff *)m; #endif REINJECT(m->queue_entry, NF_DROP); } /* * The real call to the firewall. nf_queue_entry points to the skbuf, * and eventually we need to return both through nf_reinject(). */ static int ipfw2_queue_handler(QH_ARGS) { DEFINE_SKB /* no semicolon here, goes in the macro */ int ret = 0; /* return value */ struct mbuf *m; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) if (skb->nh.iph == NULL) { printf("null dp, len %d reinject now\n", skb->len); REINJECT(info, NF_ACCEPT); return 0; } #endif m = malloc(sizeof(*m), 0, 0); if (m == NULL) { printf("malloc fail, len %d reinject now\n", skb->len); REINJECT(info, NF_ACCEPT); return 0; } m->m_skb = skb; m->m_len = skb->len; /* len from ip header to end */ m->m_pkthdr.len = skb->len; /* total packet len */ m->m_pkthdr.rcvif = info->indev; m->queue_entry = info; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) /* XXX was 2.6.0 */ m->m_data = (char *)skb->nh.iph; #else m->m_data = (char *)skb_network_header(skb); // XXX unsigned ? */ #endif /* XXX add the interface */ if (info->hook == IPFW_HOOK_IN) { ret = ipfw_check_hook(NULL, &m, info->indev, PFIL_IN, NULL); } else { ret = ipfw_check_hook(NULL, &m, info->outdev, PFIL_OUT, NULL); } if (m != NULL) { /* Accept. reinject and free the mbuf */ REINJECT(info, NF_ACCEPT); m_freem(m); } else if (ret == 0) { /* dummynet has kept the packet, will reinject later. */ } else { /* * Packet dropped by ipfw or dummynet. Nothing to do as * FREE_PKT already did a reinject as NF_DROP */ } return 0; } struct route; struct ip_moptions; struct inpcb; /* XXX should include prototypes for netisr_dispatch and ip_output */ /* * The reinjection routine after a packet comes out from dummynet. * We must update the skb timestamp so ping reports the right time. * This routine is also used (with num == -1) as FREE_PKT. XXX */ void netisr_dispatch(int num, struct mbuf *m) { struct nf_queue_entry *info = m->queue_entry; struct sk_buff *skb = m->m_skb; /* always used */ /* * This function can be called by the FREE_PKT() * used when ipfw generate their own mbuf packets * or by the mbuf2skbuff() function. */ m_freem(m); /* XXX check * info is null in the case of a real mbuf * (one created by the ipfw code without a * valid sk_buff pointer */ if (info == NULL) return; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) // XXX above 2.6.x ? __net_timestamp(skb); /* update timestamp */ #endif /* XXX to obey one-pass, possibly call the queue handler here */ REINJECT(info, ((num == -1)?NF_DROP:NF_STOP)); /* accept but no more firewall */ } /* * socket lookup function for linux. * This code is used to associate uid, gid, jail/xid to packets, * and store the info in a cache *ugp where they can be accessed quickly. * The function returns 1 if the info is found, -1 otherwise. * * We do this only on selected protocols: TCP, ... * * The chain is the following * sk_buff* sock* socket* file* * skb -> sk ->sk_socket->file ->f_owner ->pid * skb -> sk ->sk_socket->file ->f_uid (direct) * skb -> sk ->sk_socket->file ->f_cred->fsuid (2.6.29+) * * Related headers: * linux/skbuff.h struct skbuff * net/sock.h struct sock * linux/net.h struct socket * linux/fs.h struct file * * With vserver we may have sk->sk_xid and sk->sk_nid that * which we store in fw_groups[1] (matches O_JAIL) and fw_groups[2] * (no matches yet) * * Note- for locally generated, outgoing packets we should not need * need a lookup because the sk_buff already points to the socket where * the info is. */ extern struct inet_hashinfo tcp_hashinfo; int linux_lookup(const int proto, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, struct sk_buff *skb, int dir, struct bsd_ucred *u) { #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,13) /* XXX was 2.6.0 */ return -1; #else struct sock *sk; int ret = -1; /* default return value */ int st = -1; /* state */ if (proto != IPPROTO_TCP) /* XXX extend for UDP */ return -1; if ((dir ? (void *)skb_dst(skb) : (void *)skb->dev) == NULL) { panic(" -- this should not happen\n"); return -1; } if (skb->sk) { sk = skb->sk; } else { /* * Try a lookup. On a match, sk has a refcount that we must * release on exit (we know it because skb->sk = NULL). * * inet_lookup above 2.6.24 has an additional 'net' parameter * so we use a macro to conditionally supply it. * swap dst and src depending on the direction. */ #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,24) #define _OPT_NET_ARG #else #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) /* there is no dev_net() on 2.6.25 */ #define _OPT_NET_ARG (skb->dev->nd_net), #else /* 2.6.26 and above */ #define _OPT_NET_ARG dev_net(skb->dev), #endif #endif sk = (dir) ? /* dir != 0 on output */ inet_lookup(_OPT_NET_ARG &tcp_hashinfo, daddr, dport, saddr, sport, // match outgoing inet_iif(skb)) : inet_lookup(_OPT_NET_ARG &tcp_hashinfo, saddr, sport, daddr, dport, // match incoming skb->dev->ifindex); #undef _OPT_NET_ARG if (sk == NULL) /* no match, nothing to be done */ return -1; } ret = 1; /* retrying won't make things better */ st = sk->sk_state; #ifdef CONFIG_VSERVER u->xid = sk->sk_xid; u->nid = sk->sk_nid; #else u->xid = u->nid = 0; #endif /* * Exclude tcp states where sk points to a inet_timewait_sock which * has no sk_socket field (surely TCP_TIME_WAIT, perhaps more). * To be safe, use a whitelist and not a blacklist. * Before dereferencing sk_socket grab a lock on sk_callback_lock. * * Once again we need conditional code because the UID and GID * location changes between kernels. */ #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,28) /* use the current's real uid/gid */ #define _CURR_UID f_uid #define _CURR_GID f_gid #else /* 2.6.29 and above */ /* use the current's file access real uid/gid */ #define _CURR_UID f_cred->fsuid #define _CURR_GID f_cred->fsgid #endif #define GOOD_STATES ( \ (1<sk_callback_lock); if (sk->sk_socket && sk->sk_socket->file) { //u->uid = sk->sk_socket->file->_CURR_UID; //u->gid = sk->sk_socket->file->_CURR_GID; } read_unlock_bh(&sk->sk_callback_lock); } else { u->uid = u->gid = 0; } if (!skb->sk) /* return the reference that came from the lookup */ sock_put(sk); #undef GOOD_STATES #undef _CURR_UID #undef _CURR_GID return ret; #endif /* LINUX > 2.4 */ } /* * Now prepare to hook the various functions. * Linux 2.4 has a different API so we need some adaptation * for register and unregister hooks * * the unregister function changed arguments between 2.6.22 and 2.6.24 */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14) struct nf_queue_handler ipfw2_queue_handler_desc = { .outfn = ipfw2_queue_handler, #if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,2) .name = "ipfw2 dummynet queue", #endif }; #if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,2) #define REG_QH_ARG(pf, fn) pf, &(fn ## _desc) #else #define REG_QH_ARG(pf, fn) &(fn ## _desc) #endif #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) /* XXX was 2.6.0 */ static int nf_register_hooks(struct nf_hook_ops *ops, int n) { int i, ret = 0; for (i = 0; i < n; i++) { ret = nf_register_hook(ops + i); if (ret < 0) break; } return ret; } static void nf_unregister_hooks(struct nf_hook_ops *ops, int n) { int i; for (i = 0; i < n; i++) { nf_unregister_hook(ops + i); } } #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) /* XXX was 2.6.0 */ #define REG_QH_ARG(pf, fn) pf, fn, NULL #endif #define UNREG_QH_ARG(pf, fn) //fn /* argument for nf_[un]register_queue_handler */ #define SET_MOD_OWNER #else /* linux > 2.6.17 */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) #define UNREG_QH_ARG(pf, fn) //fn #elif LINUX_VERSION_CODE < KERNEL_VERSION(3,8,2) #define UNREG_QH_ARG(pf, fn) pf, &(fn ## _desc) #else #define UNREG_QH_ARG(pf, fn) #endif /* 2.6.0 < LINUX > 2.6.24 */ #define SET_MOD_OWNER .owner = THIS_MODULE, #endif /* !LINUX < 2.6.0 */ static struct nf_hook_ops ipfw_ops[] __read_mostly = { { .hook = call_ipfw, .pf = PF_INET, .hooknum = IPFW_HOOK_IN, .priority = NF_IP_PRI_FILTER, SET_MOD_OWNER }, { .hook = call_ipfw, .pf = PF_INET, .hooknum = NF_IP_POST_ROUTING, .priority = NF_IP_PRI_FILTER, SET_MOD_OWNER }, }; #endif /* __linux__ */ /* descriptors for the children, until i find a way for the * linker to produce them */ extern moduledata_t *moddesc_ipfw; extern moduledata_t *moddesc_dummynet; extern moduledata_t *moddesc_dn_fifo; extern moduledata_t *moddesc_dn_wf2qp; extern moduledata_t *moddesc_dn_rr; extern moduledata_t *moddesc_dn_qfq; extern moduledata_t *moddesc_dn_prio; extern void *sysinit_ipfw_init; extern void *sysuninit_ipfw_destroy; extern void *sysinit_vnet_ipfw_init; extern void *sysuninit_vnet_ipfw_uninit; /* * Module glue - init and exit function. */ int __init ipfw_module_init(void) { int ret = 0; #ifdef _WIN32 unsigned long resolution; #endif rn_init(64); my_mod_register("ipfw", 1, moddesc_ipfw, NULL, NULL); my_mod_register("sy_ipfw", 2, NULL, sysinit_ipfw_init, sysuninit_ipfw_destroy); my_mod_register("sy_Vnet_ipfw", 3, NULL, sysinit_vnet_ipfw_init, sysuninit_vnet_ipfw_uninit); my_mod_register("dummynet", 4, moddesc_dummynet, NULL, NULL); my_mod_register("dn_fifo", 5, moddesc_dn_fifo, NULL, NULL); my_mod_register("dn_wf2qp", 6, moddesc_dn_wf2qp, NULL, NULL); my_mod_register("dn_rr", 7, moddesc_dn_rr, NULL, NULL); my_mod_register("dn_qfq", 8, moddesc_dn_qfq, NULL, NULL); my_mod_register("dn_prio", 9, moddesc_dn_prio, NULL, NULL); init_children(); #ifdef _WIN32 resolution = ExSetTimerResolution(1, TRUE); printf("*** ExSetTimerResolution: resolution set to %d n-sec ***\n",resolution); #endif #ifdef EMULATE_SYSCTL keinit_GST(); #endif #ifdef __linux__ /* sockopt register, in order to talk with user space */ ret = nf_register_sockopt(&ipfw_sockopts); if (ret < 0) { printf("error %d in nf_register_sockopt\n", ret); goto clean_modules; } /* queue handler registration, in order to get network * packet under a private queue */ #if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,2) ret = #endif nf_register_queue_handler(REG_QH_ARG(PF_INET, ipfw2_queue_handler) ); if (ret < 0) /* queue busy */ goto unregister_sockopt; ret = nf_register_hooks(ipfw_ops, ARRAY_SIZE(ipfw_ops)); if (ret < 0) goto unregister_sockopt; printf("%s loaded\n", __FUNCTION__); return 0; /* handle errors on load */ unregister_sockopt: nf_unregister_queue_handler(UNREG_QH_ARG(PF_INET, ipfw2_queue_handler) ); nf_unregister_sockopt(&ipfw_sockopts); clean_modules: fini_children(); printf("%s error\n", __FUNCTION__); #endif /* __linux__ */ return ret; } /* module shutdown */ void __exit ipfw_module_exit(void) { #ifdef EMULATE_SYSCTL keexit_GST(); #endif #ifdef _WIN32 ExSetTimerResolution(0,FALSE); #else /* linux hook */ nf_unregister_hooks(ipfw_ops, ARRAY_SIZE(ipfw_ops)); /* maybe drain the queue before unregistering ? */ nf_unregister_queue_handler(UNREG_QH_ARG(PF_INET, ipfw2_queue_handler) ); nf_unregister_sockopt(&ipfw_sockopts); #endif /* __linux__ */ fini_children(); printf("%s unloaded\n", __FUNCTION__); } #ifdef __linux__ module_init(ipfw_module_init) module_exit(ipfw_module_exit) MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */ #endif ================================================ FILE: kipfw/md_win.c ================================================ /* * Copyright (C) 2010 Luigi Rizzo, Francesco Magno, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * kernel variables and functions that are not available in Windows. */ #include /* provides PFIL_IN and PFIL_OUT */ #include #include /* in_addr */ #include #include #include /* credentials check */ int cred_check(void *_insn, int proto, struct ifnet *oif, struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp, struct sk_buff *skb) { return 0; } /* * as good as anywhere, place here the missing calls */ void * my_alloc(int size) { void *_ret = ExAllocatePoolWithTag(NonPagedPool, size, 'wfpi'); if (_ret) memset(_ret, 0, size); return _ret; } void panic(const char *fmt, ...) { printf("%s", fmt); for (;;); } int securelevel = 0; int ffs(int bits) { int i; if (bits == 0) return (0); for (i = 1; ; i++, bits >>= 1) { if (bits & 1) break; } return (i); } void do_gettimeofday(struct timeval *tv) { static LARGE_INTEGER prevtime; //system time in 100-nsec resolution static LARGE_INTEGER prevcount; //RTC counter value static LARGE_INTEGER freq; //frequency LARGE_INTEGER currtime; LARGE_INTEGER currcount; if (prevtime.QuadPart == 0) { //first time we ask for system time KeQuerySystemTime(&prevtime); prevcount = KeQueryPerformanceCounter(&freq); currtime.QuadPart = prevtime.QuadPart; } else { KeQuerySystemTime(&currtime); currcount = KeQueryPerformanceCounter(&freq); if (currtime.QuadPart == prevtime.QuadPart) { //time has NOT changed, calculate time using ticks and DO NOT update LONGLONG difftime = 0; //difference in 100-nsec LONGLONG diffcount = 0; //clock count difference //printf("time has NOT changed\n"); diffcount = currcount.QuadPart - prevcount.QuadPart; diffcount *= 10000000; difftime = diffcount / freq.QuadPart; currtime.QuadPart += difftime; } else { //time has changed, update and return SystemTime //printf("time has changed\n"); prevtime.QuadPart = currtime.QuadPart; prevcount.QuadPart = currcount.QuadPart; } } currtime.QuadPart /= 10; //convert in usec tv->tv_sec = currtime.QuadPart / (LONGLONG)1000000; tv->tv_usec = currtime.QuadPart % (LONGLONG)1000000; //printf("sec %d usec %d\n",tv->tv_sec, tv->tv_usec); } int time_uptime_w32() { int ret; LARGE_INTEGER tm; KeQuerySystemTime(&tm); ret = (int)(tm.QuadPart / (LONGLONG)1000000); return ret; } /* * Windows version of firewall hook. We receive a partial copy of * the packet which points to the original buffers. In output, * the refcount has been already incremented. * The function reconstructs * the whole packet in a contiguous memory area, builds a fake mbuf, * calls the firewall, does the eventual cleaning and returns * to MiniportSend or ProtocolReceive, which will silently return * (dropping packet) or continue its execution (allowing packet). * The memory area contains: * - the fake mbuf, filled with data needed by ipfw, and information * for reinjection * - the packet data */ void hexdump(PUCHAR,int, const char *); static char _if_in[] = "incoming"; static char _if_out[] = "outgoing"; int ipfw2_qhandler_w32(PNDIS_PACKET pNdisPacket, int direction, NDIS_HANDLE Context) { unsigned int BufferCount = 0; unsigned TotalPacketLength = 0; PNDIS_BUFFER pCurrentBuffer = NULL; PNDIS_BUFFER pNextBuffer = NULL; struct mbuf* m; unsigned char* payload = NULL; unsigned int ofs, l; unsigned short EtherType = 0; unsigned int i = 0; int ret = 0; PNDIS_BUFFER pNdisBuffer, old_head, old_tail; NDIS_HANDLE PacketPool; PADAPT pAdapt; NDIS_STATUS Status; /* In NDIS, packets are a chain of NDIS_BUFFER. We query * the packet to get a pointer of chain's head, the length * of the chain, and the length of the packet itself. * Then allocate a buffer for the mbuf and the payload. */ NdisQueryPacket(pNdisPacket, NULL, &BufferCount, &pCurrentBuffer, &TotalPacketLength); m = malloc(sizeof(struct mbuf) + TotalPacketLength, 0, 0 ); if (m == NULL) //resource shortage, drop the packet goto drop_pkt; /* set mbuf fields to point past the MAC header. * Also set additional W32 info */ payload = (unsigned char*)(m + 1); m->m_len = m->m_pkthdr.len = TotalPacketLength-14; m->m_pkthdr.rcvif = (void *)((direction==INCOMING) ? _if_in : NULL); m->m_data = payload + 14; /* past the MAC header */ m->direction = direction; m->context = Context; m->pkt = pNdisPacket; /* m_skb != NULL is used in the ip_output routine to check * for packets that come from the stack and differentiate * from those internally generated by ipfw. * The pointer is not used, just needs to be non-null. */ m->m_skb = (void *)pNdisPacket; /* * Now copy the data from the Windows buffers to the mbuf. */ for (i=0, ofs = 0; i < BufferCount; i++) { unsigned char* src; NdisQueryBufferSafe(pCurrentBuffer, &src, &l, NormalPagePriority); bcopy(src, payload + ofs, l); ofs += l; NdisGetNextBuffer(pCurrentBuffer, &pNextBuffer); pCurrentBuffer = pNextBuffer; } /* * Identify EtherType. If the packet is not IP, simply allow * and don't bother the firewall. XXX should be done before. */ EtherType = *(unsigned short*)(payload + 12); EtherType = RtlUshortByteSwap(EtherType); if (EtherType != 0x0800) { //DbgPrint("ethertype = %X, skipping ipfw\n",EtherType); free(m, 0); return PASS; } /* * Now build a buffer descriptor to replace the original chain. */ pAdapt = Context; PacketPool = direction == OUTGOING ? pAdapt->SendPacketPoolHandle : pAdapt->RecvPacketPoolHandle; NdisAllocateBuffer(&Status, &pNdisBuffer, PacketPool, payload, m->m_pkthdr.len+14); if (Status != NDIS_STATUS_SUCCESS) goto drop_pkt; /* * Save the old buffer pointers, and put the new one * into the chain. */ pNdisBuffer->Next = NULL; old_head = NDIS_PACKET_FIRST_NDIS_BUFFER(pNdisPacket); old_tail = NDIS_PACKET_LAST_NDIS_BUFFER(pNdisPacket); NdisReinitializePacket(pNdisPacket); NdisChainBufferAtFront(pNdisPacket, pNdisBuffer); #if 0 if (direction == INCOMING) { DBGPRINT(("incoming: proto %u (%s), src %08X, dst %08X, sport %u, dport %u, len %u\n", *(payload+14+9), texify_proto(*(payload+14+9)), *(unsigned int*)(payload+14+12), *(unsigned int*)(payload+14+16), ntohs((*((unsigned short int*)(payload+14+20)))), ntohs((*((unsigned short int*)(payload+14+22)))), TotalPacketLength)); } else { DBGPRINT(("outgoing: proto %u (%s), src %08X, dst %08X, sport %u, dport %u, len %u\n", *(payload+14+9), texify_proto(*(payload+14+9)), *(unsigned int*)(payload+14+12), *(unsigned int*)(payload+14+16), ntohs((*((unsigned short int*)(payload+14+20)))), ntohs((*((unsigned short int*)(payload+14+22)))), TotalPacketLength)); } #endif if (direction == INCOMING) ret = ipfw_check_hook(NULL, &m, NULL, PFIL_IN, NULL); else ret = ipfw_check_hook(NULL, &m, (struct ifnet*)_if_out, PFIL_OUT, NULL); if (m != NULL) { /* Accept. Restore the old buffer chain, free * the mbuf and return PASS. */ //DBGPRINT(("accepted\n")); NdisReinitializePacket(pNdisPacket); NDIS_PACKET_FIRST_NDIS_BUFFER(pNdisPacket) = old_head; NDIS_PACKET_LAST_NDIS_BUFFER(pNdisPacket) = old_tail; NdisFreeBuffer(pNdisBuffer); m_freem(m); return PASS; } else if (ret == 0) { /* dummynet has kept the packet, will reinject later. */ //DBGPRINT(("kept by dummynet\n")); return DUMMYNET; } else { /* * Packet dropped by ipfw or dummynet. Nothing to do as * FREE_PKT already freed the fake mbuf */ //DBGPRINT(("dropped by dummynet, ret = %i\n", ret)); return DROP; } drop_pkt: /* for some reason we cannot proceed. Free any resources * including those received from above, and return * faking success. XXX this must be fixed later. */ NdisFreePacket(pNdisPacket); return DROP; } /* * Windows reinjection function. * The packet is already available as m->pkt, so we only * need to send it to the right place. * Normally a ndis intermediate driver allocates * a fresh descriptor, while the actual data's ownership is * retained by the protocol, or the miniport below. * Since an intermediate driver behaves as a miniport driver * at the upper edge (towards the protocol), and as a protocol * driver at the lower edge (towards the NIC), when we handle a * packet we have a reserved area in both directions (we can use * only one for each direction at our own discretion). * Normally this area is used to save a pointer to the original * packet, so when the driver is done with it, the original descriptor * can be retrieved, and the resources freed (packet descriptor, * buffer descriptor(s) and the actual data). In our driver this * area is used to mark the reinjected packets as 'orphan', because * the original descriptor is gone long ago. This way we can handle * correctly the resource freeing when the callback function * is called by NDIS. */ void netisr_dispatch(int num, struct mbuf *m) { unsigned char* payload = (unsigned char*)(m+1); PADAPT pAdapt = m->context; NDIS_STATUS Status; PNDIS_PACKET pPacket = m->pkt; PNDIS_BUFFER pNdisBuffer; NDIS_HANDLE PacketPool; if (num < 0) goto drop_pkt; //debug print #if 0 DbgPrint("reinject %s\n", m->direction == OUTGOING ? "outgoing" : "incoming"); #endif NdisAcquireSpinLock(&pAdapt->Lock); if (m->direction == OUTGOING) { //we must first check if the adapter is going down, // in this case abort the reinjection if (pAdapt->PTDeviceState > NdisDeviceStateD0) { pAdapt->OutstandingSends--; // XXX should we notify up ? NdisReleaseSpinLock(&pAdapt->Lock); goto drop_pkt; } } else { /* if the upper miniport edge is not initialized or * the miniport edge is in low power state, abort * XXX we should notify the error. */ if (!pAdapt->MiniportHandle || pAdapt->MPDeviceState > NdisDeviceStateD0) { NdisReleaseSpinLock(&pAdapt->Lock); goto drop_pkt; } } NdisReleaseSpinLock(&pAdapt->Lock); if (m->direction == OUTGOING) { PSEND_RSVD SendRsvd; /* use the 8-bytes protocol reserved area, the first * field is used to mark/the packet as 'orphan', the * second stores the pointer to the mbuf, so in the * the SendComplete handler we know that this is a * reinjected packet and can free correctly. */ SendRsvd = (PSEND_RSVD)(pPacket->ProtocolReserved); SendRsvd->OriginalPkt = NULL; SendRsvd->pMbuf = m; //do the actual send NdisSend(&Status, pAdapt->BindingHandle, pPacket); if (Status != NDIS_STATUS_PENDING) { /* done, call the callback now */ PtSendComplete(m->context, m->pkt, Status); } return; /* unconditional return here. */ } else { /* There's no need to check the 8-bytes miniport * reserved area since the path going up will be always * syncronous, and all the cleanup will be done inline. * If the reinjected packed comes from a PtReceivePacket, * there will be no callback. * Otherwise PtReceiveComplete will be called but will just * return since all the cleaning is alreqady done */ // do the actual receive. ULONG Proc = KeGetCurrentProcessorNumber(); pAdapt->ReceivedIndicationFlags[Proc] = TRUE; NdisMEthIndicateReceive(pAdapt->MiniportHandle, NULL, payload, 14, payload+14, m->m_len, m->m_len); NdisMEthIndicateReceiveComplete(pAdapt->MiniportHandle); pAdapt->ReceivedIndicationFlags[Proc] = FALSE; } drop_pkt: /* NDIS_PACKET exists and must be freed only if * the packet come from a PtReceivePacket, oherwise * m->pkt will ne null. */ if (m->pkt != NULL) { NdisUnchainBufferAtFront(m->pkt, &pNdisBuffer); NdisFreeBuffer(pNdisBuffer); NdisFreePacket(m->pkt); } m_freem(m); } void win_freem(void *); /* wrapper for m_freem() for protocol.c */ void win_freem(void *_m) { struct mbuf *m = _m; m_freem(m); } /* * not implemented in linux. * taken from /usr/src/lib/libc/string/strlcpy.c */ size_t strlcpy(char *dst, const char *src, size_t siz) { char *d = dst; const char *s = src; size_t n = siz; /* Copy as many bytes as will fit */ if (n != 0 && --n != 0) { do { if ((*d++ = *s++) == 0) break; } while (--n != 0); } /* Not enough room in dst, add NUL and traverse rest of src */ if (n == 0) { if (siz != 0) *d = '\0'; /* NUL-terminate dst */ while (*s++) ; } return(s - src - 1); /* count does not include NUL */ } void CleanupReinjected(PNDIS_PACKET Packet, struct mbuf* m, PADAPT pAdapt) { PNDIS_BUFFER pNdisBuffer; NdisQueryPacket(Packet, NULL, NULL, &pNdisBuffer, NULL); NdisUnchainBufferAtFront(Packet, &pNdisBuffer); NdisFreeBuffer(pNdisBuffer); win_freem(m); NdisFreePacket(Packet); ADAPT_DECR_PENDING_SENDS(pAdapt); } int ipfw2_qhandler_w32_oldstyle(int direction, NDIS_HANDLE ProtocolBindingContext, unsigned char* HeaderBuffer, unsigned int HeaderBufferSize, unsigned char* LookAheadBuffer, unsigned int LookAheadBufferSize, unsigned int PacketSize) { struct mbuf* m; unsigned char* payload = NULL; unsigned short EtherType = 0; int ret = 0; /* We are in a special case when NIC signals an incoming * packet using old style calls. This is done passing * a pointer to the MAC header and a pointer to the * rest of the packet. * We simply allocate space for the mbuf and the * subsequent payload section. */ m = malloc(sizeof(struct mbuf) + HeaderBufferSize + LookAheadBufferSize, 0, 0 ); if (m == NULL) //resource shortage, drop the packet return DROP; /* set mbuf fields to point past the MAC header. * Also set additional W32 info. * m->pkt here is set to null because the notification * from the NIC has come with a header+loolahead buffer, * no NDIS_PACKET has been provided. */ payload = (unsigned char*)(m + 1); m->m_len = m->m_pkthdr.len = HeaderBufferSize+LookAheadBufferSize-14; m->m_data = payload + 14; /* past the MAC header */ m->direction = direction; m->context = ProtocolBindingContext; m->pkt = NULL; /* * Now copy the data from the Windows buffers to the mbuf. */ bcopy(HeaderBuffer, payload, HeaderBufferSize); bcopy(LookAheadBuffer, payload+HeaderBufferSize, LookAheadBufferSize); //hexdump(payload,HeaderBufferSize+LookAheadBufferSize,"qhandler"); /* * Identify EtherType. If the packet is not IP, simply allow * and don't bother the firewall. XXX should be done before. */ EtherType = *(unsigned short*)(payload + 12); EtherType = RtlUshortByteSwap(EtherType); if (EtherType != 0x0800) { //DbgPrint("ethertype = %X, skipping ipfw\n",EtherType); free(m, 0); return PASS; } //DbgPrint("incoming_raw: proto %u (%s), src %08X, dst %08X, sport %u, dport %u, len %u\n", *(payload+14+9), texify_proto(*(payload+14+9)), *(unsigned int*)(payload+14+12), *(unsigned int*)(payload+14+16), ntohs((*((unsigned short int*)(payload+14+20)))), ntohs((*((unsigned short int*)(payload+14+22)))), HeaderBufferSize+LookAheadBufferSize); /* Query the firewall */ ret = ipfw_check_hook(NULL, &m, NULL, PFIL_IN, NULL); if (m != NULL) { /* Accept. Free the mbuf and return PASS. */ //DbgPrint("accepted\n"); m_freem(m); return PASS; } else if (ret == 0) { /* dummynet has kept the packet, will reinject later. */ //DbgPrint("kept by dummynet\n"); return DUMMYNET; } else { /* * Packet dropped by ipfw or dummynet. Nothing to do as * FREE_PKT already freed the fake mbuf */ //DbgPrint("dropped by dummynet, ret = %i\n", ret); return DROP; } } /* forward declaration because those functions are used only here, * no point to make them visible in passthru/protocol/miniport */ int do_ipfw_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len); int do_ipfw_get_ctl(struct sock *sk, int cmd, void __user *user, int *len); NTSTATUS DevIoControl( IN PDEVICE_OBJECT pDeviceObject, IN PIRP pIrp ) /*++ Routine Description: This is the dispatch routine for handling device ioctl requests. Arguments: pDeviceObject - Pointer to the device object. pIrp - Pointer to the request packet. Return Value: Status is returned. --*/ { PIO_STACK_LOCATION pIrpSp; NTSTATUS NtStatus = STATUS_SUCCESS; unsigned long BytesReturned = 0; unsigned long FunctionCode; unsigned long len; struct sockopt *sopt; int ret = 0; UNREFERENCED_PARAMETER(pDeviceObject); pIrpSp = IoGetCurrentIrpStackLocation(pIrp); /* * Using METHOD_BUFFERED as communication method, the userland * side calls DeviceIoControl passing an input buffer and an output * and their respective length (ipfw uses the same length for both). * The system creates a single I/O buffer, with len=max(inlen,outlen). * In the kernel we can read information from this buffer (which is * directly accessible), overwrite it with our results, and set * IoStatus.Information with the number of bytes that the system must * copy back to userland. * In our sockopt emulation, the initial part of the buffer contains * a struct sockopt, followed by the data area. */ len = pIrpSp->Parameters.DeviceIoControl.InputBufferLength; if (len < sizeof(struct sockopt)) { return STATUS_NOT_SUPPORTED; // XXX find better value } sopt = pIrp->AssociatedIrp.SystemBuffer; FunctionCode = pIrpSp->Parameters.DeviceIoControl.IoControlCode; len = sopt->sopt_valsize; switch (FunctionCode) { case IP_FW_SETSOCKOPT: ret = do_ipfw_set_ctl(NULL, sopt->sopt_name, sopt+1, len); break; case IP_FW_GETSOCKOPT: ret = do_ipfw_get_ctl(NULL, sopt->sopt_name, sopt+1, &len); sopt->sopt_valsize = len; //sanity check on len if (len + sizeof(struct sockopt) <= pIrpSp->Parameters.DeviceIoControl.InputBufferLength) BytesReturned = len + sizeof(struct sockopt); else BytesReturned = pIrpSp->Parameters.DeviceIoControl.InputBufferLength; break; default: NtStatus = STATUS_NOT_SUPPORTED; break; } pIrp->IoStatus.Information = BytesReturned; pIrp->IoStatus.Status = NtStatus; IoCompleteRequest(pIrp, IO_NO_INCREMENT); return NtStatus; } void dummynet(void * unused); void ipfw_tick(void * vnetx); VOID dummynet_dpc( __in struct _KDPC *Dpc, __in_opt PVOID DeferredContext, __in_opt PVOID SystemArgument1, __in_opt PVOID SystemArgument2 ) { dummynet(NULL); } VOID ipfw_dpc( __in struct _KDPC *Dpc, __in_opt PVOID DeferredContext, __in_opt PVOID SystemArgument1, __in_opt PVOID SystemArgument2 ) { ipfw_tick(DeferredContext); } ================================================ FILE: kipfw/missing.h ================================================ /* * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id: missing.h 12256 2013-04-26 21:12:44Z luigi $ * * Header for kernel variables and functions that are not available in * userland. */ #ifndef _MISSING_H_ #define _MISSING_H_ #include #ifdef linux #include #include #include #endif /* linux */ /* portability features, to be set before the rest: */ #define HAVE_NET_IPLEN /* iplen/ipoff in net format */ #define WITHOUT_BPF /* do not use bpf logging */ #ifdef _WIN32 #ifndef DEFINE_SPINLOCK #define DEFINE_SPINLOCK(x) FAST_MUTEX x #endif /* spinlock --> Guarded Mutex KGUARDED_MUTEX */ /* http://www.reactos.org/wiki/index.php/Guarded_Mutex */ #define spin_lock_init(_l) #define spin_lock_bh(_l) #define spin_unlock_bh(_l) #include /* bsd-compat.c */ #include /* bsd-compat.c */ #include /* local version */ #define INADDR_TO_IFP(a, b) b = NULL #else /* __linux__ */ #define MALLOC_DECLARE(x) /* nothing */ #include /* do_gettimeofday */ #include /* local version */ struct inpcb; /* * Kernel locking support. * FreeBSD uses mtx in dummynet.c and struct rwlock ip_fw2.c * * In linux we use spinlock_bh to implement both. * For 'struct rwlock' we need an #ifdef to change it to spinlock_t */ #ifndef DEFINE_SPINLOCK /* this is for linux 2.4 */ #define DEFINE_SPINLOCK(x) spinlock_t x = SPIN_LOCK_UNLOCKED #endif #define rw_assert(a, b) #define rw_destroy(_l) #define rw_init(_l, msg) spin_lock_init(_l) #define rw_rlock(_l) spin_lock_bh(_l) #define rw_runlock(_l) spin_unlock_bh(_l) #define rw_wlock(_l) spin_lock_bh(_l) #define rw_wunlock(_l) spin_unlock_bh(_l) #define rw_init_flags(_l, s, v) #define mtx_assert(a, b) #define mtx_destroy(m) #define mtx_init(m, a,b,c) spin_lock_init(m) #define mtx_lock(_l) spin_lock_bh(_l) #define mtx_unlock(_l) spin_unlock_bh(_l) #endif /* __linux__ */ /* end of locking support */ /* * Reference to an ipfw rule that can be carried outside critical sections. * A rule is identified by rulenum:rule_id which is ordered. * In version chain_id the rule can be found in slot 'slot', so * we don't need a lookup if chain_id == chain->id. * * On exit from the firewall this structure refers to the rule after * the matching one (slot points to the new rule; rulenum:rule_id-1 * is the matching rule), and additional info (e.g. info often contains * the insn argument or tablearg in the low 16 bits, in host format). * On entry, the structure is valid if slot>0, and refers to the starting * rules. 'info' contains the reason for reinject, e.g. divert port, * divert direction, and so on. */ struct ipfw_rule_ref { uint32_t slot; /* slot for matching rule */ uint32_t rulenum; /* matching rule number */ uint32_t rule_id; /* matching rule id */ uint32_t chain_id; /* ruleset id */ uint32_t info; /* see below */ }; enum { IPFW_INFO_MASK = 0x0000ffff, IPFW_INFO_OUT = 0x00000000, /* outgoing, just for convenience */ IPFW_INFO_IN = 0x80000000, /* incoming, overloads dir */ IPFW_ONEPASS = 0x40000000, /* One-pass, do not reinject */ IPFW_IS_MASK = 0x30000000, /* which source ? */ IPFW_IS_DIVERT = 0x20000000, IPFW_IS_DUMMYNET =0x10000000, IPFW_IS_PIPE = 0x08000000, /* pipe=1, queue = 0 */ }; /* in netinet/in.h */ #define in_nullhost(x) ((x).s_addr == INADDR_ANY) /* bzero not present on linux, but this should go in glue.h */ #define bzero(s, n) memset(s, 0, n) #define bcmp(p1, p2, n) memcmp(p1, p2, n) /* ethernet stuff */ #define ETHERTYPE_IP 0x0800 /* IP protocol */ //#define ETHER_ADDR_LEN 6 /* length of an Ethernet address */ struct ether_header { u_char ether_dhost[ETHER_ADDR_LEN]; u_char ether_shost[ETHER_ADDR_LEN]; u_short ether_type; }; #define ETHER_TYPE_LEN 2 /* length of the Ethernet type field */ #define ETHER_HDR_LEN (ETHER_ADDR_LEN*2+ETHER_TYPE_LEN) /* * Historically, BSD keeps ip_len and ip_off in host format * when doing layer 3 processing, and this often requires * to translate the format back and forth. * To make the process explicit, we define a couple of macros * that also take into account the fact that at some point * we may want to keep those fields always in net format. */ #if (BYTE_ORDER == BIG_ENDIAN) || defined(HAVE_NET_IPLEN) #define SET_NET_IPLEN(p) do {} while (0) #define SET_HOST_IPLEN(p) do {} while (0) #else /* never on linux */ #define SET_NET_IPLEN(p) do { \ struct ip *h_ip = (p); \ h_ip->ip_len = htons(h_ip->ip_len); \ h_ip->ip_off = htons(h_ip->ip_off); \ } while (0) #define SET_HOST_IPLEN(p) do { \ struct ip *h_ip = (p); \ h_ip->ip_len = ntohs(h_ip->ip_len); \ h_ip->ip_off = ntohs(h_ip->ip_off); \ } while (0) #endif /* !HAVE_NET_IPLEN */ /* ip_dummynet.c */ #define __FreeBSD_version 500035 #ifdef __linux__ struct moduledata; int my_mod_register(const char *name, int order, struct moduledata *mod, void *init, void *uninit); /* define some macro for ip_dummynet */ struct malloc_type { }; #define MALLOC_DEFINE(type, shortdesc, longdesc) \ struct malloc_type type[1]; void *md_dummy_ ## type = type #define CTASSERT(x) /* log... does not use the first argument */ #define LOG_ERR 0x100 #define LOG_INFO 0x200 #define log(_level, fmt, arg...) do { \ int _qwerty=_level;(void)_qwerty; printk(KERN_ERR fmt, ##arg); } while (0) /* * gettimeofday would be in sys/time.h but it is not * visible if _KERNEL is defined */ int gettimeofday(struct timeval *, struct timezone *); #else /* _WIN32 */ #define MALLOC_DEFINE(a,b,c) #endif /* _WIN32 */ extern int hz; extern long tick; /* exists in 2.4 but not in 2.6 */ extern int bootverbose; extern struct timeval boottime; /* The time_uptime a FreeBSD variable increased each second */ #ifdef __linux__ #if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,37) /* revise boundaries */ #define time_uptime get_seconds() #else /* OpenWRT */ #define time_uptime CURRENT_TIME #endif #else /* WIN32 */ #define time_uptime time_uptime_w32() #endif extern int max_linkhdr; extern int ip_defttl; extern u_long in_ifaddrhmask; /* mask for hash table */ extern struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ /*-------------------------------------------------*/ /* define, includes and functions missing in linux */ /* include and define */ #include /* inet_ntoa */ struct mbuf; /* used by ip_dummynet.c */ void reinject_drop(struct mbuf* m); #include /* error define */ #include /* IFNAMESIZ */ void rn_init(int); /* * some network structure can be defined in the bsd way * by using the _FAVOR_BSD definition. This is not true * for icmp structure. * XXX struct icmp contains bsd names in * /usr/include/netinet/ip_icmp.h */ #ifdef __linux__ #define icmp_code code #define icmp_type type /* linux in6_addr has no member __u6_addr * replace the whole structure ? */ #define __u6_addr in6_u #define __u6_addr32 u6_addr32 #endif /* __linux__ */ /* defined in linux/sctp.h with no bsd definition */ struct sctphdr { uint16_t src_port; /* source port */ uint16_t dest_port; /* destination port */ uint32_t v_tag; /* verification tag of packet */ uint32_t checksum; /* Adler32 C-Sum */ /* chunks follow... */ }; /* missing definition */ #define TH_FIN 0x01 #define TH_SYN 0x02 #define TH_RST 0x04 #define TH_ACK 0x10 #define RTF_CLONING 0x100 /* generate new routes on use */ #define IPPROTO_OSPFIGP 89 /* OSPFIGP */ #define IPPROTO_CARP 112 /* CARP */ #ifndef _WIN32 #define IPPROTO_IPV4 IPPROTO_IPIP /* for compatibility */ #endif #define CARP_VERSION 2 #define CARP_ADVERTISEMENT 0x01 #define PRIV_NETINET_IPFW 491 /* Administer IPFW firewall. */ #define IP_FORWARDING 0x1 /* most of ip header exists */ #define NETISR_IP 2 /* same as AF_INET */ #define PRIV_NETINET_DUMMYNET 494 /* Administer DUMMYNET. */ extern int securelevel; struct carp_header { #if BYTE_ORDER == LITTLE_ENDIAN u_int8_t carp_type:4, carp_version:4; #endif #if BYTE_ORDER == BIG_ENDIAN u_int8_t carp_version:4, carp_type:4; #endif }; struct pim { int dummy; /* windows compiler does not like empty definition */ }; #ifndef _WIN32 struct route { struct rtentry *ro_rt; struct sockaddr ro_dst; }; #endif struct ifaltq { void *ifq_head; }; /* * ifnet->if_snd is used in ip_dummynet.c to take the transmission * clock. */ #if defined( __linux__) #define if_xname name #define if_snd XXX /* search local the ip addresses, used for the "me" keyword */ #include #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) #define INADDR_TO_IFP(ip, b) \ b = ip_dev_find(ip.s_addr) #else #define INADDR_TO_IFP(ip, b) \ b = ip_dev_find((struct net *)&init_net, ip.s_addr) #endif #elif defined( _WIN32 ) /* used in ip_dummynet.c */ struct ifnet { char if_xname[IFNAMSIZ]; /* external name (name + unit) */ // struct ifaltq if_snd; /* output queue (includes altq) */ }; struct net_device { char if_xname[IFNAMSIZ]; /* external name (name + unit) */ }; #endif /* involves mbufs */ int in_cksum(struct mbuf *m, int len); #define divert_cookie(mtag) 0 #define divert_info(mtag) 0 #define pf_find_mtag(a) NULL #define pf_get_mtag(a) NULL #ifndef _WIN32 #define AF_LINK AF_ASH /* ? our sys/socket.h */ #endif /* we don't pullup, either success or free and fail */ #define m_pullup(m, x) \ ((m)->m_len >= x ? (m) : (FREE_PKT(m), NULL)) struct pf_mtag { void *hdr; /* saved hdr pos in mbuf, for ECN */ sa_family_t af; /* for ECN */ u_int32_t qid; /* queue id */ }; #if 0 // ndef radix /* radix stuff in radix.h and radix.c */ struct radix_node { caddr_t rn_key; /* object of search */ caddr_t rn_mask; /* netmask, if present */ }; #endif /* !radix */ /* missing kernel functions */ char *inet_ntoa(struct in_addr ina); int random(void); /* * Return the risult of a/b * * this is used in linux kernel space, * since the 64bit division needs to * be done using a macro */ int64_t div64(int64_t a, int64_t b); char * inet_ntoa_r(struct in_addr ina, char *buf); /* from bsd sys/queue.h */ #define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ for ((var) = TAILQ_FIRST((head)); \ (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ (var) = (tvar)) #define SLIST_FOREACH_SAFE(var, head, field, tvar) \ for ((var) = SLIST_FIRST((head)); \ (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ (var) = (tvar)) /* depending of linux version */ #ifndef ETHERTYPE_IPV6 #define ETHERTYPE_IPV6 0x86dd /* IP protocol version 6 */ #endif /*-------------------------------------------------*/ #define RT_NUMFIBS 1 extern u_int rt_numfibs; /* involves kernel locking function */ #ifdef RTFREE #undef RTFREE #define RTFREE(a) fprintf(stderr, "RTFREE: commented out locks\n"); #endif void getmicrouptime(struct timeval *tv); /* from sys/netinet/ip_output.c */ struct ip_moptions; struct route; struct ip; struct mbuf *ip_reass(struct mbuf *); u_short in_cksum_hdr(struct ip *); int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp); /* from net/netisr.c */ void netisr_dispatch(int num, struct mbuf *m); /* definition moved in missing.c */ int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len); int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen); /* defined in session.c */ int priv_check(struct thread *td, int priv); /* struct ucred is in linux/socket.h and has pid, uid, gid. * We need a 'bsd_ucred' to store also the extra info */ struct bsd_ucred { uid_t uid; gid_t gid; uint32_t xid; uint32_t nid; }; int cred_check(void *insn, int proto, struct ifnet *oif, struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp, struct sk_buff *skb); int securelevel_ge(struct ucred *cr, int level); struct sysctl_oid; struct sysctl_req; #ifdef _WIN32 #define module_param_named(_name, _var, _ty, _perm) #else /* !_WIN32 */ /* Linux 2.4 is mostly for openwrt */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) #include /* generic_ffs() used in ip_fw2.c */ typedef uint32_t __be32; typedef uint16_t __be16; struct sock; struct net; struct inet_hashinfo; struct sock *inet_lookup( struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif); struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif); #endif /* Linux < 2.6 */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) && \ LINUX_VERSION_CODE > KERNEL_VERSION(2,6,16) /* XXX NOT sure, in 2.6.9 give an error */ #define module_param_named(_name, _var, _ty, _perm) \ //module_param(_name, _ty, 0644) #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) typedef unsigned long uintptr_t; #ifdef __i386__ static inline unsigned long __fls(unsigned long word) { asm("bsr %1,%0" : "=r" (word) : "rm" (word)); return word; } #endif #endif /* LINUX < 2.6.25 */ #endif /* !_WIN32 so maybe __linux__ */ #if defined (__linux__) && !defined (EMULATE_SYSCTL) #define SYSCTL_DECL(_1) #define SYSCTL_OID(_1, _2, _3, _4, _5, _6, _7, _8) #define SYSCTL_NODE(_1, _2, _3, _4, _5, _6) #define _SYSCTL_BASE(_name, _var, _ty, _perm) \ module_param_named(_name, *(_var), _ty, \ ( (_perm) == CTLFLAG_RD) ? 0444: 0644 ) #define SYSCTL_PROC(_base, _oid, _name, _mode, _var, _val, _desc, _a, _b) #define SYSCTL_INT(_base, _oid, _name, _mode, _var, _val, _desc) \ _SYSCTL_BASE(_name, _var, int, _mode) #define SYSCTL_LONG(_base, _oid, _name, _mode, _var, _val, _desc) \ _SYSCTL_BASE(_name, _var, long, _mode) #define SYSCTL_ULONG(_base, _oid, _name, _mode, _var, _val, _desc) \ _SYSCTL_BASE(_name, _var, ulong, _mode) #define SYSCTL_UINT(_base, _oid, _name, _mode, _var, _val, _desc) \ _SYSCTL_BASE(_name, _var, uint, _mode) #define TUNABLE_INT(_name, _ptr) #define SYSCTL_VNET_PROC SYSCTL_PROC #define SYSCTL_VNET_INT SYSCTL_INT #define SYSCTL_VNET_UINT SYSCTL_UINT #endif #define SYSCTL_HANDLER_ARGS \ struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req int sysctl_handle_int(SYSCTL_HANDLER_ARGS); int sysctl_handle_long(SYSCTL_HANDLER_ARGS); void ether_demux(struct ifnet *ifp, struct mbuf *m); int ether_output_frame(struct ifnet *ifp, struct mbuf *m); void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum); void icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu); void rtfree(struct rtentry *rt); u_short in_cksum_skip(struct mbuf *m, int len, int skip); #ifdef INP_LOCK_ASSERT #undef INP_LOCK_ASSERT #define INP_LOCK_ASSERT(a) #endif int jailed(struct ucred *cred); /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). If subnetsarelocal * is true, this includes other subnets of the local net. * Otherwise, it includes only the directly-connected (sub)nets. */ int in_localaddr(struct in_addr in); /* the prototype is already in the headers */ //int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); int fnmatch(const char *pattern, const char *string, int flags); int linux_lookup(const int proto, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, struct sk_buff *skb, int dir, struct bsd_ucred *u); /* vnet wrappers, in vnet.h and ip_var.h */ //int ipfw_init(void); //void ipfw_destroy(void); #define MTAG_IPFW 1148380143 /* IPFW-tagged cookie */ #define MTAG_IPFW_RULE 1262273568 /* rule reference */ struct ip_fw_args; extern int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa); #define curvnet NULL #define CURVNET_SET(_v) #define CURVNET_RESTORE() #define VNET_ASSERT(condition) #define VNET_NAME(n) n #define VNET_DECLARE(t, n) extern t n #define VNET_DEFINE(t, n) t n #define _VNET_PTR(b, n) &VNET_NAME(n) /* * Virtualized global variable accessor macros. */ #define VNET_VNET_PTR(vnet, n) (&(n)) #define VNET_VNET(vnet, n) (n) #define VNET_PTR(n) (&(n)) #define VNET(n) (n) VNET_DECLARE(int, ip_defttl); #define V_ip_defttl VNET(ip_defttl); int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, struct inpcb *inp); /* hooks for divert */ extern void (*ip_divert_ptr)(struct mbuf *m, int incoming); extern int (*ip_dn_ctl_ptr)(struct sockopt *); typedef int ip_fw_ctl_t(struct sockopt *); extern ip_fw_ctl_t *ip_fw_ctl_ptr; /* netgraph prototypes */ typedef int ng_ipfw_input_t(struct mbuf **, int, struct ip_fw_args *, int); extern ng_ipfw_input_t *ng_ipfw_input_p; /* For kernel ipfw_ether and ipfw_bridge. */ struct ip_fw_args; typedef int ip_fw_chk_t(struct ip_fw_args *args); extern ip_fw_chk_t *ip_fw_chk_ptr; #define V_ip_fw_chk_ptr VNET(ip_fw_chk_ptr) #define V_ip_fw_ctl_ptr VNET(ip_fw_ctl_ptr) #define V_tcbinfo VNET(tcbinfo) #define V_udbinfo VNET(udbinfo) #endif /* !_MISSING_H_ */ ================================================ FILE: kipfw/mysetenv.sh ================================================ #!/bin/bash # bash script to set a suitable environment to call MSVC's build # to build a 64-bit version of the kernel. # # inspired by C:/winddk/7600.16385.1/bin/setenv.bat # see http://www.osronline.com/ddkx/ddtools/build_ref_0kqb.htm ############################################################# # edit theese variables to meet your configuration # # - DRIVE is the hard drive letter where DDK is installed # # - DDK is the path to the DDK's root directory # # - CYGDDK is the complete cygwin path to DDK # ############################################################# if [ $# -ne 3 ]; then echo "invalid params" && exit 1 fi DRIVE=$1 DDK=$2 CYGDDK=/cygdrive/c/${DDK} TARGETOS=$3 MYDIR=`pwd` # XXX luigi if [ "$TARGETOS" = "wnet" ]; then export DDK_TARGET_OS=WinNET export _NT_TARGET_VERSION=0x502 fi if [ "$TARGETOS" = "wlh" ]; then export DDK_TARGET_OS=WinLH export _NT_TARGET_VERSION=0x600 fi if [ "$TARGETOS" = "win7" ]; then export DDK_TARGET_OS=Win7 export _NT_TARGET_VERSION=0x601 fi ############################################################# # don't edit anything else below this point # ############################################################# D=${DRIVE}${DDK} DB=${D}/bin DI=${D}/inc DL=${D}/lib export AMD64=1 export ATL_INC_PATH=$DI # defaults to DDKROOT/inc export ATL_INC_ROOT=$DI # XXX redundant ? export ATL_LIB_PATH=${DL}/atl/* export BASEDIR=$D # default export BUFFER_OVERFLOW_CHECKS=1 export BUILD_ALLOW_COMPILER_WARNINGS=1 export BUILD_ALT_DIR=chk_${TARGETOS}_AMD64 export BUILD_DEFAULT="-ei -nmake -i -nosqm" # can go on the command line export BUILD_DEFAULT_TARGETS="-amd64" # can also go on the command line export BUILD_MAKE_PROGRAM=nmake.exe # default to nmake export BUILD_MULTIPROCESSOR=1 # parallel make, same as -M export BUILD_OPTIONS=" ~imca ~toastpkg" export COFFBASE_TXT_FILE=${DB}/coffbase.txt export CPU=AMD64 export CRT_INC_PATH=${DI}/crt # default export CRT_LIB_PATH=${DL}/crt/* # not default, it seems uses lib/{wnet,win7}/* export DDKBUILDENV=chk # checked or free export DDK_INC_PATH=${DI}/ddk export DDK_LIB_DEST=${DL}/${TARGETOS} export DDK_LIB_PATH=${DL}/${TARGETOS}/* export DEPRECATE_DDK_FUNCTIONS=1 export DRIVER_INC_PATH=${DI}/ddk export HALKIT_INC_PATH=${DI}/ddk export HALKIT_LIB_PATH=${DL}/${TARGETOS}/* export IFSKIT_INC_PATH=${DI}/ddk export IFSKIT_LIB_DEST=${DL}/${TARGETOS} export IFSKIT_LIB_PATH=${DL}/${TARGETOS}/* export Include=${DI}/api export KMDF_INC_PATH=${DI}/wdf/kmdf export KMDF_LIB_PATH=${DL}/wdf/kmdf/* export LANGUAGE_NEUTRAL=0 export Lib=${DL} export LINK_LIB_IGNORE=4198 export MFC_INC_PATH=${DI}/mfc42 export MFC_LIB_PATH=${DL}/mfc/* export MSC_OPTIMIZATION="/Od /Oi" export NEW_CRTS=1 export NO_BINPLACE=TRUE export NO_BROWSER_FILE=TRUE export NTDBGFILES=1 export NTDEBUG=ntsd export NTDEBUGTYPE=both # need NTMAKEENV to point to the binary dir export NTMAKEENV=${DB} export OAK_INC_PATH=${DI}/api export PATH="${CYGDDK}/bin/amd64:${CYGDDK}/tools/sdv/bin:${CYGDDK}/tools/pfd/bin/bin/x86_AMD64\ :${CYGDDK}/bin/SelfSign:${CYGDDK}/bin/x86/amd64:${CYGDDK}/bin/x86\ :${CYGDDK}/tools/pfd/bin/bin/AMD64:${CYGDDK}/tools/tracing/amd64:$PATH" export PATHEXT=".COM;.EXE;.BAT;.CMD;.VBS;.VBE;.JS;.JSE;.WSF;.WSH;.MSC" export PROJECT_ROOT=${D}/src export PUBLIC_ROOT=${D} export RAZZLETOOLPATH=${DB} export RCNOFONTMAP=1 export SDK_INC_PATH=${DI}/api export SDK_LIB_DEST=${DL}/${TARGETOS} export SDK_LIB_PATH=${DL}/${TARGETOS}/* export SDV=${D}/tools/sdv export separate_object_root=FALSE export TEMP=tmpbuild export TMP=tmpbuild export UMDF_INC_PATH=${DI}/wdf/umdf export USE_OBJECT_ROOT=1 export WDM_INC_PATH=${DI}/ddk export WPP_CONFIG_PATH=${DB}/wppconfig export _AMD64bit=true export _BUILDARCH=AMD64 export _BuildType=chk export _NTDRIVE=${DRIVE} export _NTROOT=${DDK} # # --- XXX note, it spams C:/winddk/7600.16385.1/build.dat # -c: delete objs, -e: generare build.* logfiles, -f rescan sources, -g color errors unset MAKEFLAGS echo "emv ${MAKE} flags ${MAKEFLAGS}" cd kipfw-mod && build -cefg echo "done" #cp objchk_${TARGETOS}_amd64/amd64/ipfw.sys ../binary/ipfw.sys ================================================ FILE: kipfw/netipfw.inf ================================================ ; version section [Version] Signature = "$Windows NT$" Class = NetService ClassGUID = {4D36E974-E325-11CE-BFC1-08002BE10318} Provider = %Unipi% DriverVer = 08/12/2012,3.0.1.1 ; manufacturer section [Manufacturer] %Unipi% = UNIPI,NTx86,NTamd64 ; control flags section ; optional, unused in netipfw.inf inf, used in netipfw_m.inf [ControlFlags] ; models section [UNIPI] ; Win2k %Desc% = Ipfw.ndi, unipi_ipfw [UNIPI.NTx86] ;For WinXP and later %Desc% = Ipfw.ndi, unipi_ipfw [UNIPI.NTamd64] ;For x64 %Desc% = Ipfw.ndi, unipi_ipfw ; ddinstall section [Ipfw.ndi] AddReg = Ipfw.ndi.AddReg, Ipfw.AddReg Characteristics = 0x4410 ; NCF_FILTER | NCF_NDIS_PROTOCOL !--Filter Specific--!! CopyFiles = Ipfw.Files.Sys CopyInf = netipfw_m.inf ; remove section [Ipfw.ndi.Remove] DelFiles = Ipfw.Files.Sys ;ddinstall.services section [Ipfw.ndi.Services] AddService = Ipfw,,Ipfw.AddService [Ipfw.AddService] DisplayName = %ServiceDesc% ServiceType = 1 ;SERVICE_KERNEL_DRIVER StartType = 3 ;SERVICE_DEMAND_START ErrorControl = 1 ;SERVICE_ERROR_NORMAL ServiceBinary = %12%\ipfw.sys AddReg = Ipfw.AddService.AddReg [Ipfw.AddService.AddReg] ;file copy related sections [SourceDisksNames] 1=%DiskDescription%,"",, [SourceDisksFiles] ipfw.sys=1 [DestinationDirs] DefaultDestDir = 12 Ipfw.Files.Sys = 12 ; %windir%\System32\drivers ; ddinstall->copyfiles points here [Ipfw.Files.Sys] ipfw.sys,,,2 ; ddinstall->addreg points here [Ipfw.ndi.AddReg] HKR, Ndi, HelpText, , %HELP% ; this is displayed at the bottom of the General page of the Connection Properties dialog box HKR, Ndi, FilterClass, , failover HKR, Ndi, FilterDeviceInfId, , unipi_ipfwmp HKR, Ndi, Service, , Ipfw HKR, Ndi\Interfaces, UpperRange, , noupper HKR, Ndi\Interfaces, LowerRange, , nolower HKR, Ndi\Interfaces, FilterMediaTypes, , "ethernet, tokenring, fddi, wan" ;strings section [Strings] Unipi = "Unipi" DiskDescription = "Ipfw Driver Disk" Desc = "ipfw+dummynet" HELP = "This is ipfw and dummynet network emulator, developed by unipi.it" ServiceDesc = "ipfw service" ================================================ FILE: kipfw/netipfw_m.inf ================================================ ; version section [Version] Signature = "$Windows NT$" Class = Net ClassGUID = {4D36E972-E325-11CE-BFC1-08002BE10318} Provider = %Unipi% DriverVer = 08/12/2012,3.0.1.1 ; control flags section ; optional, unused in netipfw.inf inf, used in netipfw_m.inf [ControlFlags] ExcludeFromSelect = unipi_ipfwmp ; destinationdirs section, optional [DestinationDirs] DefaultDestDir=12 ; No files to copy ; manufacturer section [Manufacturer] %Unipi% = UNIPI,NTx86,NTamd64 ; models section [UNIPI] ; Win2k %Desc% = IpfwMP.ndi, unipi_ipfwmp [UNIPI.NTx86] ;For WinXP and later %Desc% = IpfwMP.ndi, unipi_ipfwmp [UNIPI.NTamd64] ;For x64 %Desc% = IpfwMP.ndi, unipi_ipfwmp ; ddinstall section [IpfwMP.ndi] AddReg = IpfwMP.ndi.AddReg Characteristics = 0x29 ;NCF_NOT_USER_REMOVABLE | NCF_VIRTUAL | NCF_HIDDEN ; ddinstall->addreg points here [IpfwMP.ndi.AddReg] HKR, Ndi, Service, 0, IpfwMP ;ddinstall.services section [IpfwMP.ndi.Services] AddService = IpfwMP,0x2, IpfwMP.AddService [IpfwMP.AddService] ServiceType = 1 ;SERVICE_KERNEL_DRIVER StartType = 3 ;SERVICE_DEMAND_START ErrorControl = 1 ;SERVICE_ERROR_NORMAL ServiceBinary = %12%\ipfw.sys AddReg = IpfwMP.AddService.AddReg [IpfwMP.AddService.AddReg] ; None [Strings] Unipi = "Unipi" Desc = "Ipfw Miniport" ================================================ FILE: kipfw/sources ================================================ TARGETNAME=ipfw TARGETTYPE=DRIVER C_DEFINES=$(C_DEFINES) -DNDIS_MINIPORT_DRIVER -DNDIS_WDM=1 MSC_WARNING_LEVEL=/W2 # The driver is built in the XP or .NET build environment # So let us build NDIS 5.1 version. C_DEFINES=$(C_DEFINES) -DNDIS51_MINIPORT=1 C_DEFINES=$(C_DEFINES) -DNDIS51=1 # Enable dummynet preprocessing macros C_DEFINES=$(C_DEFINES) /D_WIN32 /DMODULENAME=Ipfw /D_BSD_SOURCE /DKERNEL_MODULE /D_KERNEL /DKLD_MODULE /D__BSD_VISIBLE /DIPFIREWALL_DEFAULT_TO_ACCEPT /D__LITTLE_ENDIAN /DSYSCTL_NODE /DEMULATE_SYSCTL -FIwinmissing.h -FImissing.h -FI../glue.h /DWIN32_LEAN_AND_MEAN=1 TARGETLIBS=$(DDK_LIB_PATH)\ndis.lib INCLUDES= include_e ; ../sys SOURCES= ip_fw2.c ip_fw_pfil.c ip_fw_sockopt.c ip_fw_dynamic.c ip_fw_table.c ip_fw_log.c radix.c in_cksum.c ip_dummynet.c ip_dn_io.c ip_dn_glue.c dn_heap.c dn_sched_fifo.c dn_sched_wf2q.c dn_sched_rr.c dn_sched_qfq.c dn_sched_prio.c ipfw2_mod.c bsd_compat.c md_win.c miniport.c protocol.c passthru.c debug.c ================================================ FILE: kipfw/win-passthru.diff ================================================ diff -ubwrp original_passthru/miniport.c kipfw/miniport.c --- original_passthru/miniport.c 2012-08-01 14:34:15.096679600 +0200 +++ kipfw/miniport.c 2012-08-01 14:34:11.377929600 +0200 @@ -223,6 +223,7 @@ Return Value: // // Use NDIS 5.1 packet stacking: // + if (0) // XXX IPFW - make sure we don't go in here { PNDIS_PACKET_STACK pStack; BOOLEAN Remaining; @@ -347,6 +348,25 @@ Return Value: MediaSpecificInfo, MediaSpecificInfoSize); } +#if 1 /* IPFW: query the firewall */ + /* if dummynet keeps the packet, we mimic success. + * otherwise continue as usual. + */ + { + int ret = ipfw2_qhandler_w32(MyPacket, OUTGOING, + MiniportAdapterContext); + if (ret != PASS) { + if (ret == DROP) + return NDIS_STATUS_FAILURE; + else { //dummynet kept the packet +#ifndef WIN9X + NdisIMCopySendCompletePerPacketInfo (Packet, MyPacket); +#endif + return NDIS_STATUS_SUCCESS; //otherwise simply continue + } + } + } +#endif /* end of IPFW code */ NdisSend(&Status, pAdapt->BindingHandle, diff -ubwrp original_passthru/passthru.c kipfw/passthru.c --- original_passthru/passthru.c 2012-08-01 14:34:15.268554600 +0200 +++ kipfw/passthru.c 2012-08-01 14:34:11.534179600 +0200 @@ -47,8 +47,15 @@ NDIS_HANDLE NdisWrapperHandle; // To support ioctls from user-mode: // -#define LINKNAME_STRING L"\\DosDevices\\Passthru" -#define NTDEVICE_STRING L"\\Device\\Passthru" +#define STR2(x) #x +#define STR(x) STR2(x) +#define DOSPREFIX "\\DosDevices\\" +#define NTPREFIX "\\Device\\" +#define WIDEN2(x) L ## x +#define WIDEN(x) WIDEN2(x) +#define LINKNAME_STRING WIDEN(DOSPREFIX) WIDEN(STR(MODULENAME)) +#define NTDEVICE_STRING WIDEN(NTPREFIX) WIDEN(STR(MODULENAME)) +#define PROTOCOLNAME_STRING WIDEN(STR(MODULENAME)) NDIS_HANDLE NdisDeviceHandle = NULL; PDEVICE_OBJECT ControlDeviceObject = NULL; @@ -136,8 +143,8 @@ Return Value: // Either the Send or the SendPackets handler should be specified. // If SendPackets handler is specified, SendHandler is ignored // - MChars.SendHandler = NULL; // MPSend; - MChars.SendPacketsHandler = MPSendPackets; + MChars.SendHandler = MPSend; // IPFW: use MPSend, not SendPackets + MChars.SendPacketsHandler = NULL; Status = NdisIMRegisterLayeredMiniport(NdisWrapperHandle, &MChars, @@ -165,7 +172,7 @@ Return Value: // This is needed to ensure that NDIS can correctly determine // the binding and call us to bind to miniports below. // - NdisInitUnicodeString(&Name, L"Passthru"); // Protocol name + NdisInitUnicodeString(&Name, PROTOCOLNAME_STRING); // Protocol name PChars.Name = Name; PChars.OpenAdapterCompleteHandler = PtOpenAdapterComplete; PChars.CloseAdapterCompleteHandler = PtCloseAdapterComplete; @@ -205,6 +212,8 @@ Return Value: NdisTerminateWrapper(NdisWrapperHandle, NULL); } + ipfw_module_init(); // IPFW - start the system + return(Status); } @@ -276,7 +285,8 @@ Return Value: DispatchTable[IRP_MJ_CREATE] = PtDispatch; DispatchTable[IRP_MJ_CLEANUP] = PtDispatch; DispatchTable[IRP_MJ_CLOSE] = PtDispatch; - DispatchTable[IRP_MJ_DEVICE_CONTROL] = PtDispatch; + // IPFW we use DevIoControl ? + DispatchTable[IRP_MJ_DEVICE_CONTROL] = DevIoControl; NdisInitUnicodeString(&DeviceName, NTDEVICE_STRING); @@ -453,6 +463,7 @@ PtUnload( NdisFreeSpinLock(&GlobalLock); + ipfw_module_exit(); // IPFW unloading dummynet + DBGPRINT(("PtUnload: done!\n")); } - diff -ubwrp original_passthru/passthru.h kipfw/passthru.h --- original_passthru/passthru.h 2012-08-01 14:34:15.049804600 +0200 +++ kipfw/passthru.h 2012-08-01 14:34:11.362304600 +0200 @@ -61,6 +61,13 @@ PtDispatch( IN PIRP Irp ); +DRIVER_DISPATCH DevIoControl; +NTSTATUS +DevIoControl( + IN PDEVICE_OBJECT pDeviceObject, + IN PIRP pIrp + ); + NDIS_STATUS PtRegisterDevice( VOID @@ -366,6 +373,7 @@ PtDereferenceAdapt( typedef struct _SEND_RSVD { PNDIS_PACKET OriginalPkt; + struct mbuf* pMbuf; // IPFW extension, reference to the mbuf } SEND_RSVD, *PSEND_RSVD; // @@ -376,6 +384,7 @@ typedef struct _SEND_RSVD typedef struct _RECV_RSVD { PNDIS_PACKET OriginalPkt; + struct mbuf* pMbuf; // IPFW extension, reference to the mbuf } RECV_RSVD, *PRECV_RSVD; C_ASSERT(sizeof(RECV_RSVD) <= sizeof(((PNDIS_PACKET)0)->MiniportReserved)); @@ -475,3 +484,17 @@ IsIMDeviceStateOn( */ #define IsIMDeviceStateOn(_pP) ((_pP)->MPDeviceState == NdisDeviceStateD0 && (_pP)->PTDeviceState == NdisDeviceStateD0 ) +#include "winmissing.h" + +int ipfw_module_init(void); +void ipfw_module_exit(void); +int ipfw2_qhandler_w32(PNDIS_PACKET pNdisPacket, int direction, + NDIS_HANDLE Context); +int ipfw2_qhandler_w32_oldstyle(int direction, NDIS_HANDLE ProtocolBindingContext, + unsigned char* HeaderBuffer, unsigned int HeaderBufferSize, + unsigned char* LookAheadBuffer, unsigned int LookAheadBufferSize, + unsigned int PacketSize); +void CleanupReinjected(PNDIS_PACKET Packet, struct mbuf* m, PADAPT pAdapt); +void hexdump(PUCHAR,int, const char *); +void my_init(); +void my_exit(); \ Manca newline alla fine del file Solo in original_passthru: passthru.htm Solo in original_passthru: passthru.rc diff -ubwrp original_passthru/protocol.c kipfw/protocol.c --- original_passthru/protocol.c 2012-08-01 14:34:15.112304600 +0200 +++ kipfw/protocol.c 2012-08-01 14:34:11.409179600 +0200 @@ -841,6 +841,14 @@ Return Value: SendRsvd = (PSEND_RSVD)(Packet->ProtocolReserved); Pkt = SendRsvd->OriginalPkt; +#if 1 // IPFW - new code + //DbgPrint("SendComplete: packet %p pkt %p\n", Packet, Pkt); + if (Pkt == NULL) { //this is a reinjected packet, with no 'father' + CleanupReinjected(Packet, SendRsvd->pMbuf, pAdapt); + return; + } +#endif /* IPFW */ + #ifndef WIN9X NdisIMCopySendCompletePerPacketInfo (Pkt, Packet); #endif @@ -1021,6 +1029,13 @@ Return Value: if (pAdapt->MiniportHandle != NULL) { +#if 1 /* IPFW: query the firewall */ + int ret; + ret = ipfw2_qhandler_w32(MyPacket, INCOMING, + ProtocolBindingContext); + if (ret != PASS) + return 0; //otherwise simply continue +#endif /* end of IPFW code */ NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &MyPacket, 1); } @@ -1055,6 +1070,13 @@ Return Value: { case NdisMedium802_3: case NdisMediumWan: + //DbgPrint("EthIndicateReceive context %p, header at %p len %u, lookahead at %p len %u, packetsize %u\n",ProtocolBindingContext,HeaderBuffer,HeaderBufferSize,LookAheadBuffer,LookAheadBufferSize,PacketSize); + //hexdump(HeaderBuffer,HeaderBufferSize+LookAheadBufferSize,"EthIndicateReceive"); + { + int ret = ipfw2_qhandler_w32_oldstyle(INCOMING, ProtocolBindingContext, HeaderBuffer, HeaderBufferSize, LookAheadBuffer, LookAheadBufferSize, PacketSize); + if (ret != PASS) + return NDIS_STATUS_SUCCESS; + } NdisMEthIndicateReceive(pAdapt->MiniportHandle, MacReceiveContext, HeaderBuffer, @@ -1120,6 +1142,21 @@ Return Value: PADAPT pAdapt =(PADAPT)ProtocolBindingContext; ULONG Proc = KeGetCurrentProcessorNumber(); + /* Warning: this is a poor implementation of the PtReceiveComplete + * made by MS, and it's a well known (but never fixed) issue. + * Since the ProcessorNumber here can be different from the one + * that processed the PtReceive, sometimes NdisMEthIndicateReceiveComplete + * will not be called, causing poor performance in the incoming traffic. + * In our driver, PtReceive is called for IP packets ONLY by particulary + * old NIC drivers, and the poor performance can be seen even + * in traffic not handled by ipfw or dummynet. + * Fortunately, this is quite rare, all the incoming IP packets + * will arrive through PtReceivePacket, and this callback will never + * be called. For reinjected traffic, a workaround is done + * commuting the ReceivedIndicationFlag and calling + * NdisMEthIndicateReceiveComplete manually for each packet. + */ + if (((pAdapt->MiniportHandle != NULL) && (pAdapt->MPDeviceState == NdisDeviceStateD0)) && (pAdapt->ReceivedIndicationFlags[Proc])) @@ -1199,7 +1236,7 @@ Return Value: // See also: PtReceive(). // (VOID)NdisIMGetCurrentPacketStack(Packet, &Remaining); - if (Remaining) + if (0 && Remaining) { // // We can reuse "Packet". Indicate it up and be done with it. @@ -1247,6 +1284,13 @@ Return Value: if (pAdapt->MiniportHandle != NULL) { +#if 1 /* IPFW: query the firewall */ + int ret; + ret = ipfw2_qhandler_w32(MyPacket, INCOMING, + ProtocolBindingContext); + if (ret != PASS) + return 0; //otherwise simply continue +#endif /* end of IPFW code */ NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &MyPacket, 1); } ================================================ FILE: kipfw/winmissing.h ================================================ /* * Copyright (c) 2010 Francesco Magno, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id: winmissing.h 11647 2012-08-06 23:20:21Z luigi $ * definitions and other things needed to build freebsd kernel * modules in Windows (with the MSVC compiler) */ #ifndef _WINMISSING_H_ #define _WINMISSING_H_ #include #include #include #include #include #include typedef UCHAR u_char; typedef UCHAR u_int8_t; typedef UCHAR uint8_t; typedef USHORT u_short; typedef USHORT u_int16_t; typedef USHORT uint16_t; typedef USHORT n_short; typedef UINT u_int; typedef INT32 int32_t; typedef UINT32 u_int32_t; typedef UINT32 uint32_t; typedef ULONG u_long; typedef ULONG n_long; typedef UINT64 uint64_t; typedef UINT64 u_int64_t; typedef INT64 int64_t; typedef UINT32 in_addr_t; typedef UCHAR sa_family_t; typedef USHORT in_port_t; typedef UINT32 __gid_t; typedef UINT32 gid_t; typedef UINT32 __uid_t; typedef UINT32 uid_t; typedef ULONG n_time; typedef char* caddr_t; /* linux_lookup uses __be32 and __be16 in the prototype */ typedef uint32_t __be32; /* XXX __u32 __bitwise __be32 */ typedef uint16_t __be16; /* XXX */ //*** DEBUG STUFF *** /* * To see the debugging messages you need DbgView http://technet.microsoft.com/en-us/sysinternals/bb896647.aspx */ #define printf DbgPrint #define log(lev, ...) DbgPrint(__VA_ARGS__) const char* texify_cmd(int i); const char* texify_proto(unsigned int p); //*** end DEBUG STUFF *** #define snprintf _snprintf #define timespec timeval struct timeval { long tv_sec; long tv_usec; }; struct in_addr { in_addr_t s_addr; }; struct sockaddr_in { uint8_t sin_len; sa_family_t sin_family; in_port_t sin_port; struct in_addr sin_addr; char sin_zero[8]; }; /* XXX watch out, windows names are actually longer */ #define IFNAMSIZ 16 #define IF_NAMESIZE 16 #define ETHER_ADDR_LEN 6 /* we do not include the windows headers for in6_addr so * we need to provide our own definition for the kernel. */ struct in6_addr { union { uint8_t __u6_addr8[16]; uint16_t __u6_addr16[8]; uint32_t __u6_addr32[4]; } __u6_addr; /* 128-bit IP6 address */ }; #define htons(x) RtlUshortByteSwap(x) #define ntohs(x) RtlUshortByteSwap(x) #define htonl(x) RtlUlongByteSwap(x) #define ntohl(x) RtlUlongByteSwap(x) #define ENOSPC 28 /* No space left on device */ #define EOPNOTSUPP 45 /* Operation not supported */ #define EACCES 13 /* Permission denied */ #define ENOENT 2 /* No such file or directory */ #define EINVAL 22 /* Invalid argument */ #define EPROTONOSUPPORT 43 /* Protocol not supported */ #define ENOMEM 12 /* Cannot allocate memory */ #define EEXIST 17 /* File exists */ #define ESRCH 3 #define ENOBUFS 55 /* No buffer space available */ #define EBUSY 16 /* Module busy */ #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) #define __packed #define __aligned(x); #define __user #define __init #define __exit #define __func__ __FUNCTION__ #define inline __inline struct sockaddr_in6 { int dummy; }; //SPINLOCKS #define DEFINE_SPINLOCK(x) NDIS_SPIN_LOCK x #define mtx_init(m,a,b,c) NdisAllocateSpinLock(m) #define mtx_lock(_l) NdisAcquireSpinLock(_l) #define mtx_unlock(_l) NdisReleaseSpinLock(_l) #define mtx_destroy(m) NdisFreeSpinLock(m) #define mtx_assert(a, b) #define rw_rlock(_l) NdisAcquireSpinLock(_l) #define rw_runlock(_l) NdisReleaseSpinLock(_l) #define rw_assert(a, b) #define rw_wlock(_l) NdisAcquireSpinLock(_l) #define rw_wunlock(_l) NdisReleaseSpinLock(_l) #define rw_destroy(_l) NdisFreeSpinLock(_l) #define rw_init(_l, msg) NdisAllocateSpinLock(_l) #define rw_init_flags(_l, s, v) NdisAllocateSpinLock(_l) #define rwlock_t NDIS_SPIN_LOCK #define spinlock_t NDIS_SPIN_LOCK #define s6_addr __u6_addr.__u6_addr8 struct icmphdr { u_char icmp_type; /* type of message, see below */ u_char icmp_code; /* type sub code */ u_short icmp_cksum; /* ones complement cksum of struct */ }; #define ICMP_ECHO 8 /* echo service */ #define IPOPT_OPTVAL 0 /* option ID */ #define IPOPT_OLEN 1 /* option length */ #define IPOPT_EOL 0 /* end of option list */ #define IPOPT_NOP 1 /* no operation */ #define IPOPT_LSRR 131 /* loose source route */ #define IPOPT_SSRR 137 /* strict source route */ #define IPOPT_RR 7 /* record packet route */ #define IPOPT_TS 68 /* timestamp */ #define IPPROTO_ICMP 1 /* control message protocol */ #define IPPROTO_TCP 6 /* tcp */ #define IPPROTO_UDP 17 /* user datagram protocol */ #define IPPROTO_ICMPV6 58 /* ICMP6 */ #define IPPROTO_SCTP 132 /* SCTP */ #define IPPROTO_HOPOPTS 0 /* IP6 hop-by-hop options */ #define IPPROTO_ROUTING 43 /* IP6 routing header */ #define IPPROTO_FRAGMENT 44 /* IP6 fragmentation header */ #define IPPROTO_DSTOPTS 60 /* IP6 destination option */ #define IPPROTO_AH 51 /* IP6 Auth Header */ #define IPPROTO_ESP 50 /* IP6 Encap Sec. Payload */ #define IPPROTO_NONE 59 /* IP6 no next header */ #define IPPROTO_PIM 103 /* Protocol Independent Mcast */ #define IPPROTO_IPV6 41 #define IPPROTO_IPV4 4 /* IPv4 encapsulation */ #define INADDR_ANY (uint32_t)0x00000000 #define AF_INET 2 /* internetwork: UDP, TCP, etc. */ #define AF_LINK 18 /* Link layer interface */ #define IN_CLASSD(i) (((uint32_t)(i) & 0xf0000000) == 0xe0000000) #define IN_MULTICAST(i) IN_CLASSD(i) #define DROP 0 #define PASS 1 #define DUMMYNET 2 #define INCOMING 0 #define OUTGOING 1 size_t strlcpy(char *dst, const char *src, size_t siz); void do_gettimeofday(struct timeval *tv); int ffs(int bits); int time_uptime_w32(); #endif /* _WINMISSING_H_ */ ================================================ FILE: planetlab/Makefile.planetlab ================================================ # $Id: Makefile 11687 2012-08-12 20:51:25Z luigi $ # # Top level makefile for building ipfw/dummynet (kernel and userspace). # You can run it manually or also under the Planetlab build. # Planetlab wants also the 'install' target. # # To build on system with non standard Kernel sources or userland files, # you should run this with # # make KERNELPATH=/path/to/linux-2.x.y.z USRDIR=/path/to/usr # # We assume that $(USRDIR) contains include/ and lib/ used to build userland. # include Makefile.inc DATE ?= $(shell date +%Y%m%d) SNAPSHOT_NAME=$(DATE)-ipfw3.tgz BINDIST=$(DATE)-dummynet-linux.tgz WINDIST=$(DATE)-dummynet-windows.zip .PHONY: ipfw kipfw ########################################### # windows x86 and x64 specific variables # ########################################### # DRIVE must be the hard drive letter where DDK is installed # DDKDIR must be the path to the DDK root directory, without drive letter # TARGETOS (x64 only) must be one of the following: # wnet -> windows server 2003 # wlh -> windows vista and windows server 2008 # win7 -> windows 7 # future version must be added here export WIN64 export DDK export DRIVE export DDKDIR DRIVE ?= C: DDKDIR ?= /WinDDK/7600.16385.1 DDK = $(DRIVE)$(DDKDIR) TARGETOS=win7 _all: all clean distclean: -@(cd ipfw && $(MAKE) $(@) ) -@rm -rf kipfw-mod binary64/[A-hj-z]* all: kipfw ipfw @# -- windows only ifeq ($(OSARCH),Windows) # copy files ifeq ($(WIN64),) -@ cp ipfw/ipfw.exe kipfw-mod/$(OBJDIR)/ipfw.sys binary/ -@ cp kipfw/*.inf binary/ else -@ cp binary/* binary64/ -@ cp ipfw/ipfw.exe kipfw-mod/objchk_win7_amd64/amd64/ipfw.sys binary64/ endif # WIN64 endif # Windows win64: $(MAKE) WIN64=1 # kipfw-src prepares the sources for the kernel part. # The windows files (passthru etc.) are modified version of the # examples found in the $(DDK)/src/network/ndis/passthru/driver/ # They can be re-created using the 'ndis-glue' target # # We need a sed trick to remove newlines from the patchfile. ndis-glue: -@mkdir -p kipfw-mod cp $(DDK)/src/network/ndis/passthru/driver/*.[ch] kipfw-mod cat kipfw/win-passthru.diff | sed "s/$$(printf '\r')//g" | (cd kipfw-mod; patch ) kipfw-src: -@rm -rf kipfw-mod -@mkdir -p kipfw-mod -@cp -Rp kipfw/* kipfw-mod -@cp `find sys -name \*.c` kipfw-mod -@(cd kipfw-mod && $(MAKE) include_e) ifeq ($(OSARCH),Windows) make ndis-glue endif snapshot: $(MAKE) distclean (cd ..; tar cvzhf /tmp/$(SNAPSHOT_NAME) --exclude .svn \ --exclude README.openwrt --exclude tags --exclude NOTES \ --exclude tcc-0.9.25-bsd \ --exclude original_passthru \ --exclude ipfw3.diff --exclude add_rules \ --exclude test --exclude test_ \ ipfw3-2012 ) bindist: $(MAKE) clean $(MAKE) all tar cvzf /tmp/$(BINDIST) ipfw/ipfw ipfw/ipfw.8 kipfw-mod/ipfw_mod.ko windist: $(MAKE) clean -$(MAKE) all -rm /tmp/$(WINDIST) zip -r /tmp/$(WINDIST) binary -x \*.svn\* ipfw: @(cd ipfw && $(MAKE) $(@) ) kipfw: kipfw-src ifeq ($(WIN64),) # linux or windows 32 bit @(cd kipfw-mod && $(MAKE) $(@) ) else #--- windows 64 bit, we use build.exe and nmake rm -f kipfw-mod/Makefile mkdir kipfw-mod/tmpbuild # check mysetenv.sh bash kipfw/mysetenv.sh $(DRIVE) $(DDKDIR) $(TARGETOS) endif IPF3_REPO ?= svn+ssh://some.host/some/path/ipfw3-2012 planetlab_update: # clean and create a local working directory rm -rf /tmp/pl-tmp mkdir -p /tmp/pl-tmp/pl mkdir -p /tmp/pl-tmp/ol2 # get the trunk version of the PlanetLab repository # to specify the sshkey use the .ssh/config file (cd /tmp/pl-tmp/pl; \ svn co svn+ssh://svn.planet-lab.org/svn/ipfw/trunk) # get an updated copy of the main ipfw repository (cd /tmp/pl-tmp/ol2; svn export $(IPFW3_REPO) ) # copy the new version over the old one (cd /tmp/pl-tmp; cp -rP ol2/ipfw3/* pl/trunk) # files cleanup in the old version (cd /tmp/pl-tmp; diff -r ol2/ipfw3 pl/trunk | \ grep -v "svn" | awk '{print $$3 $$4}' | \ sed 's/:/\//' | xargs rm -rf) # local adjustments here rm -rf /tmp/pl-tmp/pl/trunk/planetlab/check_planetlab_sync # commit to the remote repo @echo "Please, revise the update with the commands:" @echo "(cd /tmp/pl-tmp/pl/trunk; svn diff)" @echo "(cd /tmp/pl-tmp/pl/trunk; svn status)" @echo "and commit with:" @echo "(cd /tmp/pl-tmp/pl/trunk; svn ci -m 'Update from the mail ipfw repo.')" openwrt_release: # create a temporary directory $(eval TMPDIR := $(shell mktemp -d -p /tmp/ ipfw3_openwrt_XXXXX)) # create the source destination directory $(eval IPFWDIR := ipfw3-$(DATE)) $(eval DSTDIR := $(TMPDIR)/$(IPFWDIR)) mkdir $(DSTDIR) # copy the package, clean objects and svn info cp -r ./ipfw ./kipfw-mod glue.h Makefile ./configuration README $(DSTDIR) (cd $(DSTDIR); make -s distclean; find . -name .svn | xargs rm -rf) (cd $(TMPDIR); tar czf $(IPFWDIR).tar.gz $(IPFWDIR)) # create the port files in /tmp/ipfw3-port $(eval PORTDIR := $(TMPDIR)/ipfw3) mkdir -p $(PORTDIR)/patches # generate the Makefile, PKG_VERSION and PKG_MD5SUM md5sum $(DSTDIR).tar.gz | cut -d ' ' -f 1 > $(TMPDIR)/md5sum cat ./OPENWRT/Makefile | \ sed s/PKG_VERSION:=/PKG_VERSION:=$(DATE)/ | \ sed s/PKG_MD5SUM:=/PKG_MD5SUM:=`cat $(TMPDIR)/md5sum`/ \ > $(PORTDIR)/Makefile @echo "" @echo "The openwrt port is in $(TMPDIR)/ipfw3-port" @echo "The source file should be copied to the public server:" @echo "scp $(DSTDIR).tar.gz marta@info.iet.unipi.it:~marta/public_html/dummynet" @echo "after this the temporary directory $(TMPDIR) can be removed." install: diff: -@(diff -upr $(BSD_HEAD)/sbin/ipfw ipfw) -@(diff -upr $(BSD_HEAD)/sys sys) ================================================ FILE: planetlab/check_planetlab_sync ================================================ #!/bin/sh # # This script is used to check the sync of the local repo # with the remote planetlab repository tmpfile=/tmp/chech_planetlab_sync.tmp # check for local copy sync svn diff > /tmp/chech_planetlab_sync.tmp if [ -s $tmpfile ] ; then echo "Local repo unsynced, can not continue" exit -1 rm $tmpfile fi # export remote copy svn --force export http://svn.planet-lab.org/svn/ipfw/trunk ./ >> /dev/null # check diffs again, output to the user svn diff svn status | grep -v check_planetlab_sync ================================================ FILE: planetlab/ipfw ================================================ #!/bin/sh # # ipfw init the emulation service # # chkconfig: 2345 09 91 # description: ipfw init and shutdown # # Source function library. . /etc/init.d/functions IPFW=ipfw IPFW_BACKEND=/vsys/ipfw-be IPFW_MOD=ipfw_mod if [ ! -x /sbin/$IPFW ] || [ ! -x ${IPFW_BACKEND} ]; then echo -n "/sbin/$IPFW does not exist."; warning; echo exit 0 fi # Load the ipfw module, and initialize netconfig start() { # load the module modprobe $IPFW_MOD >& /dev/null let ret=$?; [ $ret -eq 0 ] && success || failure # init netconfig echo "super dbcleanup" | ${IPFW_BACKEND} root >& /dev/null echo "super init" | ${IPFW_BACKEND} root >& /dev/null return $ret } stop() { # clean netconfig stuff echo "super dbcleanup" | ${IPFW_BACKEND} root >& /dev/null echo "Unloading $IPFW_MOD module: " # unload the ipfw module rmmod ${IPFW_MOD} let ret=$?; [ $ret -eq 0 ] && success || failure return $ret } # echo the ipfw status status() { # check for module presence grep '^ipfw_mod$' /proc/modules >& /dev/null || echo "ipfw not loaded" && return 0 # Show active users USERS=$(grep BLOCK /tmp/ff | wc -l) echo "ipfw is loaded and there are currently ${USERS} with active emulation." return 0 } # main case "$1" in start) start RETVAL=$? ;; stop) stop RETVAL=$? ;; restart) stop start RETVAL=$? ;; status) status RETVAL=$? ;; *) echo $"Usage: $0 {start|stop|restart|status}" exit 1 ;; esac exit $RETVAL ================================================ FILE: planetlab/ipfw.cron ================================================ # Runs every 5 minutes and clean ipfw expired rules # $Id: ipfw.cron 6069 2010-04-15 09:35:33Z marta $ */5 * * * * root echo "super killexpired" | /vsys/ipfw-be root > /dev/null 2>&1 ================================================ FILE: planetlab/ipfwroot.spec ================================================ # # Marta Carbone # 2009 - Universita` di Pisa # License is BSD. # kernel_release, kernel_version and kernel_arch are expected to be set by the build to e.g. # kernel_release : 24.onelab (24 is then the planetlab taglevel) # kernel_version : 2.6.27.57 | 2.6.32 (57 in the 27 case is the patch level) # kernel_arch : i686 | x86_64 # the 2012 release was pulled from http://info.iet.unipi.it/~marta/dummynet/ipfw3-20120610.tar.gz # seel also http://sourceforge.net/p/dummynet/code # in 2013 Marta has moved to sourceforge at # git clone git://git.code.sf.net/p/dummynet/code your read-only code %define name ipfwroot %define version 3 %define taglevel 1 # when no planetlab kernel is being built, kernel_version is defined but empty %define _with_planetlab_kernel %{?kernel_version:1}%{!?kernel_version:0} # we need to make sure that this rpm gets upgraded when the kernel release changes %if %{_with_planetlab_kernel} # with the planetlab kernel %define pl_kernel_taglevel %( echo %{kernel_release} | cut -d. -f1 ) %define ipfw_release %{kernel_version}.%{pl_kernel_taglevel} %else # with the stock kernel # this line below #%define ipfw_release %( rpm -q --qf "%{version}" kernel-headers ) # causes recursive macro definition no matter how much you quote %define percent % %define braop \{ %define bracl \} %define kernel_version %( rpm -q --qf %{percent}%{braop}version%{bracl} kernel-headers ) %define kernel_release %( rpm -q --qf %{percent}%{braop}release%{bracl} kernel-headers ) %define kernel_arch %( rpm -q --qf %{percent}%{braop}arch%{bracl} kernel-headers ) %define ipfw_release %{kernel_version}.%{kernel_release} %endif %define release %{ipfw_release}.%{taglevel}%{?pldistro:.%{pldistro}}%{?date:.%{date}} # guess which convention is used; k27 and before used dash, k32 uses dot %define kernelpath_dash /usr/src/kernels/%{kernel_version}-%{kernel_release}-%{kernel_arch} %define kernelpath_dot /usr/src/kernels/%{kernel_version}-%{kernel_release}.%{kernel_arch} %define kernelpath %( [ -d %{kernelpath_dot} ] && echo %{kernelpath_dot} || echo %{kernelpath_dash} ) # the k32 kernel currently builds e.g. /lib/modules/2.6.32-0.onelab.2010.12.07-i686 # the k27 and before does not have the -i686 part %define kernel_id_old %{kernel_version}-%{kernel_release} %define kernel_id_new %{kernel_version}-%{kernel_release}.%{kernel_arch} %define kernel_id %( [ -d %{kernelpath_dot} ] && echo %{kernel_id_new} || echo %{kernel_id_old} ) Summary: ipfw and dummynet for Linux Name: %{name} Version: %{version} Release: %{release} License: BSD Group: System Environment/Kernel Source0: %{name}-%{version}.tar.bz2 BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-buildroot Requires: kernel = %{kernel_version}-%{kernel_release} # in fedora20 cronie does not provide vixie-cronie anymore, just cronie Requires: cronie Requires: vsys-scripts Obsoletes: ipfw Vendor: unipi Packager: PlanetLab # XXX ask Distribution: PlanetLab %{plrelease} URL: %{SCMURL} %description ipfw is the Linux port of the FreeBSD ipfw and dummynet packages %prep %setup %build # clean the rpm build directory rm -rf $RPM_BUILD_ROOT %__make KERNELPATH=%kernelpath clean %__make KERNELPATH=%kernelpath IPFW_PLANETLAB=1 %install install -D -m 755 kipfw-mod/ipfw_mod.ko $RPM_BUILD_ROOT/lib/modules/%{kernel_id}/net/netfilter/ipfw_mod.ko install -D -m 755 ipfw/ipfw $RPM_BUILD_ROOT/sbin/ipfw install -D -m 644 planetlab/ipfw.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/ipfw.cron install -D -m 755 planetlab/ipfw $RPM_BUILD_ROOT/etc/rc.d/init.d/ipfw %clean rm -rf $RPM_BUILD_ROOT %post ### this script is also triggered while the node image is being created at build-time # some parts of the script do not make sense in this context # this is why the build exports PL_BOOTCD=1 in such cases depmod -a %{kernel_id} /sbin/chkconfig --add ipfw # start the service if not building [ -z "$PL_BOOTCD" ] && service ipfw start %postun # stop the service if not building [ -z "$PL_BOOTCD" ] && service ipfw stop # here there is a list of the final installation directories %files %defattr(-,root,root) %dir /lib/modules/%{kernel_id} /lib/modules/%{kernel_id}/net/netfilter/ipfw_mod.ko /sbin/ipfw %{_sysconfdir}/cron.d/ipfw.cron /etc/rc.d/init.d/ipfw %changelog * Mon Jul 09 2012 Thierry Parmentelat - ipfw-20120610-2 - cosmetic changes only in specfile * Fri Jun 15 2012 Thierry Parmentelat - ipfw-20120610-1 - integrated ipfw3 as of 20120610 from upstream * Mon Oct 24 2011 Thierry Parmentelat - ipfw-0.9-23 - for building against k32 on f8 * Sun Oct 02 2011 Thierry Parmentelat - ipfw-0.9-22 - rpm version number has the kernel taglevel embedded * Fri Jun 10 2011 Thierry Parmentelat - ipfw-0.9-21 - build tweaks for gcc-4.6 on f15 * Sun Jan 23 2011 Thierry Parmentelat - ipfw-0.9-20 - tweaks for compiling on k32/64 bits * Wed Dec 08 2010 Thierry Parmentelat - ipfw-0.9-19 - fix detection of kernel conventions * Tue Dec 07 2010 Thierry Parmentelat - ipfw-0.9-18 - guess conventions for either <=k27 or >=k32 * Tue Jun 15 2010 Baris Metin - ipfw-0.9-17 - testing git only module-tag * Tue Jun 15 2010 Baris Metin - ipfw-0.9-16 - tagging ipfw to test module-tools on (pure) git * Wed May 12 2010 Talip Baris Metin - ipfw-0.9-15 - tagging for obsoletes * Tue Apr 27 2010 Thierry Parmentelat - ipfw-0.9-13 - Update to the ipfw3 version of the dummynet code. * Mon Apr 12 2010 Thierry Parmentelat - ipfw-0.9-11 - add ipfw initialization script to chkconfig * Wed Mar 03 2010 Talip Baris Metin - ipfw-0.9-10 - - Load module at installation - Marta * Mon Jan 11 2010 Thierry Parmentelat - ipfw-0.9-9 - consistent with vsys-scripts-0.95-13 * Mon Jan 11 2010 Marta Carbone - Integrated the ipfw rules cleanup into the backend * Sat Jan 09 2010 Thierry Parmentelat - ipfw-0.9-8 - builds on 2.6.22 & 2.6.27 - for 32 and 64 bits * Wed Jan 06 2010 Marta Carbone - move to dummynet2, added support for table lookup - added the vsys-script dependencies and the ipfw initialization * Tue Dec 15 2009 Marta Carbone - more work on the radix code, added sysctl read/write support * Sun Nov 29 2009 Thierry Parmentelat - ipfw-0.9-7 - added missing qsort.c - tag 0.9-6 was broken * Thu Nov 26 2009 Thierry Parmentelat - ipfw-0.9-6 - root: removed goto into the main ipfw switch, enabled slice_id matching - slice: completely move netconfig checks into the backend * Mon Nov 09 2009 Thierry Parmentelat - ipfw-0.9-5 - additional features on matching packets, including uid match * Mon Sep 07 2009 Thierry Parmentelat - ipfw-0.9-4 - on behalf of Marta Carbone, more options and features * Thu Jul 23 2009 Thierry Parmentelat - ipfw-0.9-3 - fixed memory usage issue * Wed Jul 15 2009 Thierry Parmentelat - ipfw-0.9-2 - patch for building on x86_64 * Thu Jun 25 2009 Marta Carbone - post installation removed for deployment, moved manpages to the slice package * Fri Apr 17 2009 Marta Carbone - Initial release ================================================ FILE: planetlab/ipfwslice.spec ================================================ # # TODO: # restart crond # modprobe ipfw_mod.ko (depmod ?) # # Marta Carbone # 2009 - Universita` di Pisa # License is BSD. # the 2012 release was pulled from http://info.iet.unipi.it/~marta/dummynet/ipfw3-20120610.tar.gz # seel also http://sourceforge.net/p/dummynet/code # in 2013 Marta has moved to sourceforge at # git clone git://git.code.sf.net/p/dummynet/code your read-only code %define name ipfwslice %define version 3 %define taglevel 1 %define release %{taglevel}%{?pldistro:.%{pldistro}}%{?date:.%{date}} Summary: ipfw and dummynet for Linux Name: %{name} Version: %{version} Release: %{release} License: BSD Group: System Environment/Kernel Source0: %{name}-%{version}.tar.bz2 BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-buildroot Vendor: unipi Packager: PlanetLab Distribution: PlanetLab %{plrelease} URL: %{SCMURL} %description the frontend part of the ipfw planetlab package %prep %setup %build rm -rf $RPM_BUILD_ROOT %install install -D -m 755 planetlab/netconfig $RPM_BUILD_ROOT/sbin/netconfig install -D -m 755 planetlab/ipfw.8.gz $RPM_BUILD_ROOT/%{_mandir}/man8/ipfw.8.gz %clean rm -rf $RPM_BUILD_ROOT # here there is a list of the final installation directories %files %defattr(-,root,root) /sbin/netconfig %{_mandir}/man8/ipfw.8* %changelog * Mon Jul 09 2012 Thierry Parmentelat - ipfw-20120610-2 - cosmetic changes only in specfile * Fri Jun 15 2012 Thierry Parmentelat - ipfw-20120610-1 - integrated ipfw3 as of 20120610 from upstream * Mon Oct 24 2011 Thierry Parmentelat - ipfw-0.9-23 - for building against k32 on f8 * Sun Oct 02 2011 Thierry Parmentelat - ipfw-0.9-22 - rpm version number has the kernel taglevel embedded * Fri Jun 10 2011 Thierry Parmentelat - ipfw-0.9-21 - build tweaks for gcc-4.6 on f15 * Sun Jan 23 2011 Thierry Parmentelat - ipfw-0.9-20 - tweaks for compiling on k32/64 bits * Wed Dec 08 2010 Thierry Parmentelat - ipfw-0.9-19 - fix detection of kernel conventions * Tue Dec 07 2010 Thierry Parmentelat - ipfw-0.9-18 - guess conventions for either <=k27 or >=k32 * Tue Jun 15 2010 Baris Metin - ipfw-0.9-17 - testing git only module-tag * Tue Jun 15 2010 Baris Metin - ipfw-0.9-16 - tagging ipfw to test module-tools on (pure) git * Wed May 12 2010 Talip Baris Metin - ipfw-0.9-15 - tagging for obsoletes * Tue Apr 27 2010 Thierry Parmentelat - ipfw-0.9-13 - Update to the ipfw3 version of the dummynet code. * Mon Apr 12 2010 Thierry Parmentelat - ipfw-0.9-11 - add ipfw initialization script to chkconfig * Wed Mar 03 2010 Talip Baris Metin - ipfw-0.9-10 - - Load module at installation - Marta * Mon Jan 11 2010 Thierry Parmentelat - ipfw-0.9-9 - consistent with vsys-scripts-0.95-13 * Sat Jan 09 2010 Thierry Parmentelat - ipfw-0.9-8 - builds on 2.6.22 & 2.6.27 - for 32 and 64 bits * Tue Dec 15 2009 Marta Carbone - more work on the radix code, added sysctl read/write support * Sun Nov 29 2009 Thierry Parmentelat - ipfw-0.9-7 - added missing qsort.c - tag 0.9-6 was broken * Thu Nov 26 2009 Thierry Parmentelat - ipfw-0.9-6 - root: removed goto into the main ipfw switch, enabled slice_id matching - slice: completely move netconfig checks into the backend * Mon Nov 09 2009 Thierry Parmentelat - ipfw-0.9-5 - additional features on matching packets, including uid match * Mon Sep 07 2009 Thierry Parmentelat - ipfw-0.9-4 - on behalf of Marta Carbone, more options and features * Thu Jul 23 2009 Thierry Parmentelat - ipfw-0.9-3 - fixed memory usage issue * Wed Jul 15 2009 Thierry Parmentelat - ipfw-0.9-2 - patch for building on x86_64 * Thu Jun 25 2009 Marta Carbone - Initial release ================================================ FILE: planetlab/netconfig ================================================ #!/bin/sh # # Marta Carbone, Luigi Rizzo # Copyright (C) 2009 Universita` di Pisa # $Id: netconfig 4533 2009-12-16 14:39:23Z luigi $ # # This script is the frontend to be used with the vsys system. # It simply passes information to the backend and gets back the reply PIPE_IN=/vsys/ipfw-be.in PIPE_OUT=/vsys/ipfw-be.out sudo sh -c "echo $* >> ${PIPE_IN}" sudo sh -c "cat ${PIPE_OUT}" ================================================ FILE: planetlab/planetlab-tags.mk ================================================ # $Id: planetlab-tags.mk 7450 2010-10-18 11:17:43Z marta $ # These are good to build the ipfw modules from svn on kernels 2.6.22 # and are used to fetch files from the onelab2 repository. linux-2.6-SVNBRANCH := 22 linux-2.6-SVNPATH := http://svn.planet-lab.org/svn/linux-2.6/tags/linux-2.6-22-39-1 ipfwsrc-SVNPATH := svn+ssh://luigi%40onelab2.iet.unipi.it/home/svn/ports-luigi/dummynet-branches/ipfw3 ================================================ FILE: planetlab/planetlab.mk ================================================ # $Id: planetlab.mk 4533 2009-12-16 14:39:23Z luigi $ # .mk file to build a module kernel-MODULES := linux-2.6 kernel-SPEC := kernel-2.6.spec kernel-BUILD-FROM-SRPM := yes ifeq "$(HOSTARCH)" "i386" kernel-RPMFLAGS:= --target i686 else kernel-RPMFLAGS:= --target $(HOSTARCH) endif ALL += kernel ipfwroot-MODULES := ipfwsrc ipfwroot-SPEC := planetlab/ipfwroot.spec ipfwroot-DEPEND-DEVEL-RPMS := kernel-devel ipfwroot-SPECVARS = kernel_version=$(kernel.rpm-version) \ kernel_release=$(kernel.rpm-release) \ kernel_arch=$(kernel.rpm-arch) ALL += ipfwroot ipfwslice-MODULES := ipfwsrc ipfwslice-SPEC := planetlab/ipfwslice.spec ipfwslice-SPECVARS = kernel_version=$(kernel.rpm-version) \ kernel_release=$(kernel.rpm-release) \ kernel_arch=$(kernel.rpm-arch) ALL += ipfwslice ================================================ FILE: planetlab/sample_hook ================================================ #!/bin/sh # # Marta Carbone # 2009 - Universita` di Pisa # # This is a sample hook file in charge to collect # statistical information on netconfig usage. It dumps # on a log file slicename, port and the configuration string # used to configure a dummynet experiment. # # Each time a user configure a dummynet port, this file # will be executed. # The following variables will be passed as argument: # # ${SLICE} ${PORT} ${CONFIG_STRING} # ${SLICE} The slicename executing the netconfig command # ${PORT} The port to be configured # ${CONFIG_STRING} The configuration string # # Note that this script can get additional information # by executing the ipfw command, e.g. # ipfw list # list of installed rules # ipfw show # list of rules and statistical information # ipfw pipe show # list of pipes # # a complete list of ipfw commands is available at: # http://www.freebsd.org/cgi/man.cgi?query=ipfw&sektion=8 # logfile LOG_FILE=/tmp/ipfw_hook.log echo -e `date` >> ${LOG_FILE} echo "$*" >> ${LOG_FILE} ================================================ FILE: sys/net/if.h ================================================ #include ================================================ FILE: sys/net/pfil.h ================================================ /* $FreeBSD: src/sys/net/pfil.h,v 1.16 2007/06/08 12:43:25 gallatin Exp $ */ /* $NetBSD: pfil.h,v 1.22 2003/06/23 12:57:08 martin Exp $ */ /*- * Copyright (c) 1996 Matthew R. Green * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NET_PFIL_H_ #define _NET_PFIL_H_ #include #include #include #include #include #include struct mbuf; struct ifnet; struct inpcb; /* * The packet filter hooks are designed for anything to call them to * possibly intercept the packet. */ struct packet_filter_hook { TAILQ_ENTRY(packet_filter_hook) pfil_link; int (*pfil_func)(void *, struct mbuf **, struct ifnet *, int, struct inpcb *); void *pfil_arg; }; #define PFIL_IN 0x00000001 #define PFIL_OUT 0x00000002 #define PFIL_WAITOK 0x00000004 #define PFIL_ALL (PFIL_IN|PFIL_OUT) typedef TAILQ_HEAD(pfil_list, packet_filter_hook) pfil_list_t; #define PFIL_TYPE_AF 1 /* key is AF_* type */ #define PFIL_TYPE_IFNET 2 /* key is ifnet pointer */ struct pfil_head { pfil_list_t ph_in; pfil_list_t ph_out; int ph_type; int ph_nhooks; #if defined( __linux__ ) || defined( _WIN32 ) rwlock_t ph_mtx; #else struct rmlock ph_lock; #endif union { u_long phu_val; void *phu_ptr; } ph_un; #define ph_af ph_un.phu_val #define ph_ifnet ph_un.phu_ptr LIST_ENTRY(pfil_head) ph_list; }; int pfil_add_hook(int (*func)(void *, struct mbuf **, struct ifnet *, int, struct inpcb *), void *, int, struct pfil_head *); int pfil_remove_hook(int (*func)(void *, struct mbuf **, struct ifnet *, int, struct inpcb *), void *, int, struct pfil_head *); int pfil_run_hooks(struct pfil_head *, struct mbuf **, struct ifnet *, int, struct inpcb *inp); int pfil_head_register(struct pfil_head *); int pfil_head_unregister(struct pfil_head *); struct pfil_head *pfil_head_get(int, u_long); #define PFIL_HOOKED(p) ((p)->ph_nhooks > 0) #define PFIL_LOCK_INIT(p) \ rm_init_flags(&(p)->ph_lock, "PFil hook read/write mutex", RM_RECURSE) #define PFIL_LOCK_DESTROY(p) rm_destroy(&(p)->ph_lock) #define PFIL_RLOCK(p, t) rm_rlock(&(p)->ph_lock, (t)) #define PFIL_WLOCK(p) rm_wlock(&(p)->ph_lock) #define PFIL_RUNLOCK(p, t) rm_runlock(&(p)->ph_lock, (t)) #define PFIL_WUNLOCK(p) rm_wunlock(&(p)->ph_lock) #define PFIL_LIST_LOCK() mtx_lock(&pfil_global_lock) #define PFIL_LIST_UNLOCK() mtx_unlock(&pfil_global_lock) static __inline struct packet_filter_hook * pfil_hook_get(int dir, struct pfil_head *ph) { if (dir == PFIL_IN) return (TAILQ_FIRST(&ph->ph_in)); else if (dir == PFIL_OUT) return (TAILQ_FIRST(&ph->ph_out)); else return (NULL); } #endif /* _NET_PFIL_H_ */ ================================================ FILE: sys/net/radix.c ================================================ /*- * Copyright (c) 1988, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)radix.c 8.5 (Berkeley) 5/19/95 * $FreeBSD: head/sys/net/radix.c 200354 2009-12-10 10:34:30Z luigi $ */ /* * Routines to build and maintain radix trees for routing lookups. */ #include #ifdef _KERNEL #include #include #include #include #include #include #include #include "opt_mpath.h" #ifdef RADIX_MPATH #include #endif #else /* !_KERNEL */ #include #include #include #define log(x, arg...) fprintf(stderr, ## arg) #define panic(x) fprintf(stderr, "PANIC: %s", x), exit(1) #define min(a, b) ((a) < (b) ? (a) : (b) ) #include #endif /* !_KERNEL */ static int rn_walktree_from(struct radix_node_head *h, void *a, void *m, walktree_f_t *f, void *w); static int rn_walktree(struct radix_node_head *, walktree_f_t *, void *); static struct radix_node *rn_insert(void *, struct radix_node_head *, int *, struct radix_node [2]), *rn_newpair(void *, int, struct radix_node[2]), *rn_search(void *, struct radix_node *), *rn_search_m(void *, struct radix_node *, void *); static int max_keylen; static struct radix_mask *rn_mkfreelist; static struct radix_node_head *mask_rnhead; /* * Work area -- the following point to 3 buffers of size max_keylen, * allocated in this order in a block of memory malloc'ed by rn_init. * rn_zeros, rn_ones are set in rn_init and used in readonly afterwards. * addmask_key is used in rn_addmask in rw mode and not thread-safe. */ static char *rn_zeros, *rn_ones, *addmask_key; #define MKGet(m) { \ if (rn_mkfreelist) { \ m = rn_mkfreelist; \ rn_mkfreelist = (m)->rm_mklist; \ } else \ R_Malloc(m, struct radix_mask *, sizeof (struct radix_mask)); } #define MKFree(m) { (m)->rm_mklist = rn_mkfreelist; rn_mkfreelist = (m);} #define rn_masktop (mask_rnhead->rnh_treetop) static int rn_lexobetter(void *m_arg, void *n_arg); static struct radix_mask * rn_new_radix_mask(struct radix_node *tt, struct radix_mask *next); static int rn_satisfies_leaf(char *trial, struct radix_node *leaf, int skip); /* * The data structure for the keys is a radix tree with one way * branching removed. The index rn_bit at an internal node n represents a bit * position to be tested. The tree is arranged so that all descendants * of a node n have keys whose bits all agree up to position rn_bit - 1. * (We say the index of n is rn_bit.) * * There is at least one descendant which has a one bit at position rn_bit, * and at least one with a zero there. * * A route is determined by a pair of key and mask. We require that the * bit-wise logical and of the key and mask to be the key. * We define the index of a route to associated with the mask to be * the first bit number in the mask where 0 occurs (with bit number 0 * representing the highest order bit). * * We say a mask is normal if every bit is 0, past the index of the mask. * If a node n has a descendant (k, m) with index(m) == index(n) == rn_bit, * and m is a normal mask, then the route applies to every descendant of n. * If the index(m) < rn_bit, this implies the trailing last few bits of k * before bit b are all 0, (and hence consequently true of every descendant * of n), so the route applies to all descendants of the node as well. * * Similar logic shows that a non-normal mask m such that * index(m) <= index(n) could potentially apply to many children of n. * Thus, for each non-host route, we attach its mask to a list at an internal * node as high in the tree as we can go. * * The present version of the code makes use of normal routes in short- * circuiting an explict mask and compare operation when testing whether * a key satisfies a normal route, and also in remembering the unique leaf * that governs a subtree. */ /* * Most of the functions in this code assume that the key/mask arguments * are sockaddr-like structures, where the first byte is an u_char * indicating the size of the entire structure. * * To make the assumption more explicit, we use the LEN() macro to access * this field. It is safe to pass an expression with side effects * to LEN() as the argument is evaluated only once. * We cast the result to int as this is the dominant usage. */ #define LEN(x) ( (int) (*(const u_char *)(x)) ) /* * XXX THIS NEEDS TO BE FIXED * In the code, pointers to keys and masks are passed as either * 'void *' (because callers use to pass pointers of various kinds), or * 'caddr_t' (which is fine for pointer arithmetics, but not very * clean when you dereference it to access data). Furthermore, caddr_t * is really 'char *', while the natural type to operate on keys and * masks would be 'u_char'. This mismatch require a lot of casts and * intermediate variables to adapt types that clutter the code. */ /* * Search a node in the tree matching the key. */ static struct radix_node * rn_search(v_arg, head) void *v_arg; struct radix_node *head; { register struct radix_node *x; register caddr_t v; for (x = head, v = v_arg; x->rn_bit >= 0;) { if (x->rn_bmask & v[x->rn_offset]) x = x->rn_right; else x = x->rn_left; } return (x); } /* * Same as above, but with an additional mask. * XXX note this function is used only once. */ static struct radix_node * rn_search_m(v_arg, head, m_arg) struct radix_node *head; void *v_arg, *m_arg; { register struct radix_node *x; register caddr_t v = v_arg, m = m_arg; for (x = head; x->rn_bit >= 0;) { if ((x->rn_bmask & m[x->rn_offset]) && (x->rn_bmask & v[x->rn_offset])) x = x->rn_right; else x = x->rn_left; } return x; } int rn_refines(m_arg, n_arg) void *m_arg, *n_arg; { register caddr_t m = m_arg, n = n_arg; register caddr_t lim, lim2 = lim = n + LEN(n); int longer = LEN(n++) - LEN(m++); int masks_are_equal = 1; if (longer > 0) lim -= longer; while (n < lim) { if (*n & ~(*m)) return 0; if (*n++ != *m++) masks_are_equal = 0; } while (n < lim2) if (*n++) return 0; if (masks_are_equal && (longer < 0)) for (lim2 = m - longer; m < lim2; ) if (*m++) return 1; return (!masks_are_equal); } struct radix_node * rn_lookup(v_arg, m_arg, head) void *v_arg, *m_arg; struct radix_node_head *head; { register struct radix_node *x; caddr_t netmask = 0; if (m_arg) { x = rn_addmask(m_arg, 1, head->rnh_treetop->rn_offset); if (x == 0) return (0); netmask = x->rn_key; } x = rn_match(v_arg, head); if (x && netmask) { while (x && x->rn_mask != netmask) x = x->rn_dupedkey; } return x; } static int rn_satisfies_leaf(trial, leaf, skip) char *trial; register struct radix_node *leaf; int skip; { register char *cp = trial, *cp2 = leaf->rn_key, *cp3 = leaf->rn_mask; char *cplim; int length = min(LEN(cp), LEN(cp2)); if (cp3 == NULL) cp3 = rn_ones; else length = min(length, LEN(cp3)); cplim = cp + length; cp3 += skip; cp2 += skip; for (cp += skip; cp < cplim; cp++, cp2++, cp3++) if ((*cp ^ *cp2) & *cp3) return 0; return 1; } struct radix_node * rn_match(v_arg, head) void *v_arg; struct radix_node_head *head; { caddr_t v = v_arg; register struct radix_node *t = head->rnh_treetop, *x; register caddr_t cp = v, cp2; caddr_t cplim; struct radix_node *saved_t, *top = t; int off = t->rn_offset, vlen = LEN(cp), matched_off; register int test, b, rn_bit; /* * Open code rn_search(v, top) to avoid overhead of extra * subroutine call. */ for (; t->rn_bit >= 0; ) { if (t->rn_bmask & cp[t->rn_offset]) t = t->rn_right; else t = t->rn_left; } /* * See if we match exactly as a host destination * or at least learn how many bits match, for normal mask finesse. * * It doesn't hurt us to limit how many bytes to check * to the length of the mask, since if it matches we had a genuine * match and the leaf we have is the most specific one anyway; * if it didn't match with a shorter length it would fail * with a long one. This wins big for class B&C netmasks which * are probably the most common case... */ if (t->rn_mask) vlen = *(u_char *)t->rn_mask; cp += off; cp2 = t->rn_key + off; cplim = v + vlen; for (; cp < cplim; cp++, cp2++) if (*cp != *cp2) goto on1; /* * This extra grot is in case we are explicitly asked * to look up the default. Ugh! * * Never return the root node itself, it seems to cause a * lot of confusion. */ if (t->rn_flags & RNF_ROOT) t = t->rn_dupedkey; return t; on1: test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */ for (b = 7; (test >>= 1) > 0;) b--; matched_off = cp - v; b += matched_off << 3; rn_bit = -1 - b; /* * If there is a host route in a duped-key chain, it will be first. */ if ((saved_t = t)->rn_mask == 0) t = t->rn_dupedkey; for (; t; t = t->rn_dupedkey) /* * Even if we don't match exactly as a host, * we may match if the leaf we wound up at is * a route to a net. */ if (t->rn_flags & RNF_NORMAL) { if (rn_bit <= t->rn_bit) return t; } else if (rn_satisfies_leaf(v, t, matched_off)) return t; t = saved_t; /* start searching up the tree */ do { register struct radix_mask *m; t = t->rn_parent; m = t->rn_mklist; /* * If non-contiguous masks ever become important * we can restore the masking and open coding of * the search and satisfaction test and put the * calculation of "off" back before the "do". */ while (m) { if (m->rm_flags & RNF_NORMAL) { if (rn_bit <= m->rm_bit) return (m->rm_leaf); } else { off = min(t->rn_offset, matched_off); x = rn_search_m(v, t, m->rm_mask); while (x && x->rn_mask != m->rm_mask) x = x->rn_dupedkey; if (x && rn_satisfies_leaf(v, x, off)) return x; } m = m->rm_mklist; } } while (t != top); return 0; } #ifdef RN_DEBUG int rn_nodenum; struct radix_node *rn_clist; int rn_saveinfo; int rn_debug = 1; #endif /* * Whenever we add a new leaf to the tree, we also add a parent node, * so we allocate them as an array of two elements: the first one must be * the leaf (see RNTORT() in route.c), the second one is the parent. * This routine initializes the relevant fields of the nodes, so that * the leaf is the left child of the parent node, and both nodes have * (almost) all all fields filled as appropriate. * (XXX some fields are left unset, see the '#if 0' section). * The function returns a pointer to the parent node. */ static struct radix_node * rn_newpair(v, b, nodes) void *v; int b; struct radix_node nodes[2]; { register struct radix_node *tt = nodes, *t = tt + 1; t->rn_bit = b; t->rn_bmask = 0x80 >> (b & 7); t->rn_left = tt; t->rn_offset = b >> 3; #if 0 /* XXX perhaps we should fill these fields as well. */ t->rn_parent = t->rn_right = NULL; tt->rn_mask = NULL; tt->rn_dupedkey = NULL; tt->rn_bmask = 0; #endif tt->rn_bit = -1; tt->rn_key = (caddr_t)v; tt->rn_parent = t; tt->rn_flags = t->rn_flags = RNF_ACTIVE; tt->rn_mklist = t->rn_mklist = 0; #ifdef RN_DEBUG tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++; tt->rn_twin = t; tt->rn_ybro = rn_clist; rn_clist = tt; #endif return t; } static struct radix_node * rn_insert(v_arg, head, dupentry, nodes) void *v_arg; struct radix_node_head *head; int *dupentry; struct radix_node nodes[2]; { caddr_t v = v_arg; struct radix_node *top = head->rnh_treetop; int head_off = top->rn_offset, vlen = LEN(v); register struct radix_node *t = rn_search(v_arg, top); register caddr_t cp = v + head_off; register int b; struct radix_node *tt; /* * Find first bit at which v and t->rn_key differ */ { register caddr_t cp2 = t->rn_key + head_off; register int cmp_res; caddr_t cplim = v + vlen; while (cp < cplim) if (*cp2++ != *cp++) goto on1; *dupentry = 1; return t; on1: *dupentry = 0; cmp_res = (cp[-1] ^ cp2[-1]) & 0xff; for (b = (cp - v) << 3; cmp_res; b--) cmp_res >>= 1; } { register struct radix_node *p, *x = top; cp = v; do { p = x; if (cp[x->rn_offset] & x->rn_bmask) x = x->rn_right; else x = x->rn_left; } while (b > (unsigned) x->rn_bit); /* x->rn_bit < b && x->rn_bit >= 0 */ #ifdef RN_DEBUG if (rn_debug) log(LOG_DEBUG, "rn_insert: Going In:\n"), traverse(p); #endif t = rn_newpair(v_arg, b, nodes); tt = t->rn_left; if ((cp[p->rn_offset] & p->rn_bmask) == 0) p->rn_left = t; else p->rn_right = t; x->rn_parent = t; t->rn_parent = p; /* frees x, p as temp vars below */ if ((cp[t->rn_offset] & t->rn_bmask) == 0) { t->rn_right = x; } else { t->rn_right = tt; t->rn_left = x; } #ifdef RN_DEBUG if (rn_debug) log(LOG_DEBUG, "rn_insert: Coming Out:\n"), traverse(p); #endif } return (tt); } struct radix_node * rn_addmask(n_arg, search, skip) int search, skip; void *n_arg; { caddr_t netmask = (caddr_t)n_arg; register struct radix_node *x; register caddr_t cp, cplim; register int b = 0, mlen, j; int maskduplicated, m0, isnormal; struct radix_node *saved_x; static int last_zeroed = 0; if ((mlen = LEN(netmask)) > max_keylen) mlen = max_keylen; if (skip == 0) skip = 1; if (mlen <= skip) return (mask_rnhead->rnh_nodes); if (skip > 1) bcopy(rn_ones + 1, addmask_key + 1, skip - 1); if ((m0 = mlen) > skip) bcopy(netmask + skip, addmask_key + skip, mlen - skip); /* * Trim trailing zeroes. */ for (cp = addmask_key + mlen; (cp > addmask_key) && cp[-1] == 0;) cp--; mlen = cp - addmask_key; if (mlen <= skip) { if (m0 >= last_zeroed) last_zeroed = mlen; return (mask_rnhead->rnh_nodes); } if (m0 < last_zeroed) bzero(addmask_key + m0, last_zeroed - m0); *addmask_key = last_zeroed = mlen; x = rn_search(addmask_key, rn_masktop); if (bcmp(addmask_key, x->rn_key, mlen) != 0) x = 0; if (x || search) return (x); R_Zalloc(x, struct radix_node *, max_keylen + 2 * sizeof (*x)); if ((saved_x = x) == 0) return (0); netmask = cp = (caddr_t)(x + 2); bcopy(addmask_key, cp, mlen); x = rn_insert(cp, mask_rnhead, &maskduplicated, x); if (maskduplicated) { log(LOG_ERR, "rn_addmask: mask impossibly already in tree"); Free(saved_x); return (x); } /* * Calculate index of mask, and check for normalcy. * First find the first byte with a 0 bit, then if there are * more bits left (remember we already trimmed the trailing 0's), * the pattern must be one of those in normal_chars[], or we have * a non-contiguous mask. */ cplim = netmask + mlen; isnormal = 1; for (cp = netmask + skip; (cp < cplim) && *(u_char *)cp == 0xff;) cp++; if (cp != cplim) { static char normal_chars[] = { 0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; for (j = 0x80; (j & *cp) != 0; j >>= 1) b++; if (*cp != normal_chars[b] || cp != (cplim - 1)) isnormal = 0; } b += (cp - netmask) << 3; x->rn_bit = -1 - b; if (isnormal) x->rn_flags |= RNF_NORMAL; return (x); } static int /* XXX: arbitrary ordering for non-contiguous masks */ rn_lexobetter(m_arg, n_arg) void *m_arg, *n_arg; { register u_char *mp = m_arg, *np = n_arg, *lim; if (LEN(mp) > LEN(np)) return 1; /* not really, but need to check longer one first */ if (LEN(mp) == LEN(np)) for (lim = mp + LEN(mp); mp < lim;) if (*mp++ > *np++) return 1; return 0; } static struct radix_mask * rn_new_radix_mask(tt, next) register struct radix_node *tt; register struct radix_mask *next; { register struct radix_mask *m; MKGet(m); if (m == 0) { log(LOG_ERR, "Mask for route not entered\n"); return (0); } bzero(m, sizeof *m); m->rm_bit = tt->rn_bit; m->rm_flags = tt->rn_flags; if (tt->rn_flags & RNF_NORMAL) m->rm_leaf = tt; else m->rm_mask = tt->rn_mask; m->rm_mklist = next; tt->rn_mklist = m; return m; } struct radix_node * rn_addroute(v_arg, n_arg, head, treenodes) void *v_arg, *n_arg; struct radix_node_head *head; struct radix_node treenodes[2]; { caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg; register struct radix_node *t, *x = 0, *tt; struct radix_node *saved_tt, *top = head->rnh_treetop; short b = 0, b_leaf = 0; int keyduplicated; caddr_t mmask; struct radix_mask *m, **mp; /* * In dealing with non-contiguous masks, there may be * many different routes which have the same mask. * We will find it useful to have a unique pointer to * the mask to speed avoiding duplicate references at * nodes and possibly save time in calculating indices. */ if (netmask) { if ((x = rn_addmask(netmask, 0, top->rn_offset)) == 0) return (0); b_leaf = x->rn_bit; b = -1 - x->rn_bit; netmask = x->rn_key; } /* * Deal with duplicated keys: attach node to previous instance */ saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes); if (keyduplicated) { for (t = tt; tt; t = tt, tt = tt->rn_dupedkey) { #ifdef RADIX_MPATH /* permit multipath, if enabled for the family */ if (rn_mpath_capable(head) && netmask == tt->rn_mask) { /* * go down to the end of multipaths, so that * new entry goes into the end of rn_dupedkey * chain. */ do { t = tt; tt = tt->rn_dupedkey; } while (tt && t->rn_mask == tt->rn_mask); break; } #endif if (tt->rn_mask == netmask) return (0); if (netmask == 0 || (tt->rn_mask && ((b_leaf < tt->rn_bit) /* index(netmask) > node */ || rn_refines(netmask, tt->rn_mask) || rn_lexobetter(netmask, tt->rn_mask)))) break; } /* * If the mask is not duplicated, we wouldn't * find it among possible duplicate key entries * anyway, so the above test doesn't hurt. * * We sort the masks for a duplicated key the same way as * in a masklist -- most specific to least specific. * This may require the unfortunate nuisance of relocating * the head of the list. * * We also reverse, or doubly link the list through the * parent pointer. */ if (tt == saved_tt) { struct radix_node *xx = x; /* link in at head of list */ (tt = treenodes)->rn_dupedkey = t; tt->rn_flags = t->rn_flags; tt->rn_parent = x = t->rn_parent; t->rn_parent = tt; /* parent */ if (x->rn_left == t) x->rn_left = tt; else x->rn_right = tt; saved_tt = tt; x = xx; } else { (tt = treenodes)->rn_dupedkey = t->rn_dupedkey; t->rn_dupedkey = tt; tt->rn_parent = t; /* parent */ if (tt->rn_dupedkey) /* parent */ tt->rn_dupedkey->rn_parent = tt; /* parent */ } #ifdef RN_DEBUG t=tt+1; tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++; tt->rn_twin = t; tt->rn_ybro = rn_clist; rn_clist = tt; #endif tt->rn_key = (caddr_t) v; tt->rn_bit = -1; tt->rn_flags = RNF_ACTIVE; } /* * Put mask in tree. */ if (netmask) { tt->rn_mask = netmask; tt->rn_bit = x->rn_bit; tt->rn_flags |= x->rn_flags & RNF_NORMAL; } t = saved_tt->rn_parent; if (keyduplicated) goto on2; b_leaf = -1 - t->rn_bit; if (t->rn_right == saved_tt) x = t->rn_left; else x = t->rn_right; /* Promote general routes from below */ if (x->rn_bit < 0) { for (mp = &t->rn_mklist; x; x = x->rn_dupedkey) if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) { *mp = m = rn_new_radix_mask(x, 0); if (m) mp = &m->rm_mklist; } } else if (x->rn_mklist) { /* * Skip over masks whose index is > that of new node */ for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) if (m->rm_bit >= b_leaf) break; t->rn_mklist = m; *mp = 0; } on2: /* Add new route to highest possible ancestor's list */ if ((netmask == 0) || (b > t->rn_bit )) return tt; /* can't lift at all */ b_leaf = tt->rn_bit; do { x = t; t = t->rn_parent; } while (b <= t->rn_bit && x != top); /* * Search through routes associated with node to * insert new route according to index. * Need same criteria as when sorting dupedkeys to avoid * double loop on deletion. */ for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) { if (m->rm_bit < b_leaf) continue; if (m->rm_bit > b_leaf) break; if (m->rm_flags & RNF_NORMAL) { mmask = m->rm_leaf->rn_mask; if (tt->rn_flags & RNF_NORMAL) { #if !defined(RADIX_MPATH) log(LOG_ERR, "Non-unique normal route, mask not entered\n"); #endif return tt; } } else mmask = m->rm_mask; if (mmask == netmask) { m->rm_refs++; tt->rn_mklist = m; return tt; } if (rn_refines(netmask, mmask) || rn_lexobetter(netmask, mmask)) break; } *mp = rn_new_radix_mask(tt, *mp); return tt; } struct radix_node * rn_delete(v_arg, netmask_arg, head) void *v_arg, *netmask_arg; struct radix_node_head *head; { register struct radix_node *t, *p, *x, *tt; struct radix_mask *m, *saved_m, **mp; struct radix_node *dupedkey, *saved_tt, *top; caddr_t v, netmask; int b, head_off, vlen; v = v_arg; netmask = netmask_arg; x = head->rnh_treetop; tt = rn_search(v, x); head_off = x->rn_offset; vlen = LEN(v); saved_tt = tt; top = x; if (tt == 0 || bcmp(v + head_off, tt->rn_key + head_off, vlen - head_off)) return (0); /* * Delete our route from mask lists. */ if (netmask) { if ((x = rn_addmask(netmask, 1, head_off)) == 0) return (0); netmask = x->rn_key; while (tt->rn_mask != netmask) if ((tt = tt->rn_dupedkey) == 0) return (0); } if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == 0) goto on1; if (tt->rn_flags & RNF_NORMAL) { if (m->rm_leaf != tt || m->rm_refs > 0) { log(LOG_ERR, "rn_delete: inconsistent annotation\n"); return 0; /* dangling ref could cause disaster */ } } else { if (m->rm_mask != tt->rn_mask) { log(LOG_ERR, "rn_delete: inconsistent annotation\n"); goto on1; } if (--m->rm_refs >= 0) goto on1; } b = -1 - tt->rn_bit; t = saved_tt->rn_parent; if (b > t->rn_bit) goto on1; /* Wasn't lifted at all */ do { x = t; t = t->rn_parent; } while (b <= t->rn_bit && x != top); for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) if (m == saved_m) { *mp = m->rm_mklist; MKFree(m); break; } if (m == 0) { log(LOG_ERR, "rn_delete: couldn't find our annotation\n"); if (tt->rn_flags & RNF_NORMAL) return (0); /* Dangling ref to us */ } on1: /* * Eliminate us from tree */ if (tt->rn_flags & RNF_ROOT) return (0); #ifdef RN_DEBUG /* Get us out of the creation list */ for (t = rn_clist; t && t->rn_ybro != tt; t = t->rn_ybro) {} if (t) t->rn_ybro = tt->rn_ybro; #endif t = tt->rn_parent; dupedkey = saved_tt->rn_dupedkey; if (dupedkey) { /* * Here, tt is the deletion target and * saved_tt is the head of the dupekey chain. */ if (tt == saved_tt) { /* remove from head of chain */ x = dupedkey; x->rn_parent = t; if (t->rn_left == tt) t->rn_left = x; else t->rn_right = x; } else { /* find node in front of tt on the chain */ for (x = p = saved_tt; p && p->rn_dupedkey != tt;) p = p->rn_dupedkey; if (p) { p->rn_dupedkey = tt->rn_dupedkey; if (tt->rn_dupedkey) /* parent */ tt->rn_dupedkey->rn_parent = p; /* parent */ } else log(LOG_ERR, "rn_delete: couldn't find us\n"); } t = tt + 1; if (t->rn_flags & RNF_ACTIVE) { #ifndef RN_DEBUG *++x = *t; p = t->rn_parent; #else b = t->rn_info; *++x = *t; t->rn_info = b; p = t->rn_parent; #endif if (p->rn_left == t) p->rn_left = x; else p->rn_right = x; x->rn_left->rn_parent = x; x->rn_right->rn_parent = x; } goto out; } if (t->rn_left == tt) x = t->rn_right; else x = t->rn_left; p = t->rn_parent; if (p->rn_right == t) p->rn_right = x; else p->rn_left = x; x->rn_parent = p; /* * Demote routes attached to us. */ if (t->rn_mklist) { if (x->rn_bit >= 0) { for (mp = &x->rn_mklist; (m = *mp);) mp = &m->rm_mklist; *mp = t->rn_mklist; } else { /* If there are any key,mask pairs in a sibling duped-key chain, some subset will appear sorted in the same order attached to our mklist */ for (m = t->rn_mklist; m && x; x = x->rn_dupedkey) if (m == x->rn_mklist) { struct radix_mask *mm = m->rm_mklist; x->rn_mklist = 0; if (--(m->rm_refs) < 0) MKFree(m); m = mm; } if (m) log(LOG_ERR, "rn_delete: Orphaned Mask %p at %p\n", m, x); } } /* * We may be holding an active internal node in the tree. */ x = tt + 1; if (t != x) { #ifndef RN_DEBUG *t = *x; #else b = t->rn_info; *t = *x; t->rn_info = b; #endif t->rn_left->rn_parent = t; t->rn_right->rn_parent = t; p = x->rn_parent; if (p->rn_left == x) p->rn_left = t; else p->rn_right = t; } out: tt->rn_flags &= ~RNF_ACTIVE; tt[1].rn_flags &= ~RNF_ACTIVE; return (tt); } /* * This is the same as rn_walktree() except for the parameters and the * exit. */ static int rn_walktree_from(h, a, m, f, w) struct radix_node_head *h; void *a, *m; walktree_f_t *f; void *w; { int error; struct radix_node *base, *next; u_char *xa = (u_char *)a; u_char *xm = (u_char *)m; register struct radix_node *rn, *last = 0 /* shut up gcc */; int stopping = 0; int lastb; /* * rn_search_m is sort-of-open-coded here. We cannot use the * function because we need to keep track of the last node seen. */ /* printf("about to search\n"); */ for (rn = h->rnh_treetop; rn->rn_bit >= 0; ) { last = rn; /* printf("rn_bit %d, rn_bmask %x, xm[rn_offset] %x\n", rn->rn_bit, rn->rn_bmask, xm[rn->rn_offset]); */ if (!(rn->rn_bmask & xm[rn->rn_offset])) { break; } if (rn->rn_bmask & xa[rn->rn_offset]) { rn = rn->rn_right; } else { rn = rn->rn_left; } } /* printf("done searching\n"); */ /* * Two cases: either we stepped off the end of our mask, * in which case last == rn, or we reached a leaf, in which * case we want to start from the last node we looked at. * Either way, last is the node we want to start from. */ rn = last; lastb = rn->rn_bit; /* printf("rn %p, lastb %d\n", rn, lastb);*/ /* * This gets complicated because we may delete the node * while applying the function f to it, so we need to calculate * the successor node in advance. */ while (rn->rn_bit >= 0) rn = rn->rn_left; while (!stopping) { /* printf("node %p (%d)\n", rn, rn->rn_bit); */ base = rn; /* If at right child go back up, otherwise, go right */ while (rn->rn_parent->rn_right == rn && !(rn->rn_flags & RNF_ROOT)) { rn = rn->rn_parent; /* if went up beyond last, stop */ if (rn->rn_bit <= lastb) { stopping = 1; /* printf("up too far\n"); */ /* * XXX we should jump to the 'Process leaves' * part, because the values of 'rn' and 'next' * we compute will not be used. Not a big deal * because this loop will terminate, but it is * inefficient and hard to understand! */ } } /* * At the top of the tree, no need to traverse the right * half, prevent the traversal of the entire tree in the * case of default route. */ if (rn->rn_parent->rn_flags & RNF_ROOT) stopping = 1; /* Find the next *leaf* since next node might vanish, too */ for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;) rn = rn->rn_left; next = rn; /* Process leaves */ while ((rn = base) != 0) { base = rn->rn_dupedkey; /* printf("leaf %p\n", rn); */ if (!(rn->rn_flags & RNF_ROOT) && (error = (*f)(rn, w))) return (error); } rn = next; if (rn->rn_flags & RNF_ROOT) { /* printf("root, stopping"); */ stopping = 1; } } return 0; } static int rn_walktree(h, f, w) struct radix_node_head *h; walktree_f_t *f; void *w; { int error; struct radix_node *base, *next; register struct radix_node *rn = h->rnh_treetop; /* * This gets complicated because we may delete the node * while applying the function f to it, so we need to calculate * the successor node in advance. */ /* First time through node, go left */ while (rn->rn_bit >= 0) rn = rn->rn_left; for (;;) { base = rn; /* If at right child go back up, otherwise, go right */ while (rn->rn_parent->rn_right == rn && (rn->rn_flags & RNF_ROOT) == 0) rn = rn->rn_parent; /* Find the next *leaf* since next node might vanish, too */ for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;) rn = rn->rn_left; next = rn; /* Process leaves */ while ((rn = base)) { base = rn->rn_dupedkey; if (!(rn->rn_flags & RNF_ROOT) && (error = (*f)(rn, w))) return (error); } rn = next; if (rn->rn_flags & RNF_ROOT) return (0); } /* NOTREACHED */ } /* * Allocate and initialize an empty tree. This has 3 nodes, which are * part of the radix_node_head (in the order ) and are * marked RNF_ROOT so they cannot be freed. * The leaves have all-zero and all-one keys, with significant * bits starting at 'off'. * Return 1 on success, 0 on error. */ int rn_inithead(head, off) void **head; int off; { register struct radix_node_head *rnh; register struct radix_node *t, *tt, *ttt; if (*head) return (1); R_Zalloc(rnh, struct radix_node_head *, sizeof (*rnh)); if (rnh == 0) return (0); #ifdef _KERNEL RADIX_NODE_HEAD_LOCK_INIT(rnh); #endif *head = rnh; t = rn_newpair(rn_zeros, off, rnh->rnh_nodes); ttt = rnh->rnh_nodes + 2; t->rn_right = ttt; t->rn_parent = t; tt = t->rn_left; /* ... which in turn is rnh->rnh_nodes */ tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE; tt->rn_bit = -1 - off; *ttt = *tt; ttt->rn_key = rn_ones; rnh->rnh_addaddr = rn_addroute; rnh->rnh_deladdr = rn_delete; rnh->rnh_matchaddr = rn_match; rnh->rnh_lookup = rn_lookup; rnh->rnh_walktree = rn_walktree; rnh->rnh_walktree_from = rn_walktree_from; rnh->rnh_treetop = t; return (1); } int rn_detachhead(void **head) { struct radix_node_head *rnh; KASSERT((head != NULL && *head != NULL), ("%s: head already freed", __func__)); rnh = *head; /* Free nodes. */ Free(rnh); *head = NULL; return (1); } void rn_init(int maxk) { char *cp, *cplim; max_keylen = maxk; if (max_keylen == 0) { log(LOG_ERR, "rn_init: radix functions require max_keylen be set\n"); return; } R_Malloc(rn_zeros, char *, 3 * max_keylen); if (rn_zeros == NULL) panic("rn_init"); bzero(rn_zeros, 3 * max_keylen); rn_ones = cp = rn_zeros + max_keylen; addmask_key = cplim = rn_ones + max_keylen; while (cp < cplim) *cp++ = -1; if (rn_inithead((void **)(void *)&mask_rnhead, 0) == 0) panic("rn_init 2"); } ================================================ FILE: sys/net/radix.h ================================================ /*- * Copyright (c) 1988, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)radix.h 8.2 (Berkeley) 10/31/94 * $FreeBSD: head/sys/net/radix.h 185747 2008-12-07 21:15:43Z kmacy $ */ #ifndef _RADIX_H_ #define _RADIX_H_ #ifdef _KERNEL #include #include #include #endif #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_RTABLE); #endif /* * Radix search tree node layout. */ struct radix_node { struct radix_mask *rn_mklist; /* list of masks contained in subtree */ struct radix_node *rn_parent; /* parent */ short rn_bit; /* bit offset; -1-index(netmask) */ char rn_bmask; /* node: mask for bit test*/ u_char rn_flags; /* enumerated next */ #define RNF_NORMAL 1 /* leaf contains normal route */ #define RNF_ROOT 2 /* leaf is root leaf for tree */ #define RNF_ACTIVE 4 /* This node is alive (for rtfree) */ union { struct { /* leaf only data: */ caddr_t rn_Key; /* object of search */ caddr_t rn_Mask; /* netmask, if present */ struct radix_node *rn_Dupedkey; } rn_leaf; struct { /* node only data: */ int rn_Off; /* where to start compare */ struct radix_node *rn_L;/* progeny */ struct radix_node *rn_R;/* progeny */ } rn_node; } rn_u; #ifdef RN_DEBUG int rn_info; struct radix_node *rn_twin; struct radix_node *rn_ybro; #endif }; #define rn_dupedkey rn_u.rn_leaf.rn_Dupedkey #define rn_key rn_u.rn_leaf.rn_Key #define rn_mask rn_u.rn_leaf.rn_Mask #define rn_offset rn_u.rn_node.rn_Off #define rn_left rn_u.rn_node.rn_L #define rn_right rn_u.rn_node.rn_R /* * Annotations to tree concerning potential routes applying to subtrees. */ struct radix_mask { short rm_bit; /* bit offset; -1-index(netmask) */ char rm_unused; /* cf. rn_bmask */ u_char rm_flags; /* cf. rn_flags */ struct radix_mask *rm_mklist; /* more masks to try */ union { caddr_t rmu_mask; /* the mask */ struct radix_node *rmu_leaf; /* for normal routes */ } rm_rmu; int rm_refs; /* # of references to this struct */ }; #define rm_mask rm_rmu.rmu_mask #define rm_leaf rm_rmu.rmu_leaf /* extra field would make 32 bytes */ typedef int walktree_f_t(struct radix_node *, void *); struct radix_node_head { struct radix_node *rnh_treetop; u_int rnh_gen; /* generation counter */ int rnh_multipath; /* multipath capable ? */ int rnh_addrsize; /* permit, but not require fixed keys */ int rnh_pktsize; /* permit, but not require fixed keys */ struct radix_node *(*rnh_addaddr) /* add based on sockaddr */ (void *v, void *mask, struct radix_node_head *head, struct radix_node nodes[]); struct radix_node *(*rnh_addpkt) /* add based on packet hdr */ (void *v, void *mask, struct radix_node_head *head, struct radix_node nodes[]); struct radix_node *(*rnh_deladdr) /* remove based on sockaddr */ (void *v, void *mask, struct radix_node_head *head); struct radix_node *(*rnh_delpkt) /* remove based on packet hdr */ (void *v, void *mask, struct radix_node_head *head); struct radix_node *(*rnh_matchaddr) /* locate based on sockaddr */ (void *v, struct radix_node_head *head); struct radix_node *(*rnh_lookup) /* locate based on sockaddr */ (void *v, void *mask, struct radix_node_head *head); struct radix_node *(*rnh_matchpkt) /* locate based on packet hdr */ (void *v, struct radix_node_head *head); int (*rnh_walktree) /* traverse tree */ (struct radix_node_head *head, walktree_f_t *f, void *w); int (*rnh_walktree_from) /* traverse tree below a */ (struct radix_node_head *head, void *a, void *m, walktree_f_t *f, void *w); void (*rnh_close) /* do something when the last ref drops */ (struct radix_node *rn, struct radix_node_head *head); struct radix_node rnh_nodes[3]; /* empty tree for common case */ #ifdef _KERNEL #if defined( __linux__ ) || defined( _WIN32 ) spinlock_t rnh_lock; #else struct rwlock rnh_lock; /* locks entire radix tree */ #endif /* !__linux__ */ #endif }; #ifndef _KERNEL #define R_Malloc(p, t, n) (p = (t) malloc((unsigned int)(n))) #define R_Zalloc(p, t, n) (p = (t) calloc(1,(unsigned int)(n))) #define Free(p) free((char *)p); #else #define R_Malloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT)) #define R_Zalloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT | M_ZERO)) #define Free(p) free((caddr_t)p, M_RTABLE); #define RADIX_NODE_HEAD_LOCK_INIT(rnh) \ rw_init_flags(&(rnh)->rnh_lock, "radix node head", 0) #define RADIX_NODE_HEAD_LOCK(rnh) rw_wlock(&(rnh)->rnh_lock) #define RADIX_NODE_HEAD_UNLOCK(rnh) rw_wunlock(&(rnh)->rnh_lock) #define RADIX_NODE_HEAD_RLOCK(rnh) rw_rlock(&(rnh)->rnh_lock) #define RADIX_NODE_HEAD_RUNLOCK(rnh) rw_runlock(&(rnh)->rnh_lock) #define RADIX_NODE_HEAD_LOCK_TRY_UPGRADE(rnh) rw_try_upgrade(&(rnh)->rnh_lock) #define RADIX_NODE_HEAD_DESTROY(rnh) rw_destroy(&(rnh)->rnh_lock) #define RADIX_NODE_HEAD_LOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_LOCKED) #define RADIX_NODE_HEAD_WLOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_WLOCKED) #endif /* _KERNEL */ void rn_init(int); int rn_inithead(void **, int); int rn_detachhead(void **); int rn_refines(void *, void *); struct radix_node *rn_addmask(void *, int, int), *rn_addroute (void *, void *, struct radix_node_head *, struct radix_node [2]), *rn_delete(void *, void *, struct radix_node_head *), *rn_lookup (void *v_arg, void *m_arg, struct radix_node_head *head), *rn_match(void *, struct radix_node_head *); #endif /* _RADIX_H_ */ ================================================ FILE: sys/netgraph/ng_ipfw.h ================================================ /*- * Copyright 2005, Gleb Smirnoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD: src/sys/netgraph/ng_ipfw.h,v 1.2 2006/02/17 09:42:49 glebius Exp $ */ #ifndef _NG_IPFW_H #define _NG_IPFW_H #define NG_IPFW_NODE_TYPE "ipfw" #define NGM_IPFW_COOKIE 1105988990 #endif /* _NG_IPFW_H */ ================================================ FILE: sys/netinet/in_cksum.c ================================================ /*- * Copyright (c) 1988, 1992, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD: src/sys/netinet/in_cksum.c,v 1.10 2007/10/07 20:44:22 silby Exp $"); #include #include /* * Checksum routine for Internet Protocol family headers (Portable Version). * * This routine is very heavily used in the network * code and should be modified for each CPU to be as fast as possible. */ #define ADDCARRY(x) (x > 65535 ? x -= 65535 : x) #define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);} int in_cksum(struct mbuf *m, int len) { register u_short *w; register int sum = 0; register int mlen = 0; int byte_swapped = 0; union { char c[2]; u_short s; } s_util; union { u_short s[2]; long l; } l_util; for (;m && len; m = m->m_next) { if (m->m_len == 0) continue; w = mtod(m, u_short *); if (mlen == -1) { /* * The first byte of this mbuf is the continuation * of a word spanning between this mbuf and the * last mbuf. * * s_util.c[0] is already saved when scanning previous * mbuf. */ s_util.c[1] = *(char *)w; sum += s_util.s; w = (u_short *)((char *)w + 1); mlen = m->m_len - 1; len--; } else mlen = m->m_len; if (len < mlen) mlen = len; len -= mlen; /* * Force to even boundary. */ if ((1 & (uintptr_t) w) && (mlen > 0)) { REDUCE; sum <<= 8; s_util.c[0] = *(u_char *)w; w = (u_short *)((char *)w + 1); mlen--; byte_swapped = 1; } /* * Unroll the loop to make overhead from * branches &c small. */ while ((mlen -= 32) >= 0) { sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11]; sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15]; w += 16; } mlen += 32; while ((mlen -= 8) >= 0) { sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; w += 4; } mlen += 8; if (mlen == 0 && byte_swapped == 0) continue; REDUCE; while ((mlen -= 2) >= 0) { sum += *w++; } if (byte_swapped) { REDUCE; sum <<= 8; byte_swapped = 0; if (mlen == -1) { s_util.c[1] = *(char *)w; sum += s_util.s; mlen = 0; } else mlen = -1; } else if (mlen == -1) s_util.c[0] = *(char *)w; } if (len) printf("cksum: out of data\n"); if (mlen == -1) { /* The last mbuf has odd # of bytes. Follow the standard (the odd byte may be shifted left by 8 bits or not as determined by endian-ness of the machine) */ s_util.c[1] = 0; sum += s_util.s; } REDUCE; return (~sum & 0xffff); } ================================================ FILE: sys/netinet/ip.h ================================================ #ifndef _NETINET_IP_H_ #define _NETINET_IP_H_ #define LITTLE_ENDIAN 1234 #define BIG_ENDIAN 4321 #if defined(__BIG_ENDIAN) #define BYTE_ORDER BIG_ENDIAN //#warning we are in bigendian #elif defined(__LITTLE_ENDIAN) //#warning we are in littleendian #define BYTE_ORDER LITTLE_ENDIAN #else #error no platform #endif /* XXX endiannes doesn't belong here */ // #define LITTLE_ENDIAN 1234 // #define BIG_ENDIAN 4321 // #define BYTE_ORDER LITTLE_ENDIAN /* * Structure of an internet header, naked of options. */ struct ip { #if BYTE_ORDER == LITTLE_ENDIAN u_char ip_hl:4, /* header length */ ip_v:4; /* version */ #endif #if BYTE_ORDER == BIG_ENDIAN u_char ip_v:4, /* version */ ip_hl:4; /* header length */ #endif u_char ip_tos; /* type of service */ u_short ip_len; /* total length */ u_short ip_id; /* identification */ u_short ip_off; /* fragment offset field */ #define IP_RF 0x8000 /* reserved fragment flag */ #define IP_DF 0x4000 /* dont fragment flag */ #define IP_MF 0x2000 /* more fragments flag */ #define IP_OFFMASK 0x1fff /* mask for fragmenting bits */ u_char ip_ttl; /* time to live */ u_char ip_p; /* protocol */ u_short ip_sum; /* checksum */ struct in_addr ip_src,ip_dst; /* source and dest address */ } __packed __aligned(4); #define IPTOS_LOWDELAY 0x10 #endif /* _NETINET_IP_H_ */ ================================================ FILE: sys/netinet/ip6.h ================================================ #ifndef _NETINET_IP6_H_ #define _NETINET_IP6_H_ #define IN6_ARE_ADDR_EQUAL(a, b) \ (memcmp(&(a)->s6_addr[0], &(b)->s6_addr[0], sizeof(struct in6_addr)) == 0) struct ip6_hdr { union { struct ip6_hdrctl { u_int32_t ip6_un1_flow; /* 20 bits of flow-ID */ u_int16_t ip6_un1_plen; /* payload length */ u_int8_t ip6_un1_nxt; /* next header */ u_int8_t ip6_un1_hlim; /* hop limit */ } ip6_un1; u_int8_t ip6_un2_vfc; /* 4 bits version, top 4 bits class */ } ip6_ctlun; struct in6_addr ip6_src; /* source address */ struct in6_addr ip6_dst; /* destination address */ }; #define ip6_nxt ip6_ctlun.ip6_un1.ip6_un1_nxt #define ip6_flow ip6_ctlun.ip6_un1.ip6_un1_flow struct icmp6_hdr { u_int8_t icmp6_type; /* type field */ u_int8_t icmp6_code; /* code field */ u_int16_t icmp6_cksum; /* checksum field */ union { u_int32_t icmp6_un_data32[1]; /* type-specific field */ u_int16_t icmp6_un_data16[2]; /* type-specific field */ u_int8_t icmp6_un_data8[4]; /* type-specific field */ } icmp6_dataun; }; struct ip6_hbh { u_int8_t ip6h_nxt; /* next header */ u_int8_t ip6h_len; /* length in units of 8 octets */ /* followed by options */ }; struct ip6_rthdr { u_int8_t ip6r_nxt; /* next header */ u_int8_t ip6r_len; /* length in units of 8 octets */ u_int8_t ip6r_type; /* routing type */ u_int8_t ip6r_segleft; /* segments left */ /* followed by routing type specific data */ }; struct ip6_frag { u_int8_t ip6f_nxt; /* next header */ u_int8_t ip6f_reserved; /* reserved field */ u_int16_t ip6f_offlg; /* offset, reserved, and flag */ u_int32_t ip6f_ident; /* identification */ }; #define IP6F_OFF_MASK 0xfff8 /* mask out offset from _offlg */ #define IP6F_MORE_FRAG 0x0001 /* more-fragments flag */ struct ip6_ext { u_int8_t ip6e_nxt; u_int8_t ip6e_len; }; #endif /* _NETINET_IP6_H_ */ ================================================ FILE: sys/netinet/ip_dummynet.h ================================================ /*- * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa * Portions Copyright (c) 2000 Akamba Corp. * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD: user/luigi/ipfw3-head/sys/netinet/ip_dummynet.h 203321 2010-01-31 21:39:25Z luigi $ */ #ifndef _IP_DUMMYNET_H #define _IP_DUMMYNET_H /* * Definition of the kernel-userland API for dummynet. * * Setsockopt() and getsockopt() pass a batch of objects, each * of them starting with a "struct dn_id" which should fully identify * the object and its relation with others in the sequence. * The first object in each request should have * type= DN_CMD_*, id = DN_API_VERSION. * For other objects, type and subtype specify the object, len indicates * the total length including the header, and 'id' identifies the specific * object. * * Most objects are numbered with an identifier in the range 1..65535. * DN_MAX_ID indicates the first value outside the range. */ #define DN_API_VERSION 12500000 #define DN_MAX_ID 0x10000 struct dn_id { uint16_t len; /* total obj len including this header */ uint8_t type; uint8_t subtype; uint32_t id; /* generic id */ }; /* * These values are in the type field of struct dn_id. * To preserve the ABI, never rearrange the list or delete * entries with the exception of DN_LAST */ enum { DN_NONE = 0, DN_LINK = 1, DN_FS, DN_SCH, DN_SCH_I, DN_QUEUE, DN_DELAY_LINE, DN_PROFILE, DN_FLOW, /* struct dn_flow */ DN_TEXT, /* opaque text is the object */ DN_CMD_CONFIG = 0x80, /* objects follow */ DN_CMD_DELETE, /* subtype + list of entries */ DN_CMD_GET, /* subtype + list of entries */ DN_CMD_FLUSH, /* for compatibility with FreeBSD 7.2/8 */ DN_COMPAT_PIPE, DN_COMPAT_QUEUE, DN_GET_COMPAT, /* special commands for emulation of sysctl variables */ DN_SYSCTL_GET, DN_SYSCTL_SET, DN_LAST, }; enum { /* subtype for schedulers, flowset and the like */ DN_SCHED_UNKNOWN = 0, DN_SCHED_FIFO = 1, DN_SCHED_WF2QP = 2, /* others are in individual modules */ }; enum { /* user flags */ DN_HAVE_MASK = 0x0001, /* fs or sched has a mask */ DN_NOERROR = 0x0002, /* do not report errors */ DN_QHT_HASH = 0x0004, /* qht is a hash table */ DN_QSIZE_BYTES = 0x0008, /* queue size is in bytes */ DN_HAS_PROFILE = 0x0010, /* a link has a profile */ DN_IS_RED = 0x0020, DN_IS_GENTLE_RED= 0x0040, DN_PIPE_CMD = 0x1000, /* pipe config... */ }; /* * link template. */ struct dn_link { struct dn_id oid; /* * Userland sets bw and delay in bits/s and milliseconds. * The kernel converts this back and forth to bits/tick and ticks. * XXX what about burst ? */ int32_t link_nr; int bandwidth; /* bit/s or bits/tick. */ int delay; /* ms and ticks */ uint64_t burst; /* scaled. bits*Hz XXX */ }; /* * A flowset, which is a template for flows. Contains parameters * from the command line: id, target scheduler, queue sizes, plr, * flow masks, buckets for the flow hash, and possibly scheduler- * specific parameters (weight, quantum and so on). */ struct dn_fs { struct dn_id oid; uint32_t fs_nr; /* the flowset number */ uint32_t flags; /* userland flags */ int qsize; /* queue size in slots or bytes */ int32_t plr; /* PLR, pkt loss rate (2^31-1 means 100%) */ uint32_t buckets; /* buckets used for the queue hash table */ struct ipfw_flow_id flow_mask; uint32_t sched_nr; /* the scheduler we attach to */ /* generic scheduler parameters. Leave them at -1 if unset. * Now we use 0: weight, 1: lmax, 2: priority */ int par[4]; /* RED/GRED parameters. * weight and probabilities are in the range 0..1 represented * in fixed point arithmetic with SCALE_RED decimal bits. */ #define SCALE_RED 16 #define SCALE(x) ( (x) << SCALE_RED ) #define SCALE_VAL(x) ( (x) >> SCALE_RED ) #define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) int w_q ; /* queue weight (scaled) */ int max_th ; /* maximum threshold for queue (scaled) */ int min_th ; /* minimum threshold for queue (scaled) */ int max_p ; /* maximum value for p_b (scaled) */ }; /* * dn_flow collects flow_id and stats for queues and scheduler * instances, and is used to pass these info to userland. * oid.type/oid.subtype describe the object, oid.id is number * of the parent object. */ struct dn_flow { struct dn_id oid; struct ipfw_flow_id fid; uint64_t tot_pkts; /* statistics counters */ uint64_t tot_bytes; uint32_t length; /* Queue length, in packets */ uint32_t len_bytes; /* Queue length, in bytes */ uint32_t drops; }; /* * Scheduler template, mostly indicating the name, number, * sched_mask and buckets. */ struct dn_sch { struct dn_id oid; uint32_t sched_nr; /* N, scheduler number */ uint32_t buckets; /* number of buckets for the instances */ uint32_t flags; /* have_mask, ... */ char name[16]; /* null terminated */ /* mask to select the appropriate scheduler instance */ struct ipfw_flow_id sched_mask; /* M */ }; /* A delay profile is attached to a link. * Note that a profile, as any other object, cannot be longer than 2^16 */ #define ED_MAX_SAMPLES_NO 1024 struct dn_profile { struct dn_id oid; /* fields to simulate a delay profile */ #define ED_MAX_NAME_LEN 32 char name[ED_MAX_NAME_LEN]; int link_nr; int loss_level; int bandwidth; // XXX use link bandwidth? int samples_no; /* actual len of samples[] */ int samples[0]; /* may be shorter */ }; /* * Overall structure of dummynet In dummynet, packets are selected with the firewall rules, and passed to two different objects: PIPE or QUEUE (bad name). A QUEUE defines a classifier, which groups packets into flows according to a 'mask', puts them into independent queues (one per flow) with configurable size and queue management policy, and passes flows to a scheduler: (flow_mask|sched_mask) sched_mask +---------+ weight Wx +-------------+ | |->-[flow]-->--| |-+ -->--| QUEUE x | ... | | | | |->-[flow]-->--| SCHEDuler N | | +---------+ | | | ... | +--[LINK N]-->-- +---------+ weight Wy | | +--[LINK N]-->-- | |->-[flow]-->--| | | -->--| QUEUE y | ... | | | | |->-[flow]-->--| | | +---------+ +-------------+ | +-------------+ Many QUEUE objects can connect to the same scheduler, each QUEUE object can have its own set of parameters. In turn, the SCHEDuler 'forks' multiple instances according to a 'sched_mask', each instance manages its own set of queues and transmits on a private instance of a configurable LINK. A PIPE is a simplified version of the above, where there is no flow_mask, and each scheduler instance handles a single queue. The following data structures (visible from userland) describe the objects used by dummynet: + dn_link, contains the main configuration parameters related to delay and bandwidth; + dn_profile describes a delay profile; + dn_flow describes the flow status (flow id, statistics) + dn_sch describes a scheduler + dn_fs describes a flowset (msk, weight, queue parameters) * */ #endif /* _IP_DUMMYNET_H */ ================================================ FILE: sys/netinet/ip_fw.h ================================================ /*- * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD: user/luigi/ipfw3-head/sys/netinet/ip_fw.h 202072 2010-01-11 10:12:35Z luigi $ */ #ifndef _IPFW2_H #define _IPFW2_H /* * The default rule number. By the design of ip_fw, the default rule * is the last one, so its number can also serve as the highest number * allowed for a rule. The ip_fw code relies on both meanings of this * constant. */ #define IPFW_DEFAULT_RULE 65535 /* * The number of ipfw tables. The maximum allowed table number is the * (IPFW_TABLES_MAX - 1). */ #define IPFW_TABLES_MAX 128 /* * Most commands (queue, pipe, tag, untag, limit...) can have a 16-bit * argument between 1 and 65534. The value 0 is unused, the value * 65535 (IP_FW_TABLEARG) is used to represent 'tablearg', i.e. the * can be 1..65534, or 65535 to indicate the use of a 'tablearg' * result of the most recent table() lookup. * Note that 16bit is only a historical limit, resulting from * the use of a 16-bit fields for that value. In reality, we can have * 2^32 pipes, queues, tag values and so on, and use 0 as a tablearg. */ #define IPFW_ARG_MIN 1 #define IPFW_ARG_MAX 65534 #define IP_FW_TABLEARG 65535 /* XXX should use 0 */ /* * Number of entries in the call stack of the call/return commands. * Call stack currently is an uint16_t array with rule numbers. */ #define IPFW_CALLSTACK_SIZE 16 /* IP_FW3 header/opcodes */ typedef struct _ip_fw3_opheader { uint16_t opcode; /* Operation opcode */ uint16_t reserved[3]; /* Align to 64-bit boundary */ } ip_fw3_opheader; /* IPFW extented tables support XXX what namespace ? */ #define IP_FW_TABLE_XADD 86 /* add entry */ #define IP_FW_TABLE_XDEL 87 /* delete entry */ #define IP_FW_TABLE_XGETSIZE 88 /* get table size */ #define IP_FW_TABLE_XLIST 89 /* list table contents */ /* * The kernel representation of ipfw rules is made of a list of * 'instructions' (for all practical purposes equivalent to BPF * instructions), which specify which fields of the packet * (or its metadata) should be analysed. * * Each instruction is stored in a structure which begins with * "ipfw_insn", and can contain extra fields depending on the * instruction type (listed below). * Note that the code is written so that individual instructions * have a size which is a multiple of 32 bits. This means that, if * such structures contain pointers or other 64-bit entities, * (there is just one instance now) they may end up unaligned on * 64-bit architectures, so the must be handled with care. * * "enum ipfw_opcodes" are the opcodes supported. We can have up * to 256 different opcodes. When adding new opcodes, they should * be appended to the end of the opcode list before O_LAST_OPCODE, * this will prevent the ABI from being broken, otherwise users * will have to recompile ipfw(8) when they update the kernel. */ enum ipfw_opcodes { /* arguments (4 byte each) */ O_NOP, O_IP_SRC, /* u32 = IP */ O_IP_SRC_MASK, /* ip = IP/mask */ O_IP_SRC_ME, /* none */ O_IP_SRC_SET, /* u32=base, arg1=len, bitmap */ O_IP_DST, /* u32 = IP */ O_IP_DST_MASK, /* ip = IP/mask */ O_IP_DST_ME, /* none */ O_IP_DST_SET, /* u32=base, arg1=len, bitmap */ O_IP_SRCPORT, /* (n)port list:mask 4 byte ea */ O_IP_DSTPORT, /* (n)port list:mask 4 byte ea */ O_PROTO, /* arg1=protocol */ O_MACADDR2, /* 2 mac addr:mask */ O_MAC_TYPE, /* same as srcport */ O_LAYER2, /* none */ O_IN, /* none */ O_FRAG, /* none */ O_RECV, /* none */ O_XMIT, /* none */ O_VIA, /* none */ O_IPOPT, /* arg1 = 2*u8 bitmap */ O_IPLEN, /* arg1 = len */ O_IPID, /* arg1 = id */ O_IPTOS, /* arg1 = id */ O_IPPRECEDENCE, /* arg1 = precedence << 5 */ O_IPTTL, /* arg1 = TTL */ O_IPVER, /* arg1 = version */ O_UID, /* u32 = id */ O_GID, /* u32 = id */ O_ESTAB, /* none (tcp established) */ O_TCPFLAGS, /* arg1 = 2*u8 bitmap */ O_TCPWIN, /* arg1 = desired win */ O_TCPSEQ, /* u32 = desired seq. */ O_TCPACK, /* u32 = desired seq. */ O_ICMPTYPE, /* u32 = icmp bitmap */ O_TCPOPTS, /* arg1 = 2*u8 bitmap */ O_VERREVPATH, /* none */ O_VERSRCREACH, /* none */ O_PROBE_STATE, /* none */ O_KEEP_STATE, /* none */ O_LIMIT, /* ipfw_insn_limit */ O_LIMIT_PARENT, /* dyn_type, not an opcode. */ /* * These are really 'actions'. */ O_LOG, /* ipfw_insn_log */ O_PROB, /* u32 = match probability */ O_CHECK_STATE, /* none */ O_ACCEPT, /* none */ O_DENY, /* none */ O_REJECT, /* arg1=icmp arg (same as deny) */ O_COUNT, /* none */ O_SKIPTO, /* arg1=next rule number */ O_PIPE, /* arg1=pipe number */ O_QUEUE, /* arg1=queue number */ O_DIVERT, /* arg1=port number */ O_TEE, /* arg1=port number */ O_FORWARD_IP, /* fwd sockaddr */ O_FORWARD_MAC, /* fwd mac */ O_NAT, /* nope */ O_REASS, /* none */ /* * More opcodes. */ O_IPSEC, /* has ipsec history */ O_IP_SRC_LOOKUP, /* arg1=table number, u32=value */ O_IP_DST_LOOKUP, /* arg1=table number, u32=value */ O_ANTISPOOF, /* none */ O_JAIL, /* u32 = id */ O_ALTQ, /* u32 = altq classif. qid */ O_DIVERTED, /* arg1=bitmap (1:loop, 2:out) */ O_TCPDATALEN, /* arg1 = tcp data len */ O_IP6_SRC, /* address without mask */ O_IP6_SRC_ME, /* my addresses */ O_IP6_SRC_MASK, /* address with the mask */ O_IP6_DST, O_IP6_DST_ME, O_IP6_DST_MASK, O_FLOW6ID, /* for flow id tag in the ipv6 pkt */ O_ICMP6TYPE, /* icmp6 packet type filtering */ O_EXT_HDR, /* filtering for ipv6 extension header */ O_IP6, /* * actions for ng_ipfw */ O_NETGRAPH, /* send to ng_ipfw */ O_NGTEE, /* copy to ng_ipfw */ O_IP4, O_UNREACH6, /* arg1=icmpv6 code arg (deny) */ O_TAG, /* arg1=tag number */ O_TAGGED, /* arg1=tag number */ O_SETFIB, /* arg1=FIB number */ O_FIB, /* arg1=FIB desired fib number */ O_SOCKARG, /* socket argument */ O_CALLRETURN, /* arg1=called rule number */ O_FORWARD_IP6, /* fwd sockaddr_in6 */ O_LAST_OPCODE /* not an opcode! */ }; /* * The extension header are filtered only for presence using a bit * vector with a flag for each header. */ #define EXT_FRAGMENT 0x1 #define EXT_HOPOPTS 0x2 #define EXT_ROUTING 0x4 #define EXT_AH 0x8 #define EXT_ESP 0x10 #define EXT_DSTOPTS 0x20 #define EXT_RTHDR0 0x40 #define EXT_RTHDR2 0x80 /* * Template for instructions. * * ipfw_insn is used for all instructions which require no operands, * a single 16-bit value (arg1), or a couple of 8-bit values. * * For other instructions which require different/larger arguments * we have derived structures, ipfw_insn_*. * * The size of the instruction (in 32-bit words) is in the low * 6 bits of "len". The 2 remaining bits are used to implement * NOT and OR on individual instructions. Given a type, you can * compute the length to be put in "len" using F_INSN_SIZE(t) * * F_NOT negates the match result of the instruction. * * F_OR is used to build or blocks. By default, instructions * are evaluated as part of a logical AND. An "or" block * { X or Y or Z } contains F_OR set in all but the last * instruction of the block. A match will cause the code * to skip past the last instruction of the block. * * NOTA BENE: in a couple of places we assume that * sizeof(ipfw_insn) == sizeof(u_int32_t) * this needs to be fixed. * */ typedef struct _ipfw_insn { /* template for instructions */ u_int8_t opcode; u_int8_t len; /* number of 32-bit words */ #define F_NOT 0x80 #define F_OR 0x40 #define F_LEN_MASK 0x3f #define F_LEN(cmd) ((cmd)->len & F_LEN_MASK) u_int16_t arg1; } ipfw_insn; /* * The F_INSN_SIZE(type) computes the size, in 4-byte words, of * a given type. */ #define F_INSN_SIZE(t) ((sizeof (t))/sizeof(u_int32_t)) /* * This is used to store an array of 16-bit entries (ports etc.) */ typedef struct _ipfw_insn_u16 { ipfw_insn o; u_int16_t ports[2]; /* there may be more */ } ipfw_insn_u16; /* * This is used to store an array of 32-bit entries * (uid, single IPv4 addresses etc.) */ typedef struct _ipfw_insn_u32 { ipfw_insn o; u_int32_t d[1]; /* one or more */ } ipfw_insn_u32; /* * This is used to store IP addr-mask pairs. */ typedef struct _ipfw_insn_ip { ipfw_insn o; struct in_addr addr; struct in_addr mask; } ipfw_insn_ip; /* * This is used to forward to a given address (ip). */ typedef struct _ipfw_insn_sa { ipfw_insn o; struct sockaddr_in sa; } ipfw_insn_sa; /* * This is used to forward to a given address (ipv6). */ typedef struct _ipfw_insn_sa6 { ipfw_insn o; struct sockaddr_in6 sa; } ipfw_insn_sa6; /* * This is used for MAC addr-mask pairs. */ typedef struct _ipfw_insn_mac { ipfw_insn o; u_char addr[12]; /* dst[6] + src[6] */ u_char mask[12]; /* dst[6] + src[6] */ } ipfw_insn_mac; /* * This is used for interface match rules (recv xx, xmit xx). */ typedef struct _ipfw_insn_if { ipfw_insn o; union { struct in_addr ip; int glob; } p; char name[IFNAMSIZ]; } ipfw_insn_if; /* * This is used for storing an altq queue id number. */ typedef struct _ipfw_insn_altq { ipfw_insn o; u_int32_t qid; } ipfw_insn_altq; /* * This is used for limit rules. */ typedef struct _ipfw_insn_limit { ipfw_insn o; u_int8_t _pad; u_int8_t limit_mask; /* combination of DYN_* below */ #define DYN_SRC_ADDR 0x1 #define DYN_SRC_PORT 0x2 #define DYN_DST_ADDR 0x4 #define DYN_DST_PORT 0x8 u_int16_t conn_limit; } ipfw_insn_limit; /* * This is used for log instructions. */ typedef struct _ipfw_insn_log { ipfw_insn o; u_int32_t max_log; /* how many do we log -- 0 = all */ u_int32_t log_left; /* how many left to log */ } ipfw_insn_log; /* * Data structures required by both ipfw(8) and ipfw(4) but not part of the * management API are protected by IPFW_INTERNAL. */ #ifdef IPFW_INTERNAL /* Server pool support (LSNAT). */ struct cfg_spool { LIST_ENTRY(cfg_spool) _next; /* chain of spool instances */ struct in_addr addr; u_short port; }; #endif /* Redirect modes id. */ #define REDIR_ADDR 0x01 #define REDIR_PORT 0x02 #define REDIR_PROTO 0x04 #ifdef IPFW_INTERNAL /* Nat redirect configuration. */ struct cfg_redir { LIST_ENTRY(cfg_redir) _next; /* chain of redir instances */ u_int16_t mode; /* type of redirect mode */ struct in_addr laddr; /* local ip address */ struct in_addr paddr; /* public ip address */ struct in_addr raddr; /* remote ip address */ u_short lport; /* local port */ u_short pport; /* public port */ u_short rport; /* remote port */ u_short pport_cnt; /* number of public ports */ u_short rport_cnt; /* number of remote ports */ int proto; /* protocol: tcp/udp */ struct alias_link **alink; /* num of entry in spool chain */ u_int16_t spool_cnt; /* chain of spool instances */ LIST_HEAD(spool_chain, cfg_spool) spool_chain; }; #endif #define NAT_BUF_LEN 1024 #ifdef IPFW_INTERNAL /* Nat configuration data struct. */ struct cfg_nat { /* chain of nat instances */ LIST_ENTRY(cfg_nat) _next; int id; /* nat id */ struct in_addr ip; /* nat ip address */ char if_name[IF_NAMESIZE]; /* interface name */ int mode; /* aliasing mode */ struct libalias *lib; /* libalias instance */ /* number of entry in spool chain */ int redir_cnt; /* chain of redir instances */ LIST_HEAD(redir_chain, cfg_redir) redir_chain; }; #endif #define SOF_NAT sizeof(struct cfg_nat) #define SOF_REDIR sizeof(struct cfg_redir) #define SOF_SPOOL sizeof(struct cfg_spool) /* Nat command. */ typedef struct _ipfw_insn_nat { ipfw_insn o; struct cfg_nat *nat; } ipfw_insn_nat; /* Apply ipv6 mask on ipv6 addr */ #define APPLY_MASK(addr,mask) \ (addr)->__u6_addr.__u6_addr32[0] &= (mask)->__u6_addr.__u6_addr32[0]; \ (addr)->__u6_addr.__u6_addr32[1] &= (mask)->__u6_addr.__u6_addr32[1]; \ (addr)->__u6_addr.__u6_addr32[2] &= (mask)->__u6_addr.__u6_addr32[2]; \ (addr)->__u6_addr.__u6_addr32[3] &= (mask)->__u6_addr.__u6_addr32[3]; /* Structure for ipv6 */ typedef struct _ipfw_insn_ip6 { ipfw_insn o; struct in6_addr addr6; struct in6_addr mask6; } ipfw_insn_ip6; /* Used to support icmp6 types */ typedef struct _ipfw_insn_icmp6 { ipfw_insn o; uint32_t d[7]; /* XXX This number si related to the netinet/icmp6.h * define ICMP6_MAXTYPE * as follows: n = ICMP6_MAXTYPE/32 + 1 * Actually is 203 */ } ipfw_insn_icmp6; /* * Here we have the structure representing an ipfw rule. * * It starts with a general area (with link fields and counters) * followed by an array of one or more instructions, which the code * accesses as an array of 32-bit values. * * Given a rule pointer r: * * r->cmd is the start of the first instruction. * ACTION_PTR(r) is the start of the first action (things to do * once a rule matched). * * When assembling instruction, remember the following: * * + if a rule has a "keep-state" (or "limit") option, then the * first instruction (at r->cmd) MUST BE an O_PROBE_STATE * + if a rule has a "log" option, then the first action * (at ACTION_PTR(r)) MUST be O_LOG * + if a rule has an "altq" option, it comes after "log" * + if a rule has an O_TAG option, it comes after "log" and "altq" * * NOTE: we use a simple linked list of rules because we never need * to delete a rule without scanning the list. We do not use * queue(3) macros for portability and readability. */ struct ip_fw { #ifdef _X64EMU int32_t pad1; #endif struct ip_fw *x_next; /* linked list of rules */ #ifdef _X64EMU int32_t pad2; #endif struct ip_fw *next_rule; /* ptr to next [skipto] rule */ /* 'next_rule' is used to pass up 'set_disable' status */ uint16_t act_ofs; /* offset of action in 32-bit units */ uint16_t cmd_len; /* # of 32-bit words in cmd */ uint16_t rulenum; /* rule number */ uint8_t set; /* rule set (0..31) */ #define RESVD_SET 31 /* set for default and persistent rules */ uint8_t _pad; /* padding */ uint32_t id; /* rule id */ /* These fields are present in all rules. */ uint64_t pcnt; /* Packet counter */ uint64_t bcnt; /* Byte counter */ uint32_t timestamp; /* tv_sec of last match */ ipfw_insn cmd[1]; /* storage for commands */ }; #define ACTION_PTR(rule) \ (ipfw_insn *)( (u_int32_t *)((rule)->cmd) + ((rule)->act_ofs) ) #define RULESIZE(rule) (sizeof(struct ip_fw) + \ ((struct ip_fw *)(rule))->cmd_len * 4 - 4) #if 1 // should be moved to in.h /* * This structure is used as a flow mask and a flow id for various * parts of the code. * addr_type is used in userland and kernel to mark the address type. * fib is used in the kernel to record the fib in use. * _flags is used in the kernel to store tcp flags for dynamic rules. */ struct ipfw_flow_id { uint32_t dst_ip; uint32_t src_ip; uint16_t dst_port; uint16_t src_port; uint8_t fib; uint8_t proto; uint8_t _flags; /* protocol-specific flags */ uint8_t addr_type; /* 4=ip4, 6=ip6, 1=ether ? */ struct in6_addr dst_ip6; struct in6_addr src_ip6; uint32_t flow_id6; uint32_t extra; /* queue/pipe or frag_id */ }; #endif #define IS_IP6_FLOW_ID(id) ((id)->addr_type == 6) /* * Dynamic ipfw rule. */ typedef struct _ipfw_dyn_rule ipfw_dyn_rule; struct _ipfw_dyn_rule { ipfw_dyn_rule *next; /* linked list of rules. */ struct ip_fw *rule; /* pointer to rule */ /* 'rule' is used to pass up the rule number (from the parent) */ ipfw_dyn_rule *parent; /* pointer to parent rule */ u_int64_t pcnt; /* packet match counter */ u_int64_t bcnt; /* byte match counter */ struct ipfw_flow_id id; /* (masked) flow id */ u_int32_t expire; /* expire time */ u_int32_t bucket; /* which bucket in hash table */ u_int32_t state; /* state of this rule (typically a * combination of TCP flags) */ u_int32_t ack_fwd; /* most recent ACKs in forward */ u_int32_t ack_rev; /* and reverse directions (used */ /* to generate keepalives) */ u_int16_t dyn_type; /* rule type */ u_int16_t count; /* refcount */ }; /* * Definitions for IP option names. */ #define IP_FW_IPOPT_LSRR 0x01 #define IP_FW_IPOPT_SSRR 0x02 #define IP_FW_IPOPT_RR 0x04 #define IP_FW_IPOPT_TS 0x08 /* * Definitions for TCP option names. */ #define IP_FW_TCPOPT_MSS 0x01 #define IP_FW_TCPOPT_WINDOW 0x02 #define IP_FW_TCPOPT_SACK 0x04 #define IP_FW_TCPOPT_TS 0x08 #define IP_FW_TCPOPT_CC 0x10 #define ICMP_REJECT_RST 0x100 /* fake ICMP code (send a TCP RST) */ #define ICMP6_UNREACH_RST 0x100 /* fake ICMPv6 code (send a TCP RST) */ /* * These are used for lookup tables. */ #define IPFW_TABLE_CIDR 1 /* Table for holding IPv4/IPv6 prefixes */ #define IPFW_TABLE_INTERFACE 2 /* Table for holding interface names */ #define IPFW_TABLE_MAXTYPE 2 /* Maximum valid number */ typedef struct _ipfw_table_entry { in_addr_t addr; /* network address */ u_int32_t value; /* value */ u_int16_t tbl; /* table number */ u_int8_t masklen; /* mask length */ } ipfw_table_entry; typedef struct _ipfw_table_xentry { uint16_t len; /* Total entry length */ uint8_t type; /* entry type */ uint8_t masklen; /* mask length */ uint16_t tbl; /* table number */ uint32_t value; /* value */ union { /* Longest field needs to be aligned by 4-byte boundary */ struct in6_addr addr6; /* IPv6 address */ char iface[IF_NAMESIZE]; /* interface name */ } k; } ipfw_table_xentry; typedef struct _ipfw_table { u_int32_t size; /* size of entries in bytes */ u_int32_t cnt; /* # of entries */ u_int16_t tbl; /* table number */ ipfw_table_entry ent[0]; /* entries */ } ipfw_table; typedef struct _ipfw_xtable { ip_fw3_opheader opheader; /* eXtended tables are controlled via IP_FW3 */ uint32_t size; /* size of entries in bytes */ uint32_t cnt; /* # of entries */ uint16_t tbl; /* table number */ uint8_t type; /* table type */ ipfw_table_xentry xent[0]; /* entries */ } ipfw_xtable; #endif /* _IPFW2_H */ ================================================ FILE: sys/netinet/ip_icmp.h ================================================ /* * additional define not present in linux * should go in glue.h */ #ifndef _NETINET_IP_ICMP_H_ #define _NETINET_IP_ICMP_H_ #define ICMP_MAXTYPE 40 /* defined as 18 in compat.h */ #define ICMP_ROUTERSOLICIT 10 /* router solicitation */ #define ICMP_TSTAMP 13 /* timestamp request */ #define ICMP_IREQ 15 /* information request */ #define ICMP_MASKREQ 17 /* address mask request */ #define ICMP_UNREACH_HOST 1 /* bad host */ #define ICMP_UNREACH 3 /* dest unreachable, codes: */ #endif /* _NETINET_IP_ICMP_H_ */ ================================================ FILE: sys/netinet/ipfw/dn_heap.c ================================================ /*- * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Binary heap and hash tables, used in dummynet * * $Id: dn_heap.c 11480 2012-07-31 08:02:00Z luigi $ */ #include #include #ifdef _KERNEL __FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/dn_heap.c 203279 2010-01-31 12:20:29Z luigi $"); #include #include #include #include #ifndef log #define log(x, arg...) #endif #else /* !_KERNEL */ #include #include #include #include #include "dn_heap.h" #define log(x, arg...) fprintf(stderr, ## arg) #define panic(x...) fprintf(stderr, ## x), exit(1) #define MALLOC_DEFINE(a, b, c) static void *my_malloc(int s) { return malloc(s); } static void my_free(void *p) { free(p); } #define malloc(s, t, w) my_malloc(s) #define free(p, t) my_free(p) #endif /* !_KERNEL */ MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap"); /* * Heap management functions. * * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. * Some macros help finding parent/children so we can optimize them. * * heap_init() is called to expand the heap when needed. * Increment size in blocks of 16 entries. * Returns 1 on error, 0 on success */ #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) #define HEAP_LEFT(x) ( (x)+(x) + 1 ) #define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } #define HEAP_INCREMENT 15 static int heap_resize(struct dn_heap *h, unsigned int new_size) { struct dn_heap_entry *p; if (h->size >= new_size ) /* have enough room */ return 0; #if 1 /* round to the next power of 2 */ new_size |= new_size >> 1; new_size |= new_size >> 2; new_size |= new_size >> 4; new_size |= new_size >> 8; new_size |= new_size >> 16; #else new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT; #endif p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT); if (p == NULL) { printf("--- %s, resize %d failed\n", __func__, new_size ); return 1; /* error */ } if (h->size > 0) { bcopy(h->p, p, h->size * sizeof(*p) ); free(h->p, M_DN_HEAP); } h->p = p; h->size = new_size; return 0; } int heap_init(struct dn_heap *h, int size, int ofs) { if (heap_resize(h, size)) return 1; h->elements = 0; h->ofs = ofs; return 0; } /* * Insert element in heap. Normally, p != NULL, we insert p in * a new position and bubble up. If p == NULL, then the element is * already in place, and key is the position where to start the * bubble-up. * Returns 1 on failure (cannot allocate new heap entry) * * If ofs > 0 the position (index, int) of the element in the heap is * also stored in the element itself at the given offset in bytes. */ #define SET_OFFSET(h, i) do { \ if (h->ofs > 0) \ *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i; \ } while (0) /* * RESET_OFFSET is used for sanity checks. It sets ofs * to an invalid value. */ #define RESET_OFFSET(h, i) do { \ if (h->ofs > 0) \ *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16; \ } while (0) int heap_insert(struct dn_heap *h, uint64_t key1, void *p) { int son = h->elements; //log("%s key %llu p %p\n", __FUNCTION__, key1, p); if (p == NULL) { /* data already there, set starting point */ son = key1; } else { /* insert new element at the end, possibly resize */ son = h->elements; if (son == h->size) /* need resize... */ // XXX expand by 16 or so if (heap_resize(h, h->elements+16) ) return 1; /* failure... */ h->p[son].object = p; h->p[son].key = key1; h->elements++; } /* make sure that son >= father along the path */ while (son > 0) { int father = HEAP_FATHER(son); struct dn_heap_entry tmp; if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) break; /* found right position */ /* son smaller than father, swap and repeat */ HEAP_SWAP(h->p[son], h->p[father], tmp); SET_OFFSET(h, son); son = father; } SET_OFFSET(h, son); return 0; } /* * remove top element from heap, or obj if obj != NULL */ void heap_extract(struct dn_heap *h, void *obj) { int child, father, max = h->elements - 1; if (max < 0) { printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h); return; } if (obj == NULL) father = 0; /* default: move up smallest child */ else { /* extract specific element, index is at offset */ if (h->ofs <= 0) panic("%s: extract from middle not set on %p\n", __FUNCTION__, h); father = *((int *)((char *)obj + h->ofs)); if (father < 0 || father >= h->elements) { panic("%s: father %d out of bound 0..%d\n", __FUNCTION__, father, h->elements); } } /* * below, father is the index of the empty element, which * we replace at each step with the smallest child until we * reach the bottom level. */ // XXX why removing RESET_OFFSET increases runtime by 10% ? RESET_OFFSET(h, father); while ( (child = HEAP_LEFT(father)) <= max ) { if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) child++; /* take right child, otherwise left */ h->p[father] = h->p[child]; SET_OFFSET(h, father); father = child; } h->elements--; if (father != max) { /* * Fill hole with last entry and bubble up, * reusing the insert code */ h->p[father] = h->p[max]; heap_insert(h, father, NULL); } } #if 0 /* * change object position and update references * XXX this one is never used! */ static void heap_move(struct dn_heap *h, uint64_t new_key, void *object) { int temp, i, max = h->elements-1; struct dn_heap_entry *p, buf; if (h->ofs <= 0) panic("cannot move items on this heap"); p = h->p; /* shortcut */ i = *((int *)((char *)object + h->ofs)); if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */ p[i].key = new_key; for (; i>0 && DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key); i = temp ) { /* bubble up */ HEAP_SWAP(p[i], p[temp], buf); SET_OFFSET(h, i); } } else { /* must move down */ p[i].key = new_key; while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */ if (temp != max && DN_KEY_LT(p[temp+1].key, p[temp].key)) temp++; /* select child with min key */ if (DN_KEY_LT(>p[temp].key, new_key)) { /* go down */ HEAP_SWAP(p[i], p[temp], buf); SET_OFFSET(h, i); } else break; i = temp; } } SET_OFFSET(h, i); } #endif /* heap_move, unused */ /* * heapify() will reorganize data inside an array to maintain the * heap property. It is needed when we delete a bunch of entries. */ static void heapify(struct dn_heap *h) { int i; for (i = 0; i < h->elements; i++ ) heap_insert(h, i , NULL); } int heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t), uintptr_t arg) { int i, ret, found; for (i = found = 0 ; i < h->elements ;) { ret = fn(h->p[i].object, arg); if (ret & HEAP_SCAN_DEL) { h->elements-- ; h->p[i] = h->p[h->elements] ; found++ ; } else i++ ; if (ret & HEAP_SCAN_END) break; } if (found) heapify(h); return found; } /* * cleanup the heap and free data structure */ void heap_free(struct dn_heap *h) { if (h->size >0 ) free(h->p, M_DN_HEAP); bzero(h, sizeof(*h) ); } /* * hash table support. */ struct dn_ht { int buckets; /* how many buckets, really buckets - 1*/ int entries; /* how many entries */ int ofs; /* offset of link field */ uint32_t (*hash)(uintptr_t, int, void *arg); int (*match)(void *_el, uintptr_t key, int, void *); void *(*newh)(uintptr_t, int, void *); void **ht; /* bucket heads */ }; /* * Initialize, allocating bucket pointers inline. * Recycle previous record if possible. * If the 'newh' function is not supplied, we assume that the * key passed to ht_find is the same object to be stored in. */ struct dn_ht * dn_ht_init(struct dn_ht *ht, int buckets, int ofs, uint32_t (*h)(uintptr_t, int, void *), int (*match)(void *, uintptr_t, int, void *), void *(*newh)(uintptr_t, int, void *)) { int l; /* * Notes about rounding bucket size to a power of two. * Given the original bucket size, we compute the nearest lower and * higher power of two, minus 1 (respectively b_min and b_max) because * this value will be used to do an AND with the index returned * by hash function. * To choice between these two values, the original bucket size is * compared with b_min. If the original size is greater than 4/3 b_min, * we round the bucket size to b_max, else to b_min. * This ratio try to round to the nearest power of two, advantaging * the greater size if the different between two power is relatively * big. * Rounding the bucket size to a power of two avoid the use of * module when calculating the correct bucket. * The ht->buckets variable store the bucket size - 1 to simply * do an AND between the index returned by hash function and ht->bucket * instead of a module. */ int b_min; /* min buckets */ int b_max; /* max buckets */ int b_ori; /* original buckets */ if (h == NULL || match == NULL) { printf("--- missing hash or match function"); return NULL; } if (buckets < 1 || buckets > 65536) return NULL; b_ori = buckets; /* calculate next power of 2, - 1*/ buckets |= buckets >> 1; buckets |= buckets >> 2; buckets |= buckets >> 4; buckets |= buckets >> 8; buckets |= buckets >> 16; b_max = buckets; /* Next power */ b_min = buckets >> 1; /* Previous power */ /* Calculate the 'nearest' bucket size */ if (b_min * 4000 / 3000 < b_ori) buckets = b_max; else buckets = b_min; if (ht) { /* see if we can reuse */ if (buckets <= ht->buckets) { ht->buckets = buckets; } else { /* free pointers if not allocated inline */ if (ht->ht != (void *)(ht + 1)) free(ht->ht, M_DN_HEAP); free(ht, M_DN_HEAP); ht = NULL; } } if (ht == NULL) { /* Allocate buckets + 1 entries because buckets is use to * do the AND with the index returned by hash function */ l = sizeof(*ht) + (buckets + 1) * sizeof(void **); ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO); } if (ht) { ht->ht = (void **)(ht + 1); ht->buckets = buckets; ht->ofs = ofs; ht->hash = h; ht->match = match; ht->newh = newh; } return ht; } /* dummy callback for dn_ht_free to unlink all */ static int do_del(void *obj, void *arg) { return DNHT_SCAN_DEL; } void dn_ht_free(struct dn_ht *ht, int flags) { if (ht == NULL) return; if (flags & DNHT_REMOVE) { (void)dn_ht_scan(ht, do_del, NULL); } else { if (ht->ht && ht->ht != (void *)(ht + 1)) free(ht->ht, M_DN_HEAP); free(ht, M_DN_HEAP); } } int dn_ht_entries(struct dn_ht *ht) { return ht ? ht->entries : 0; } /* * Helper function to scan a bucket in the hash table, it * can only be called on a non-empty bucket for a valid table. * * In lookup and scan, consider ht->ht[i] as pointing to the tail * of the queue (head is NEXTP(tail). The 'empty' value is irrelevant. * While searching, start analysing p = head, end when p == tail. * Note that 'tail' is a cache of the _original_ ht->ht[i] * and is used to check for loop termination. If you remove * it, you must also adjust 'p' when deleting the 'tail' element. */ #define NEXT(_h, _p) *((void **)((char *)(_p) + (_h)->ofs)) static int dn_ht_scan_body(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *), void *arg) { int ret, found = 0, i = *bucket; void *tail, *pp, *p, *nextp; pp = tail = ht->ht[i]; do { p = NEXT(ht, pp); nextp = NEXT(ht, p); ret = fn(p, arg); if ((ret & DNHT_SCAN_DEL) == 0) { pp = p; /* prepare for next loop */ } else { found++; ht->entries--; /* skip current element */ if (pp != p) /* pp == p implies p == tail */ NEXT(ht, pp) = nextp; if (p == tail) ht->ht[i] = (pp != p) ? pp : NULL; } if (ret & DNHT_SCAN_END) { /* Update ht->ht[i] before returning */ ht->ht[i] = (ht->ht[i] == NULL) ? NULL : pp; return found; } } while (p != tail); (*bucket)++; return found; } /* * lookup and optionally create or delete element. * This is an optimized version of the scan so it is coded * inline. */ void * dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg) { int i, found; void *tail, *pp, *p; /* pp is the prev element, pp is current */ if (ht == NULL) /* easy on an empty hash */ return NULL; i = (ht->buckets == 1) ? 0 : (ht->hash(key, flags, arg) & ht->buckets); pp = tail = ht->ht[i]; if (tail) { /* non empty, try a lookup */ do { p = NEXT(ht, pp); found = (flags & DNHT_MATCH_PTR) ? key == (uintptr_t)p : ht->match(p, key, flags, arg); if (!found) continue; if (flags & DNHT_REMOVE) { ht->entries--; if (p != pp) /* skip current element */ NEXT(ht, pp) = NEXT(ht, p); if (p == tail) ht->ht[i] = (pp != p) ? pp : NULL; } return p; } while ( (pp = p) != tail); } /* not found */ if ((flags & DNHT_INSERT) == 0) return NULL; p = ht->newh ? ht->newh(key, flags, arg) : (void *)key; if (p) { ht->entries++; if (tail == NULL) { ht->ht[i] = NEXT(ht, p) = p; } else { NEXT(ht, p) = NEXT(ht, tail); NEXT(ht, tail) = p; } } return p; } /* * do a scan with the option to delete the object. * Similar to the lookup, but the match function is different, * and we extract 'next' before running the callback because * the element may be destroyed there. */ int dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg) { int i, bucket, found = 0; if (ht == NULL || fn == NULL) return 0; for (i = 0; i <= ht->buckets; i++) { if (ht->ht[i] == NULL) continue; /* empty bucket */ bucket = i; found += dn_ht_scan_body(ht, &bucket, fn, arg); if (bucket == i) /* early exit */ return found; } return found; } /* * Similar to dn_ht_scan(), except that the scan is performed only * in the bucket 'bucket'. The function returns a correct bucket number if * the original is invalid. * If the callback returns DNHT_SCAN_END, the function move the ht->ht[i] * pointer to the last entry processed. Moreover, the bucket number passed * by caller is decremented, because usually the caller increment it. */ int dn_ht_scan_bucket(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *), void *arg) { if (ht == NULL || fn == NULL) return 0; if (*bucket > ht->buckets || *bucket < 0) *bucket = 0; if (ht->ht[*bucket] == NULL) { (*bucket)++; return 0; } else return dn_ht_scan_body(ht, bucket, fn, arg); } ================================================ FILE: sys/netinet/ipfw/dn_heap.h ================================================ /*- * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Binary heap and hash tables, header file * * $FreeBSD: head/sys/netinet/ipfw/dn_heap.h 204865 2010-03-08 11:27:08Z luigi $ */ #ifndef _IP_DN_HEAP_H #define _IP_DN_HEAP_H #define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0) #define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0) /* * This module implements a binary heap supporting random extraction. * * A heap entry contains an uint64_t key and a pointer to object. * DN_KEY_LT(a,b) returns true if key 'a' is smaller than 'b' * * The heap is a struct dn_heap plus a dynamically allocated * array of dn_heap_entry entries. 'size' represents the size of * the array, 'elements' count entries in use. The topmost * element has the smallest key. * The heap supports ordered insert, and extract from the top. * To extract an object from the middle of the heap, we the object * must reserve an 'int32_t' to store the position of the object * in the heap itself, and the location of this field must be * passed as an argument to heap_init() -- use -1 if the feature * is not used. */ struct dn_heap_entry { uint64_t key; /* sorting key, smallest comes first */ void *object; /* object pointer */ }; struct dn_heap { int size; /* the size of the array */ int elements; /* elements in use */ int ofs; /* offset in the object of heap index */ struct dn_heap_entry *p; /* array of "size" entries */ }; enum { HEAP_SCAN_DEL = 1, HEAP_SCAN_END = 2, }; /* * heap_init() reinitializes the heap setting the size and the offset * of the index for random extraction (use -1 if not used). * The 'elements' counter is set to 0. * * SET_HEAP_OFS() indicates where, in the object, is stored the index * for random extractions from the heap. * * heap_free() frees the memory associated to a heap. * * heap_insert() adds a key-pointer pair to the heap * * HEAP_TOP() returns a pointer to the top element of the heap, * but makes no checks on its existance (XXX should we change ?) * * heap_extract() removes the entry at the top, returing the pointer. * (the key should have been read before). * * heap_scan() invokes a callback on each entry of the heap. * The callback can return a combination of HEAP_SCAN_DEL and * HEAP_SCAN_END. HEAP_SCAN_DEL means the current element must * be removed, and HEAP_SCAN_END means to terminate the scan. * heap_scan() returns the number of elements removed. * Because the order is not guaranteed, we should use heap_scan() * only as a last resort mechanism. */ #define HEAP_TOP(h) ((h)->p) #define SET_HEAP_OFS(h, n) do { (h)->ofs = n; } while (0) int heap_init(struct dn_heap *h, int size, int ofs); int heap_insert(struct dn_heap *h, uint64_t key1, void *p); void heap_extract(struct dn_heap *h, void *obj); void heap_free(struct dn_heap *h); int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t); /*------------------------------------------------------ * This module implements a generic hash table with support for * running callbacks on the entire table. To avoid allocating * memory during hash table operations, objects must reserve * space for a link field. XXX if the heap is moderately full, * an SLIST suffices, and we can tolerate the cost of a hash * computation on each removal. * * dn_ht_init() initializes the table, setting the number of * buckets, the offset of the link field, the main callbacks. * Callbacks are: * * hash(key, flags, arg) called to return a bucket index. * match(obj, key, flags, arg) called to determine if key * matches the current 'obj' in the heap * newh(key, flags, arg) optional, used to allocate a new * object during insertions. * * dn_ht_free() frees the heap or unlink elements. * DNHT_REMOVE unlink elements, 0 frees the heap. * You need two calls to do both. * * dn_ht_find() is the main lookup function, which can also be * used to insert or delete elements in the hash table. * The final 'arg' is passed to all callbacks. * * dn_ht_scan() is used to invoke a callback on all entries of * the heap, or possibly on just one bucket. The callback * is invoked with a pointer to the object, and must return * one of DNHT_SCAN_DEL or DNHT_SCAN_END to request the * removal of the object from the heap and the end of the * scan, respectively. * * dn_ht_scan_bucket() is similar to dn_ht_scan(), except that it scans * only the specific bucket of the table. The bucket is a in-out * parameter and return a valid bucket number if the original * is invalid. * * A combination of flags can be used to modify the operation * of the dn_ht_find(), and of the callbacks: * * DNHT_KEY_IS_OBJ means the key is the object pointer. * It is usally of interest for the hash and match functions. * * DNHT_MATCH_PTR during a lookup, match pointers instead * of calling match(). Normally used when removing specific * entries. Does not imply KEY_IS_OBJ as the latter _is_ used * by the match function. * * DNHT_INSERT insert the element if not found. * Calls new() to allocates a new object unless * DNHT_KEY_IS_OBJ is set. * * DNHT_UNIQUE only insert if object not found. * XXX should it imply DNHT_INSERT ? * * DNHT_REMOVE remove objects if we find them. */ struct dn_ht; /* should be opaque */ struct dn_ht *dn_ht_init(struct dn_ht *, int buckets, int ofs, uint32_t (*hash)(uintptr_t, int, void *), int (*match)(void *, uintptr_t, int, void *), void *(*newh)(uintptr_t, int, void *)); void dn_ht_free(struct dn_ht *, int flags); void *dn_ht_find(struct dn_ht *, uintptr_t, int, void *); int dn_ht_scan(struct dn_ht *, int (*)(void *, void *), void *); int dn_ht_scan_bucket(struct dn_ht *, int * , int (*)(void *, void *), void *); int dn_ht_entries(struct dn_ht *); enum { /* flags values. * first two are returned by the scan callback to indicate * to delete the matching element or to end the scan */ DNHT_SCAN_DEL = 0x0001, DNHT_SCAN_END = 0x0002, DNHT_KEY_IS_OBJ = 0x0004, /* key is the obj pointer */ DNHT_MATCH_PTR = 0x0008, /* match by pointer, not match() */ DNHT_INSERT = 0x0010, /* insert if not found */ DNHT_UNIQUE = 0x0020, /* report error if already there */ DNHT_REMOVE = 0x0040, /* remove on find or dn_ht_free */ }; #endif /* _IP_DN_HEAP_H */ ================================================ FILE: sys/netinet/ipfw/dn_sched.h ================================================ /* * Copyright (c) 2010 Riccardo Panicucci, Luigi Rizzo, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * The API to write a packet scheduling algorithm for dummynet. * * $FreeBSD: head/sys/netinet/ipfw/dn_sched.h 204591 2010-03-02 17:40:48Z luigi $ */ #ifndef _DN_SCHED_H #define _DN_SCHED_H #define DN_MULTIQUEUE 0x01 /* * Descriptor for a scheduling algorithm. * Contains all function pointers for a given scheduler * This is typically created when a module is loaded, and stored * in a global list of schedulers. */ struct dn_alg { uint32_t type; /* the scheduler type */ const char *name; /* scheduler name */ uint32_t flags; /* DN_MULTIQUEUE if supports multiple queues */ /* * The following define the size of 3 optional data structures * that may need to be allocated at runtime, and are appended * to each of the base data structures: scheduler, sched.inst, * and queue. We don't have a per-flowset structure. */ /* + parameters attached to the template, e.g. * default queue sizes, weights, quantum size, and so on; */ size_t schk_datalen; /* + per-instance parameters, such as timestamps, * containers for queues, etc; */ size_t si_datalen; size_t q_datalen; /* per-queue parameters (e.g. S,F) */ /* * Methods implemented by the scheduler: * enqueue enqueue packet 'm' on scheduler 's', queue 'q'. * q is NULL for !MULTIQUEUE. * Return 0 on success, 1 on drop (packet consumed anyways). * Note that q should be interpreted only as a hint * on the flow that the mbuf belongs to: while a * scheduler will normally enqueue m into q, it is ok * to leave q alone and put the mbuf elsewhere. * This function is called in two cases: * - when a new packet arrives to the scheduler; * - when a scheduler is reconfigured. In this case the * call is issued by the new_queue callback, with a * non empty queue (q) and m pointing to the first * mbuf in the queue. For this reason, the function * should internally check for (m != q->mq.head) * before calling dn_enqueue(). * * dequeue Called when scheduler instance 's' can * dequeue a packet. Return NULL if none are available. * XXX what about non work-conserving ? * * config called on 'sched X config ...', normally writes * in the area of size sch_arg * * destroy called on 'sched delete', frees everything * in sch_arg (other parts are handled by more specific * functions) * * new_sched called when a new instance is created, e.g. * to create the local queue for !MULTIQUEUE, set V or * copy parameters for WFQ, and so on. * * free_sched called when deleting an instance, cleans * extra data in the per-instance area. * * new_fsk called when a flowset is linked to a scheduler, * e.g. to validate parameters such as weights etc. * free_fsk when a flowset is unlinked from a scheduler. * (probably unnecessary) * * new_queue called to set the per-queue parameters, * e.g. S and F, adjust sum of weights in the parent, etc. * * The new_queue callback is normally called from when * creating a new queue. In some cases (such as a * scheduler change or reconfiguration) it can be called * with a non empty queue. In this case, the queue * In case of non empty queue, the new_queue callback could * need to call the enqueue function. In this case, * the callback should eventually call enqueue() passing * as m the first element in the queue. * * free_queue actions related to a queue removal, e.g. undo * all the above. If the queue has data in it, also remove * from the scheduler. This can e.g. happen during a reconfigure. * If safe == 1 remove the queue only if the scheduler no longer * need it, otherwise delete it even if the scheduler is using * it. Usually, the flag safe is set when the drain routine is * running to delete idle queues. */ int (*enqueue)(struct dn_sch_inst *, struct dn_queue *, struct mbuf *); struct mbuf * (*dequeue)(struct dn_sch_inst *); int (*config)(struct dn_schk *); int (*destroy)(struct dn_schk*); int (*new_sched)(struct dn_sch_inst *); int (*free_sched)(struct dn_sch_inst *); int (*new_fsk)(struct dn_fsk *f); int (*free_fsk)(struct dn_fsk *f); int (*new_queue)(struct dn_queue *q); int (*free_queue)(struct dn_queue *q, int safe); /* run-time fields */ int ref_count; /* XXX number of instances in the system */ SLIST_ENTRY(dn_alg) next; /* Next scheduler in the list */ }; /* MSVC does not support initializers so we need this ugly macro */ #ifdef _WIN32 #define _SI(fld) #else #define _SI(fld) fld #endif /* * Additionally, dummynet exports some functions and macros * to be used by schedulers: */ void dn_free_pkts(struct mbuf *mnext); int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop); /* bound a variable between min and max */ int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg); /* * Extract the head of a queue, update stats. Must be the very last * thing done on a dequeue as the queue itself may go away. */ static __inline struct mbuf* dn_dequeue(struct dn_queue *q) { struct mbuf *m = q->mq.head; if (m == NULL) return NULL; q->mq.head = m->m_nextpkt; /* Update stats for the queue */ q->ni.length--; q->ni.len_bytes -= m->m_pkthdr.len; /* When the queue becomes idle, update idle_time (used by RED) * and also update the count of idle queues (for garbage collection). */ if (q->ni.length == 0) { dn_cfg.idle_queue++; q->q_time = dn_cfg.curr_time; } if (q->_si) { struct dn_flow *ni = &(q->_si->ni); /* update stats for the scheduler instance, and keep track * of idle scheduler instances if needed */ ni->length--; ni->len_bytes -= m->m_pkthdr.len; if (ni->length == 0) dn_cfg.idle_si++; } return m; } int dn_sched_modevent(module_t mod, int cmd, void *arg); #define DECLARE_DNSCHED_MODULE(name, dnsched) \ static moduledata_t name##_mod = { \ #name, dn_sched_modevent, dnsched \ }; \ DECLARE_MODULE(name, name##_mod, \ SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \ MODULE_DEPEND(name, dummynet, 3, 3, 3); #endif /* _DN_SCHED_H */ ================================================ FILE: sys/netinet/ipfw/dn_sched_fifo.c ================================================ /* * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id: dn_sched_fifo.c 11480 2012-07-31 08:02:00Z luigi $ */ #ifdef _KERNEL #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include /* ipfw_rule_ref */ #include /* flow_id */ #include #include #include #include #else #include #endif /* * This file implements a FIFO scheduler for a single queue. * The queue is allocated as part of the scheduler instance, * and there is a single flowset is in the template which stores * queue size and policy. * Enqueue and dequeue use the default library functions. */ static int fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m) { /* XXX if called with q != NULL and m=NULL, this is a * re-enqueue from an existing scheduler, which we should * handle. */ return dn_enqueue((struct dn_queue *)(si+1), m, 0); } static struct mbuf * fifo_dequeue(struct dn_sch_inst *si) { return dn_dequeue((struct dn_queue *)(si + 1)); } static int fifo_new_sched(struct dn_sch_inst *si) { /* This scheduler instance contains the queue */ struct dn_queue *q = (struct dn_queue *)(si + 1); set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q)); q->_si = si; q->fs = si->sched->fs; return 0; } static int fifo_free_sched(struct dn_sch_inst *si) { struct dn_queue *q = (struct dn_queue *)(si + 1); dn_free_pkts(q->mq.head); bzero(q, sizeof(*q)); return 0; } /* * FIFO scheduler descriptor * contains the type of the scheduler, the name, the size of extra * data structures, and function pointers. */ static struct dn_alg fifo_desc = { _SI( .type = ) DN_SCHED_FIFO, _SI( .name = ) "FIFO", _SI( .flags = ) 0, _SI( .schk_datalen = ) 0, _SI( .si_datalen = ) sizeof(struct dn_queue), _SI( .q_datalen = ) 0, _SI( .enqueue = ) fifo_enqueue, _SI( .dequeue = ) fifo_dequeue, _SI( .config = ) NULL, _SI( .destroy = ) NULL, _SI( .new_sched = ) fifo_new_sched, _SI( .free_sched = ) fifo_free_sched, _SI( .new_fsk = ) NULL, _SI( .free_fsk = ) NULL, _SI( .new_queue = ) NULL, _SI( .free_queue = ) NULL, }; DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc); ================================================ FILE: sys/netinet/ipfw/dn_sched_prio.c ================================================ /* * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id: dn_sched_prio.c 11480 2012-07-31 08:02:00Z luigi $ */ #ifdef _KERNEL #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include /* ipfw_rule_ref */ #include /* flow_id */ #include #include #include #include #else #include #endif #define DN_SCHED_PRIO 5 //XXX #if !defined(_KERNEL) || !defined(__linux__) #define test_bit(ix, pData) ((*pData) & (1<<(ix))) #define __set_bit(ix, pData) (*pData) |= (1<<(ix)) #define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) #endif #ifdef __MIPSEL__ #define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) #endif /* Size of the array of queues pointers. */ #define BITMAP_T unsigned long #define MAXPRIO (sizeof(BITMAP_T) * 8) /* * The scheduler instance contains an array of pointers to queues, * one for each priority, and a bitmap listing backlogged queues. */ struct prio_si { BITMAP_T bitmap; /* array bitmap */ struct dn_queue *q_array[MAXPRIO]; /* Array of queues pointers */ }; /* * If a queue with the same priority is already backlogged, use * that one instead of the queue passed as argument. */ static int prio_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) { struct prio_si *si = (struct prio_si *)(_si + 1); int prio = q->fs->fs.par[0]; if (test_bit(prio, &si->bitmap) == 0) { /* No queue with this priority, insert */ __set_bit(prio, &si->bitmap); si->q_array[prio] = q; } else { /* use the existing queue */ q = si->q_array[prio]; } if (dn_enqueue(q, m, 0)) return 1; return 0; } /* * Packets are dequeued only from the highest priority queue. * The function ffs() return the lowest bit in the bitmap that rapresent * the array index (-1) which contains the pointer to the highest priority * queue. * After the dequeue, if this queue become empty, it is index is removed * from the bitmap. * Scheduler is idle if the bitmap is empty * * NOTE: highest priority is 0, lowest is sched->max_prio_q */ static struct mbuf * prio_dequeue(struct dn_sch_inst *_si) { struct prio_si *si = (struct prio_si *)(_si + 1); struct mbuf *m; struct dn_queue *q; int prio; if (si->bitmap == 0) /* scheduler idle */ return NULL; prio = ffs(si->bitmap) - 1; /* Take the highest priority queue in the scheduler */ q = si->q_array[prio]; // assert(q) m = dn_dequeue(q); if (q->mq.head == NULL) { /* Queue is now empty, remove from scheduler * and mark it */ si->q_array[prio] = NULL; __clear_bit(prio, &si->bitmap); } return m; } static int prio_new_sched(struct dn_sch_inst *_si) { struct prio_si *si = (struct prio_si *)(_si + 1); bzero(si->q_array, sizeof(si->q_array)); si->bitmap = 0; return 0; } static int prio_new_fsk(struct dn_fsk *fs) { /* Check if the prioritiy is between 0 and MAXPRIO-1 */ ipdn_bound_var(&fs->fs.par[0], 0, 0, MAXPRIO - 1, "PRIO priority"); return 0; } static int prio_new_queue(struct dn_queue *q) { struct prio_si *si = (struct prio_si *)(q->_si + 1); int prio = q->fs->fs.par[0]; struct dn_queue *oldq; q->ni.oid.subtype = DN_SCHED_PRIO; if (q->mq.head == NULL) return 0; /* Queue already full, must insert in the scheduler or append * mbufs to existing queue. This partly duplicates prio_enqueue */ if (test_bit(prio, &si->bitmap) == 0) { /* No queue with this priority, insert */ __set_bit(prio, &si->bitmap); si->q_array[prio] = q; } else if ( (oldq = si->q_array[prio]) != q) { /* must append to the existing queue. * can simply append q->mq.head to q2->... * and add the counters to those of q2 */ oldq->mq.tail->m_nextpkt = q->mq.head; oldq->mq.tail = q->mq.tail; oldq->ni.length += q->ni.length; q->ni.length = 0; oldq->ni.len_bytes += q->ni.len_bytes; q->ni.len_bytes = 0; q->mq.tail = q->mq.head = NULL; } return 0; } static int prio_free_queue(struct dn_queue *q, int safe) { int prio = q->fs->fs.par[0]; struct prio_si *si = (struct prio_si *)(q->_si + 1); if (si->q_array[prio] == q) { si->q_array[prio] = NULL; __clear_bit(prio, &si->bitmap); } return 0; } static struct dn_alg prio_desc = { _SI( .type = ) DN_SCHED_PRIO, _SI( .name = ) "PRIO", _SI( .flags = ) DN_MULTIQUEUE, /* we need extra space in the si and the queue */ _SI( .schk_datalen = ) 0, _SI( .si_datalen = ) sizeof(struct prio_si), _SI( .q_datalen = ) 0, _SI( .enqueue = ) prio_enqueue, _SI( .dequeue = ) prio_dequeue, _SI( .config = ) NULL, _SI( .destroy = ) NULL, _SI( .new_sched = ) prio_new_sched, _SI( .free_sched = ) NULL, _SI( .new_fsk = ) prio_new_fsk, _SI( .free_fsk = ) NULL, _SI( .new_queue = ) prio_new_queue, _SI( .free_queue = ) prio_free_queue, }; DECLARE_DNSCHED_MODULE(dn_prio, &prio_desc); ================================================ FILE: sys/netinet/ipfw/dn_sched_qfq.c ================================================ /* * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id: dn_sched_qfq.c 11656 2012-08-07 08:39:06Z luigi $ */ #ifdef _KERNEL #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include /* ipfw_rule_ref */ #include /* flow_id */ #include #include #include #include #else #include #endif #ifdef QFQ_DEBUG struct qfq_sched; static void dump_sched(struct qfq_sched *q, const char *msg); #define NO(x) x #else #define NO(x) #endif #define DN_SCHED_QFQ 4 // XXX Where? typedef unsigned long bitmap; /* * bitmaps ops are critical. Some linux versions have __fls * and the bitmap ops. Some machines have ffs */ #if defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24)) int fls(unsigned int n) { int i = 0; for (i = 0; n > 0; n >>= 1, i++) ; return i; } #endif #if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24)) static inline unsigned long __fls(unsigned long word) { return fls(word) - 1; } #endif #if !defined(_KERNEL) || !defined(__linux__) #ifdef QFQ_DEBUG int test_bit(int ix, bitmap *p) { if (ix < 0 || ix > 31) D("bad index %d", ix); return *p & (1< 31) D("bad index %d", ix); *p |= (1< 31) D("bad index %d", ix); *p &= ~(1<index = 0 *.__grp->slot_shift where MIN_SLOT_SHIFT is derived by difference from the others. The max group index corresponds to Lmax/w_min, where Lmax=1<group mapping. Class weights are * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the * group with the smallest index that can support the L_i / r_i * configured for the class. * * grp->index is the index of the group; and grp->slot_shift * is the shift for the corresponding (scaled) sigma_i. * * When computing the group index, we do (len<i_wsum) #define IWSUM ((1< 0; } /* Round a precise timestamp to its slotted value. */ static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift) { return ts & ~((1ULL << shift) - 1); } /* return the pointer to the group with lowest index in the bitmap */ static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, unsigned long bitmap) { int index = ffs(bitmap) - 1; // zero-based return &q->groups[index]; } /* * Calculate a flow index, given its weight and maximum packet length. * index = log_2(maxlen/weight) but we need to apply the scaling. * This is used only once at flow creation. */ static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen) { uint64_t slot_size = (uint64_t)maxlen *inv_w; unsigned long size_map; int index = 0; size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT); if (!size_map) goto out; index = __fls(size_map) + 1; // basically a log_2() index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); if (index < 0) index = 0; out: ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index); return index; } /*---- end support functions ----*/ /*-------- API calls --------------------------------*/ /* * Validate and copy parameters from flowset. */ static int qfq_new_queue(struct dn_queue *_q) { struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); struct qfq_class *cl = (struct qfq_class *)_q; int i; uint32_t w; /* approximated weight */ /* import parameters from the flowset. They should be correct * already. */ w = _q->fs->fs.par[0]; cl->lmax = _q->fs->fs.par[1]; if (!w || w > QFQ_MAX_WEIGHT) { w = 1; D("rounding weight to 1"); } cl->inv_w = ONE_FP/w; w = ONE_FP/cl->inv_w; if (q->wsum + w > QFQ_MAX_WSUM) return EINVAL; i = qfq_calc_index(cl->inv_w, cl->lmax); cl->grp = &q->groups[i]; q->wsum += w; // XXX cl->S = q->V; ? // XXX compute q->i_wsum return 0; } /* remove an empty queue */ static int qfq_free_queue(struct dn_queue *_q, int safe) { struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); struct qfq_class *cl = (struct qfq_class *)_q; if (cl->inv_w) { q->wsum -= ONE_FP/cl->inv_w; cl->inv_w = 0; /* reset weight to avoid run twice */ } return 0; } /* Calculate a mask to mimic what would be ffs_from(). */ static inline unsigned long mask_from(unsigned long bitmap, int from) { return bitmap & ~((1UL << from) - 1); } /* * The state computation relies on ER=0, IR=1, EB=2, IB=3 * First compute eligibility comparing grp->S, q->V, * then check if someone is blocking us and possibly add EB */ static inline unsigned int qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp) { /* if S > V we are not eligible */ unsigned int state = qfq_gt(grp->S, q->V); unsigned long mask = mask_from(q->bitmaps[ER], grp->index); struct qfq_group *next; if (mask) { next = qfq_ffs(q, mask); if (qfq_gt(grp->F, next->F)) state |= EB; } return state; } /* * In principle * q->bitmaps[dst] |= q->bitmaps[src] & mask; * q->bitmaps[src] &= ~mask; * but we should make sure that src != dst */ static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst) { q->bitmaps[dst] |= q->bitmaps[src] & mask; q->bitmaps[src] &= ~mask; } static inline void qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish) { unsigned long mask = mask_from(q->bitmaps[ER], index + 1); struct qfq_group *next; if (mask) { next = qfq_ffs(q, mask); if (!qfq_gt(next->F, old_finish)) return; } mask = (1UL << index) - 1; qfq_move_groups(q, mask, EB, ER); qfq_move_groups(q, mask, IB, IR); } /* * perhaps * old_V ^= q->V; old_V >>= QFQ_MIN_SLOT_SHIFT; if (old_V) { ... } * */ static inline void qfq_make_eligible(struct qfq_sched *q, uint64_t old_V) { unsigned long mask, vslot, old_vslot; vslot = q->V >> QFQ_MIN_SLOT_SHIFT; old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; if (vslot != old_vslot) { mask = (2UL << (__fls(vslot ^ old_vslot))) - 1; qfq_move_groups(q, mask, IR, ER); qfq_move_groups(q, mask, IB, EB); } } /* * XXX we should make sure that slot becomes less than 32. * This is guaranteed by the input values. * roundedS is always cl->S rounded on grp->slot_shift bits. */ static inline void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS) { uint64_t slot = (roundedS - grp->S) >> grp->slot_shift; unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS; cl->next = grp->slots[i]; grp->slots[i] = cl; __set_bit(slot, &grp->full_slots); } /* * remove the entry from the slot */ static inline void qfq_front_slot_remove(struct qfq_group *grp) { struct qfq_class **h = &grp->slots[grp->front]; *h = (*h)->next; if (!*h) __clear_bit(0, &grp->full_slots); } /* * Returns the first full queue in a group. As a side effect, * adjust the bucket list so the first non-empty bucket is at * position 0 in full_slots. */ static inline struct qfq_class * qfq_slot_scan(struct qfq_group *grp) { int i; ND("grp %d full %x", grp->index, grp->full_slots); if (!grp->full_slots) return NULL; i = ffs(grp->full_slots) - 1; // zero-based if (i > 0) { grp->front = (grp->front + i) % QFQ_MAX_SLOTS; grp->full_slots >>= i; } return grp->slots[grp->front]; } /* * adjust the bucket list. When the start time of a group decreases, * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to * move the objects. The mask of occupied slots must be shifted * because we use ffs() to find the first non-empty slot. * This covers decreases in the group's start time, but what about * increases of the start time ? * Here too we should make sure that i is less than 32 */ static inline void qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS) { unsigned int i = (grp->S - roundedS) >> grp->slot_shift; grp->full_slots <<= i; grp->front = (grp->front - i) % QFQ_MAX_SLOTS; } static inline void qfq_update_eligible(struct qfq_sched *q, uint64_t old_V) { bitmap ineligible; ineligible = q->bitmaps[IR] | q->bitmaps[IB]; if (ineligible) { if (!q->bitmaps[ER]) { struct qfq_group *grp; grp = qfq_ffs(q, ineligible); if (qfq_gt(grp->S, q->V)) q->V = grp->S; } qfq_make_eligible(q, old_V); } } /* * Updates the class, returns true if also the group needs to be updated. */ static inline int qfq_update_class(struct qfq_sched *q, struct qfq_group *grp, struct qfq_class *cl) { cl->S = cl->F; if (cl->_q.mq.head == NULL) { qfq_front_slot_remove(grp); } else { unsigned int len; uint64_t roundedS; len = cl->_q.mq.head->m_pkthdr.len; cl->F = cl->S + (uint64_t)len * cl->inv_w; roundedS = qfq_round_down(cl->S, grp->slot_shift); if (roundedS == grp->S) return 0; qfq_front_slot_remove(grp); qfq_slot_insert(grp, cl, roundedS); } return 1; } static struct mbuf * qfq_dequeue(struct dn_sch_inst *si) { struct qfq_sched *q = (struct qfq_sched *)(si + 1); struct qfq_group *grp; struct qfq_class *cl; struct mbuf *m; uint64_t old_V; NO(q->loops++;) if (!q->bitmaps[ER]) { NO(if (q->queued) dump_sched(q, "start dequeue");) return NULL; } grp = qfq_ffs(q, q->bitmaps[ER]); cl = grp->slots[grp->front]; /* extract from the first bucket in the bucket list */ m = dn_dequeue(&cl->_q); if (!m) { D("BUG/* non-workconserving leaf */"); return NULL; } NO(q->queued--;) old_V = q->V; q->V += (uint64_t)m->m_pkthdr.len * IWSUM; ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V); if (qfq_update_class(q, grp, cl)) { uint64_t old_F = grp->F; cl = qfq_slot_scan(grp); if (!cl) { /* group gone, remove from ER */ __clear_bit(grp->index, &q->bitmaps[ER]); // grp->S = grp->F + 1; // XXX debugging only } else { uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift); unsigned int s; if (grp->S == roundedS) goto skip_unblock; grp->S = roundedS; grp->F = roundedS + (2ULL << grp->slot_shift); /* remove from ER and put in the new set */ __clear_bit(grp->index, &q->bitmaps[ER]); s = qfq_calc_state(q, grp); __set_bit(grp->index, &q->bitmaps[s]); } /* we need to unblock even if the group has gone away */ qfq_unblock_groups(q, grp->index, old_F); } skip_unblock: qfq_update_eligible(q, old_V); NO(if (!q->bitmaps[ER] && q->queued) dump_sched(q, "end dequeue");) return m; } /* * Assign a reasonable start time for a new flow k in group i. * Admissible values for \hat(F) are multiples of \sigma_i * no greater than V+\sigma_i . Larger values mean that * we had a wraparound so we consider the timestamp to be stale. * * If F is not stale and F >= V then we set S = F. * Otherwise we should assign S = V, but this may violate * the ordering in ER. So, if we have groups in ER, set S to * the F_j of the first group j which would be blocking us. * We are guaranteed not to move S backward because * otherwise our group i would still be blocked. */ static inline void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) { unsigned long mask; uint64_t limit, roundedF; int slot_shift = cl->grp->slot_shift; roundedF = qfq_round_down(cl->F, slot_shift); limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift); if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { /* timestamp was stale */ mask = mask_from(q->bitmaps[ER], cl->grp->index); if (mask) { struct qfq_group *next = qfq_ffs(q, mask); if (qfq_gt(roundedF, next->F)) { cl->S = next->F; return; } } cl->S = q->V; } else { /* timestamp is not stale */ cl->S = cl->F; } } static int qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m) { struct qfq_sched *q = (struct qfq_sched *)(si + 1); struct qfq_group *grp; struct qfq_class *cl = (struct qfq_class *)_q; uint64_t roundedS; int s; NO(q->loops++;) DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len, _q, cl->inv_w, cl->grp->index); /* XXX verify that the packet obeys the parameters */ if (m != _q->mq.head) { if (dn_enqueue(_q, m, 0)) /* packet was dropped */ return 1; NO(q->queued++;) if (m != _q->mq.head) return 0; } /* If reach this point, queue q was idle */ grp = cl->grp; qfq_update_start(q, cl); /* adjust start time */ /* compute new finish time and rounded start. */ cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w; roundedS = qfq_round_down(cl->S, grp->slot_shift); /* * insert cl in the correct bucket. * If cl->S >= grp->S we don't need to adjust the * bucket list and simply go to the insertion phase. * Otherwise grp->S is decreasing, we must make room * in the bucket list, and also recompute the group state. * Finally, if there were no flows in this group and nobody * was in ER make sure to adjust V. */ if (grp->full_slots) { if (!qfq_gt(grp->S, cl->S)) goto skip_update; /* create a slot for this cl->S */ qfq_slot_rotate(q, grp, roundedS); /* group was surely ineligible, remove */ __clear_bit(grp->index, &q->bitmaps[IR]); __clear_bit(grp->index, &q->bitmaps[IB]); } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V)) q->V = roundedS; grp->S = roundedS; grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i s = qfq_calc_state(q, grp); __set_bit(grp->index, &q->bitmaps[s]); ND("new state %d 0x%x", s, q->bitmaps[s]); ND("S %llx F %llx V %llx", cl->S, cl->F, q->V); skip_update: qfq_slot_insert(grp, cl, roundedS); return 0; } #if 0 static inline void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, struct qfq_class *cl, struct qfq_class **pprev) { unsigned int i, offset; uint64_t roundedS; roundedS = qfq_round_down(cl->S, grp->slot_shift); offset = (roundedS - grp->S) >> grp->slot_shift; i = (grp->front + offset) % QFQ_MAX_SLOTS; #ifdef notyet if (!pprev) { pprev = &grp->slots[i]; while (*pprev && *pprev != cl) pprev = &(*pprev)->next; } #endif *pprev = cl->next; if (!grp->slots[i]) __clear_bit(offset, &grp->full_slots); } /* * called to forcibly destroy a queue. * If the queue is not in the front bucket, or if it has * other queues in the front bucket, we can simply remove * the queue with no other side effects. * Otherwise we must propagate the event up. * XXX description to be completed. */ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl, struct qfq_class **pprev) { struct qfq_group *grp = &q->groups[cl->index]; unsigned long mask; uint64_t roundedS; int s; cl->F = cl->S; // not needed if the class goes away. qfq_slot_remove(q, grp, cl, pprev); if (!grp->full_slots) { /* nothing left in the group, remove from all sets. * Do ER last because if we were blocking other groups * we must unblock them. */ __clear_bit(grp->index, &q->bitmaps[IR]); __clear_bit(grp->index, &q->bitmaps[EB]); __clear_bit(grp->index, &q->bitmaps[IB]); if (test_bit(grp->index, &q->bitmaps[ER]) && !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) { mask = q->bitmaps[ER] & ((1UL << grp->index) - 1); if (mask) mask = ~((1UL << __fls(mask)) - 1); else mask = ~0UL; qfq_move_groups(q, mask, EB, ER); qfq_move_groups(q, mask, IB, IR); } __clear_bit(grp->index, &q->bitmaps[ER]); } else if (!grp->slots[grp->front]) { cl = qfq_slot_scan(grp); roundedS = qfq_round_down(cl->S, grp->slot_shift); if (grp->S != roundedS) { __clear_bit(grp->index, &q->bitmaps[ER]); __clear_bit(grp->index, &q->bitmaps[IR]); __clear_bit(grp->index, &q->bitmaps[EB]); __clear_bit(grp->index, &q->bitmaps[IB]); grp->S = roundedS; grp->F = roundedS + (2ULL << grp->slot_shift); s = qfq_calc_state(q, grp); __set_bit(grp->index, &q->bitmaps[s]); } } qfq_update_eligible(q, q->V); } #endif static int qfq_new_fsk(struct dn_fsk *f) { ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight"); ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen"); ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]); return 0; } /* * initialize a new scheduler instance */ static int qfq_new_sched(struct dn_sch_inst *si) { struct qfq_sched *q = (struct qfq_sched *)(si + 1); struct qfq_group *grp; int i; for (i = 0; i <= QFQ_MAX_INDEX; i++) { grp = &q->groups[i]; grp->index = i; grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS - (QFQ_MAX_INDEX - i); } return 0; } /* * QFQ scheduler descriptor */ static struct dn_alg qfq_desc = { _SI( .type = ) DN_SCHED_QFQ, _SI( .name = ) "QFQ", _SI( .flags = ) DN_MULTIQUEUE, _SI( .schk_datalen = ) 0, _SI( .si_datalen = ) sizeof(struct qfq_sched), _SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue), _SI( .enqueue = ) qfq_enqueue, _SI( .dequeue = ) qfq_dequeue, _SI( .config = ) NULL, _SI( .destroy = ) NULL, _SI( .new_sched = ) qfq_new_sched, _SI( .free_sched = ) NULL, _SI( .new_fsk = ) qfq_new_fsk, _SI( .free_fsk = ) NULL, _SI( .new_queue = ) qfq_new_queue, _SI( .free_queue = ) qfq_free_queue, }; DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc); #ifdef QFQ_DEBUG static void dump_groups(struct qfq_sched *q, uint32_t mask) { int i, j; for (i = 0; i < QFQ_MAX_INDEX + 1; i++) { struct qfq_group *g = &q->groups[i]; if (0 == (mask & (1<slots[j]) D(" bucket %d %p", j, g->slots[j]); } D("full_slots 0x%x", g->full_slots); D(" %2d S 0x%20llx F 0x%llx %c", i, g->S, g->F, mask & (1<loops, q->queued, q->V); D(" ER 0x%08x", q->bitmaps[ER]); D(" EB 0x%08x", q->bitmaps[EB]); D(" IR 0x%08x", q->bitmaps[IR]); D(" IB 0x%08x", q->bitmaps[IB]); dump_groups(q, 0xffffffff); }; #endif /* QFQ_DEBUG */ ================================================ FILE: sys/netinet/ipfw/dn_sched_rr.c ================================================ /* * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id: dn_sched_rr.c 11480 2012-07-31 08:02:00Z luigi $ */ #ifdef _KERNEL #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include /* ipfw_rule_ref */ #include /* flow_id */ #include #include #include #include #else #include #endif #define DN_SCHED_RR 3 // XXX Where? struct rr_queue { struct dn_queue q; /* Standard queue */ int status; /* 1: queue is in the list */ int credit; /* Number of bytes to transmit */ int quantum; /* quantum * C */ struct rr_queue *qnext; /* */ }; /* struct rr_schk contains global config parameters * and is right after dn_schk */ struct rr_schk { int min_q; /* Min quantum */ int max_q; /* Max quantum */ int q_bytes; /* Bytes per quantum */ }; /* per-instance round robin list, right after dn_sch_inst */ struct rr_si { struct rr_queue *head, *tail; /* Pointer to current queue */ }; /* Append a queue to the rr list */ static inline void rr_append(struct rr_queue *q, struct rr_si *si) { q->status = 1; /* mark as in-rr_list */ q->credit = q->quantum; /* initialize credit */ /* append to the tail */ if (si->head == NULL) si->head = q; else si->tail->qnext = q; si->tail = q; /* advance the tail pointer */ q->qnext = si->head; /* make it circular */ } /* Remove the head queue from circular list. */ static inline void rr_remove_head(struct rr_si *si) { if (si->head == NULL) return; /* empty queue */ si->head->status = 0; if (si->head == si->tail) { si->head = si->tail = NULL; return; } si->head = si->head->qnext; si->tail->qnext = si->head; } /* Remove a queue from circular list. * XXX see if ti can be merge with remove_queue() */ static inline void remove_queue_q(struct rr_queue *q, struct rr_si *si) { struct rr_queue *prev; if (q->status != 1) return; if (q == si->head) { rr_remove_head(si); return; } for (prev = si->head; prev; prev = prev->qnext) { if (prev->qnext != q) continue; prev->qnext = q->qnext; if (q == si->tail) si->tail = prev; q->status = 0; break; } } static inline void next_pointer(struct rr_si *si) { if (si->head == NULL) return; /* empty queue */ si->head = si->head->qnext; si->tail = si->tail->qnext; } static int rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) { struct rr_si *si; struct rr_queue *rrq; if (m != q->mq.head) { if (dn_enqueue(q, m, 0)) /* packet was dropped */ return 1; if (m != q->mq.head) return 0; } /* If reach this point, queue q was idle */ si = (struct rr_si *)(_si + 1); rrq = (struct rr_queue *)q; if (rrq->status == 1) /* Queue is already in the queue list */ return 0; /* Insert the queue in the queue list */ rr_append(rrq, si); return 0; } static struct mbuf * rr_dequeue(struct dn_sch_inst *_si) { /* Access scheduler instance private data */ struct rr_si *si = (struct rr_si *)(_si + 1); struct rr_queue *rrq; uint64_t len; while ( (rrq = si->head) ) { struct mbuf *m = rrq->q.mq.head; if ( m == NULL) { /* empty queue, remove from list */ rr_remove_head(si); continue; } len = m->m_pkthdr.len; if (len > rrq->credit) { /* Packet too big */ rrq->credit += rrq->quantum; /* Try next queue */ next_pointer(si); } else { rrq->credit -= len; return dn_dequeue(&rrq->q); } } /* no packet to dequeue*/ return NULL; } static int rr_config(struct dn_schk *_schk) { struct rr_schk *schk = (struct rr_schk *)(_schk + 1); ND("called"); /* use reasonable quantums (64..2k bytes, default 1500) */ schk->min_q = 64; schk->max_q = 2048; schk->q_bytes = 1500; /* quantum */ return 0; } static int rr_new_sched(struct dn_sch_inst *_si) { struct rr_si *si = (struct rr_si *)(_si + 1); ND("called"); si->head = si->tail = NULL; return 0; } static int rr_free_sched(struct dn_sch_inst *_si) { ND("called"); /* Nothing to do? */ return 0; } static int rr_new_fsk(struct dn_fsk *fs) { struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1); /* par[0] is the weight, par[1] is the quantum step */ ipdn_bound_var(&fs->fs.par[0], 1, 1, 65536, "RR weight"); ipdn_bound_var(&fs->fs.par[1], schk->q_bytes, schk->min_q, schk->max_q, "RR quantum"); return 0; } static int rr_new_queue(struct dn_queue *_q) { struct rr_queue *q = (struct rr_queue *)_q; _q->ni.oid.subtype = DN_SCHED_RR; q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1]; ND("called, q->quantum %d", q->quantum); q->credit = q->quantum; q->status = 0; if (_q->mq.head != NULL) { /* Queue NOT empty, insert in the queue list */ rr_append(q, (struct rr_si *)(_q->_si + 1)); } return 0; } static int rr_free_queue(struct dn_queue *_q, int safe) { struct rr_queue *q = (struct rr_queue *)_q; ND("called"); if (safe) /* Delete only if status == 0 */ return q->status; if (q->status == 1) { struct rr_si *si = (struct rr_si *)(_q->_si + 1); remove_queue_q(q, si); } return 0; } /* * RR scheduler descriptor * contains the type of the scheduler, the name, the size of the * structures and function pointers. */ static struct dn_alg rr_desc = { _SI( .type = ) DN_SCHED_RR, _SI( .name = ) "RR", _SI( .flags = ) DN_MULTIQUEUE, _SI( .schk_datalen = ) 0, _SI( .si_datalen = ) sizeof(struct rr_si), _SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue), _SI( .enqueue = ) rr_enqueue, _SI( .dequeue = ) rr_dequeue, _SI( .config = ) rr_config, _SI( .destroy = ) NULL, _SI( .new_sched = ) rr_new_sched, _SI( .free_sched = ) rr_free_sched, _SI( .new_fsk = ) rr_new_fsk, _SI( .free_fsk = ) NULL, _SI( .new_queue = ) rr_new_queue, _SI( .free_queue = ) rr_free_queue, }; DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc); ================================================ FILE: sys/netinet/ipfw/dn_sched_wf2q.c ================================================ /* * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id: dn_sched_wf2q.c 11480 2012-07-31 08:02:00Z luigi $ */ #ifdef _KERNEL #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include /* ipfw_rule_ref */ #include /* flow_id */ #include #include #include #include #else #include #endif #ifndef MAX64 #define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) #endif /* * timestamps are computed on 64 bit using fixed point arithmetic. * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len * and sum of weights, respectively. FRAC_BITS is the number of * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w * using an unsigned 32-bit division, and to avoid wraparounds we need * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64 * As an example * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19 */ #ifndef FRAC_BITS #define FRAC_BITS 28 /* shift for fixed point arithmetic */ #define ONE_FP (1UL << FRAC_BITS) #endif /* * Private information for the scheduler instance: * sch_heap (key is Finish time) returns the next queue to serve * ne_heap (key is Start time) stores not-eligible queues * idle_heap (key=start/finish time) stores idle flows. It must * support extract-from-middle. * A flow is only in 1 of the three heaps. * XXX todo: use a more efficient data structure, e.g. a tree sorted * by F with min_subtree(S) in each node */ struct wf2qp_si { struct dn_heap sch_heap; /* top extract - key Finish time */ struct dn_heap ne_heap; /* top extract - key Start time */ struct dn_heap idle_heap; /* random extract - key Start=Finish time */ uint64_t V; /* virtual time */ uint32_t inv_wsum; /* inverse of sum of weights */ uint32_t wsum; /* sum of weights */ }; struct wf2qp_queue { struct dn_queue _q; uint64_t S, F; /* start time, finish time */ uint32_t inv_w; /* ONE_FP / weight */ int32_t heap_pos; /* position (index) of struct in heap */ }; /* * This file implements a WF2Q+ scheduler as it has been in dummynet * since 2000. * The scheduler supports per-flow queues and has O(log N) complexity. * * WF2Q+ needs to drain entries from the idle heap so that we * can keep the sum of weights up to date. We can do it whenever * we get a chance, or periodically, or following some other * strategy. The function idle_check() drains at most N elements * from the idle heap. */ static void idle_check(struct wf2qp_si *si, int n, int force) { struct dn_heap *h = &si->idle_heap; while (n-- > 0 && h->elements > 0 && (force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) { struct dn_queue *q = HEAP_TOP(h)->object; struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; heap_extract(h, NULL); /* XXX to let the flowset delete the queue we should * mark it as 'unused' by the scheduler. */ alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */ si->wsum -= q->fs->fs.par[0]; /* adjust sum of weights */ if (si->wsum > 0) si->inv_wsum = ONE_FP/si->wsum; } } static int wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) { struct dn_fsk *fs = q->fs; struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); struct wf2qp_queue *alg_fq; uint64_t len = m->m_pkthdr.len; if (m != q->mq.head) { if (dn_enqueue(q, m, 0)) /* packet was dropped */ return 1; if (m != q->mq.head) /* queue was already busy */ return 0; } /* If reach this point, queue q was idle */ alg_fq = (struct wf2qp_queue *)q; if (DN_KEY_LT(alg_fq->F, alg_fq->S)) { /* Fbrand new queue. */ alg_fq->S = si->V; /* init start time */ si->wsum += fs->fs.par[0]; /* add weight of new queue. */ si->inv_wsum = ONE_FP/si->wsum; } else { /* if it was idle then it was in the idle heap */ heap_extract(&si->idle_heap, q); alg_fq->S = MAX64(alg_fq->F, si->V); /* compute new S */ } alg_fq->F = alg_fq->S + len * alg_fq->inv_w; /* if nothing is backlogged, make sure this flow is eligible */ if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0) si->V = MAX64(alg_fq->S, si->V); /* * Look at eligibility. A flow is not eligibile if S>V (when * this happens, it means that there is some other flow already * scheduled for the same pipe, so the sch_heap cannot be * empty). If the flow is not eligible we just store it in the * ne_heap. Otherwise, we store in the sch_heap. * Note that for all flows in sch_heap (SCH), S_i <= V, * and for all flows in ne_heap (NEH), S_i > V. * So when we need to compute max(V, min(S_i)) forall i in * SCH+NEH, we only need to look into NEH. */ if (DN_KEY_LT(si->V, alg_fq->S)) { /* S>V means flow Not eligible. */ if (si->sch_heap.elements == 0) D("++ ouch! not eligible but empty scheduler!"); heap_insert(&si->ne_heap, alg_fq->S, q); } else { heap_insert(&si->sch_heap, alg_fq->F, q); } return 0; } /* XXX invariant: sch > 0 || V >= min(S in neh) */ static struct mbuf * wf2qp_dequeue(struct dn_sch_inst *_si) { /* Access scheduler instance private data */ struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); struct mbuf *m; struct dn_queue *q; struct dn_heap *sch = &si->sch_heap; struct dn_heap *neh = &si->ne_heap; struct wf2qp_queue *alg_fq; if (sch->elements == 0 && neh->elements == 0) { /* we have nothing to do. We could kill the idle heap * altogether and reset V */ idle_check(si, 0x7fffffff, 1); si->V = 0; si->wsum = 0; /* should be set already */ return NULL; /* quick return if nothing to do */ } idle_check(si, 1, 0); /* drain something from the idle heap */ /* make sure at least one element is eligible, bumping V * and moving entries that have become eligible. * We need to repeat the first part twice, before and * after extracting the candidate, or enqueue() will * find the data structure in a wrong state. */ m = NULL; for(;;) { /* * Compute V = max(V, min(S_i)). Remember that all elements * in sch have by definition S_i <= V so if sch is not empty, * V is surely the max and we must not update it. Conversely, * if sch is empty we only need to look at neh. * We don't need to move the queues, as it will be done at the * next enqueue */ if (sch->elements == 0 && neh->elements > 0) { si->V = MAX64(si->V, HEAP_TOP(neh)->key); } while (neh->elements > 0 && DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) { q = HEAP_TOP(neh)->object; alg_fq = (struct wf2qp_queue *)q; heap_extract(neh, NULL); heap_insert(sch, alg_fq->F, q); } if (m) /* pkt found in previous iteration */ break; /* ok we have at least one eligible pkt */ q = HEAP_TOP(sch)->object; alg_fq = (struct wf2qp_queue *)q; m = dn_dequeue(q); heap_extract(sch, NULL); /* Remove queue from heap. */ si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum; alg_fq->S = alg_fq->F; /* Update start time. */ if (q->mq.head == 0) { /* not backlogged any more. */ heap_insert(&si->idle_heap, alg_fq->F, q); } else { /* Still backlogged. */ /* Update F, store in neh or sch */ uint64_t len = q->mq.head->m_pkthdr.len; alg_fq->F += len * alg_fq->inv_w; if (DN_KEY_LEQ(alg_fq->S, si->V)) { heap_insert(sch, alg_fq->F, q); } else { heap_insert(neh, alg_fq->S, q); } } } return m; } static int wf2qp_new_sched(struct dn_sch_inst *_si) { struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); int ofs = offsetof(struct wf2qp_queue, heap_pos); /* all heaps support extract from middle */ if (heap_init(&si->idle_heap, 16, ofs) || heap_init(&si->sch_heap, 16, ofs) || heap_init(&si->ne_heap, 16, ofs)) { heap_free(&si->ne_heap); heap_free(&si->sch_heap); heap_free(&si->idle_heap); return ENOMEM; } return 0; } static int wf2qp_free_sched(struct dn_sch_inst *_si) { struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); heap_free(&si->sch_heap); heap_free(&si->ne_heap); heap_free(&si->idle_heap); return 0; } static int wf2qp_new_fsk(struct dn_fsk *fs) { ipdn_bound_var(&fs->fs.par[0], 1, 1, 100, "WF2Q+ weight"); return 0; } static int wf2qp_new_queue(struct dn_queue *_q) { struct wf2qp_queue *q = (struct wf2qp_queue *)_q; _q->ni.oid.subtype = DN_SCHED_WF2QP; q->F = 0; /* not strictly necessary */ q->S = q->F + 1; /* mark timestamp as invalid. */ q->inv_w = ONE_FP / _q->fs->fs.par[0]; if (_q->mq.head != NULL) { wf2qp_enqueue(_q->_si, _q, _q->mq.head); } return 0; } /* * Called when the infrastructure removes a queue (e.g. flowset * is reconfigured). Nothing to do if we did not 'own' the queue, * otherwise remove it from the right heap and adjust the sum * of weights. */ static int wf2qp_free_queue(struct dn_queue *q, int safe) { struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1); if (alg_fq->S >= alg_fq->F + 1) return 0; /* nothing to do, not in any heap */ /* queue is in a scheduler heap */ if (safe) /* do not delete in safe mode */ return 1; si->wsum -= q->fs->fs.par[0]; if (si->wsum > 0) si->inv_wsum = ONE_FP/si->wsum; /* extract from the heap. XXX TODO we may need to adjust V * to make sure the invariants hold. */ if (q->mq.head == NULL) { heap_extract(&si->idle_heap, q); } else if (DN_KEY_LT(si->V, alg_fq->S)) { heap_extract(&si->ne_heap, q); } else { heap_extract(&si->sch_heap, q); } return 0; } /* * WF2Q+ scheduler descriptor * contains the type of the scheduler, the name, the size of the * structures and function pointers. */ static struct dn_alg wf2qp_desc = { _SI( .type = ) DN_SCHED_WF2QP, _SI( .name = ) "WF2Q+", _SI( .flags = ) DN_MULTIQUEUE, /* we need extra space in the si and the queue */ _SI( .schk_datalen = ) 0, _SI( .si_datalen = ) sizeof(struct wf2qp_si), _SI( .q_datalen = ) sizeof(struct wf2qp_queue) - sizeof(struct dn_queue), _SI( .enqueue = ) wf2qp_enqueue, _SI( .dequeue = ) wf2qp_dequeue, _SI( .config = ) NULL, _SI( .destroy = ) NULL, _SI( .new_sched = ) wf2qp_new_sched, _SI( .free_sched = ) wf2qp_free_sched, _SI( .new_fsk = ) wf2qp_new_fsk, _SI( .free_fsk = ) NULL, _SI( .new_queue = ) wf2qp_new_queue, _SI( .free_queue = ) wf2qp_free_queue, }; DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc); ================================================ FILE: sys/netinet/ipfw/ip_dn_glue.c ================================================ /*- * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id: ip_dn_glue.c 12500 2013-12-11 23:07:58Z luigi $ * * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8 */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ #include #include /* ip_output(), IP_FORWARDING */ #include #include #include #include #include #include /* FREEBSD7.2 ip_dummynet.h r191715*/ struct dn_heap_entry7 { int64_t key; /* sorting key. Topmost element is smallest one */ void *object; /* object pointer */ }; struct dn_heap7 { int size; int elements; int offset; /* XXX if > 0 this is the offset of direct ptr to obj */ struct dn_heap_entry7 *p; /* really an array of "size" entries */ }; /* Common to 7.2 and 8 */ struct dn_flow_set { SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */ u_short fs_nr ; /* flow_set number */ u_short flags_fs; #define DNOLD_HAVE_FLOW_MASK 0x0001 #define DNOLD_IS_RED 0x0002 #define DNOLD_IS_GENTLE_RED 0x0004 #define DNOLD_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */ #define DNOLD_NOERROR 0x0010 /* do not report ENOBUFS on drops */ #define DNOLD_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */ #define DNOLD_IS_PIPE 0x4000 #define DNOLD_IS_QUEUE 0x8000 struct dn_pipe7 *pipe ; /* pointer to parent pipe */ u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ int weight ; /* WFQ queue weight */ int qsize ; /* queue size in slots or bytes */ int plr ; /* pkt loss rate (2^31-1 means 100%) */ struct ipfw_flow_id flow_mask ; /* hash table of queues onto this flow_set */ int rq_size ; /* number of slots */ int rq_elements ; /* active elements */ struct dn_flow_queue7 **rq; /* array of rq_size entries */ u_int32_t last_expired ; /* do not expire too frequently */ int backlogged ; /* #active queues for this flowset */ /* RED parameters */ #define SCALE_RED 16 #define SCALE(x) ( (x) << SCALE_RED ) #define SCALE_VAL(x) ( (x) >> SCALE_RED ) #define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) int w_q ; /* queue weight (scaled) */ int max_th ; /* maximum threshold for queue (scaled) */ int min_th ; /* minimum threshold for queue (scaled) */ int max_p ; /* maximum value for p_b (scaled) */ u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ u_int lookup_depth ; /* depth of lookup table */ int lookup_step ; /* granularity inside the lookup table */ int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ int avg_pkt_size ; /* medium packet size */ int max_pkt_size ; /* max packet size */ }; SLIST_HEAD(dn_flow_set_head, dn_flow_set); #define DN_IS_PIPE 0x4000 #define DN_IS_QUEUE 0x8000 struct dn_flow_queue7 { struct dn_flow_queue7 *next ; struct ipfw_flow_id id ; struct mbuf *head, *tail ; /* queue of packets */ u_int len ; u_int len_bytes ; u_long numbytes; u_int64_t tot_pkts ; /* statistics counters */ u_int64_t tot_bytes ; u_int32_t drops ; int hash_slot ; /* debugging/diagnostic */ /* RED parameters */ int avg ; /* average queue length est. (scaled) */ int count ; /* arrivals since last RED drop */ int random ; /* random value (scaled) */ u_int32_t q_time; /* start of queue idle time */ /* WF2Q+ support */ struct dn_flow_set *fs ; /* parent flow set */ int heap_pos ; /* position (index) of struct in heap */ int64_t sched_time ; /* current time when queue enters ready_heap */ int64_t S,F ; /* start time, finish time */ }; struct dn_pipe7 { /* a pipe */ SLIST_ENTRY(dn_pipe7) next; /* linked list in a hash slot */ int pipe_nr ; /* number */ int bandwidth; /* really, bytes/tick. */ int delay ; /* really, ticks */ struct mbuf *head, *tail ; /* packets in delay line */ /* WF2Q+ */ struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ int64_t V ; /* virtual time */ int sum; /* sum of weights of all active sessions */ int numbytes; int64_t sched_time ; /* time pipe was scheduled in ready_heap */ /* * When the tx clock come from an interface (if_name[0] != '\0'), its name * is stored below, whereas the ifp is filled when the rule is configured. */ char if_name[IFNAMSIZ]; struct ifnet *ifp ; int ready ; /* set if ifp != NULL and we got a signal from it */ struct dn_flow_set fs ; /* used with fixed-rate flows */ }; SLIST_HEAD(dn_pipe_head7, dn_pipe7); /* FREEBSD8 ip_dummynet.h r196045 */ struct dn_flow_queue8 { struct dn_flow_queue8 *next ; struct ipfw_flow_id id ; struct mbuf *head, *tail ; /* queue of packets */ u_int len ; u_int len_bytes ; uint64_t numbytes ; /* credit for transmission (dynamic queues) */ int64_t extra_bits; /* extra bits simulating unavailable channel */ u_int64_t tot_pkts ; /* statistics counters */ u_int64_t tot_bytes ; u_int32_t drops ; int hash_slot ; /* debugging/diagnostic */ /* RED parameters */ int avg ; /* average queue length est. (scaled) */ int count ; /* arrivals since last RED drop */ int random ; /* random value (scaled) */ int64_t idle_time; /* start of queue idle time */ /* WF2Q+ support */ struct dn_flow_set *fs ; /* parent flow set */ int heap_pos ; /* position (index) of struct in heap */ int64_t sched_time ; /* current time when queue enters ready_heap */ int64_t S,F ; /* start time, finish time */ }; struct dn_pipe8 { /* a pipe */ SLIST_ENTRY(dn_pipe8) next; /* linked list in a hash slot */ int pipe_nr ; /* number */ int bandwidth; /* really, bytes/tick. */ int delay ; /* really, ticks */ struct mbuf *head, *tail ; /* packets in delay line */ /* WF2Q+ */ struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ int64_t V ; /* virtual time */ int sum; /* sum of weights of all active sessions */ /* Same as in dn_flow_queue, numbytes can become large */ int64_t numbytes; /* bits I can transmit (more or less). */ uint64_t burst; /* burst size, scaled: bits * hz */ int64_t sched_time ; /* time pipe was scheduled in ready_heap */ int64_t idle_time; /* start of pipe idle time */ char if_name[IFNAMSIZ]; struct ifnet *ifp ; int ready ; /* set if ifp != NULL and we got a signal from it */ struct dn_flow_set fs ; /* used with fixed-rate flows */ /* fields to simulate a delay profile */ #define ED_MAX_NAME_LEN 32 char name[ED_MAX_NAME_LEN]; int loss_level; int samples_no; int *samples; }; #define ED_MAX_SAMPLES_NO 1024 struct dn_pipe_max8 { struct dn_pipe8 pipe; int samples[ED_MAX_SAMPLES_NO]; }; SLIST_HEAD(dn_pipe_head8, dn_pipe8); /* * Changes from 7.2 to 8: * dn_pipe: * numbytes from int to int64_t * add burst (int64_t) * add idle_time (int64_t) * add profile * add struct dn_pipe_max * add flag DN_HAS_PROFILE * * dn_flow_queue * numbytes from u_long to int64_t * add extra_bits (int64_t) * q_time from u_int32_t to int64_t and name idle_time * * dn_flow_set unchanged * */ /* NOTE:XXX copied from dummynet.c */ #define O_NEXT(p, len) ((void *)((char *)p + len)) static void oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) { oid->len = len; oid->type = type; oid->subtype = 0; oid->id = id; } /* make room in the buffer and move the pointer forward */ static void * o_next(struct dn_id **o, int len, int type) { struct dn_id *ret = *o; oid_fill(ret, len, type, 0); *o = O_NEXT(*o, len); return ret; } static size_t pipesize7 = sizeof(struct dn_pipe7); static size_t pipesize8 = sizeof(struct dn_pipe8); static size_t pipesizemax8 = sizeof(struct dn_pipe_max8); /* Indicate 'ipfw' version * 1: from FreeBSD 7.2 * 0: from FreeBSD 8 * -1: unknown (for now is unused) * * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknown, * it is suppose to be the FreeBSD 8 version. */ static int is7 = 0; static int convertflags2new(int src) { int dst = 0; if (src & DNOLD_HAVE_FLOW_MASK) dst |= DN_HAVE_MASK; if (src & DNOLD_QSIZE_IS_BYTES) dst |= DN_QSIZE_BYTES; if (src & DNOLD_NOERROR) dst |= DN_NOERROR; if (src & DNOLD_IS_RED) dst |= DN_IS_RED; if (src & DNOLD_IS_GENTLE_RED) dst |= DN_IS_GENTLE_RED; if (src & DNOLD_HAS_PROFILE) dst |= DN_HAS_PROFILE; return dst; } static int convertflags2old(int src) { int dst = 0; if (src & DN_HAVE_MASK) dst |= DNOLD_HAVE_FLOW_MASK; if (src & DN_IS_RED) dst |= DNOLD_IS_RED; if (src & DN_IS_GENTLE_RED) dst |= DNOLD_IS_GENTLE_RED; if (src & DN_NOERROR) dst |= DNOLD_NOERROR; if (src & DN_HAS_PROFILE) dst |= DNOLD_HAS_PROFILE; if (src & DN_QSIZE_BYTES) dst |= DNOLD_QSIZE_IS_BYTES; return dst; } static int dn_compat_del(void *v) { struct dn_pipe7 *p = (struct dn_pipe7 *) v; struct dn_pipe8 *p8 = (struct dn_pipe8 *) v; struct { struct dn_id oid; uintptr_t a[1]; /* add more if we want a list */ } cmd; /* XXX DN_API_VERSION ??? */ oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); if (is7) { if (p->pipe_nr == 0 && p->fs.fs_nr == 0) return EINVAL; if (p->pipe_nr != 0 && p->fs.fs_nr != 0) return EINVAL; } else { if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0) return EINVAL; if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0) return EINVAL; } if (p->pipe_nr != 0) { /* pipe x delete */ cmd.a[0] = p->pipe_nr; cmd.oid.subtype = DN_LINK; } else { /* queue x delete */ cmd.oid.subtype = DN_FS; cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr; } return do_config(&cmd, cmd.oid.len); } static int dn_compat_config_queue(struct dn_fs *fs, void* v) { struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; struct dn_flow_set *f; if (is7) f = &p7->fs; else f = &p8->fs; fs->fs_nr = f->fs_nr; fs->sched_nr = f->parent_nr; fs->flow_mask = f->flow_mask; fs->buckets = f->rq_size; fs->qsize = f->qsize; fs->plr = f->plr; fs->par[0] = f->weight; fs->flags = convertflags2new(f->flags_fs); if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) { fs->w_q = f->w_q; fs->max_th = f->max_th; fs->min_th = f->min_th; fs->max_p = f->max_p; } return 0; } static int dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p, struct dn_fs *fs, void* v) { struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; int i = p7->pipe_nr; sch->sched_nr = i; sch->oid.subtype = 0; p->link_nr = i; fs->fs_nr = i + 2*DN_MAX_ID; fs->sched_nr = i + DN_MAX_ID; /* Common to 7 and 8 */ p->bandwidth = p7->bandwidth; p->delay = p7->delay; if (!is7) { /* FreeBSD 8 has burst */ p->burst = p8->burst; } /* fill the fifo flowset */ dn_compat_config_queue(fs, v); fs->fs_nr = i + 2*DN_MAX_ID; fs->sched_nr = i + DN_MAX_ID; /* Move scheduler related parameter from fs to sch */ sch->buckets = fs->buckets; /*XXX*/ fs->buckets = 0; if (fs->flags & DN_HAVE_MASK) { sch->flags |= DN_HAVE_MASK; fs->flags &= ~DN_HAVE_MASK; sch->sched_mask = fs->flow_mask; bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id)); } return 0; } static int dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p, void *v) { struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]); pf->link_nr = p->link_nr; pf->loss_level = p8->loss_level; // pf->bandwidth = p->bandwidth; //XXX bandwidth redundant? pf->samples_no = p8->samples_no; strncpy(pf->name, p8->name,sizeof(pf->name)); bcopy(p8->samples, pf->samples, sizeof(pf->samples)); return 0; } /* * If p->pipe_nr != 0 the command is 'pipe x config', so need to create * the three main struct, else only a flowset is created */ static int dn_compat_configure(void *v) { struct dn_id *buf = NULL, *base; struct dn_sch *sch = NULL; struct dn_link *p = NULL; struct dn_fs *fs = NULL; struct dn_profile *pf = NULL; int lmax; int error; struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; int i; /* number of object to configure */ lmax = sizeof(struct dn_id); /* command header */ lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + sizeof(struct dn_fs) + sizeof(struct dn_profile); base = buf = malloc(lmax, M_DUMMYNET, M_WAITOK|M_ZERO); o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); base->id = DN_API_VERSION; /* pipe_nr is the same in p7 and p8 */ i = p7->pipe_nr; if (i != 0) { /* pipe config */ sch = o_next(&buf, sizeof(*sch), DN_SCH); p = o_next(&buf, sizeof(*p), DN_LINK); fs = o_next(&buf, sizeof(*fs), DN_FS); error = dn_compat_config_pipe(sch, p, fs, v); if (error) { free(buf, M_DUMMYNET); return error; } if (!is7 && p8->samples_no > 0) { /* Add profiles*/ pf = o_next(&buf, sizeof(*pf), DN_PROFILE); error = dn_compat_config_profile(pf, p, v); if (error) { free(buf, M_DUMMYNET); return error; } } } else { /* queue config */ fs = o_next(&buf, sizeof(*fs), DN_FS); error = dn_compat_config_queue(fs, v); if (error) { free(buf, M_DUMMYNET); return error; } } error = do_config(base, (char *)buf - (char *)base); if (buf) free(buf, M_DUMMYNET); return error; } int dn_compat_calc_size(void) { int need = 0; /* XXX use FreeBSD 8 struct size */ /* NOTE: * - half scheduler: schk_count/2 * - all flowset: fsk_count * - all flowset queues: queue_count * - all pipe queue: si_count */ need += dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2; need += dn_cfg.fsk_count * sizeof(struct dn_flow_set); need += dn_cfg.si_count * sizeof(struct dn_flow_queue8); need += dn_cfg.queue_count * sizeof(struct dn_flow_queue8); return need; } int dn_c_copy_q (void *_ni, void *arg) { struct copy_args *a = arg; struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start; struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start; struct dn_flow *ni = (struct dn_flow *)_ni; int size = 0; /* XXX hash slot not set */ /* No difference between 7.2/8 */ fq7->len = ni->length; fq7->len_bytes = ni->len_bytes; fq7->id = ni->fid; if (is7) { size = sizeof(struct dn_flow_queue7); fq7->tot_pkts = ni->tot_pkts; fq7->tot_bytes = ni->tot_bytes; fq7->drops = ni->drops; } else { size = sizeof(struct dn_flow_queue8); fq8->tot_pkts = ni->tot_pkts; fq8->tot_bytes = ni->tot_bytes; fq8->drops = ni->drops; } *a->start += size; return 0; } int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq) { struct dn_link *l = &s->link; struct dn_fsk *f = s->fs; struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start; struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start; struct dn_flow_set *fs; int size = 0; if (is7) { fs = &pipe7->fs; size = sizeof(struct dn_pipe7); } else { fs = &pipe8->fs; size = sizeof(struct dn_pipe8); } /* These 4 field are the same in pipe7 and pipe8 */ pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE; pipe7->bandwidth = l->bandwidth; pipe7->delay = l->delay * 1000 / hz; pipe7->pipe_nr = l->link_nr - DN_MAX_ID; if (!is7) { if (s->profile) { struct dn_profile *pf = s->profile; strncpy(pipe8->name, pf->name, sizeof(pf->name)); pipe8->loss_level = pf->loss_level; pipe8->samples_no = pf->samples_no; } pipe8->burst = div64(l->burst , 8 * hz); } fs->flow_mask = s->sch.sched_mask; fs->rq_size = s->sch.buckets ? s->sch.buckets : 1; fs->parent_nr = l->link_nr - DN_MAX_ID; fs->qsize = f->fs.qsize; fs->plr = f->fs.plr; fs->w_q = f->fs.w_q; fs->max_th = f->max_th; fs->min_th = f->min_th; fs->max_p = f->fs.max_p; fs->rq_elements = nq; fs->flags_fs = convertflags2old(f->fs.flags); *a->start += size; return 0; } int dn_compat_copy_pipe(struct copy_args *a, void *_o) { int have = a->end - *a->start; int need = 0; int pipe_size = sizeof(struct dn_pipe8); int queue_size = sizeof(struct dn_flow_queue8); int n_queue = 0; /* number of queues */ struct dn_schk *s = (struct dn_schk *)_o; /* calculate needed space: * - struct dn_pipe * - if there are instances, dn_queue * n_instances */ n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) : (s->siht ? 1 : 0)); need = pipe_size + queue_size * n_queue; if (have < need) { D("have %d < need %d", have, need); return 1; } /* copy pipe */ dn_c_copy_pipe(s, a, n_queue); /* copy queues */ if (s->sch.flags & DN_HAVE_MASK) dn_ht_scan(s->siht, dn_c_copy_q, a); else if (s->siht) dn_c_copy_q(s->siht, a); return 0; } int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq) { struct dn_flow_set *fs = (struct dn_flow_set *)*a->start; fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE; fs->fs_nr = f->fs.fs_nr; fs->qsize = f->fs.qsize; fs->plr = f->fs.plr; fs->w_q = f->fs.w_q; fs->max_th = f->max_th; fs->min_th = f->min_th; fs->max_p = f->fs.max_p; fs->flow_mask = f->fs.flow_mask; fs->rq_elements = nq; fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1); fs->parent_nr = f->fs.sched_nr; fs->weight = f->fs.par[0]; fs->flags_fs = convertflags2old(f->fs.flags); *a->start += sizeof(struct dn_flow_set); return 0; } int dn_compat_copy_queue(struct copy_args *a, void *_o) { int have = a->end - *a->start; int need = 0; int fs_size = sizeof(struct dn_flow_set); int queue_size = sizeof(struct dn_flow_queue8); struct dn_fsk *fs = (struct dn_fsk *)_o; int n_queue = 0; /* number of queues */ n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0)); need = fs_size + queue_size * n_queue; if (have < need) { D("have < need"); return 1; } /* copy flowset */ dn_c_copy_fs(fs, a, n_queue); /* copy queues */ if (fs->fs.flags & DN_HAVE_MASK) dn_ht_scan(fs->qht, dn_c_copy_q, a); else if (fs->qht) dn_c_copy_q(fs->qht, a); return 0; } int copy_data_helper_compat(void *_o, void *_arg) { struct copy_args *a = _arg; if (a->type == DN_COMPAT_PIPE) { struct dn_schk *s = _o; if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) { return 0; /* not old type */ } /* copy pipe parameters, and if instance exists, copy * other parameters and eventually queues. */ if(dn_compat_copy_pipe(a, _o)) return DNHT_SCAN_END; } else if (a->type == DN_COMPAT_QUEUE) { struct dn_fsk *fs = _o; if (fs->fs.fs_nr >= DN_MAX_ID) return 0; if (dn_compat_copy_queue(a, _o)) return DNHT_SCAN_END; } return 0; } /* Main function to manage old requests */ int ip_dummynet_compat(struct sockopt *sopt) { int error=0; void *v = NULL; struct dn_id oid; /* Lenght of data, used to found ipfw version... */ int len = sopt->sopt_valsize; /* len can be 0 if command was dummynet_flush */ if (len == pipesize7) { D("setting compatibility with FreeBSD 7.2"); is7 = 1; } else if (len == pipesize8 || len == pipesizemax8) { D("setting compatibility with FreeBSD 8"); is7 = 0; } switch (sopt->sopt_name) { default: printf("dummynet: -- unknown option %d", sopt->sopt_name); error = EINVAL; break; case IP_DUMMYNET_FLUSH: oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); do_config(&oid, oid.len); break; case IP_DUMMYNET_DEL: v = malloc(len, M_TEMP, M_WAITOK); error = sooptcopyin(sopt, v, len, len); if (error) break; error = dn_compat_del(v); free(v, M_TEMP); break; case IP_DUMMYNET_CONFIGURE: v = malloc(len, M_TEMP, M_WAITOK); error = sooptcopyin(sopt, v, len, len); if (error) break; error = dn_compat_configure(v); free(v, M_TEMP); break; case IP_DUMMYNET_GET: { void *buf; int ret; int original_size = sopt->sopt_valsize; int size; ret = dummynet_get(sopt, &buf); if (ret) return 0;//XXX ? size = sopt->sopt_valsize; sopt->sopt_valsize = original_size; D("size=%d, buf=%p", size, buf); ret = sooptcopyout(sopt, buf, size); if (ret) printf(" %s ERROR sooptcopyout\n", __FUNCTION__); if (buf) free(buf, M_DUMMYNET); } } return error; } ================================================ FILE: sys/netinet/ipfw/ip_dn_io.c ================================================ /*- * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Dummynet portions related to packet handling. */ #include __FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_dn_io.c 203321 2010-01-31 21:39:25Z luigi $"); #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ #include #include #include #include /* ip_len, ip_off */ #include /* ip_output(), IP_FORWARDING */ #include #include #include #include #include #include #include /* various ether_* routines */ #include /* for ip6_input, ip6_output prototypes */ #include /* * We keep a private variable for the simulation time, but we could * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) * instead of dn_cfg.curr_time */ struct dn_parms dn_cfg; //VNET_DEFINE(struct dn_parms, _base_dn_cfg); static long tick_last; /* Last tick duration (usec). */ static long tick_delta; /* Last vs standard tick diff (usec). */ static long tick_delta_sum; /* Accumulated tick difference (usec).*/ static long tick_adjustment; /* Tick adjustments done. */ static long tick_lost; /* Lost(coalesced) ticks number. */ /* Adjusted vs non-adjusted curr_time difference (ticks). */ static long tick_diff; static unsigned long io_pkt; static unsigned long io_pkt_fast; static unsigned long io_pkt_drop; /* * We use a heap to store entities for which we have pending timer events. * The heap is checked at every tick and all entities with expired events * are extracted. */ MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *); #ifdef SYSCTL_NODE SYSBEGIN(f4) SYSCTL_DECL(_net_inet); SYSCTL_DECL(_net_inet_ip); SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); /* wrapper to pass dn_cfg fields to SYSCTL_* */ //#define DC(x) (&(VNET_NAME(_base_dn_cfg).x)) #define DC(x) (&(dn_cfg.x)) /* parameters */ SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, CTLFLAG_RW, DC(hash_size), 0, "Default hash table size"); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit, CTLFLAG_RW, DC(slot_limit), 0, "Upper limit in slots for pipe queue."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit, CTLFLAG_RW, DC(byte_limit), 0, "Upper limit in bytes for pipe queue."); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast, CTLFLAG_RW, DC(io_fast), 0, "Enable fast dummynet io."); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, DC(debug), 0, "Dummynet debug level"); /* RED parameters */ SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, CTLFLAG_RD, DC(red_lookup_depth), 0, "Depth of RED lookup table"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, CTLFLAG_RD, DC(red_avg_pkt_size), 0, "RED Medium packet size"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, CTLFLAG_RD, DC(red_max_pkt_size), 0, "RED Max packet size"); /* time adjustment */ SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta, CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec)."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum, CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec)."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment, CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff, CTLFLAG_RD, &tick_diff, 0, "Adjusted vs non-adjusted curr_time difference (ticks)."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost, CTLFLAG_RD, &tick_lost, 0, "Number of ticks coalesced by dummynet taskqueue."); /* Drain parameters */ SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire, CTLFLAG_RW, DC(expire), 0, "Expire empty queues/pipes"); SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle, CTLFLAG_RD, DC(expire_cycle), 0, "Expire cycle for queues/pipes"); SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_object, CTLFLAG_RW, DC(expire_object), 0, "Min # of objects before start drain routine"); SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, object_idle_tick, CTLFLAG_RD, DC(object_idle_tick), 0, "Time (in ticks) to cosiderer an object as idle"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, drain_ratio, CTLFLAG_RD, DC(drain_ratio), 0, "% of dummynet_task() to dedicate to drain routine"); /* statistics */ SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count, CTLFLAG_RD, DC(schk_count), 0, "Number of schedulers"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count, CTLFLAG_RD, DC(si_count), 0, "Number of scheduler instances"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count, CTLFLAG_RD, DC(fsk_count), 0, "Number of flowsets"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count, CTLFLAG_RD, DC(queue_count), 0, "Number of queues"); SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt, CTLFLAG_RD, &io_pkt, 0, "Number of packets passed to dummynet."); SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast, CTLFLAG_RD, &io_pkt_fast, 0, "Number of packets bypassed dummynet scheduler."); SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop, CTLFLAG_RD, &io_pkt_drop, 0, "Number of packets dropped by dummynet."); #undef DC SYSEND #endif static void dummynet_send(struct mbuf *); /* * Packets processed by dummynet have an mbuf tag associated with * them that carries their dummynet state. * Outside dummynet, only the 'rule' field is relevant, and it must * be at the beginning of the structure. */ struct dn_pkt_tag { struct ipfw_rule_ref rule; /* matching rule */ /* second part, dummynet specific */ int dn_dir; /* action when packet comes out.*/ /* see ip_fw_private.h */ uint64_t output_time; /* when the pkt is due for delivery*/ struct ifnet *ifp; /* interface, for ip_output */ struct _ip6dn_args ip6opt; /* XXX ipv6 options */ }; /* * Return the mbuf tag holding the dummynet state (it should * be the first one on the list). */ static struct dn_pkt_tag * dn_tag_get(struct mbuf *m) { struct m_tag *mtag = m_tag_first(m); KASSERT(mtag != NULL && mtag->m_tag_cookie == MTAG_ABI_COMPAT && mtag->m_tag_id == PACKET_TAG_DUMMYNET, ("packet on dummynet queue w/o dummynet tag!")); return (struct dn_pkt_tag *)(mtag+1); } static inline void mq_append(struct mq *q, struct mbuf *m) { if (q->head == NULL) q->head = m; else q->tail->m_nextpkt = m; q->tail = m; m->m_nextpkt = NULL; } /* * Dispose a list of packet. Use a functions so if we need to do * more work, this is a central point to do it. */ void dn_free_pkts(struct mbuf *mnext) { struct mbuf *m; while ((m = mnext) != NULL) { mnext = m->m_nextpkt; FREE_PKT(m); } } static int red_drops (struct dn_queue *q, int len) { /* * RED algorithm * * RED calculates the average queue size (avg) using a low-pass filter * with an exponential weighted (w_q) moving average: * avg <- (1-w_q) * avg + w_q * q_size * where q_size is the queue length (measured in bytes or * packets). * * If q_size == 0, we compute the idle time for the link, and set * avg = (1 - w_q)^(idle/s) * where s is the time needed for transmitting a medium-sized packet. * * Now, if avg < min_th the packet is enqueued. * If avg > max_th the packet is dropped. Otherwise, the packet is * dropped with probability P function of avg. */ struct dn_fsk *fs = q->fs; int64_t p_b = 0; /* Queue in bytes or packets? */ uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ? q->ni.len_bytes : q->ni.length; /* Average queue size estimation. */ if (q_size != 0) { /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */ int diff = SCALE(q_size) - q->avg; int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q); q->avg += (int)v; } else { /* * Queue is empty, find for how long the queue has been * empty and use a lookup table for computing * (1 - * w_q)^(idle_time/s) where s is the time to send a * (small) packet. * XXX check wraps... */ if (q->avg) { u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step); q->avg = (t < fs->lookup_depth) ? SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; } } /* Should i drop? */ if (q->avg < fs->min_th) { q->count = -1; return (0); /* accept packet */ } if (q->avg >= fs->max_th) { /* average queue >= max threshold */ if (fs->fs.flags & DN_IS_GENTLE_RED) { /* * According to Gentle-RED, if avg is greater than * max_th the packet is dropped with a probability * p_b = c_3 * avg - c_4 * where c_3 = (1 - max_p) / max_th * c_4 = 1 - 2 * max_p */ p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) - fs->c_4; } else { q->count = -1; return (1); } } else if (q->avg > fs->min_th) { /* * We compute p_b using the linear dropping function * p_b = c_1 * avg - c_2 * where c_1 = max_p / (max_th - min_th) * c_2 = max_p * min_th / (max_th - min_th) */ p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2; } if (fs->fs.flags & DN_QSIZE_BYTES) p_b = div64((p_b * len) , fs->max_pkt_size); if (++q->count == 0) q->random = random() & 0xffff; else { /* * q->count counts packets arrived since last drop, so a greater * value of q->count means a greater packet drop probability. */ if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) { q->count = 0; /* After a drop we calculate a new random value. */ q->random = random() & 0xffff; return (1); /* drop */ } } /* End of RED algorithm. */ return (0); /* accept */ } /* * Enqueue a packet in q, subject to space and queue management policy * (whose parameters are in q->fs). * Update stats for the queue and the scheduler. * Return 0 on success, 1 on drop. The packet is consumed anyways. */ int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) { struct dn_fs *f; struct dn_flow *ni; /* stats for scheduler instance */ uint64_t len; if (q->fs == NULL || q->_si == NULL) { printf("%s fs %p si %p, dropping\n", __FUNCTION__, q->fs, q->_si); FREE_PKT(m); return 1; } f = &(q->fs->fs); ni = &q->_si->ni; len = m->m_pkthdr.len; /* Update statistics, then check reasons to drop pkt. */ q->ni.tot_bytes += len; q->ni.tot_pkts++; ni->tot_bytes += len; ni->tot_pkts++; if (drop) goto drop; if (f->plr && random() < f->plr) goto drop; if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) goto drop; if (f->flags & DN_QSIZE_BYTES) { if (q->ni.len_bytes > f->qsize) goto drop; } else if (q->ni.length >= f->qsize) { goto drop; } mq_append(&q->mq, m); if (q->ni.length == 0) { /* queue was idle */ dn_cfg.idle_queue--; if (ni->length == 0) /* scheduler was idle */ dn_cfg.idle_si--; } q->ni.length++; q->ni.len_bytes += len; ni->length++; ni->len_bytes += len; return 0; drop: io_pkt_drop++; q->ni.drops++; ni->drops++; FREE_PKT(m); return 1; } /* * Fetch packets from the delay line which are due now. If there are * leftover packets, reinsert the delay line in the heap. * Runs under scheduler lock. */ static void transmit_event(struct mq *q, struct delay_line *dline, uint64_t now) { struct mbuf *m; struct dn_pkt_tag *pkt = NULL; dline->oid.subtype = 0; /* not in heap */ while ((m = dline->mq.head) != NULL) { pkt = dn_tag_get(m); if (!DN_KEY_LEQ(pkt->output_time, now)) break; dline->mq.head = m->m_nextpkt; mq_append(q, m); } if (m != NULL) { dline->oid.subtype = 1; /* in heap */ heap_insert(&dn_cfg.evheap, pkt->output_time, dline); } } /* * Convert the additional MAC overheads/delays into an equivalent * number of bits for the given data rate. The samples are * in milliseconds so we need to divide by 1000. */ static uint64_t extra_bits(struct mbuf *m, struct dn_schk *s) { int index; uint64_t bits; struct dn_profile *pf = s->profile; if (!pf || pf->samples_no == 0) return 0; index = random() % pf->samples_no; bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000); if (index >= pf->loss_level) { struct dn_pkt_tag *dt = dn_tag_get(m); if (dt) dt->dn_dir = DIR_DROP; } return bits; } /* * Send traffic from a scheduler instance due by 'now'. * Return a pointer to the head of the queue. */ static struct mbuf * serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now) { struct mq def_q; struct dn_schk *s = si->sched; struct mbuf *m = NULL; int delay_line_idle = (si->dline.mq.head == NULL); int done, bw; if (q == NULL) { q = &def_q; q->head = NULL; } bw = s->link.bandwidth; si->kflags &= ~DN_ACTIVE; if (bw > 0) si->credit += (now - si->sched_time) * bw; else si->credit = 0; si->sched_time = now; done = 0; while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) { uint64_t len_scaled; /* * Some schedulers might want wake up the scheduler later. * To suppor this the caller returns an mbuf with len < 0 * this will result in a new wake up of the scheduler * instance between m->m_pkthdr.len ticks. */ if (m->m_pkthdr.len < 0) { si->kflags |= DN_ACTIVE; heap_insert(&dn_cfg.evheap, now - m->m_pkthdr.len, si); if (delay_line_idle && done) transmit_event(q, &si->dline, now); return NULL; } /* a regular mbuf received */ done++; len_scaled = (bw == 0) ? 0 : hz * (m->m_pkthdr.len * 8 + extra_bits(m, s)); si->credit -= len_scaled; /* Move packet in the delay line */ dn_tag_get(m)->output_time = dn_cfg.curr_time + s->link.delay; mq_append(&si->dline.mq, m); } /* * If credit >= 0 the instance is idle, mark time. * Otherwise put back in the heap, and adjust the output * time of the last inserted packet, m, which was too early. */ if (si->credit >= 0) { si->idle_time = now; } else { uint64_t t; KASSERT (bw > 0, ("bw=0 and credit<0 ?")); t = div64(bw - 1 - si->credit, bw); if (m) dn_tag_get(m)->output_time += t; si->kflags |= DN_ACTIVE; heap_insert(&dn_cfg.evheap, now + t, si); } if (delay_line_idle && done) transmit_event(q, &si->dline, now); return q->head; } /* * Support function to read the TSC (or equivalent). We use this * high resolution timer to adapt the amount of work done for * expiring the clock. * Supports Linux and FreeBSD both i386 and amd64 platform * Supports OpenWRT mips architecture * * SMP no special works is needed in * - In linux 2.6 timers will always run in the same cpu that have added it.See * (http://book.opensourceproject.org.cn/kernel/kernel3rd/opensource/0596005652/understandlk-chp-6-sect-5.html) * - FreeBSD8 has a new callout_reset_on() with specify the cpu on which * the timer must be run * - Windows runs dummynet_task() on cpu0. * * - Linux 2.4 doesn't assure to run a timer in the same cpu every time. */ #ifdef HAVE_TSC uint64_t readTSC (void) { uint64_t a=0; #ifdef __linux__ /* Linux and openwrt have a macro to read the tsc for i386 and * amd64. * Openwrt have patched the kernel and allow use of tsc with mips * and other platforms * rdtscll() is a macro defined in include/asm-xxx/msr.h, * where xxx is the architecture (x86, mips). */ rdtscll(a); #elif defined(_WIN32) /* Microsoft recommends the use of KeQueryPerformanceCounter() * insteead of rdtsc(). */ KeQueryPerformanceCounter((PLARGE_INTEGER)&a); //XXX not tested! #elif defined(__FreeBSD__) /* FreeBSD (i386/amd64) has macro rdtsc() defined in machine/cpufunc.h. * We could use the macro instead of explicity assembly XXX */ return rdtsc(); #endif return a; } #endif /* HAVE_TSC */ /* * compute avg task period. * We could do something more complex, possibly. */ static void do_update_cycle(void) { #ifdef HAVE_TSC uint64_t tmp = readTSC(); #if defined (LINUX_24) && defined(CONFIG_SMP) /* on LINUX24 and SMP, we have no guarantees on which cpu runs * the timer callbacks. If the difference between new and * old value is negative, we assume that the values come from * different cpus so we adjust 'new' accordingly. */ if (tmp <= dn_cfg.cycle_task_new) dn_cfg.cycle_task_new = tmp - dn_cfg.cycle_task; #endif /* !(linux24 && SMP) */ dn_cfg.cycle_task_old = dn_cfg.cycle_task_new; dn_cfg.cycle_task_new = tmp; dn_cfg.cycle_task = dn_cfg.cycle_task_new - dn_cfg.cycle_task_old; /* Update the average * avg = (2^N * avg + new - avg ) / 2^N * avg * N==4 seems to be a good compromise between clock clock change * and 'spurious' cycle_task value */ #define DN_N 4 dn_cfg.cycle_task_avg = (dn_cfg.cycle_task_avg << DN_N) + dn_cfg.cycle_task - dn_cfg.cycle_task_avg; dn_cfg.cycle_task_avg = dn_cfg.cycle_task_avg >> DN_N; #undef DN_N #endif /* HAVE_TSC */ } static void do_drain(void) { #ifdef HAVE_TSC uint64_t dt_max; #endif if (!dn_cfg.expire || ++dn_cfg.expire_cycle < dn_cfg.expire) return; /* It's time to check if drain routines should be called */ dn_cfg.expire_cycle = 0; dn_cfg.idle_queue_wait = 0; dn_cfg.idle_si_wait = 0; /* Do a drain cycle even if there isn't time to do it */ #ifdef HAVE_TSC dt_max = dn_cfg.cycle_task_avg * dn_cfg.drain_ratio; #endif for (;;) { int done = 0; if (dn_cfg.idle_queue > dn_cfg.expire_object && dn_cfg.idle_queue_wait < dn_cfg.idle_queue) { dn_drain_queue(); done = 1; } if (dn_cfg.idle_si > dn_cfg.expire_object && dn_cfg.idle_si_wait < dn_cfg.idle_si) { dn_drain_scheduler(); done = 1; } /* time to end ? */ #ifndef HAVE_TSC /* If tsc does not exist, do only one drain cycle and exit */ break; #else /* Exit when nothing was done or we have consumed all time */ if ( (done == 0) || ((readTSC() - dn_cfg.cycle_task_new) * 100 > dt_max) ) break; #endif /* HAVE_TSC */ } } /* * The timer handler for dummynet. Time is computed in ticks, but * but the code is tolerant to the actual rate at which this is called. * Once complete, the function reschedules itself for the next tick. */ void dummynet_task(void *context, int pending) { struct timeval t; struct mq q = { NULL, NULL }; /* queue to accumulate results */ CURVNET_SET((struct vnet *)context); do_update_cycle(); /* compute avg. tick duration */ DN_BH_WLOCK(); /* Update number of lost(coalesced) ticks. */ tick_lost += pending - 1; getmicrouptime(&t); /* Last tick duration (usec). */ tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 + (t.tv_usec - dn_cfg.prev_t.tv_usec); /* Last tick vs standard tick difference (usec). */ tick_delta = (tick_last * hz - 1000000) / hz; /* Accumulated tick difference (usec). */ tick_delta_sum += tick_delta; dn_cfg.prev_t = t; /* * Adjust curr_time if the accumulated tick difference is * greater than the 'standard' tick. Since curr_time should * be monotonically increasing, we do positive adjustments * as required, and throttle curr_time in case of negative * adjustment. */ dn_cfg.curr_time++; if (tick_delta_sum - tick >= 0) { int diff = tick_delta_sum / tick; dn_cfg.curr_time += diff; tick_diff += diff; tick_delta_sum %= tick; tick_adjustment++; } else if (tick_delta_sum + tick <= 0) { dn_cfg.curr_time--; tick_diff--; tick_delta_sum += tick; tick_adjustment++; } /* serve pending events, accumulate in q */ for (;;) { struct dn_id *p; /* generic parameter to handler */ if (dn_cfg.evheap.elements == 0 || DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key)) break; p = HEAP_TOP(&dn_cfg.evheap)->object; heap_extract(&dn_cfg.evheap, NULL); if (p->type == DN_SCH_I) { serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time); } else { /* extracted a delay line */ transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time); } } do_drain(); DN_BH_WUNLOCK(); dn_reschedule(); if (q.head != NULL) dummynet_send(q.head); CURVNET_RESTORE(); } /* * forward a chain of packets to the proper destination. * This runs outside the dummynet lock. */ static void dummynet_send(struct mbuf *m) { struct mbuf *n; for (; m != NULL; m = n) { struct ifnet *ifp = NULL; /* gcc 3.4.6 complains */ struct m_tag *tag; int dst; n = m->m_nextpkt; m->m_nextpkt = NULL; tag = m_tag_first(m); if (tag == NULL) { /* should not happen */ dst = DIR_DROP; } else { struct dn_pkt_tag *pkt = dn_tag_get(m); /* extract the dummynet info, rename the tag * to carry reinject info. */ dst = pkt->dn_dir; ifp = pkt->ifp; tag->m_tag_cookie = MTAG_IPFW_RULE; tag->m_tag_id = 0; } switch (dst) { case DIR_OUT: SET_HOST_IPLEN(mtod(m, struct ip *)); ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); break ; case DIR_IN : /* put header in network format for ip_input() */ //SET_NET_IPLEN(mtod(m, struct ip *)); netisr_dispatch(NETISR_IP, m); break; #ifdef INET6 case DIR_IN | PROTO_IPV6: netisr_dispatch(NETISR_IPV6, m); break; case DIR_OUT | PROTO_IPV6: SET_HOST_IPLEN(mtod(m, struct ip *)); ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL); break; #endif case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */ if (bridge_dn_p != NULL) ((*bridge_dn_p)(m, ifp)); else printf("dummynet: if_bridge not loaded\n"); break; case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */ /* * The Ethernet code assumes the Ethernet header is * contiguous in the first mbuf header. * Insure this is true. */ if (m->m_len < ETHER_HDR_LEN && (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { printf("dummynet/ether: pullup failed, " "dropping packet\n"); break; } ether_demux(m->m_pkthdr.rcvif, m); break; case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */ ether_output_frame(ifp, m); break; case DIR_DROP: /* drop the packet after some time */ FREE_PKT(m); break; default: printf("dummynet: bad switch %d!\n", dst); FREE_PKT(m); break; } } } static inline int tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa) { struct dn_pkt_tag *dt; struct m_tag *mtag; mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*dt), M_NOWAIT | M_ZERO); if (mtag == NULL) return 1; /* Cannot allocate packet header. */ m_tag_prepend(m, mtag); /* Attach to mbuf chain. */ dt = (struct dn_pkt_tag *)(mtag + 1); dt->rule = fwa->rule; dt->rule.info &= IPFW_ONEPASS; /* only keep this info */ dt->dn_dir = dir; dt->ifp = fwa->oif; /* dt->output tame is updated as we move through */ dt->output_time = dn_cfg.curr_time; return 0; } /* * dummynet hook for packets. * We use the argument to locate the flowset fs and the sched_set sch * associated to it. The we apply flow_mask and sched_mask to * determine the queue and scheduler instances. * * dir where shall we send the packet after dummynet. * *m0 the mbuf with the packet * ifp the 'ifp' parameter from the caller. * NULL in ip_input, destination interface in ip_output, */ int dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa) { struct mbuf *m = *m0; struct dn_fsk *fs = NULL; struct dn_sch_inst *si; struct dn_queue *q = NULL; /* default */ int fs_id = (fwa->rule.info & IPFW_INFO_MASK) + ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0); DN_BH_WLOCK(); io_pkt++; /* we could actually tag outside the lock, but who cares... */ if (tag_mbuf(m, dir, fwa)) goto dropit; if (dn_cfg.busy) { /* if the upper half is busy doing something expensive, * lets queue the packet and move forward */ mq_append(&dn_cfg.pending, m); m = *m0 = NULL; /* consumed */ goto done; /* already active, nothing to do */ } /* XXX locate_flowset could be optimised with a direct ref. */ fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL); if (fs == NULL) goto dropit; /* This queue/pipe does not exist! */ if (fs->sched == NULL) /* should not happen */ goto dropit; /* * If the scheduler supports multiple queues, find the right one * (otherwise it will be ignored by enqueue). */ if (fs->sched->fp->flags & DN_MULTIQUEUE) { q = ipdn_q_find(fs, &(fwa->f_id)); if (q == NULL) goto dropit; /* The scheduler instance lookup is done only for new queue. * The callback q_new() will create the scheduler instance * if needed. */ si = q->_si; } else si = ipdn_si_find(fs->sched, &(fwa->f_id)); if (si == NULL) goto dropit; if (fs->sched->fp->enqueue(si, q, m)) { /* packet was dropped by enqueue() */ m = *m0 = NULL; goto dropit; } if (si->kflags & DN_ACTIVE) { m = *m0 = NULL; /* consumed */ goto done; /* already active, nothing to do */ } /* compute the initial allowance */ if (si->idle_time < dn_cfg.curr_time) { /* Do this only on the first packet on an idle pipe */ struct dn_link *p = &fs->sched->link; si->sched_time = dn_cfg.curr_time; si->credit = dn_cfg.io_fast ? p->bandwidth : 0; if (p->burst) { uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth; if (burst > p->burst) burst = p->burst; si->credit += burst; } } /* pass through scheduler and delay line */ m = serve_sched(NULL, si, dn_cfg.curr_time); /* optimization -- pass it back to ipfw for immediate send */ /* XXX Don't call dummynet_send() if scheduler return the packet * just enqueued. This avoid a lock order reversal. * */ if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) { /* fast io, rename the tag * to carry reinject info. */ struct m_tag *tag = m_tag_first(m); tag->m_tag_cookie = MTAG_IPFW_RULE; tag->m_tag_id = 0; io_pkt_fast++; if (m->m_nextpkt != NULL) { printf("dummynet: fast io: pkt chain detected!\n"); m->m_nextpkt = NULL; } m = NULL; } else { *m0 = NULL; } done: DN_BH_WUNLOCK(); if (m) dummynet_send(m); return 0; dropit: io_pkt_drop++; DN_BH_WUNLOCK(); if (m) FREE_PKT(m); *m0 = NULL; return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS; } ================================================ FILE: sys/netinet/ipfw/ip_dn_private.h ================================================ /*- * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * internal dummynet APIs. * * $FreeBSD: head/sys/netinet/ipfw/ip_dn_private.h 204591 2010-03-02 17:40:48Z luigi $ */ #ifndef _IP_DN_PRIVATE_H #define _IP_DN_PRIVATE_H /* debugging support * use ND() to remove debugging, D() to print a line, * DX(level, ...) to print above a certain level * If you redefine D() you are expected to redefine all. */ #ifndef D #define ND(fmt, ...) do {} while (0) #define D1(fmt, ...) do {} while (0) #define D(fmt, ...) printf("%-10s " fmt "\n", \ __FUNCTION__, ## __VA_ARGS__) #define DX(lev, fmt, ...) do { \ if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0) #endif MALLOC_DECLARE(M_DUMMYNET); #ifndef __linux__ #define div64(a, b) ((int64_t)(a) / (int64_t)(b)) #endif #define DN_LOCK_INIT() do { \ mtx_init(&dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF); \ mtx_init(&dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF); \ } while (0) #define DN_LOCK_DESTROY() do { \ mtx_destroy(&dn_cfg.uh_mtx); \ mtx_destroy(&dn_cfg.bh_mtx); \ } while (0) #if 0 /* not used yet */ #define DN_UH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) #define DN_UH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) #define DN_UH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) #define DN_UH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) #define DN_UH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) #endif #define DN_BH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) #define DN_BH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) #define DN_BH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) #define DN_BH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) #define DN_BH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) SLIST_HEAD(dn_schk_head, dn_schk); SLIST_HEAD(dn_sch_inst_head, dn_sch_inst); SLIST_HEAD(dn_fsk_head, dn_fsk); SLIST_HEAD(dn_queue_head, dn_queue); SLIST_HEAD(dn_alg_head, dn_alg); struct mq { /* a basic queue of packets*/ struct mbuf *head, *tail; }; static inline void set_oid(struct dn_id *o, int type, int len) { o->type = type; o->len = len; o->subtype = 0; }; uint64_t readTSC (void); /* * see if tsc (ot other timer) is supported. * - FreeBSD has rdtsc macro for i386 and amd64 * - Linux has rdtscll and/or rdtsc (also for openWRT patched kernel source) * - Windows has KeQueryPerformanceCounter() function that use tsc or other * timer */ #if defined(rdtscll) || defined(rdtsc) || defined(_WIN32) #define HAVE_TSC #endif /* * configuration and global data for a dummynet instance * * When a configuration is modified from userland, 'id' is incremented * so we can use the value to check for stale pointers. */ struct dn_parms { uint32_t id; /* configuration version */ /* defaults (sysctl-accessible) */ int red_lookup_depth; int red_avg_pkt_size; int red_max_pkt_size; int hash_size; int max_hash_size; long byte_limit; /* max queue sizes */ long slot_limit; int io_fast; int debug; /* timekeeping */ struct timeval prev_t; /* last time dummynet_tick ran */ struct dn_heap evheap; /* scheduled events */ /* counters of objects -- used for reporting space */ int schk_count; int si_count; int fsk_count; int queue_count; /* ticks and other stuff */ uint64_t curr_time; /* in ticks */ /* * Variables to manage the time spent in the drain routines. * max_drain is max the fraction of a tick (0..100) to be used * for draining. * We also need some variables to store the average number of * timecounter ticks between calls to the periodic task, etc. */ int drain_ratio; uint64_t cycle_task_new; /* TSC when dummynet_task() starts */ uint64_t cycle_task_old; /* TSC when prev. dummynet_task() starts */ uint64_t cycle_task; uint64_t cycle_task_avg; /* Moving average of cicle_task */ /* flowsets and schedulers are in hash tables, with 'hash_size' * buckets. fshash is looked up at every packet arrival * so better be generous if we expect many entries. */ struct dn_ht *fshash; struct dn_ht *schedhash; /* list of flowsets without a scheduler -- use sch_chain */ struct dn_fsk_head fsu; /* list of unlinked flowsets */ struct dn_alg_head schedlist; /* list of algorithms */ /* Counter of idle objects -- used by drain routine * We scan when idle_queue (or idle_si) > expire_object. * The drain routine is called every 'expire' cycles (the counter * used is expire_cycle). * We can disable the expire routine by setting expire to 0. * An object is kept alive for at least object_idle_tick after it * becomes idle. During the scan, we count the number of objects * that are idle but not ready in 'idle_si_wait' and 'idle_queue_wait' */ int idle_queue; int idle_queue_wait; /* idle but not expired yet */ int idle_si; int idle_si_wait; /* idle but not expired yet */ uint32_t expire_object; /* threshold for expires */ uint32_t expire; /* how often to expire */ uint32_t expire_cycle; uint32_t object_idle_tick; /* lifetime of objs */ uint32_t expire_object_examined; /* Burst of object examined */ /* drain_fs and drain_sch point to the next bucket to scan when * draining. */ uint32_t drain_fs; uint32_t drain_sch; int init_done; /* if the upper half is busy doing something long, * can set the busy flag and we will enqueue packets in * a queue for later processing. */ int busy; struct mq pending; #ifdef _KERNEL /* * This file is normally used in the kernel, unless we do * some userland tests, in which case we do not need a mtx. * uh_mtx arbitrates between system calls and also * protects fshash, schedhash and fsunlinked. * These structures are readonly for the lower half. * bh_mtx protects all other structures which may be * modified upon packet arrivals */ #if defined( __linux__ ) || defined( _WIN32 ) spinlock_t uh_mtx; spinlock_t bh_mtx; #else struct mtx uh_mtx; struct mtx bh_mtx; #endif #endif /* _KERNEL */ }; /* * Delay line, contains all packets on output from a link. * Every scheduler instance has one. */ struct delay_line { struct dn_id oid; struct dn_sch_inst *si; struct mq mq; }; /* * The kernel side of a flowset. It is linked in a hash table * of flowsets, and in a list of children of their parent scheduler. * qht is either the queue or (if HAVE_MASK) a hash table queues. * Note that the mask to use is the (flow_mask|sched_mask), which * changes as we attach/detach schedulers. So we store it here. * * XXX If we want to add scheduler-specific parameters, we need to * put them in external storage because the scheduler may not be * available when the fsk is created. */ struct dn_fsk { /* kernel side of a flowset */ struct dn_fs fs; SLIST_ENTRY(dn_fsk) fsk_next; /* hash chain for fshash */ struct ipfw_flow_id fsk_mask; /* qht is a hash table of queues, or just a single queue * a bit in fs.flags tells us which one */ struct dn_ht *qht; struct dn_schk *sched; /* Sched we are linked to */ SLIST_ENTRY(dn_fsk) sch_chain; /* list of fsk attached to sched */ /* bucket index used by drain routine to drain queues for this * flowset */ int drain_bucket; /* Parameter realted to RED / GRED */ /* original values are in dn_fs*/ int w_q ; /* queue weight (scaled) */ int max_th ; /* maximum threshold for queue (scaled) */ int min_th ; /* minimum threshold for queue (scaled) */ int max_p ; /* maximum value for p_b (scaled) */ u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ u_int lookup_depth ; /* depth of lookup table */ int lookup_step ; /* granularity inside the lookup table */ int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ int avg_pkt_size ; /* medium packet size */ int max_pkt_size ; /* max packet size */ }; /* * A queue is created as a child of a flowset unless it belongs to * a !MULTIQUEUE scheduler. It is normally in a hash table in the * flowset. fs always points to the parent flowset. * si normally points to the sch_inst, unless the flowset has been * detached from the scheduler -- in this case si == NULL and we * should not enqueue. */ struct dn_queue { struct dn_flow ni; /* oid, flow_id, stats */ struct mq mq; /* packets queue */ struct dn_sch_inst *_si; /* owner scheduler instance */ SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */ struct dn_fsk *fs; /* parent flowset. */ /* RED parameters */ int avg; /* average queue length est. (scaled) */ int count; /* arrivals since last RED drop */ int random; /* random value (scaled) */ uint64_t q_time; /* start of queue idle time */ }; /* * The kernel side of a scheduler. Contains the userland config, * a link, pointer to extra config arguments from command line, * kernel flags, and a pointer to the scheduler methods. * It is stored in a hash table, and holds a list of all * flowsets and scheduler instances. * XXX sch must be at the beginning, see schk_hash(). */ struct dn_schk { struct dn_sch sch; struct dn_alg *fp; /* Pointer to scheduler functions */ struct dn_link link; /* The link, embedded */ struct dn_profile *profile; /* delay profile, if any */ struct dn_id *cfg; /* extra config arguments */ SLIST_ENTRY(dn_schk) schk_next; /* hash chain for schedhash */ struct dn_fsk_head fsk_list; /* all fsk linked to me */ struct dn_fsk *fs; /* Flowset for !MULTIQUEUE */ /* bucket index used by the drain routine to drain the scheduler * instance for this flowset. */ int drain_bucket; /* Hash table of all instances (through sch.sched_mask) * or single instance if no mask. Always valid. */ struct dn_ht *siht; }; /* * Scheduler instance. * Contains variables and all queues relative to a this instance. * This struct is created a runtime. */ struct dn_sch_inst { struct dn_flow ni; /* oid, flowid and stats */ SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */ struct delay_line dline; struct dn_schk *sched; /* the template */ int kflags; /* DN_ACTIVE */ int64_t credit; /* bits I can transmit (more or less). */ uint64_t sched_time; /* time link was scheduled in ready_heap */ uint64_t idle_time; /* start of scheduler instance idle time */ /* q_count is the number of queues that this instance is using. * The counter is incremented or decremented when * a reference from the queue is created or deleted. * It is used to make sure that a scheduler instance can be safely * deleted by the drain routine. */ int q_count; }; /* kernel-side flags. Linux has DN_DELETE in fcntl.h */ enum { /* 1 and 2 are reserved for the SCAN flags */ DN_DESTROY = 0x0004, /* destroy */ DN_DELETE_FS = 0x0008, /* destroy flowset */ DN_DETACH = 0x0010, DN_ACTIVE = 0x0020, /* object is in evheap */ DN_F_DLINE = 0x0040, /* object is a delay line */ DN_DEL_SAFE = 0x0080, /* delete a queue only if no longer needed * by scheduler */ DN_QHT_IS_Q = 0x0100, /* in flowset, qht is a single queue */ }; extern struct dn_parms dn_cfg; //VNET_DECLARE(struct dn_parms, _base_dn_cfg); //#define dn_cfg VNET(_base_dn_cfg) int dummynet_io(struct mbuf **, int , struct ip_fw_args *); void dummynet_task(void *context, int pending); void dn_reschedule(void); struct dn_queue *ipdn_q_find(struct dn_fsk *, struct ipfw_flow_id *); struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *); /* * copy_range is a template for requests for ranges of pipes/queues/scheds. * The number of ranges is variable and can be derived by o.len. * As a default, we use a small number of entries so that the struct * fits easily on the stack and is sufficient for most common requests. */ #define DEFAULT_RANGES 5 struct copy_range { struct dn_id o; uint32_t r[ 2 * DEFAULT_RANGES ]; }; struct copy_args { char **start; char *end; int flags; int type; struct copy_range *extra; /* extra filtering */ }; struct sockopt; int ip_dummynet_compat(struct sockopt *sopt); int dummynet_get(struct sockopt *sopt, void **compat); int dn_c_copy_q (void *_ni, void *arg); int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq); int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq); int dn_compat_copy_queue(struct copy_args *a, void *_o); int dn_compat_copy_pipe(struct copy_args *a, void *_o); int copy_data_helper_compat(void *_o, void *_arg); int dn_compat_calc_size(void); int do_config(void *p, int l); /* function to drain idle object */ void dn_drain_scheduler(void); void dn_drain_queue(void); #endif /* _IP_DN_PRIVATE_H */ ================================================ FILE: sys/netinet/ipfw/ip_dummynet.c ================================================ /*- * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa * Portions Copyright (c) 2000 Akamba Corp. * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_dummynet.c 203340 2010-02-01 12:06:37Z luigi $"); /* * Configuration and internal object management for dummynet. */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ #include #include /* ip_output(), IP_FORWARDING */ #include #include #include #include #include #include /* which objects to copy */ #define DN_C_LINK 0x01 #define DN_C_SCH 0x02 #define DN_C_FLOW 0x04 #define DN_C_FS 0x08 #define DN_C_QUEUE 0x10 /* we use this argument in case of a schk_new */ struct schk_new_arg { struct dn_alg *fp; struct dn_sch *sch; }; /*---- callout hooks. ----*/ static struct callout dn_timeout; static struct task dn_task; static struct taskqueue *dn_tq = NULL; /* dummynet and ipfw_tick can't be static in windows */ void dummynet(void * arg) { (void)arg; /* UNUSED */ taskqueue_enqueue(dn_tq, &dn_task); } void dn_reschedule(void) { callout_reset_on(&dn_timeout, 1, dummynet, NULL, 0); } /*----- end of callout hooks -----*/ /* Return a scheduler descriptor given the type or name. */ static struct dn_alg * find_sched_type(int type, char *name) { struct dn_alg *d; SLIST_FOREACH(d, &dn_cfg.schedlist, next) { if (d->type == type || (name && !strcasecmp(d->name, name))) return d; } return NULL; /* not found */ } int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg) { int oldv = *v; const char *op = NULL; if (dflt < lo) dflt = lo; if (dflt > hi) dflt = hi; if (oldv < lo) { *v = dflt; op = "Bump"; } else if (oldv > hi) { *v = hi; op = "Clamp"; } else return *v; if (op && msg) printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); return *v; } /*---- flow_id mask, hash and compare functions ---*/ /* * The flow_id includes the 5-tuple, the queue/pipe number * which we store in the extra area in host order, * and for ipv6 also the flow_id6. * XXX see if we want the tos byte (can store in 'flags') */ static struct ipfw_flow_id * flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id) { int is_v6 = IS_IP6_FLOW_ID(id); id->dst_port &= mask->dst_port; id->src_port &= mask->src_port; id->proto &= mask->proto; id->extra &= mask->extra; if (is_v6) { APPLY_MASK(&id->dst_ip6, &mask->dst_ip6); APPLY_MASK(&id->src_ip6, &mask->src_ip6); id->flow_id6 &= mask->flow_id6; } else { id->dst_ip &= mask->dst_ip; id->src_ip &= mask->src_ip; } return id; } /* computes an OR of two masks, result in dst and also returned */ static struct ipfw_flow_id * flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst) { int is_v6 = IS_IP6_FLOW_ID(dst); dst->dst_port |= src->dst_port; dst->src_port |= src->src_port; dst->proto |= src->proto; dst->extra |= src->extra; if (is_v6) { #define OR_MASK(_d, _s) \ (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \ (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \ (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \ (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3]; OR_MASK(&dst->dst_ip6, &src->dst_ip6); OR_MASK(&dst->src_ip6, &src->src_ip6); #undef OR_MASK dst->flow_id6 |= src->flow_id6; } else { dst->dst_ip |= src->dst_ip; dst->src_ip |= src->src_ip; } return dst; } static int nonzero_mask(struct ipfw_flow_id *m) { if (m->dst_port || m->src_port || m->proto || m->extra) return 1; if (IS_IP6_FLOW_ID(m)) { return m->dst_ip6.__u6_addr.__u6_addr32[0] || m->dst_ip6.__u6_addr.__u6_addr32[1] || m->dst_ip6.__u6_addr.__u6_addr32[2] || m->dst_ip6.__u6_addr.__u6_addr32[3] || m->src_ip6.__u6_addr.__u6_addr32[0] || m->src_ip6.__u6_addr.__u6_addr32[1] || m->src_ip6.__u6_addr.__u6_addr32[2] || m->src_ip6.__u6_addr.__u6_addr32[3] || m->flow_id6; } else { return m->dst_ip || m->src_ip; } } /* XXX we may want a better hash function */ static uint32_t flow_id_hash(struct ipfw_flow_id *id) { uint32_t i; if (IS_IP6_FLOW_ID(id)) { uint32_t *d = (uint32_t *)&id->dst_ip6; uint32_t *s = (uint32_t *)&id->src_ip6; i = (d[0] ) ^ (d[1]) ^ (d[2] ) ^ (d[3]) ^ (d[0] >> 15) ^ (d[1] >> 15) ^ (d[2] >> 15) ^ (d[3] >> 15) ^ (s[0] << 1) ^ (s[1] << 1) ^ (s[2] << 1) ^ (s[3] << 1) ^ (s[0] << 16) ^ (s[1] << 16) ^ (s[2] << 16) ^ (s[3] << 16) ^ (id->dst_port << 1) ^ (id->src_port) ^ (id->extra) ^ (id->proto ) ^ (id->flow_id6); } else { i = (id->dst_ip) ^ (id->dst_ip >> 15) ^ (id->src_ip << 1) ^ (id->src_ip >> 16) ^ (id->extra) ^ (id->dst_port << 1) ^ (id->src_port) ^ (id->proto); } return i; } /* Like bcmp, returns 0 if ids match, 1 otherwise. */ static int flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2) { int is_v6 = IS_IP6_FLOW_ID(id1); if (!is_v6) { if (IS_IP6_FLOW_ID(id2)) return 1; /* different address families */ return (id1->dst_ip == id2->dst_ip && id1->src_ip == id2->src_ip && id1->dst_port == id2->dst_port && id1->src_port == id2->src_port && id1->proto == id2->proto && id1->extra == id2->extra) ? 0 : 1; } /* the ipv6 case */ return ( !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) && !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) && id1->dst_port == id2->dst_port && id1->src_port == id2->src_port && id1->proto == id2->proto && id1->extra == id2->extra && id1->flow_id6 == id2->flow_id6) ? 0 : 1; } /*--------- end of flow-id mask, hash and compare ---------*/ /*--- support functions for the qht hashtable ---- * Entries are hashed by flow-id */ static uint32_t q_hash(uintptr_t key, int flags, void *arg) { /* compute the hash slot from the flow id */ struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? &((struct dn_queue *)key)->ni.fid : (struct ipfw_flow_id *)key; return flow_id_hash(id); } static int q_match(void *obj, uintptr_t key, int flags, void *arg) { struct dn_queue *o = (struct dn_queue *)obj; struct ipfw_flow_id *id2; if (flags & DNHT_KEY_IS_OBJ) { /* compare pointers */ id2 = &((struct dn_queue *)key)->ni.fid; } else { id2 = (struct ipfw_flow_id *)key; } return (0 == flow_id_cmp(&o->ni.fid, id2)); } /* * create a new queue instance for the given 'key'. */ static void * q_new(uintptr_t key, int flags, void *arg) { struct dn_queue *q, *template = arg; struct dn_fsk *fs = template->fs; int size = sizeof(*q) + fs->sched->fp->q_datalen; q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO); if (q == NULL) { D("no memory for new queue"); return NULL; } set_oid(&q->ni.oid, DN_QUEUE, size); if (fs->fs.flags & DN_QHT_HASH) q->ni.fid = *(struct ipfw_flow_id *)key; q->fs = fs; q->_si = ipdn_si_find(q->fs->sched, &(template->ni.fid)); if (q->_si == NULL) { D("no memory for new si"); free (q, M_DUMMYNET); return NULL; } q->_si->q_count++; if (fs->sched->fp->new_queue) fs->sched->fp->new_queue(q); dn_cfg.queue_count++; dn_cfg.idle_queue++; return q; } /* * Notify schedulers that a queue is going away. * If (flags & DN_DESTROY), also free the packets. * The version for callbacks is called q_delete_cb(). * Returns 1 if the queue is NOT deleted (usually when * the drain routine try to delete a queue that a scheduler * instance needs), 0 otherwise. * NOTE: flag DN_DEL_SAFE means that the queue should be * deleted only if the scheduler no longer needs it */ static int dn_delete_queue(struct dn_queue *q, int flags) { struct dn_fsk *fs = q->fs; // D("fs %p si %p\n", fs, q->_si); /* notify the parent scheduler that the queue is going away */ if (fs && fs->sched->fp->free_queue) if (fs->sched->fp->free_queue(q, flags & DN_DEL_SAFE) == 1) return 1; /* queue NOT deleted */ q->_si->q_count--; q->_si = NULL; if (flags & DN_DESTROY) { if (q->mq.head) dn_free_pkts(q->mq.head); else dn_cfg.idle_queue--; bzero(q, sizeof(*q)); // safety free(q, M_DUMMYNET); dn_cfg.queue_count--; } return 0; } static int q_delete_cb(void *q, void *arg) { int flags = (int)(uintptr_t)arg; dn_delete_queue(q, flags); return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0; } /* * calls dn_delete_queue/q_delete_cb on all queues, * which notifies the parent scheduler and possibly drains packets. * flags & DN_DESTROY: drains queues and destroy qht; */ static void qht_delete(struct dn_fsk *fs, int flags) { ND("fs %d start flags %d qht %p", fs->fs.fs_nr, flags, fs->qht); if (!fs->qht) return; if (fs->fs.flags & DN_QHT_HASH) { dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags); if (flags & DN_DESTROY) { dn_ht_free(fs->qht, 0); fs->qht = NULL; } } else { dn_delete_queue((struct dn_queue *)(fs->qht), flags); if (flags & DN_DESTROY) fs->qht = NULL; } } /* * Find and possibly create the queue for a MULTIQUEUE scheduler. * We never call it for !MULTIQUEUE (the queue is in the sch_inst). */ struct dn_queue * ipdn_q_find(struct dn_fsk *fs, struct ipfw_flow_id *id) { struct dn_queue template; template.fs = fs; if (fs->fs.flags & DN_QHT_HASH) { struct ipfw_flow_id masked_id; if (fs->qht == NULL) { fs->qht = dn_ht_init(NULL, fs->fs.buckets, offsetof(struct dn_queue, q_next), q_hash, q_match, q_new); if (fs->qht == NULL) return NULL; } masked_id = *id; flow_id_mask(&fs->fsk_mask, &masked_id); return dn_ht_find(fs->qht, (uintptr_t)&masked_id, DNHT_INSERT, &template); } else { if (fs->qht == NULL) fs->qht = q_new(0, 0, &template); return (struct dn_queue *)fs->qht; } } /*--- end of queue hash table ---*/ /*--- support functions for the sch_inst hashtable ---- * * These are hashed by flow-id */ static uint32_t si_hash(uintptr_t key, int flags, void *arg) { /* compute the hash slot from the flow id */ struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? &((struct dn_sch_inst *)key)->ni.fid : (struct ipfw_flow_id *)key; return flow_id_hash(id); } static int si_match(void *obj, uintptr_t key, int flags, void *arg) { struct dn_sch_inst *o = obj; struct ipfw_flow_id *id2; id2 = (flags & DNHT_KEY_IS_OBJ) ? &((struct dn_sch_inst *)key)->ni.fid : (struct ipfw_flow_id *)key; return flow_id_cmp(&o->ni.fid, id2) == 0; } static int si_reset_credit(void *_si, void *arg); // XXX si_new use this /* * create a new instance for the given 'key' * Allocate memory for instance, delay line and scheduler private data. */ static void * si_new(uintptr_t key, int flags, void *arg) { struct dn_schk *s = arg; struct dn_sch_inst *si; int l = sizeof(*si) + s->fp->si_datalen; si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); if (si == NULL) goto error; /* Set length only for the part passed up to userland. */ set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow)); set_oid(&(si->dline.oid), DN_DELAY_LINE, sizeof(struct delay_line)); /* mark si and dline as outside the event queue */ si->ni.oid.id = si->dline.oid.id = -1; si->sched = s; si->dline.si = si; if (s->fp->new_sched && s->fp->new_sched(si)) { D("new_sched error"); goto error; } if (s->sch.flags & DN_HAVE_MASK) si->ni.fid = *(struct ipfw_flow_id *)key; si_reset_credit(si, NULL); dn_cfg.si_count++; dn_cfg.idle_si++; return si; error: if (si) { bzero(si, sizeof(*si)); // safety free(si, M_DUMMYNET); } return NULL; } /* * Callback from siht to delete all scheduler instances. Remove * si and delay line from the system heap, destroy all queues. * We assume that all flowset have been notified and do not * point to us anymore. */ static int si_destroy(void *_si, void *arg) { struct dn_sch_inst *si = _si; struct dn_schk *s = si->sched; struct delay_line *dl = &si->dline; if (dl->oid.subtype) /* remove delay line from event heap */ heap_extract(&dn_cfg.evheap, dl); if (si->ni.length == 0) dn_cfg.idle_si--; dn_free_pkts(dl->mq.head); /* drain delay line */ if (si->kflags & DN_ACTIVE) /* remove si from event heap */ heap_extract(&dn_cfg.evheap, si); if (s->fp->free_sched) s->fp->free_sched(si); bzero(si, sizeof(*si)); /* safety */ free(si, M_DUMMYNET); dn_cfg.si_count--; return DNHT_SCAN_DEL; } /* * Find the scheduler instance for this packet. If we need to apply * a mask, do on a local copy of the flow_id to preserve the original. * Assume siht is always initialized if we have a mask. */ struct dn_sch_inst * ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id) { if (s->sch.flags & DN_HAVE_MASK) { struct ipfw_flow_id id_t = *id; flow_id_mask(&s->sch.sched_mask, &id_t); return dn_ht_find(s->siht, (uintptr_t)&id_t, DNHT_INSERT, s); } if (!s->siht) s->siht = si_new(0, 0, s); return (struct dn_sch_inst *)s->siht; } /* callback to flush credit for the scheduler instance */ static int si_reset_credit(void *_si, void *arg) { struct dn_sch_inst *si = _si; struct dn_link *p = &si->sched->link; si->idle_time = dn_cfg.curr_time; si->credit = p->burst + (dn_cfg.io_fast ? p->bandwidth : 0); return 0; } static void schk_reset_credit(struct dn_schk *s) { if (s->sch.flags & DN_HAVE_MASK) dn_ht_scan(s->siht, si_reset_credit, NULL); else if (s->siht) si_reset_credit(s->siht, NULL); } /*---- end of sch_inst hashtable ---------------------*/ /*------------------------------------------------------- * flowset hash (fshash) support. Entries are hashed by fs_nr. * New allocations are put in the fsunlinked list, from which * they are removed when they point to a specific scheduler. */ static uint32_t fsk_hash(uintptr_t key, int flags, void *arg) { uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : ((struct dn_fsk *)key)->fs.fs_nr; return ( (i>>8)^(i>>4)^i ); } static int fsk_match(void *obj, uintptr_t key, int flags, void *arg) { struct dn_fsk *fs = obj; int i = !(flags & DNHT_KEY_IS_OBJ) ? key : ((struct dn_fsk *)key)->fs.fs_nr; return (fs->fs.fs_nr == i); } static void * fsk_new(uintptr_t key, int flags, void *arg) { struct dn_fsk *fs; fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO); if (fs) { set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs)); dn_cfg.fsk_count++; fs->drain_bucket = 0; SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); } return fs; } /* * detach flowset from its current scheduler. Flags as follows: * DN_DETACH removes from the fsk_list * DN_DESTROY deletes individual queues * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked). */ static void fsk_detach(struct dn_fsk *fs, int flags) { if (flags & DN_DELETE_FS) flags |= DN_DESTROY; ND("fs %d from sched %d flags %s %s %s", fs->fs.fs_nr, fs->fs.sched_nr, (flags & DN_DELETE_FS) ? "DEL_FS":"", (flags & DN_DESTROY) ? "DEL":"", (flags & DN_DETACH) ? "DET":""); if (flags & DN_DETACH) { /* detach from the list */ struct dn_fsk_head *h; h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu; SLIST_REMOVE(h, fs, dn_fsk, sch_chain); } /* Free the RED parameters, they will be recomputed on * subsequent attach if needed. */ if (fs->w_q_lookup) free(fs->w_q_lookup, M_DUMMYNET); fs->w_q_lookup = NULL; qht_delete(fs, flags); if (fs->sched && fs->sched->fp->free_fsk) fs->sched->fp->free_fsk(fs); fs->sched = NULL; if (flags & DN_DELETE_FS) { bzero(fs, sizeof(*fs)); /* safety */ free(fs, M_DUMMYNET); dn_cfg.fsk_count--; } else { SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); } } /* * Detach or destroy all flowsets in a list. * flags specifies what to do: * DN_DESTROY: flush all queues * DN_DELETE_FS: DN_DESTROY + destroy flowset * DN_DELETE_FS implies DN_DESTROY */ static void fsk_detach_list(struct dn_fsk_head *h, int flags) { struct dn_fsk *fs; int n = 0; /* only for stats */ ND("head %p flags %x", h, flags); while ((fs = SLIST_FIRST(h))) { SLIST_REMOVE_HEAD(h, sch_chain); n++; fsk_detach(fs, flags); } ND("done %d flowsets", n); } /* * called on 'queue X delete' -- removes the flowset from fshash, * deletes all queues for the flowset, and removes the flowset. */ static int delete_fs(int i, int locked) { struct dn_fsk *fs; int err = 0; if (!locked) DN_BH_WLOCK(); fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL); if (dn_ht_entries(dn_cfg.fshash) == 0) { dn_ht_free(dn_cfg.fshash, 0); dn_cfg.fshash = NULL; } ND("fs %d found %p", i, fs); if (fs) { fsk_detach(fs, DN_DETACH | DN_DELETE_FS); err = 0; } else err = EINVAL; if (!locked) DN_BH_WUNLOCK(); return err; } /*----- end of flowset hashtable support -------------*/ /*------------------------------------------------------------ * Scheduler hash. When searching by index we pass sched_nr, * otherwise we pass struct dn_sch * which is the first field in * struct dn_schk so we can cast between the two. We use this trick * because in the create phase (but it should be fixed). */ static uint32_t schk_hash(uintptr_t key, int flags, void *_arg) { uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : ((struct dn_schk *)key)->sch.sched_nr; return ( (i>>8)^(i>>4)^i ); } static int schk_match(void *obj, uintptr_t key, int flags, void *_arg) { struct dn_schk *s = (struct dn_schk *)obj; int i = !(flags & DNHT_KEY_IS_OBJ) ? key : ((struct dn_schk *)key)->sch.sched_nr; return (s->sch.sched_nr == i); } /* * Create the entry and intialize with the sched hash if needed. * Leave s->fp unset so we can tell whether a dn_ht_find() returns * a new object or a previously existing one. */ static void * schk_new(uintptr_t key, int flags, void *arg) { struct schk_new_arg *a = arg; struct dn_schk *s; int l = sizeof(*s) +a->fp->schk_datalen; s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); if (s == NULL) return NULL; set_oid(&s->link.oid, DN_LINK, sizeof(s->link)); s->sch = *a->sch; // copy initial values s->link.link_nr = s->sch.sched_nr; SLIST_INIT(&s->fsk_list); /* initialize the hash table or create the single instance */ s->fp = a->fp; /* si_new needs this */ s->drain_bucket = 0; if (s->sch.flags & DN_HAVE_MASK) { s->siht = dn_ht_init(NULL, s->sch.buckets, offsetof(struct dn_sch_inst, si_next), si_hash, si_match, si_new); if (s->siht == NULL) { free(s, M_DUMMYNET); return NULL; } } s->fp = NULL; /* mark as a new scheduler */ dn_cfg.schk_count++; return s; } /* * Callback for sched delete. Notify all attached flowsets to * detach from the scheduler, destroy the internal flowset, and * all instances. The scheduler goes away too. * arg is 0 (only detach flowsets and destroy instances) * DN_DESTROY (detach & delete queues, delete schk) * or DN_DELETE_FS (delete queues and flowsets, delete schk) */ static int schk_delete_cb(void *obj, void *arg) { struct dn_schk *s = obj; #if 0 int a = (int)arg; ND("sched %d arg %s%s", s->sch.sched_nr, a&DN_DESTROY ? "DEL ":"", a&DN_DELETE_FS ? "DEL_FS":""); #endif fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0); /* no more flowset pointing to us now */ if (s->sch.flags & DN_HAVE_MASK) { dn_ht_scan(s->siht, si_destroy, NULL); dn_ht_free(s->siht, 0); } else if (s->siht) si_destroy(s->siht, NULL); if (s->profile) { free(s->profile, M_DUMMYNET); s->profile = NULL; } s->siht = NULL; if (s->fp->destroy) s->fp->destroy(s); bzero(s, sizeof(*s)); // safety free(obj, M_DUMMYNET); dn_cfg.schk_count--; return DNHT_SCAN_DEL; } /* * called on a 'sched X delete' command. Deletes a single scheduler. * This is done by removing from the schedhash, unlinking all * flowsets and deleting their traffic. */ static int delete_schk(int i) { struct dn_schk *s; s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); if (dn_ht_entries(dn_cfg.schedhash) == 0) { dn_ht_free(dn_cfg.schedhash, 0); dn_cfg.schedhash = NULL; } ND("%d %p", i, s); if (!s) return EINVAL; delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */ /* then detach flowsets, delete traffic */ schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY); return 0; } /*--- end of schk hashtable support ---*/ static int copy_obj(char **start, char *end, void *_o, const char *msg, int i) { struct dn_id *o = _o; int have = end - *start; if (have < o->len || o->len == 0 || o->type == 0) { D("(WARN) type %d %s %d have %d need %d", o->type, msg, i, have, o->len); return 1; } ND("type %d %s %d len %d", o->type, msg, i, o->len); bcopy(_o, *start, o->len); if (o->type == DN_LINK) { /* Adjust burst parameter for link */ struct dn_link *l = (struct dn_link *)*start; l->burst = div64(l->burst, 8 * hz); } else if (o->type == DN_SCH) { /* Set id->id to the number of instances */ struct dn_schk *s = _o; struct dn_id *id = (struct dn_id *)(*start); id->id = (s->sch.flags & DN_HAVE_MASK) ? dn_ht_entries(s->siht) : (s->siht ? 1 : 0); } *start += o->len; return 0; } /* Specific function to copy a queue. * Copies only the user-visible part of a queue (which is in * a struct dn_flow), and sets len accordingly. */ static int copy_obj_q(char **start, char *end, void *_o, const char *msg, int i) { struct dn_id *o = _o; int have = end - *start; int len = sizeof(struct dn_flow); /* see above comment */ if (have < len || o->len == 0 || o->type != DN_QUEUE) { D("ERROR type %d %s %d have %d need %d", o->type, msg, i, have, len); return 1; } ND("type %d %s %d len %d", o->type, msg, i, len); bcopy(_o, *start, len); ((struct dn_id*)(*start))->len = len; *start += len; return 0; } static int copy_q_cb(void *obj, void *arg) { struct dn_queue *q = obj; struct copy_args *a = arg; struct dn_flow *ni = (struct dn_flow *)(*a->start); if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1)) return DNHT_SCAN_END; ni->oid.type = DN_FLOW; /* override the DN_QUEUE */ ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL); return 0; } static int copy_q(struct copy_args *a, struct dn_fsk *fs, int flags) { if (!fs->qht) return 0; if (fs->fs.flags & DN_QHT_HASH) dn_ht_scan(fs->qht, copy_q_cb, a); else copy_q_cb(fs->qht, a); return 0; } /* * This routine only copies the initial part of a profile ? XXX * XXX marta: I think this routine is called to print a summary * of the pipe configuration and does not need to show the * profile samples list. */ static int copy_profile(struct copy_args *a, struct dn_profile *p) { int have = a->end - *a->start; /* XXX here we check for max length */ int profile_len = sizeof(struct dn_profile); if (p == NULL) return 0; if (have < profile_len) { D("error have %d need %d", have, profile_len); return 1; } bcopy(p, *a->start, profile_len); ((struct dn_id *)(*a->start))->len = profile_len; *a->start += profile_len; return 0; } static int copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags) { struct dn_fs *ufs = (struct dn_fs *)(*a->start); if (!fs) return 0; ND("flowset %d", fs->fs.fs_nr); if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr)) return DNHT_SCAN_END; ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ? dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0); if (flags) { /* copy queues */ copy_q(a, fs, 0); } return 0; } static int copy_si_cb(void *obj, void *arg) { struct dn_sch_inst *si = obj; struct copy_args *a = arg; struct dn_flow *ni = (struct dn_flow *)(*a->start); if (copy_obj(a->start, a->end, &si->ni, "inst", si->sched->sch.sched_nr)) return DNHT_SCAN_END; ni->oid.type = DN_FLOW; /* override the DN_SCH_I */ ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL); return 0; } static int copy_si(struct copy_args *a, struct dn_schk *s, int flags) { if (s->sch.flags & DN_HAVE_MASK) dn_ht_scan(s->siht, copy_si_cb, a); else if (s->siht) copy_si_cb(s->siht, a); return 0; } /* * compute a list of children of a scheduler and copy up */ static int copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags) { struct dn_fsk *fs; struct dn_id *o; uint32_t *p; int n = 0, space = sizeof(*o); SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { if (fs->fs.fs_nr < DN_MAX_ID) n++; } space += n * sizeof(uint32_t); DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n); if (a->end - *(a->start) < space) return DNHT_SCAN_END; o = (struct dn_id *)(*(a->start)); o->len = space; *a->start += o->len; o->type = DN_TEXT; p = (uint32_t *)(o+1); SLIST_FOREACH(fs, &s->fsk_list, sch_chain) if (fs->fs.fs_nr < DN_MAX_ID) *p++ = fs->fs.fs_nr; return 0; } static int copy_data_helper(void *_o, void *_arg) { struct copy_args *a = _arg; uint32_t *r = a->extra->r; /* start of first range */ uint32_t *lim; /* first invalid pointer */ int n; lim = (uint32_t *)((char *)(a->extra) + a->extra->o.len); if (a->type == DN_LINK || a->type == DN_SCH) { /* pipe|sched show, we receive a dn_schk */ struct dn_schk *s = _o; n = s->sch.sched_nr; if (a->type == DN_SCH && n >= DN_MAX_ID) return 0; /* not a scheduler */ if (a->type == DN_LINK && n <= DN_MAX_ID) return 0; /* not a pipe */ /* see if the object is within one of our ranges */ for (;r < lim; r += 2) { if (n < r[0] || n > r[1]) continue; /* Found a valid entry, copy and we are done */ if (a->flags & DN_C_LINK) { if (copy_obj(a->start, a->end, &s->link, "link", n)) return DNHT_SCAN_END; if (copy_profile(a, s->profile)) return DNHT_SCAN_END; if (copy_flowset(a, s->fs, 0)) return DNHT_SCAN_END; } if (a->flags & DN_C_SCH) { if (copy_obj(a->start, a->end, &s->sch, "sched", n)) return DNHT_SCAN_END; /* list all attached flowsets */ if (copy_fsk_list(a, s, 0)) return DNHT_SCAN_END; } if (a->flags & DN_C_FLOW) copy_si(a, s, 0); break; } } else if (a->type == DN_FS) { /* queue show, skip internal flowsets */ struct dn_fsk *fs = _o; n = fs->fs.fs_nr; if (n >= DN_MAX_ID) return 0; /* see if the object is within one of our ranges */ for (;r < lim; r += 2) { if (n < r[0] || n > r[1]) continue; if (copy_flowset(a, fs, 0)) return DNHT_SCAN_END; copy_q(a, fs, 0); break; /* we are done */ } } return 0; } static inline struct dn_schk * locate_scheduler(int i) { return dn_ht_find(dn_cfg.schedhash, i, 0, NULL); } /* * red parameters are in fixed point arithmetic. */ static int config_red(struct dn_fsk *fs) { int64_t s, idle, weight, w0; int t, i; fs->w_q = fs->fs.w_q; fs->max_p = fs->fs.max_p; ND("called"); /* Doing stuff that was in userland */ i = fs->sched->link.bandwidth; s = (i <= 0) ? 0 : hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i; idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */ fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth); /* fs->lookup_step not scaled, */ if (!fs->lookup_step) fs->lookup_step = 1; w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled for (t = fs->lookup_step; t > 1; --t) weight = SCALE_MUL(weight, w0); fs->lookup_weight = (int)(weight); // scaled /* Now doing stuff that was in kerneland */ fs->min_th = SCALE(fs->fs.min_th); fs->max_th = SCALE(fs->fs.max_th); fs->c_1 = fs->max_p / (fs->fs.max_th - fs->fs.min_th); fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th)); if (fs->fs.flags & DN_IS_GENTLE_RED) { fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th; fs->c_4 = SCALE(1) - 2 * fs->max_p; } /* If the lookup table already exist, free and create it again. */ if (fs->w_q_lookup) { free(fs->w_q_lookup, M_DUMMYNET); fs->w_q_lookup = NULL; } if (dn_cfg.red_lookup_depth == 0) { printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth" "must be > 0\n"); fs->fs.flags &= ~DN_IS_RED; fs->fs.flags &= ~DN_IS_GENTLE_RED; return (EINVAL); } fs->lookup_depth = dn_cfg.red_lookup_depth; fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int), M_DUMMYNET, M_NOWAIT); if (fs->w_q_lookup == NULL) { printf("dummynet: sorry, cannot allocate red lookup table\n"); fs->fs.flags &= ~DN_IS_RED; fs->fs.flags &= ~DN_IS_GENTLE_RED; return(ENOSPC); } /* Fill the lookup table with (1 - w_q)^x */ fs->w_q_lookup[0] = SCALE(1) - fs->w_q; for (i = 1; i < fs->lookup_depth; i++) fs->w_q_lookup[i] = SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight); if (dn_cfg.red_avg_pkt_size < 1) dn_cfg.red_avg_pkt_size = 512; fs->avg_pkt_size = dn_cfg.red_avg_pkt_size; if (dn_cfg.red_max_pkt_size < 1) dn_cfg.red_max_pkt_size = 1500; fs->max_pkt_size = dn_cfg.red_max_pkt_size; ND("exit"); return 0; } /* Scan all flowset attached to this scheduler and update red */ static void update_red(struct dn_schk *s) { struct dn_fsk *fs; SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { if (fs && (fs->fs.flags & DN_IS_RED)) config_red(fs); } } /* attach flowset to scheduler s, possibly requeue */ static void fsk_attach(struct dn_fsk *fs, struct dn_schk *s) { ND("remove fs %d from fsunlinked, link to sched %d", fs->fs.fs_nr, s->sch.sched_nr); SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain); fs->sched = s; SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain); if (s->fp->new_fsk) s->fp->new_fsk(fs); /* XXX compute fsk_mask */ fs->fsk_mask = fs->fs.flow_mask; if (fs->sched->sch.flags & DN_HAVE_MASK) flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask); if (fs->qht) { /* * we must drain qht according to the old * type, and reinsert according to the new one. * The requeue is complex -- in general we need to * reclassify every single packet. * For the time being, let's hope qht is never set * when we reach this point. */ D("XXX TODO requeue from fs %d to sch %d", fs->fs.fs_nr, s->sch.sched_nr); fs->qht = NULL; } /* set the new type for qht */ if (nonzero_mask(&fs->fsk_mask)) fs->fs.flags |= DN_QHT_HASH; else fs->fs.flags &= ~DN_QHT_HASH; /* XXX config_red() can fail... */ if (fs->fs.flags & DN_IS_RED) config_red(fs); } /* update all flowsets which may refer to this scheduler */ static void update_fs(struct dn_schk *s) { struct dn_fsk *fs, *tmp; SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) { if (s->sch.sched_nr != fs->fs.sched_nr) { D("fs %d for sch %d not %d still unlinked", fs->fs.fs_nr, fs->fs.sched_nr, s->sch.sched_nr); continue; } fsk_attach(fs, s); } } /* * Configuration -- to preserve backward compatibility we use * the following scheme (N is 65536) * NUMBER SCHED LINK FLOWSET * 1 .. N-1 (1)WFQ (2)WFQ (3)queue * N+1 .. 2N-1 (4)FIFO (5)FIFO (6)FIFO for sched 1..N-1 * 2N+1 .. 3N-1 -- -- (7)FIFO for sched N+1..2N-1 * * "pipe i config" configures #1, #2 and #3 * "sched i config" configures #1 and possibly #6 * "queue i config" configures #3 * #1 is configured with 'pipe i config' or 'sched i config' * #2 is configured with 'pipe i config', and created if not * existing with 'sched i config' * #3 is configured with 'queue i config' * #4 is automatically configured after #1, can only be FIFO * #5 is automatically configured after #2 * #6 is automatically created when #1 is !MULTIQUEUE, * and can be updated. * #7 is automatically configured after #2 */ /* * configure a link (and its FIFO instance) */ static int config_link(struct dn_link *p, struct dn_id *arg) { int i; if (p->oid.len != sizeof(*p)) { D("invalid pipe len %d", p->oid.len); return EINVAL; } i = p->link_nr; if (i <= 0 || i >= DN_MAX_ID) return EINVAL; /* * The config program passes parameters as follows: * bw = bits/second (0 means no limits), * delay = ms, must be translated into ticks. * qsize = slots/bytes * burst ??? */ p->delay = (p->delay * hz) / 1000; /* Scale burst size: bytes -> bits * hz */ p->burst *= 8 * hz; DN_BH_WLOCK(); /* do it twice, base link and FIFO link */ for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { struct dn_schk *s = locate_scheduler(i); if (s == NULL) { DN_BH_WUNLOCK(); D("sched %d not found", i); return EINVAL; } /* remove profile if exists */ if (s->profile) { free(s->profile, M_DUMMYNET); s->profile = NULL; } /* copy all parameters */ s->link.oid = p->oid; s->link.link_nr = i; s->link.delay = p->delay; if (s->link.bandwidth != p->bandwidth) { /* XXX bandwidth changes, need to update red params */ s->link.bandwidth = p->bandwidth; update_red(s); } s->link.burst = p->burst; schk_reset_credit(s); } dn_cfg.id++; DN_BH_WUNLOCK(); return 0; } /* * configure a flowset. Can be called from inside with locked=1, */ static struct dn_fsk * config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked) { int i; struct dn_fsk *fs; if (nfs->oid.len != sizeof(*nfs)) { D("invalid flowset len %d", nfs->oid.len); return NULL; } i = nfs->fs_nr; if (i <= 0 || i >= 3*DN_MAX_ID) return NULL; ND("flowset %d", i); /* XXX other sanity checks */ if (nfs->flags & DN_QSIZE_BYTES) { ipdn_bound_var(&nfs->qsize, 16384, 1500, dn_cfg.byte_limit, NULL); // "queue byte size"); } else { ipdn_bound_var(&nfs->qsize, 50, 1, dn_cfg.slot_limit, NULL); // "queue slot size"); } if (nfs->flags & DN_HAVE_MASK) { /* make sure we have some buckets */ ipdn_bound_var((int *)&nfs->buckets, dn_cfg.hash_size, 1, dn_cfg.max_hash_size, "flowset buckets"); } else { nfs->buckets = 1; /* we only need 1 */ } if (!locked) DN_BH_WLOCK(); if (dn_cfg.fshash == NULL) dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size, offsetof(struct dn_fsk, fsk_next), fsk_hash, fsk_match, fsk_new); do { /* exit with break when done */ struct dn_schk *s; int flags = nfs->sched_nr ? DNHT_INSERT : 0; int j; int oldc = dn_cfg.fsk_count; fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL); if (fs == NULL) { D("missing sched for flowset %d", i); break; } /* grab some defaults from the existing one */ if (nfs->sched_nr == 0) /* reuse */ nfs->sched_nr = fs->fs.sched_nr; for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) { if (nfs->par[j] == -1) /* reuse */ nfs->par[j] = fs->fs.par[j]; } if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) { ND("flowset %d unchanged", i); break; /* no change, nothing to do */ } if (oldc != dn_cfg.fsk_count) /* new item */ dn_cfg.id++; s = locate_scheduler(nfs->sched_nr); /* detach from old scheduler if needed, preserving * queues if we need to reattach. Then update the * configuration, and possibly attach to the new sched. */ DX(2, "fs %d changed sched %d@%p to %d@%p", fs->fs.fs_nr, fs->fs.sched_nr, fs->sched, nfs->sched_nr, s); if (fs->sched) { int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY); flags |= DN_DESTROY; /* XXX temporary */ fsk_detach(fs, flags); } fs->fs = *nfs; /* copy configuration */ if (s != NULL) fsk_attach(fs, s); } while (0); if (!locked) DN_BH_WUNLOCK(); return fs; } /* * config/reconfig a scheduler and its FIFO variant. * For !MULTIQUEUE schedulers, also set up the flowset. * * On reconfigurations (detected because s->fp is set), * detach existing flowsets preserving traffic, preserve link, * and delete the old scheduler creating a new one. */ static int config_sched(struct dn_sch *_nsch, struct dn_id *arg) { struct dn_schk *s; struct schk_new_arg a; /* argument for schk_new */ int i; struct dn_link p; /* copy of oldlink */ struct dn_profile *pf = NULL; /* copy of old link profile */ /* Used to preserv mask parameter */ struct ipfw_flow_id new_mask; int new_buckets = 0; int new_flags = 0; int pipe_cmd; int err = ENOMEM; a.sch = _nsch; if (a.sch->oid.len != sizeof(*a.sch)) { D("bad sched len %d", a.sch->oid.len); return EINVAL; } i = a.sch->sched_nr; if (i <= 0 || i >= DN_MAX_ID) return EINVAL; /* make sure we have some buckets */ if (a.sch->flags & DN_HAVE_MASK) ipdn_bound_var((int *)&a.sch->buckets, dn_cfg.hash_size, 1, dn_cfg.max_hash_size, "sched buckets"); /* XXX other sanity checks */ bzero(&p, sizeof(p)); pipe_cmd = a.sch->flags & DN_PIPE_CMD; a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set? if (pipe_cmd) { /* Copy mask parameter */ new_mask = a.sch->sched_mask; new_buckets = a.sch->buckets; new_flags = a.sch->flags; } DN_BH_WLOCK(); if (dn_cfg.schedhash == NULL) dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size, offsetof(struct dn_schk, schk_next), schk_hash, schk_match, schk_new); again: /* run twice, for wfq and fifo */ /* * lookup the type. If not supplied, use the previous one * or default to WF2Q+. Otherwise, return an error. */ dn_cfg.id++; a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name); if (a.fp != NULL) { /* found. Lookup or create entry */ s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a); } else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) { /* No type. search existing s* or retry with WF2Q+ */ s = dn_ht_find(dn_cfg.schedhash, i, 0, &a); if (s != NULL) { a.fp = s->fp; /* Scheduler exists, skip to FIFO scheduler * if command was pipe config... */ if (pipe_cmd) goto next; } else { /* New scheduler, create a wf2q+ with no mask * if command was pipe config... */ if (pipe_cmd) { /* clear mask parameter */ bzero(&a.sch->sched_mask, sizeof(new_mask)); a.sch->buckets = 0; a.sch->flags &= ~DN_HAVE_MASK; } a.sch->oid.subtype = DN_SCHED_WF2QP; goto again; } } else { D("invalid scheduler type %d %s", a.sch->oid.subtype, a.sch->name); err = EINVAL; goto error; } /* normalize name and subtype */ a.sch->oid.subtype = a.fp->type; bzero(a.sch->name, sizeof(a.sch->name)); strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name)); if (s == NULL) { D("cannot allocate scheduler %d", i); goto error; } /* restore existing link if any */ if (p.link_nr) { s->link = p; if (!pf || pf->link_nr != p.link_nr) { /* no saved value */ s->profile = NULL; /* XXX maybe not needed */ } else { size_t pf_size = sizeof(struct dn_profile) + s->profile->samples_no * sizeof(int); s->profile = malloc(pf_size, M_DUMMYNET, M_NOWAIT | M_ZERO); if (s->profile == NULL) { D("cannot allocate profile"); goto error; //XXX } bcopy(pf, s->profile, pf_size); } } p.link_nr = 0; if (s->fp == NULL) { DX(2, "sched %d new type %s", i, a.fp->name); } else if (s->fp != a.fp || bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) { /* already existing. */ DX(2, "sched %d type changed from %s to %s", i, s->fp->name, a.fp->name); DX(4, " type/sub %d/%d -> %d/%d", s->sch.oid.type, s->sch.oid.subtype, a.sch->oid.type, a.sch->oid.subtype); if (s->link.link_nr == 0) D("XXX WARNING link 0 for sched %d", i); p = s->link; /* preserve link */ if (s->profile) {/* preserve profile */ if (!pf) pf = malloc(sizeof(*pf), M_DUMMYNET, M_NOWAIT | M_ZERO); if (pf) /* XXX should issue a warning otherwise */ bcopy(s->profile, pf, sizeof(*pf)); } /* remove from the hash */ dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); /* Detach flowsets, preserve queues. */ // schk_delete_cb(s, NULL); // XXX temporarily, kill queues schk_delete_cb(s, (void *)DN_DESTROY); goto again; } else { DX(4, "sched %d unchanged type %s", i, a.fp->name); } /* complete initialization */ s->sch = *a.sch; s->fp = a.fp; s->cfg = arg; // XXX schk_reset_credit(s); /* create the internal flowset if needed, * trying to reuse existing ones if available */ if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) { s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL); if (!s->fs) { struct dn_fs fs; bzero(&fs, sizeof(fs)); set_oid(&fs.oid, DN_FS, sizeof(fs)); fs.fs_nr = i + DN_MAX_ID; fs.sched_nr = i; s->fs = config_fs(&fs, NULL, 1 /* locked */); } if (!s->fs) { schk_delete_cb(s, (void *)DN_DESTROY); D("error creating internal fs for %d", i); goto error; } } /* call init function after the flowset is created */ if (s->fp->config) s->fp->config(s); update_fs(s); next: if (i < DN_MAX_ID) { /* now configure the FIFO instance */ i += DN_MAX_ID; if (pipe_cmd) { /* Restore mask parameter for FIFO */ a.sch->sched_mask = new_mask; a.sch->buckets = new_buckets; a.sch->flags = new_flags; } else { /* sched config shouldn't modify the FIFO scheduler */ if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) { /* FIFO already exist, don't touch it */ err = 0; /* and this is not an error */ goto error; } } a.sch->sched_nr = i; a.sch->oid.subtype = DN_SCHED_FIFO; bzero(a.sch->name, sizeof(a.sch->name)); goto again; } err = 0; error: DN_BH_WUNLOCK(); if (pf) free(pf, M_DUMMYNET); return err; } /* * attach a profile to a link */ static int config_profile(struct dn_profile *pf, struct dn_id *arg) { struct dn_schk *s; int i, olen, err = 0; if (pf->oid.len < sizeof(*pf)) { D("short profile len %d", pf->oid.len); return EINVAL; } i = pf->link_nr; if (i <= 0 || i >= DN_MAX_ID) return EINVAL; /* XXX other sanity checks */ DN_BH_WLOCK(); for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { s = locate_scheduler(i); if (s == NULL) { err = EINVAL; break; } dn_cfg.id++; /* * If we had a profile and the new one does not fit, * or it is deleted, then we need to free memory. */ if (s->profile && (pf->samples_no == 0 || s->profile->oid.len < pf->oid.len)) { free(s->profile, M_DUMMYNET); s->profile = NULL; } if (pf->samples_no == 0) continue; /* * new profile, possibly allocate memory * and copy data. */ if (s->profile == NULL) s->profile = malloc(pf->oid.len, M_DUMMYNET, M_NOWAIT | M_ZERO); if (s->profile == NULL) { D("no memory for profile %d", i); err = ENOMEM; break; } /* preserve larger length XXX double check */ olen = s->profile->oid.len; if (olen < pf->oid.len) olen = pf->oid.len; bcopy(pf, s->profile, pf->oid.len); s->profile->oid.len = olen; } DN_BH_WUNLOCK(); return err; } /* * Delete all objects: */ static void dummynet_flush(void) { /* delete all schedulers and related links/queues/flowsets */ dn_ht_scan(dn_cfg.schedhash, schk_delete_cb, (void *)(uintptr_t)DN_DELETE_FS); /* delete all remaining (unlinked) flowsets */ DX(4, "still %d unlinked fs", dn_cfg.fsk_count); dn_ht_free(dn_cfg.fshash, DNHT_REMOVE); fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS); dn_ht_free(dn_cfg.schedhash, DNHT_REMOVE); /* Reinitialize system heap... */ heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); } /* * Main handler for configuration. We are guaranteed to be called * with an oid which is at least a dn_id. * - the first object is the command (config, delete, flush, ...) * - config_link must be issued after the corresponding config_sched * - parameters (DN_TXT) for an object must preceed the object * processed on a config_sched. */ int do_config(void *p, int l) { struct dn_id *next, *o; int err = 0, err2 = 0; struct dn_id *arg = NULL; uintptr_t *a; o = p; if (o->id != DN_API_VERSION) { D("invalid api version got %d need %d", o->id, DN_API_VERSION); return EINVAL; } for (; l >= sizeof(*o); o = next) { struct dn_id *prev = arg; if (o->len < sizeof(*o) || l < o->len) { D("bad len o->len %d len %d", o->len, l); err = EINVAL; break; } l -= o->len; next = (struct dn_id *)((char *)o + o->len); err = 0; switch (o->type) { default: D("cmd %d not implemented", o->type); break; #ifdef EMULATE_SYSCTL /* sysctl emulation. * if we recognize the command, jump to the correct * handler and return */ case DN_SYSCTL_SET: err = kesysctl_emu_set(p, l); return err; #endif case DN_CMD_CONFIG: /* simply a header */ break; case DN_CMD_DELETE: /* the argument is in the first uintptr_t after o */ a = (uintptr_t *)(o+1); if (o->len < sizeof(*o) + sizeof(*a)) { err = EINVAL; break; } switch (o->subtype) { case DN_LINK: /* delete base and derived schedulers */ DN_BH_WLOCK(); err = delete_schk(*a); err2 = delete_schk(*a + DN_MAX_ID); DN_BH_WUNLOCK(); if (!err) err = err2; break; default: D("invalid delete type %d", o->subtype); err = EINVAL; break; case DN_FS: err = (*a <1 || *a >= DN_MAX_ID) ? EINVAL : delete_fs(*a, 0) ; break; } break; case DN_CMD_FLUSH: DN_BH_WLOCK(); dummynet_flush(); DN_BH_WUNLOCK(); break; case DN_TEXT: /* store argument the next block */ prev = NULL; arg = o; break; case DN_LINK: err = config_link((struct dn_link *)o, arg); break; case DN_PROFILE: err = config_profile((struct dn_profile *)o, arg); break; case DN_SCH: err = config_sched((struct dn_sch *)o, arg); break; case DN_FS: err = (NULL==config_fs((struct dn_fs *)o, arg, 0)); break; } if (prev) arg = NULL; if (err != 0) break; } return err; } static int compute_space(struct dn_id *cmd, struct copy_args *a) { int x = 0, need = 0; int profile_size = sizeof(struct dn_profile); /* NOTE about compute space: * NP = dn_cfg.schk_count * NSI = dn_cfg.si_count * NF = dn_cfg.fsk_count * NQ = dn_cfg.queue_count * - ipfw pipe show * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler * link, scheduler template, flowset * integrated in scheduler and header * for flowset list * (NSI)*(dn_flow) all scheduler instance (includes * the queue instance) * - ipfw sched show * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler * link, scheduler template, flowset * integrated in scheduler and header * for flowset list * (NSI * dn_flow) all scheduler instances * (NF * sizeof(uint_32)) space for flowset list linked to scheduler * (NQ * dn_queue) all queue [XXXfor now not listed] * - ipfw queue show * (NF * dn_fs) all flowset * (NQ * dn_queue) all queues */ switch (cmd->subtype) { default: return -1; /* XXX where do LINK and SCH differ ? */ /* 'ipfw sched show' could list all queues associated to * a scheduler. This feature for now is disabled */ case DN_LINK: /* pipe show */ x = DN_C_LINK | DN_C_SCH | DN_C_FLOW; need += dn_cfg.schk_count * (sizeof(struct dn_fs) + profile_size) / 2; need += dn_cfg.fsk_count * sizeof(uint32_t); break; case DN_SCH: /* sched show */ need += dn_cfg.schk_count * (sizeof(struct dn_fs) + profile_size) / 2; need += dn_cfg.fsk_count * sizeof(uint32_t); x = DN_C_SCH | DN_C_LINK | DN_C_FLOW; break; case DN_FS: /* queue show */ x = DN_C_FS | DN_C_QUEUE; break; case DN_GET_COMPAT: /* compatibility mode */ need = dn_compat_calc_size(); break; } a->flags = x; if (x & DN_C_SCH) { need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2; /* NOT also, each fs might be attached to a sched */ need += dn_cfg.schk_count * sizeof(struct dn_id) / 2; } if (x & DN_C_FS) need += dn_cfg.fsk_count * sizeof(struct dn_fs); if (x & DN_C_LINK) { need += dn_cfg.schk_count * sizeof(struct dn_link) / 2; } /* * When exporting a queue to userland, only pass up the * struct dn_flow, which is the only visible part. */ if (x & DN_C_QUEUE) need += dn_cfg.queue_count * sizeof(struct dn_flow); if (x & DN_C_FLOW) need += dn_cfg.si_count * (sizeof(struct dn_flow)); return need; } /* * If compat != NULL dummynet_get is called in compatibility mode. * *compat will be the pointer to the buffer to pass to ipfw */ int dummynet_get(struct sockopt *sopt, void **compat) { int have, i, need, error; char *start = NULL, *buf; size_t sopt_valsize; struct dn_id *cmd; struct copy_args a; struct copy_range r; int l = sizeof(struct dn_id); bzero(&a, sizeof(a)); bzero(&r, sizeof(r)); /* save and restore original sopt_valsize around copyin */ sopt_valsize = sopt->sopt_valsize; cmd = &r.o; if (!compat) { /* copy at least an oid, and possibly a full object */ error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd)); sopt->sopt_valsize = sopt_valsize; if (error) goto done; l = cmd->len; #ifdef EMULATE_SYSCTL /* sysctl emulation. */ if (cmd->type == DN_SYSCTL_GET) return kesysctl_emu_get(sopt); #endif if (l > sizeof(r)) { /* request larger than default, allocate buffer */ cmd = malloc(l, M_DUMMYNET, M_WAITOK); error = sooptcopyin(sopt, cmd, l, l); sopt->sopt_valsize = sopt_valsize; if (error) goto done; } } else { /* compatibility */ error = 0; cmd->type = DN_CMD_GET; cmd->len = sizeof(struct dn_id); cmd->subtype = DN_GET_COMPAT; // cmd->id = sopt_valsize; D("compatibility mode"); } a.extra = (struct copy_range *)cmd; if (cmd->len == sizeof(*cmd)) { /* no range, create a default */ uint32_t *rp = (uint32_t *)(cmd + 1); cmd->len += 2* sizeof(uint32_t); rp[0] = 1; rp[1] = DN_MAX_ID - 1; if (cmd->subtype == DN_LINK) { rp[0] += DN_MAX_ID; rp[1] += DN_MAX_ID; } } /* Count space (under lock) and allocate (outside lock). * Exit with lock held if we manage to get enough buffer. * Try a few times then give up. */ for (have = 0, i = 0; i < 10; i++) { DN_BH_WLOCK(); need = compute_space(cmd, &a); /* if there is a range, ignore value from compute_space() */ if (l > sizeof(*cmd)) need = sopt_valsize - sizeof(*cmd); if (need < 0) { DN_BH_WUNLOCK(); error = EINVAL; goto done; } need += sizeof(*cmd); cmd->id = need; if (have >= need) break; DN_BH_WUNLOCK(); if (start) free(start, M_DUMMYNET); start = NULL; if (need > sopt_valsize) break; have = need; start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO); } if (start == NULL) { if (compat) { *compat = NULL; error = 1; // XXX } else { error = sooptcopyout(sopt, cmd, sizeof(*cmd)); } goto done; } ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, " "%d:%d si %d, %d:%d queues %d", dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH, dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK, dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS, dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I, dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE); sopt->sopt_valsize = sopt_valsize; a.type = cmd->subtype; if (compat == NULL) { bcopy(cmd, start, sizeof(*cmd)); ((struct dn_id*)(start))->len = sizeof(struct dn_id); buf = start + sizeof(*cmd); } else buf = start; a.start = &buf; a.end = start + have; /* start copying other objects */ if (compat) { a.type = DN_COMPAT_PIPE; dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a); a.type = DN_COMPAT_QUEUE; dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a); } else if (a.type == DN_FS) { dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a); } else { dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a); } DN_BH_WUNLOCK(); if (compat) { *compat = start; sopt->sopt_valsize = buf - start; /* free() is done by ip_dummynet_compat() */ start = NULL; //XXX hack } else { error = sooptcopyout(sopt, start, buf - start); } done: if (cmd && cmd != &r.o) free(cmd, M_DUMMYNET); if (start) free(start, M_DUMMYNET); return error; } /* * Functions to drain idle objects -- see dummynet_task() for some notes */ /* Callback called on scheduler instance to delete it if idle */ static int drain_scheduler_cb(void *_si, void *_arg) { struct dn_sch_inst *si = _si; int *arg = _arg; int empty; if ( (*arg++) > dn_cfg.expire_object_examined) return DNHT_SCAN_END; if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL) return 0; /* * if the scheduler is multiqueue, q_count also reflects empty * queues that point to si, so we need to check si->q_count to * tell whether we can remove the instance. */ if (si->ni.length == 0) { /* si was marked as idle: * remove it or increment idle_si_wait counter */ empty = (si->sched->fp->flags & DN_MULTIQUEUE) ? (si->q_count == 0) : 1; if (empty && (si->idle_time < dn_cfg.curr_time - dn_cfg.object_idle_tick)) return si_destroy(si, NULL); else dn_cfg.idle_si_wait++; } return 0; } /* Callback called on scheduler to check if it has instances */ static int drain_scheduler_sch_cb(void *_s, void *_arg) { struct dn_schk *s = _s; int *arg = _arg; if (s->sch.flags & DN_HAVE_MASK) { dn_ht_scan_bucket(s->siht, &s->drain_bucket, drain_scheduler_cb, _arg); } else { if (s->siht) { if (drain_scheduler_cb(s->siht, _arg) == DNHT_SCAN_DEL) s->siht = NULL; } } return ( (*arg++) > dn_cfg.expire_object_examined) ? DNHT_SCAN_END : 0; } /* Called every tick, try to delete a 'bucket' of scheduler */ void dn_drain_scheduler(void) { int arg = 0; dn_ht_scan_bucket(dn_cfg.schedhash, (int *)&dn_cfg.drain_sch, drain_scheduler_sch_cb, &arg); } /* Callback called on queue to delete if it is idle */ static int drain_queue_cb(void *_q, void *_arg) { struct dn_queue *q = _q; int *arg = _arg; if ( (*arg++) > dn_cfg.expire_object_examined) return DNHT_SCAN_END; if (q->ni.length == 0) { if (q->q_time < dn_cfg.curr_time - dn_cfg.object_idle_tick) { if (dn_delete_queue(q, DN_DESTROY | DN_DEL_SAFE) == 0) return DNHT_SCAN_DEL; /* queue is deleted */ } else dn_cfg.idle_queue_wait++; } return 0; /* queue isn't deleted */ } /* Callback called on flowset used to check if it has queues */ static int drain_queue_fs_cb(void *_fs, void *_arg) { struct dn_fsk *fs = _fs; int *arg = _arg; if (fs->fs.flags & DN_QHT_HASH) { /* Flowset has a hash table for queues */ dn_ht_scan_bucket(fs->qht, &fs->drain_bucket, drain_queue_cb, _arg); } else { /* No hash table for this flowset, null the pointer * if the queue is deleted */ if (fs->qht) { if (drain_queue_cb(fs->qht, _arg) == DNHT_SCAN_DEL) fs->qht = NULL; } } return ( (*arg++) > dn_cfg.expire_object_examined) ? DNHT_SCAN_END : 0; } /* Called every tick, try to delete a 'bucket' of queue */ void dn_drain_queue(void) { int arg = 0; /* scan a bucket of flowset */ dn_ht_scan_bucket(dn_cfg.fshash, (int *)&dn_cfg.drain_fs, drain_queue_fs_cb, &arg); } /* * Handler for the various dummynet socket options */ static int ip_dn_ctl(struct sockopt *sopt) { void *p = NULL; int error, l; error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET); if (error) return (error); /* Disallow sets in really-really secure mode. */ if (sopt->sopt_dir == SOPT_SET) { error = securelevel_ge(sopt->sopt_td->td_ucred, 3); if (error) return (error); } switch (sopt->sopt_name) { default : D("dummynet: unknown option %d", sopt->sopt_name); error = EINVAL; break; case IP_DUMMYNET_FLUSH: case IP_DUMMYNET_CONFIGURE: case IP_DUMMYNET_DEL: /* remove a pipe or queue */ case IP_DUMMYNET_GET: D("dummynet: compat option %d", sopt->sopt_name); error = ip_dummynet_compat(sopt); break; case IP_DUMMYNET3 : if (sopt->sopt_dir == SOPT_GET) { error = dummynet_get(sopt, NULL); break; } l = sopt->sopt_valsize; if (l < sizeof(struct dn_id) || l > 12000) { D("argument len %d invalid", l); break; } p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ? error = sooptcopyin(sopt, p, l, l); if (error) break ; error = do_config(p, l); break; } if (p != NULL) free(p, M_TEMP); return error ; } static void ip_dn_init(void) { if (dn_cfg.init_done) return; printf("DUMMYNET %p with IPv6 initialized (100409)\n", curvnet); dn_cfg.init_done = 1; /* Set defaults here. MSVC does not accept initializers, * and this is also useful for vimages */ /* queue limits */ dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */ dn_cfg.byte_limit = 1024 * 1024; dn_cfg.expire = 1; /* RED parameters */ dn_cfg.red_lookup_depth = 256; /* default lookup table depth */ dn_cfg.red_avg_pkt_size = 512; /* default medium packet size */ dn_cfg.red_max_pkt_size = 1500; /* default max packet size */ /* hash tables */ dn_cfg.max_hash_size = 1024; /* max in the hash tables */ if (dn_cfg.hash_size == 0) /* XXX or <= 0 ? */ dn_cfg.hash_size = 64; /* default hash size */ /* hash tables for schedulers and flowsets are created * when the first scheduler/flowset is inserted. * This is done to allow to use the right hash_size value. * When the last object is deleted, the table is destroyed, * so a new hash_size value can be used. * XXX rehash is not supported for now */ dn_cfg.schedhash = NULL; dn_cfg.fshash = NULL; /* bucket index to drain object */ dn_cfg.drain_fs = 0; dn_cfg.drain_sch = 0; if (dn_cfg.expire_object == 0) dn_cfg.expire_object = 50; if (dn_cfg.object_idle_tick == 0) dn_cfg.object_idle_tick = 1000; if (dn_cfg.expire_object_examined == 0) dn_cfg.expire_object_examined = 10; if (dn_cfg.drain_ratio == 0) dn_cfg.drain_ratio = 1; // XXX what if we don't have a tsc ? #ifdef HAVE_TSC dn_cfg.cycle_task_new = dn_cfg.cycle_task_old = readTSC(); #endif heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); SLIST_INIT(&dn_cfg.fsu); SLIST_INIT(&dn_cfg.schedlist); DN_LOCK_INIT(); TASK_INIT(&dn_task, 0, dummynet_task, curvnet); dn_tq = taskqueue_create_fast("dummynet", M_NOWAIT, taskqueue_thread_enqueue, &dn_tq); taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet"); callout_init(&dn_timeout, CALLOUT_MPSAFE); callout_reset_on(&dn_timeout, 1, dummynet, NULL, 0); /* Initialize curr_time adjustment mechanics. */ getmicrouptime(&dn_cfg.prev_t); } #ifdef KLD_MODULE static void ip_dn_destroy(int last) { callout_drain(&dn_timeout); DN_BH_WLOCK(); if (last) { ND("removing last instance\n"); ip_dn_ctl_ptr = NULL; ip_dn_io_ptr = NULL; } dummynet_flush(); DN_BH_WUNLOCK(); taskqueue_drain(dn_tq, &dn_task); taskqueue_free(dn_tq); dn_ht_free(dn_cfg.schedhash, 0); dn_ht_free(dn_cfg.fshash, 0); heap_free(&dn_cfg.evheap); DN_LOCK_DESTROY(); } #endif /* KLD_MODULE */ static int dummynet_modevent(module_t mod, int type, void *data) { if (type == MOD_LOAD) { if (ip_dn_io_ptr) { printf("DUMMYNET already loaded\n"); return EEXIST ; } ip_dn_init(); ip_dn_ctl_ptr = ip_dn_ctl; ip_dn_io_ptr = dummynet_io; return 0; } else if (type == MOD_UNLOAD) { #if !defined(KLD_MODULE) printf("dummynet statically compiled, cannot unload\n"); return EINVAL ; #else ip_dn_destroy(1 /* last */); return 0; #endif } else return EOPNOTSUPP; } /* modevent helpers for the modules */ static int load_dn_sched(struct dn_alg *d) { struct dn_alg *s; if (d == NULL) return 1; /* error */ ip_dn_init(); /* just in case, we need the lock */ /* Check that mandatory funcs exists */ if (d->enqueue == NULL || d->dequeue == NULL) { D("missing enqueue or dequeue for %s", d->name); return 1; } /* Search if scheduler already exists */ DN_BH_WLOCK(); SLIST_FOREACH(s, &dn_cfg.schedlist, next) { if (strcmp(s->name, d->name) == 0) { D("%s already loaded", d->name); break; /* scheduler already exists */ } } if (s == NULL) SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next); DN_BH_WUNLOCK(); D("dn_sched %s %sloaded", d->name, s ? "not ":""); return s ? 1 : 0; } static int unload_dn_sched(struct dn_alg *s) { struct dn_alg *tmp, *r; int err = EINVAL; ND("called for %s", s->name); DN_BH_WLOCK(); SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) { if (strcmp(s->name, r->name) != 0) continue; ND("ref_count = %d", r->ref_count); err = (r->ref_count != 0) ? EBUSY : 0; if (err == 0) SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next); break; } DN_BH_WUNLOCK(); D("dn_sched %s %sunloaded", s->name, err ? "not ":""); return err; } int dn_sched_modevent(module_t mod, int cmd, void *arg) { struct dn_alg *sch = arg; if (cmd == MOD_LOAD) return load_dn_sched(sch); else if (cmd == MOD_UNLOAD) return unload_dn_sched(sch); else return EINVAL; } static moduledata_t dummynet_mod = { "dummynet", dummynet_modevent, NULL }; #define DN_SI_SUB SI_SUB_PROTO_IFATTACHDOMAIN #define DN_MODEV_ORD (SI_ORDER_ANY - 128) /* after ipfw */ DECLARE_MODULE(dummynet, dummynet_mod, DN_SI_SUB, DN_MODEV_ORD); MODULE_DEPEND(dummynet, ipfw, 2, 2, 2); MODULE_VERSION(dummynet, 3); /* * Starting up. Done in order after dummynet_modevent() has been called. * VNET_SYSINIT is also called for each existing vnet and each new vnet. */ //VNET_SYSINIT(vnet_dn_init, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_init, NULL); /* * Shutdown handlers up shop. These are done in REVERSE ORDER, but still * after dummynet_modevent() has been called. Not called on reboot. * VNET_SYSUNINIT is also called for each exiting vnet as it exits. * or when the module is unloaded. */ //VNET_SYSUNINIT(vnet_dn_uninit, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_destroy, NULL); /* end of file */ ================================================ FILE: sys/netinet/ipfw/ip_fw2.c ================================================ /*- * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw2.c 200601 2009-12-16 10:48:40Z luigi $"); /* * The FreeBSD IP packet firewall, main file */ #include "opt_ipfw.h" #include "opt_ipdivert.h" #include "opt_inet.h" #ifndef INET #error "IPFIREWALL requires INET" #endif /* INET */ #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for ETHERTYPE_IP */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include /* XXX for in_cksum */ #ifdef MAC #include #endif /* * static variables followed by global ones. * All ipfw global variables are here. */ /* ipfw_vnet_ready controls when we are open for business */ static VNET_DEFINE(int, ipfw_vnet_ready) = 0; #define V_ipfw_vnet_ready VNET(ipfw_vnet_ready) static VNET_DEFINE(int, fw_deny_unknown_exthdrs); #define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs) #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT static int default_to_accept = 1; #else static int default_to_accept; #endif VNET_DEFINE(int, autoinc_step); /* * Each rule belongs to one of 32 different sets (0..31). * The variable set_disable contains one bit per set. * If the bit is set, all rules in the corresponding set * are disabled. Set RESVD_SET(31) is reserved for the default rule * and rules that are not deleted by the flush command, * and CANNOT be disabled. * Rules in set RESVD_SET can only be deleted individually. */ VNET_DEFINE(u_int32_t, set_disable); #define V_set_disable VNET(set_disable) VNET_DEFINE(int, fw_verbose); /* counter for ipfw_log(NULL...) */ VNET_DEFINE(u_int64_t, norule_counter); VNET_DEFINE(int, verbose_limit); /* layer3_chain contains the list of rules for layer 3 */ VNET_DEFINE(struct ip_fw_chain, layer3_chain); ipfw_nat_t *ipfw_nat_ptr = NULL; struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_del_ptr; ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; #ifdef SYSCTL_NODE uint32_t dummy_def = IPFW_DEFAULT_RULE; uint32_t dummy_tables_max = IPFW_TABLES_MAX; SYSBEGIN(f3) SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass, CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, "Only do a single pass through ipfw when using dummynet(4)"); SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, "Rule number auto-increment step"); SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0, "Log matches to ipfw rules"); SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, "Set upper limit of matches of ipfw rules logged"); SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD, &dummy_def, 0, "The default/max possible rule number."); SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD, &dummy_tables_max, 0, "The maximum number of tables."); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN, &default_to_accept, 0, "Make the default rule accept all packets."); TUNABLE_INT("net.inet.ip.fw.default_to_accept", &default_to_accept); SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0, "Number of static rules"); #ifdef INET6 SYSCTL_DECL(_net_inet6_ip6); SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0, "Deny packets with unknown IPv6 Extension Headers"); #endif /* INET6 */ SYSEND #endif /* SYSCTL_NODE */ /* * Some macros used in the various matching options. * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type */ #define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) #define TCP(p) ((struct tcphdr *)(p)) #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) static __inline int icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd) { int type = icmp->icmp_type; return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<icmp_type; return (type <= ICMP_MAXTYPE && (TT & (1<arg1 or cmd->d[0]. * * We scan options and store the bits we find set. We succeed if * * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear * * The code is sometimes optimized not to store additional variables. */ static int flags_match(ipfw_insn *cmd, u_int8_t bits) { u_char want_clear; bits = ~bits; if ( ((cmd->arg1 & 0xff) & bits) != 0) return 0; /* some bits we want set were clear */ want_clear = (cmd->arg1 >> 8) & 0xff; if ( (want_clear & bits) != want_clear) return 0; /* some bits we want clear were set */ return 1; } static int ipopts_match(struct ip *ip, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(ip + 1); int x = (ip->ip_hl << 2) - sizeof (struct ip); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { optlen = cp[IPOPT_OLEN]; if (optlen <= 0 || optlen > x) return 0; /* invalid or truncated */ } switch (opt) { default: break; case IPOPT_LSRR: bits |= IP_FW_IPOPT_LSRR; break; case IPOPT_SSRR: bits |= IP_FW_IPOPT_SSRR; break; case IPOPT_RR: bits |= IP_FW_IPOPT_RR; break; case IPOPT_TS: bits |= IP_FW_IPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(tcp + 1); int x = (tcp->th_off << 2) - sizeof(struct tcphdr); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { optlen = cp[1]; if (optlen <= 0) break; } switch (opt) { default: break; case TCPOPT_MAXSEG: bits |= IP_FW_TCPOPT_MSS; break; case TCPOPT_WINDOW: bits |= IP_FW_TCPOPT_WINDOW; break; case TCPOPT_SACK_PERMITTED: case TCPOPT_SACK: bits |= IP_FW_TCPOPT_SACK; break; case TCPOPT_TIMESTAMP: bits |= IP_FW_TCPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int iface_match(struct ifnet *ifp, ipfw_insn_if *cmd) { if (ifp == NULL) /* no iface with this packet, match fails */ return 0; /* Check by name or by IP address */ if (cmd->name[0] != '\0') { /* match by name */ /* Check name */ if (cmd->p.glob) { if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) return(1); } else { if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) return(1); } } else { #ifdef __FreeBSD__ /* and OSX too ? */ struct ifaddr *ia; if_addr_rlock(ifp); TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr->sa_family != AF_INET) continue; if (cmd->p.ip.s_addr == ((struct sockaddr_in *) (ia->ifa_addr))->sin_addr.s_addr) { if_addr_runlock(ifp); return(1); /* match */ } } if_addr_runlock(ifp); #endif /* __FreeBSD__ */ } return(0); /* no match, fail ... */ } /* * The verify_path function checks if a route to the src exists and * if it is reachable via ifp (when provided). * * The 'verrevpath' option checks that the interface that an IP packet * arrives on is the same interface that traffic destined for the * packet's source address would be routed out of. * The 'versrcreach' option just checks that the source address is * reachable via any route (except default) in the routing table. * These two are a measure to block forged packets. This is also * commonly known as "anti-spoofing" or Unicast Reverse Path * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs * is purposely reminiscent of the Cisco IOS command, * * ip verify unicast reverse-path * ip verify unicast source reachable-via any * * which implements the same functionality. But note that the syntax * is misleading, and the check may be performed on all IP packets * whether unicast, multicast, or broadcast. */ static int verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) { #ifndef __FreeBSD__ return 0; #else struct route ro; struct sockaddr_in *dst; bzero(&ro, sizeof(ro)); dst = (struct sockaddr_in *)&(ro.ro_dst); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = src; in_rtalloc_ign(&ro, 0, fib); if (ro.ro_rt == NULL) return 0; /* * If ifp is provided, check for equality with rtentry. * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * in order to pass packets injected back by if_simloop(): * if useloopback == 1 routing entry (via lo0) for our own address * may exist, so we need to handle routing assymetry. */ if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { RTFREE(ro.ro_rt); return 0; } /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) { RTFREE(ro.ro_rt); return 0; } /* or if this is a blackhole/reject route */ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { RTFREE(ro.ro_rt); return 0; } /* found valid route */ RTFREE(ro.ro_rt); return 1; #endif /* __FreeBSD__ */ } #ifdef INET6 /* * ipv6 specific rules here... */ static __inline int icmp6type_match (int type, ipfw_insn_u32 *cmd) { return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) ); } static int flow6id_match( int curr_flow, ipfw_insn_u32 *cmd ) { int i; for (i=0; i <= cmd->o.arg1; ++i ) if (curr_flow == cmd->d[i] ) return 1; return 0; } /* support for IP6_*_ME opcodes */ static int search_ip6_addr_net (struct in6_addr * ip6_addr) { struct ifnet *mdc; struct ifaddr *mdc2; struct in6_ifaddr *fdm; struct in6_addr copia; TAILQ_FOREACH(mdc, &V_ifnet, if_link) { if_addr_rlock(mdc); TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) { if (mdc2->ifa_addr->sa_family == AF_INET6) { fdm = (struct in6_ifaddr *)mdc2; copia = fdm->ia_addr.sin6_addr; /* need for leaving scope_id in the sock_addr */ in6_clearscope(&copia); if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) { if_addr_runlock(mdc); return 1; } } } if_addr_runlock(mdc); } return 0; } static int verify_path6(struct in6_addr *src, struct ifnet *ifp) { struct route_in6 ro; struct sockaddr_in6 *dst; bzero(&ro, sizeof(ro)); dst = (struct sockaddr_in6 * )&(ro.ro_dst); dst->sin6_family = AF_INET6; dst->sin6_len = sizeof(*dst); dst->sin6_addr = *src; /* XXX MRT 0 for ipv6 at this time */ rtalloc_ign((struct route *)&ro, 0); if (ro.ro_rt == NULL) return 0; /* * if ifp is provided, check for equality with rtentry * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * to support the case of sending packets to an address of our own. * (where the former interface is the first argument of if_simloop() * (=ifp), the latter is lo0) */ if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { RTFREE(ro.ro_rt); return 0; } /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) { RTFREE(ro.ro_rt); return 0; } /* or if this is a blackhole/reject route */ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { RTFREE(ro.ro_rt); return 0; } /* found valid route */ RTFREE(ro.ro_rt); return 1; } static int is_icmp6_query(int icmp6_type) { if ((icmp6_type <= ICMP6_MAXTYPE) && (icmp6_type == ICMP6_ECHO_REQUEST || icmp6_type == ICMP6_MEMBERSHIP_QUERY || icmp6_type == ICMP6_WRUREQUEST || icmp6_type == ICMP6_FQDN_QUERY || icmp6_type == ICMP6_NI_QUERY)) return (1); return (0); } static void send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6) { struct mbuf *m; m = args->m; if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) { struct tcphdr *tcp; tcp = (struct tcphdr *)((char *)ip6 + hlen); if ((tcp->th_flags & TH_RST) == 0) { struct mbuf *m0; m0 = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m0 != NULL) ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); } FREE_PKT(m); } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */ #if 0 /* * Unlike above, the mbufs need to line up with the ip6 hdr, * as the contents are read. We need to m_adj() the * needed amount. * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif icmp6_error(m, ICMP6_DST_UNREACH, code, 0); } else FREE_PKT(m); args->m = NULL; } #endif /* INET6 */ /* * sends a reject message, consuming the mbuf passed as an argument. */ static void send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip) { #if 0 /* XXX When ip is not guaranteed to be at mtod() we will * need to account for this */ * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */ /* We need the IP header in host order for icmp_error(). */ SET_HOST_IPLEN(ip); icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); } else if (args->f_id.proto == IPPROTO_TCP) { struct tcphdr *const tcp = L3HDR(struct tcphdr, mtod(args->m, struct ip *)); if ( (tcp->th_flags & TH_RST) == 0) { struct mbuf *m; m = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); } FREE_PKT(args->m); } else FREE_PKT(args->m); args->m = NULL; } /* * Support for uid/gid/jail lookup. These tests are expensive * (because we may need to look into the list of active sockets) * so we cache the results. ugid_lookupp is 0 if we have not * yet done a lookup, 1 if we succeeded, and -1 if we tried * and failed. The function always returns the match value. * We could actually spare the variable and use *uc, setting * it to '(void *)check_uidgid if we have no info, NULL if * we tried and failed, or any other value if successful. */ static int check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, u_int16_t src_port, int *ugid_lookupp, struct ucred **uc, struct inpcb *inp) { #ifndef __FreeBSD__ return cred_check(insn, proto, oif, dst_ip, dst_port, src_ip, src_port, (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); #else /* FreeBSD */ struct inpcbinfo *pi; int wildcard; struct inpcb *pcb; int match; /* * Check to see if the UDP or TCP stack supplied us with * the PCB. If so, rather then holding a lock and looking * up the PCB, we can use the one that was supplied. */ if (inp && *ugid_lookupp == 0) { INP_LOCK_ASSERT(inp); if (inp->inp_socket != NULL) { *uc = crhold(inp->inp_cred); *ugid_lookupp = 1; } else *ugid_lookupp = -1; } /* * If we have already been here and the packet has no * PCB entry associated with it, then we can safely * assume that this is a no match. */ if (*ugid_lookupp == -1) return (0); if (proto == IPPROTO_TCP) { wildcard = 0; pi = &V_tcbinfo; } else if (proto == IPPROTO_UDP) { wildcard = INPLOOKUP_WILDCARD; pi = &V_udbinfo; } else return 0; match = 0; if (*ugid_lookupp == 0) { INP_INFO_RLOCK(pi); pcb = (oif) ? in_pcblookup_hash(pi, dst_ip, htons(dst_port), src_ip, htons(src_port), wildcard, oif) : in_pcblookup_hash(pi, src_ip, htons(src_port), dst_ip, htons(dst_port), wildcard, NULL); if (pcb != NULL) { *uc = crhold(pcb->inp_cred); *ugid_lookupp = 1; } INP_INFO_RUNLOCK(pi); if (*ugid_lookupp == 0) { /* * We tried and failed, set the variable to -1 * so we will not try again on this packet. */ *ugid_lookupp = -1; return (0); } } if (insn->o.opcode == O_UID) match = ((*uc)->cr_uid == (uid_t)insn->d[0]); else if (insn->o.opcode == O_GID) match = groupmember((gid_t)insn->d[0], *uc); else if (insn->o.opcode == O_JAIL) match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); return match; #endif /* __FreeBSD__ */ } /* * Helper function to set args with info on the rule after the matching * one. slot is precise, whereas we guess rule_id as they are * assigned sequentially. */ static inline void set_match(struct ip_fw_args *args, int slot, struct ip_fw_chain *chain) { args->rule.chain_id = chain->id; args->rule.slot = slot + 1; /* we use 0 as a marker */ args->rule.rule_id = 1 + chain->map[slot]->id; args->rule.rulenum = chain->map[slot]->rulenum; } /* * The main check routine for the firewall. * * All arguments are in args so we can modify them and return them * back to the caller. * * Parameters: * * args->m (in/out) The packet; we set to NULL when/if we nuke it. * Starts with the IP header. * args->eh (in) Mac header if present, NULL for layer3 packet. * args->L3offset Number of bytes bypassed if we came from L2. * e.g. often sizeof(eh) ** NOTYET ** * args->oif Outgoing interface, NULL if packet is incoming. * The incoming interface is in the mbuf. (in) * args->divert_rule (in/out) * Skip up to the first rule past this rule number; * upon return, non-zero port number for divert or tee. * * args->rule Pointer to the last matching rule (in/out) * args->next_hop Socket we are forwarding to (out). * args->f_id Addresses grabbed from the packet (out) * args->rule.info a cookie depending on rule action * * Return value: * * IP_FW_PASS the packet must be accepted * IP_FW_DENY the packet must be dropped * IP_FW_DIVERT divert packet, port in m_tag * IP_FW_TEE tee packet, port in m_tag * IP_FW_DUMMYNET to dummynet, pipe in args->cookie * IP_FW_NETGRAPH into netgraph, cookie args->cookie * args->rule contains the matching rule, * args->rule.info has additional information. * */ int ipfw_chk(struct ip_fw_args *args) { /* * Local variables holding state while processing a packet: * * IMPORTANT NOTE: to speed up the processing of rules, there * are some assumption on the values of the variables, which * are documented here. Should you change them, please check * the implementation of the various instructions to make sure * that they still work. * * args->eh The MAC header. It is non-null for a layer2 * packet, it is NULL for a layer-3 packet. * **notyet** * args->L3offset Offset in the packet to the L3 (IP or equiv.) header. * * m | args->m Pointer to the mbuf, as received from the caller. * It may change if ipfw_chk() does an m_pullup, or if it * consumes the packet because it calls send_reject(). * XXX This has to change, so that ipfw_chk() never modifies * or consumes the buffer. * ip is the beginning of the ip(4 or 6) header. * Calculated by adding the L3offset to the start of data. * (Until we start using L3offset, the packet is * supposed to start with the ip header). */ struct mbuf *m = args->m; struct ip *ip = mtod(m, struct ip *); /* * For rules which contain uid/gid or jail constraints, cache * a copy of the users credentials after the pcb lookup has been * executed. This will speed up the processing of rules with * these types of constraints, as well as decrease contention * on pcb related locks. */ #ifndef __FreeBSD__ struct bsd_ucred ucred_cache; #else struct ucred *ucred_cache = NULL; #endif int ucred_lookup = 0; /* * oif | args->oif If NULL, ipfw_chk has been called on the * inbound path (ether_input, ip_input). * If non-NULL, ipfw_chk has been called on the outbound path * (ether_output, ip_output). */ struct ifnet *oif = args->oif; int f_pos = 0; /* index of current rule in the array */ int retval = 0; /* * hlen The length of the IP header. */ u_int hlen = 0; /* hlen >0 means we have an IP pkt */ /* * offset The offset of a fragment. offset != 0 means that * we have a fragment at this offset of an IPv4 packet. * offset == 0 means that (if this is an IPv4 packet) * this is the first or only fragment. * For IPv6 offset == 0 means there is no Fragment Header. * If offset != 0 for IPv6 always use correct mask to * get the correct offset because we add IP6F_MORE_FRAG * to be able to dectect the first fragment which would * otherwise have offset = 0. */ u_short offset = 0; /* * Local copies of addresses. They are only valid if we have * an IP packet. * * proto The protocol. Set to 0 for non-ip packets, * or to the protocol read from the packet otherwise. * proto != 0 means that we have an IPv4 packet. * * src_port, dst_port port numbers, in HOST format. Only * valid for TCP and UDP packets. * * src_ip, dst_ip ip addresses, in NETWORK format. * Only valid for IPv4 packets. */ uint8_t proto; uint16_t src_port = 0, dst_port = 0; /* NOTE: host format */ struct in_addr src_ip, dst_ip; /* NOTE: network format */ uint16_t iplen=0; int pktlen; uint16_t etype = 0; /* Host order stored ether type */ /* * dyn_dir = MATCH_UNKNOWN when rules unchecked, * MATCH_NONE when checked and not matched (q = NULL), * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL) */ int dyn_dir = MATCH_UNKNOWN; ipfw_dyn_rule *q = NULL; struct ip_fw_chain *chain = &V_layer3_chain; /* * We store in ulp a pointer to the upper layer protocol header. * In the ipv4 case this is easy to determine from the header, * but for ipv6 we might have some additional headers in the middle. * ulp is NULL if not found. */ void *ulp = NULL; /* upper layer protocol pointer. */ /* XXX ipv6 variables */ int is_ipv6 = 0; uint8_t icmp6_type = 0; uint16_t ext_hd = 0; /* bits vector for extension header filtering */ /* end of ipv6 variables */ int is_ipv4 = 0; int done = 0; /* flag to exit the outer loop */ if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready)) return (IP_FW_PASS); /* accept */ dst_ip.s_addr = 0; /* make sure it is initialized */ src_ip.s_addr = 0; /* make sure it is initialized */ pktlen = m->m_pkthdr.len; args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */ proto = args->f_id.proto = 0; /* mark f_id invalid */ /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */ /* * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, * then it sets p to point at the offset "len" in the mbuf. WARNING: the * pointer might become stale after other pullups (but we never use it * this way). */ #define PULLUP_TO(_len, p, T) \ do { \ int x = (_len) + sizeof(T); \ if ((m)->m_len < x) { \ args->m = m = m_pullup(m, x); \ if (m == NULL) \ goto pullup_failed; \ } \ p = (mtod(m, char *) + (_len)); \ } while (0) /* * if we have an ether header, */ if (args->eh) etype = ntohs(args->eh->ether_type); /* Identify IP packets and fill up variables. */ if (pktlen >= sizeof(struct ip6_hdr) && (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) { struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; is_ipv6 = 1; args->f_id.addr_type = 6; hlen = sizeof(struct ip6_hdr); proto = ip6->ip6_nxt; /* Search extension headers to find upper layer protocols */ while (ulp == NULL) { switch (proto) { case IPPROTO_ICMPV6: PULLUP_TO(hlen, ulp, struct icmp6_hdr); icmp6_type = ICMP6(ulp)->icmp6_type; break; case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; /* save flags for dynamic rules */ args->f_id._flags = TCP(ulp)->th_flags; break; case IPPROTO_SCTP: PULLUP_TO(hlen, ulp, struct sctphdr); src_port = SCTP(ulp)->src_port; dst_port = SCTP(ulp)->dest_port; break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_HOPOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_HOPOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_ROUTING: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_rthdr); switch (((struct ip6_rthdr *)ulp)->ip6r_type) { case 0: ext_hd |= EXT_RTHDR0; break; case 2: ext_hd |= EXT_RTHDR2; break; default: printf("IPFW2: IPV6 - Unknown Routing " "Header type(%d)\n", ((struct ip6_rthdr *)ulp)->ip6r_type); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } ext_hd |= EXT_ROUTING; hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; ulp = NULL; break; case IPPROTO_FRAGMENT: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_frag); ext_hd |= EXT_FRAGMENT; hlen += sizeof (struct ip6_frag); proto = ((struct ip6_frag *)ulp)->ip6f_nxt; offset = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_OFF_MASK; /* Add IP6F_MORE_FRAG for offset of first * fragment to be != 0. */ offset |= ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_MORE_FRAG; if (offset == 0) { printf("IPFW2: IPV6 - Invalid Fragment " "Header\n"); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } args->f_id.extra = ntohl(((struct ip6_frag *)ulp)->ip6f_ident); ulp = NULL; break; case IPPROTO_DSTOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_DSTOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_AH: /* RFC 2402 */ PULLUP_TO(hlen, ulp, struct ip6_ext); ext_hd |= EXT_AH; hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; proto = ((struct ip6_ext *)ulp)->ip6e_nxt; ulp = NULL; break; case IPPROTO_ESP: /* RFC 2406 */ PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */ /* Anything past Seq# is variable length and * data past this ext. header is encrypted. */ ext_hd |= EXT_ESP; break; case IPPROTO_NONE: /* RFC 2460 */ /* * Packet ends here, and IPv6 header has * already been pulled up. If ip6e_len!=0 * then octets must be ignored. */ ulp = ip; /* non-NULL to get out of loop. */ break; case IPPROTO_OSPFIGP: /* XXX OSPF header check? */ PULLUP_TO(hlen, ulp, struct ip6_ext); break; case IPPROTO_PIM: /* XXX PIM header check? */ PULLUP_TO(hlen, ulp, struct pim); break; case IPPROTO_CARP: PULLUP_TO(hlen, ulp, struct carp_header); if (((struct carp_header *)ulp)->carp_version != CARP_VERSION) return (IP_FW_DENY); if (((struct carp_header *)ulp)->carp_type != CARP_ADVERTISEMENT) return (IP_FW_DENY); break; case IPPROTO_IPV6: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip6_hdr); break; case IPPROTO_IPV4: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip); break; default: printf("IPFW2: IPV6 - Unknown Extension " "Header(%d), ext_hd=%x\n", proto, ext_hd); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); PULLUP_TO(hlen, ulp, struct ip6_ext); break; } /*switch */ } ip = mtod(m, struct ip *); ip6 = (struct ip6_hdr *)ip; args->f_id.src_ip6 = ip6->ip6_src; args->f_id.dst_ip6 = ip6->ip6_dst; args->f_id.src_ip = 0; args->f_id.dst_ip = 0; args->f_id.flow_id6 = ntohl(ip6->ip6_flow); } else if (pktlen >= sizeof(struct ip) && (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) { is_ipv4 = 1; hlen = ip->ip_hl << 2; args->f_id.addr_type = 4; /* * Collect parameters into local variables for faster matching. */ proto = ip->ip_p; src_ip = ip->ip_src; dst_ip = ip->ip_dst; offset = ntohs(ip->ip_off) & IP_OFFMASK; iplen = ntohs(ip->ip_len); pktlen = iplen < pktlen ? iplen : pktlen; if (offset == 0) { switch (proto) { case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; /* save flags for dynamic rules */ args->f_id._flags = TCP(ulp)->th_flags; break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_ICMP: PULLUP_TO(hlen, ulp, struct icmphdr); //args->f_id.flags = ICMP(ulp)->icmp_type; break; default: break; } } ip = mtod(m, struct ip *); args->f_id.src_ip = ntohl(src_ip.s_addr); args->f_id.dst_ip = ntohl(dst_ip.s_addr); } #undef PULLUP_TO if (proto) { /* we may have port numbers, store them */ args->f_id.proto = proto; args->f_id.src_port = src_port = ntohs(src_port); args->f_id.dst_port = dst_port = ntohs(dst_port); } IPFW_RLOCK(chain); if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */ IPFW_RUNLOCK(chain); return (IP_FW_PASS); /* accept */ } if (args->rule.slot) { /* * Packet has already been tagged as a result of a previous * match on rule args->rule aka args->rule_id (PIPE, QUEUE, * REASS, NETGRAPH, DIVERT/TEE...) * Validate the slot and continue from the next one * if still present, otherwise do a lookup. */ f_pos = (args->rule.chain_id == chain->id) ? args->rule.slot : ipfw_find_rule(chain, args->rule.rulenum, args->rule.rule_id); } else { f_pos = 0; } /* * Now scan the rules, and parse microinstructions for each rule. * We have two nested loops and an inner switch. Sometimes we * need to break out of one or both loops, or re-enter one of * the loops with updated variables. Loop variables are: * * f_pos (outer loop) points to the current rule. * On output it points to the matching rule. * done (outer loop) is used as a flag to break the loop. * l (inner loop) residual length of current rule. * cmd points to the current microinstruction. * * We break the inner loop by setting l=0 and possibly * cmdlen=0 if we don't want to advance cmd. * We break the outer loop by setting done=1 * We can restart the inner loop by setting l>0 and f_pos, f, cmd * as needed. */ for (; f_pos < chain->n_rules; f_pos++) { ipfw_insn *cmd; uint32_t tablearg = 0; int l, cmdlen, skip_or; /* skip rest of OR block */ struct ip_fw *f; f = chain->map[f_pos]; if (V_set_disable & (1 << f->set) ) continue; skip_or = 0; for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { int match; /* * check_body is a jump target used when we find a * CHECK_STATE, and need to jump to the body of * the target rule. */ /* check_body: */ cmdlen = F_LEN(cmd); /* * An OR block (insn_1 || .. || insn_n) has the * F_OR bit set in all but the last instruction. * The first match will set "skip_or", and cause * the following instructions to be skipped until * past the one with the F_OR bit clear. */ if (skip_or) { /* skip this instruction */ if ((cmd->len & F_OR) == 0) skip_or = 0; /* next one is good */ continue; } match = 0; /* set to 1 if we succeed */ switch (cmd->opcode) { /* * The first set of opcodes compares the packet's * fields with some pattern, setting 'match' if a * match is found. At the end of the loop there is * logic to deal with F_NOT and F_OR flags associated * with the opcode. */ case O_NOP: match = 1; break; case O_FORWARD_MAC: printf("ipfw: opcode %d unimplemented\n", cmd->opcode); break; case O_GID: case O_UID: case O_JAIL: /* * We only check offset == 0 && proto != 0, * as this ensures that we have a * packet with the ports info. */ if (offset!=0) break; if (is_ipv6) /* XXX to be fixed later */ break; if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) match = check_uidgid( (ipfw_insn_u32 *)cmd, proto, oif, dst_ip, dst_port, src_ip, src_port, &ucred_lookup, #ifdef __FreeBSD__ &ucred_cache, args->inp); #else (void *)&ucred_cache, (struct inpcb *)args->m); #endif break; case O_RECV: match = iface_match(m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd); break; case O_XMIT: match = iface_match(oif, (ipfw_insn_if *)cmd); break; case O_VIA: match = iface_match(oif ? oif : m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd); break; case O_MACADDR2: if (args->eh != NULL) { /* have MAC header */ u_int32_t *want = (u_int32_t *) ((ipfw_insn_mac *)cmd)->addr; u_int32_t *mask = (u_int32_t *) ((ipfw_insn_mac *)cmd)->mask; u_int32_t *hdr = (u_int32_t *)args->eh; match = ( want[0] == (hdr[0] & mask[0]) && want[1] == (hdr[1] & mask[1]) && want[2] == (hdr[2] & mask[2]) ); } break; case O_MAC_TYPE: if (args->eh != NULL) { u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (etype >= p[0] && etype <= p[1]); } break; case O_FRAG: match = (offset != 0); break; case O_IN: /* "out" is "not in" */ match = (oif == NULL); break; case O_LAYER2: match = (args->eh != NULL); break; case O_DIVERTED: { /* For diverted packets, args->rule.info * contains the divert port (in host format) * reason and direction. */ uint32_t i = args->rule.info; match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT && cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2); } break; case O_PROTO: /* * We do not allow an arg of 0 so the * check of "proto" only suffices. */ match = (proto == cmd->arg1); break; case O_IP_SRC: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == src_ip.s_addr); break; case O_IP_SRC_LOOKUP: case O_IP_DST_LOOKUP: if (is_ipv4) { uint32_t key = (cmd->opcode == O_IP_DST_LOOKUP) ? dst_ip.s_addr : src_ip.s_addr; uint32_t v = 0; if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) { /* generic lookup. The key must be * in 32bit big-endian format. */ v = ((ipfw_insn_u32 *)cmd)->d[1]; if (v == 0) key = dst_ip.s_addr; else if (v == 1) key = src_ip.s_addr; else if (v == 6) /* dscp */ key = (ip->ip_tos >> 2) & 0x3f; else if (offset != 0) break; else if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) break; else if (v == 2) key = htonl(dst_port); else if (v == 3) key = htonl(src_port); else if (v == 4 || v == 5) { check_uidgid( (ipfw_insn_u32 *)cmd, proto, oif, dst_ip, dst_port, src_ip, src_port, &ucred_lookup, #ifdef __FreeBSD__ &ucred_cache, args->inp); if (v == 4 /* O_UID */) key = ucred_cache->cr_uid; else if (v == 5 /* O_JAIL */) key = ucred_cache->cr_prison->pr_id; #else /* !__FreeBSD__ */ (void *)&ucred_cache, (struct inpcb *)args->m); if (v ==4 /* O_UID */) key = ucred_cache.uid; else if (v == 5 /* O_JAIL */) key = ucred_cache.xid; #endif /* !__FreeBSD__ */ key = htonl(key); } else break; } match = ipfw_lookup_table(chain, cmd->arg1, key, &v); if (!match) break; if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) match = ((ipfw_insn_u32 *)cmd)->d[0] == v; else tablearg = v; } break; case O_IP_SRC_MASK: case O_IP_DST_MASK: if (is_ipv4) { uint32_t a = (cmd->opcode == O_IP_DST_MASK) ? dst_ip.s_addr : src_ip.s_addr; uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; int i = cmdlen-1; for (; !match && i>0; i-= 2, p+= 2) match = (p[0] == (a & p[1])); } break; case O_IP_SRC_ME: if (is_ipv4) { struct ifnet *tif; INADDR_TO_IFP(src_ip, tif); match = (tif != NULL); break; } #ifdef INET6 /* FALLTHROUGH */ case O_IP6_SRC_ME: match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6); #endif break; case O_IP_DST_SET: case O_IP_SRC_SET: if (is_ipv4) { u_int32_t *d = (u_int32_t *)(cmd+1); u_int32_t addr = cmd->opcode == O_IP_DST_SET ? args->f_id.dst_ip : args->f_id.src_ip; if (addr < d[0]) break; addr -= d[0]; /* subtract base */ match = (addr < cmd->arg1) && ( d[ 1 + (addr>>5)] & (1<<(addr & 0x1f)) ); } break; case O_IP_DST: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == dst_ip.s_addr); break; case O_IP_DST_ME: if (is_ipv4) { struct ifnet *tif; INADDR_TO_IFP(dst_ip, tif); match = (tif != NULL); break; } #ifdef INET6 /* FALLTHROUGH */ case O_IP6_DST_ME: match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6); #endif break; case O_IP_SRCPORT: case O_IP_DSTPORT: /* * offset == 0 && proto != 0 is enough * to guarantee that we have a * packet with port info. */ if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP) && offset == 0) { u_int16_t x = (cmd->opcode == O_IP_SRCPORT) ? src_port : dst_port ; u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (x>=p[0] && x<=p[1]); } break; case O_ICMPTYPE: match = (offset == 0 && proto==IPPROTO_ICMP && icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) ); break; #ifdef INET6 case O_ICMP6TYPE: match = is_ipv6 && offset == 0 && proto==IPPROTO_ICMPV6 && icmp6type_match( ICMP6(ulp)->icmp6_type, (ipfw_insn_u32 *)cmd); break; #endif /* INET6 */ case O_IPOPT: match = (is_ipv4 && ipopts_match(ip, cmd) ); break; case O_IPVER: match = (is_ipv4 && cmd->arg1 == ip->ip_v); break; case O_IPID: case O_IPLEN: case O_IPTTL: if (is_ipv4) { /* only for IP packets */ uint16_t x; uint16_t *p; int i; if (cmd->opcode == O_IPLEN) x = iplen; else if (cmd->opcode == O_IPTTL) x = ip->ip_ttl; else /* must be IPID */ x = ntohs(ip->ip_id); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_IPPRECEDENCE: match = (is_ipv4 && (cmd->arg1 == (ip->ip_tos & 0xe0)) ); break; case O_IPTOS: match = (is_ipv4 && flags_match(cmd, ip->ip_tos)); break; case O_TCPDATALEN: if (proto == IPPROTO_TCP && offset == 0) { struct tcphdr *tcp; uint16_t x; uint16_t *p; int i; tcp = TCP(ulp); x = iplen - ((ip->ip_hl + tcp->th_off) << 2); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_TCPFLAGS: match = (proto == IPPROTO_TCP && offset == 0 && flags_match(cmd, TCP(ulp)->th_flags)); break; case O_TCPOPTS: match = (proto == IPPROTO_TCP && offset == 0 && tcpopts_match(TCP(ulp), cmd)); break; case O_TCPSEQ: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_seq); break; case O_TCPACK: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_ack); break; case O_TCPWIN: match = (proto == IPPROTO_TCP && offset == 0 && cmd->arg1 == TCP(ulp)->th_win); break; case O_ESTAB: /* reject packets which have SYN only */ /* XXX should i also check for TH_ACK ? */ match = (proto == IPPROTO_TCP && offset == 0 && (TCP(ulp)->th_flags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); break; case O_ALTQ: { struct pf_mtag *at; ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; match = 1; at = pf_find_mtag(m); if (at != NULL && at->qid != 0) break; at = pf_get_mtag(m); if (at == NULL) { /* * Let the packet fall back to the * default ALTQ. */ break; } at->qid = altq->qid; if (is_ipv4) at->af = AF_INET; else at->af = AF_LINK; at->hdr = ip; break; } case O_LOG: ipfw_log(f, hlen, args, m, oif, offset, tablearg, ip); match = 1; break; case O_PROB: match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); break; case O_VERREVPATH: /* Outgoing packets automatically pass/match */ match = ((oif != NULL) || (m->m_pkthdr.rcvif == NULL) || ( #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), m->m_pkthdr.rcvif) : #endif verify_path(src_ip, m->m_pkthdr.rcvif, args->f_id.fib))); break; case O_VERSRCREACH: /* Outgoing packets automatically pass/match */ match = (hlen > 0 && ((oif != NULL) || #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), NULL) : #endif verify_path(src_ip, NULL, args->f_id.fib))); break; case O_ANTISPOOF: /* Outgoing packets automatically pass/match */ if (oif == NULL && hlen > 0 && ( (is_ipv4 && in_localaddr(src_ip)) #ifdef INET6 || (is_ipv6 && in6_localaddr(&(args->f_id.src_ip6))) #endif )) match = #ifdef INET6 is_ipv6 ? verify_path6( &(args->f_id.src_ip6), m->m_pkthdr.rcvif) : #endif verify_path(src_ip, m->m_pkthdr.rcvif, args->f_id.fib); else match = 1; break; case O_IPSEC: #ifdef IPSEC match = (m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); #endif /* otherwise no match */ break; #ifdef INET6 case O_IP6_SRC: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_DST: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_SRC_MASK: case O_IP6_DST_MASK: if (is_ipv6) { int i = cmdlen - 1; struct in6_addr p; struct in6_addr *d = &((ipfw_insn_ip6 *)cmd)->addr6; for (; !match && i > 0; d += 2, i -= F_INSN_SIZE(struct in6_addr) * 2) { p = (cmd->opcode == O_IP6_SRC_MASK) ? args->f_id.src_ip6: args->f_id.dst_ip6; APPLY_MASK(&p, &d[1]); match = IN6_ARE_ADDR_EQUAL(&d[0], &p); } } break; case O_FLOW6ID: match = is_ipv6 && flow6id_match(args->f_id.flow_id6, (ipfw_insn_u32 *) cmd); break; case O_EXT_HDR: match = is_ipv6 && (ext_hd & ((ipfw_insn *) cmd)->arg1); break; case O_IP6: match = is_ipv6; break; #endif case O_IP4: match = is_ipv4; break; case O_TAG: { struct m_tag *mtag; uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; /* Packet is already tagged with this tag? */ mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL); /* We have `untag' action when F_NOT flag is * present. And we must remove this mtag from * mbuf and reset `match' to zero (`match' will * be inversed later). * Otherwise we should allocate new mtag and * push it into mbuf. */ if (cmd->len & F_NOT) { /* `untag' action */ if (mtag != NULL) m_tag_delete(m, mtag); match = 0; } else if (mtag == NULL) { if ((mtag = m_tag_alloc(MTAG_IPFW, tag, 0, M_NOWAIT)) != NULL) m_tag_prepend(m, mtag); match = 1; } break; } case O_FIB: /* try match the specified fib */ if (args->f_id.fib == cmd->arg1) match = 1; break; case O_TAGGED: { struct m_tag *mtag; uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; if (cmdlen == 1) { match = m_tag_locate(m, MTAG_IPFW, tag, NULL) != NULL; break; } /* we have ranges */ for (mtag = m_tag_first(m); mtag != NULL && !match; mtag = m_tag_next(m, mtag)) { uint16_t *p; int i; if (mtag->m_tag_cookie != MTAG_IPFW) continue; p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for(; !match && i > 0; i--, p += 2) match = mtag->m_tag_id >= p[0] && mtag->m_tag_id <= p[1]; } break; } /* * The second set of opcodes represents 'actions', * i.e. the terminal part of a rule once the packet * matches all previous patterns. * Typically there is only one action for each rule, * and the opcode is stored at the end of the rule * (but there are exceptions -- see below). * * In general, here we set retval and terminate the * outer loop (would be a 'break 3' in some language, * but we need to set l=0, done=1) * * Exceptions: * O_COUNT and O_SKIPTO actions: * instead of terminating, we jump to the next rule * (setting l=0), or to the SKIPTO target (setting * f/f_len, cmd and l as needed), respectively. * * O_TAG, O_LOG and O_ALTQ action parameters: * perform some action and set match = 1; * * O_LIMIT and O_KEEP_STATE: these opcodes are * not real 'actions', and are stored right * before the 'action' part of the rule. * These opcodes try to install an entry in the * state tables; if successful, we continue with * the next opcode (match=1; break;), otherwise * the packet must be dropped (set retval, * break loops with l=0, done=1) * * O_PROBE_STATE and O_CHECK_STATE: these opcodes * cause a lookup of the state table, and a jump * to the 'action' part of the parent rule * if an entry is found, or * (CHECK_STATE only) a jump to the next rule if * the entry is not found. * The result of the lookup is cached so that * further instances of these opcodes become NOPs. * The jump to the next rule is done by setting * l=0, cmdlen=0. */ case O_LIMIT: case O_KEEP_STATE: if (ipfw_install_state(f, (ipfw_insn_limit *)cmd, args, tablearg)) { /* error or limit violation */ retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ } match = 1; break; case O_PROBE_STATE: case O_CHECK_STATE: /* * dynamic rules are checked at the first * keep-state or check-state occurrence, * with the result being stored in dyn_dir. * The compiler introduces a PROBE_STATE * instruction for us when we have a * KEEP_STATE (because PROBE_STATE needs * to be run first). */ if (dyn_dir == MATCH_UNKNOWN && (q = ipfw_lookup_dyn_rule(&args->f_id, &dyn_dir, proto == IPPROTO_TCP ? TCP(ulp) : NULL)) != NULL) { /* * Found dynamic entry, update stats * and jump to the 'action' part of * the parent rule by setting * f, cmd, l and clearing cmdlen. */ q->pcnt++; q->bcnt += pktlen; /* XXX we would like to have f_pos * readily accessible in the dynamic * rule, instead of having to * lookup q->rule. */ f = q->rule; f_pos = ipfw_find_rule(chain, f->rulenum, f->id); cmd = ACTION_PTR(f); l = f->cmd_len - f->act_ofs; ipfw_dyn_unlock(); cmdlen = 0; match = 1; break; } /* * Dynamic entry not found. If CHECK_STATE, * skip to next rule, if PROBE_STATE just * ignore and continue with next opcode. */ if (cmd->opcode == O_CHECK_STATE) l = 0; /* exit inner loop */ match = 1; break; case O_ACCEPT: retval = 0; /* accept */ l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_PIPE: case O_QUEUE: set_match(args, f_pos, chain); args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; if (cmd->opcode == O_PIPE) args->rule.info |= IPFW_IS_PIPE; if (V_fw_one_pass) args->rule.info |= IPFW_ONEPASS; retval = IP_FW_DUMMYNET; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_DIVERT: case O_TEE: if (args->eh) /* not on layer 2 */ break; /* otherwise this is terminal */ l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ retval = (cmd->opcode == O_DIVERT) ? IP_FW_DIVERT : IP_FW_TEE; set_match(args, f_pos, chain); args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; break; case O_COUNT: f->pcnt++; /* update stats */ f->bcnt += pktlen; f->timestamp = time_uptime; l = 0; /* exit inner loop */ break; case O_SKIPTO: f->pcnt++; /* update stats */ f->bcnt += pktlen; f->timestamp = time_uptime; /* If possible use cached f_pos (in f->next_rule), * whose version is written in f->next_rule * (horrible hacks to avoid changing the ABI). */ if (cmd->arg1 != IP_FW_TABLEARG && (uintptr_t)f->x_next == chain->id) { f_pos = (uintptr_t)f->next_rule; } else { int i = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; /* make sure we do not jump backward */ if (i <= f->rulenum) i = f->rulenum + 1; f_pos = ipfw_find_rule(chain, i, 0); /* update the cache */ if (cmd->arg1 != IP_FW_TABLEARG) { f->next_rule = (void *)(uintptr_t)f_pos; f->x_next = (void *)(uintptr_t)chain->id; } } /* * Skip disabled rules, and re-enter * the inner loop with the correct * f_pos, f, l and cmd. * Also clear cmdlen and skip_or */ for (; f_pos < chain->n_rules - 1 && (V_set_disable & (1 << chain->map[f_pos]->set)); f_pos++) ; /* Re-enter the inner loop at the skipto rule. */ f = chain->map[f_pos]; l = f->cmd_len; cmd = f->cmd; match = 1; cmdlen = 0; skip_or = 0; continue; break; /* not reached */ case O_REJECT: /* * Drop the packet and send a reject notice * if the packet is not ICMP (or is an ICMP * query), and it is not multicast/broadcast. */ if (hlen > 0 && is_ipv4 && offset == 0 && (proto != IPPROTO_ICMP || is_icmp_query(ICMP(ulp))) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN_MULTICAST(ntohl(dst_ip.s_addr))) { send_reject(args, cmd->arg1, iplen, ip); m = args->m; } /* FALLTHROUGH */ #ifdef INET6 case O_UNREACH6: if (hlen > 0 && is_ipv6 && ((offset & IP6F_OFF_MASK) == 0) && (proto != IPPROTO_ICMPV6 || (is_icmp6_query(icmp6_type) == 1)) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) { send_reject6( args, cmd->arg1, hlen, (struct ip6_hdr *)ip); m = args->m; } /* FALLTHROUGH */ #endif case O_DENY: retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_FORWARD_IP: if (args->eh) /* not valid on layer2 pkts */ break; if (!q || dyn_dir == MATCH_FORWARD) { struct sockaddr_in *sa; sa = &(((ipfw_insn_sa *)cmd)->sa); if (sa->sin_addr.s_addr == INADDR_ANY) { bcopy(sa, &args->hopstore, sizeof(*sa)); args->hopstore.sin_addr.s_addr = htonl(tablearg); args->next_hop = &args->hopstore; } else { args->next_hop = sa; } } retval = IP_FW_PASS; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_NETGRAPH: case O_NGTEE: set_match(args, f_pos, chain); args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; if (V_fw_one_pass) args->rule.info |= IPFW_ONEPASS; retval = (cmd->opcode == O_NETGRAPH) ? IP_FW_NETGRAPH : IP_FW_NGTEE; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_SETFIB: f->pcnt++; /* update stats */ f->bcnt += pktlen; f->timestamp = time_uptime; M_SETFIB(m, cmd->arg1); args->f_id.fib = cmd->arg1; l = 0; /* exit inner loop */ break; case O_NAT: if (!IPFW_NAT_LOADED) { retval = IP_FW_DENY; } else { struct cfg_nat *t; int nat_id; set_match(args, f_pos, chain); t = ((ipfw_insn_nat *)cmd)->nat; if (t == NULL) { nat_id = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; t = (*lookup_nat_ptr)(&chain->nat, nat_id); if (t == NULL) { retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; } if (cmd->arg1 != IP_FW_TABLEARG) ((ipfw_insn_nat *)cmd)->nat = t; } retval = ipfw_nat_ptr(args, t, m); } l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_REASS: { int ip_off; f->pcnt++; f->bcnt += pktlen; l = 0; /* in any case exit inner loop */ ip_off = ntohs(ip->ip_off); /* if not fragmented, go to next rule */ if ((ip_off & (IP_MF | IP_OFFMASK)) == 0) break; /* * ip_reass() expects len & off in host * byte order. */ SET_HOST_IPLEN(ip); args->m = m = ip_reass(m); /* * do IP header checksum fixup. */ if (m == NULL) { /* fragment got swallowed */ retval = IP_FW_DENY; } else { /* good, packet complete */ int hlen; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; SET_NET_IPLEN(ip); ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); else ip->ip_sum = in_cksum(m, hlen); retval = IP_FW_REASS; set_match(args, f_pos, chain); } done = 1; /* exit outer loop */ break; } default: panic("-- unknown opcode %d\n", cmd->opcode); } /* end of switch() on opcodes */ /* * if we get here with l=0, then match is irrelevant. */ if (cmd->len & F_NOT) match = !match; if (match) { if (cmd->len & F_OR) skip_or = 1; } else { if (!(cmd->len & F_OR)) /* not an OR block, */ break; /* try next rule */ } } /* end of inner loop, scan opcodes */ if (done) break; /* next_rule:; */ /* try next rule */ } /* end of outer for, scan rules */ if (done) { struct ip_fw *rule = chain->map[f_pos]; /* Update statistics */ rule->pcnt++; rule->bcnt += pktlen; rule->timestamp = time_uptime; } else { retval = IP_FW_DENY; printf("ipfw: ouch!, skip past end of rules, denying packet\n"); } IPFW_RUNLOCK(chain); #ifdef __FreeBSD__ if (ucred_cache != NULL) crfree(ucred_cache); #endif return (retval); pullup_failed: if (V_fw_verbose) printf("ipfw: pullup failed\n"); return (IP_FW_DENY); } /* * Module and VNET glue */ /* * Stuff that must be initialised only on boot or module load */ static int ipfw_init(void) { int error = 0; ipfw_dyn_attach(); /* * Only print out this stuff the first time around, * when called from the sysinit code. */ printf("ipfw2 " #ifdef INET6 "(+ipv6) " #endif "initialized, divert %s, nat %s, " "rule-based forwarding " #ifdef IPFIREWALL_FORWARD "enabled, " #else "disabled, " #endif "default to %s, logging ", #ifdef IPDIVERT "enabled", #else "loadable", #endif #ifdef IPFIREWALL_NAT "enabled", #else "loadable", #endif default_to_accept ? "accept" : "deny"); /* * Note: V_xxx variables can be accessed here but the vnet specific * initializer may not have been called yet for the VIMAGE case. * Tuneables will have been processed. We will print out values for * the default vnet. * XXX This should all be rationalized AFTER 8.0 */ if (V_fw_verbose == 0) printf("disabled\n"); else if (V_verbose_limit == 0) printf("unlimited\n"); else printf("limited to %d packets/entry by default\n", V_verbose_limit); ipfw_log_bpf(1); /* init */ return (error); } /* * Called for the removal of the last instance only on module unload. */ static void ipfw_destroy(void) { ipfw_log_bpf(0); /* uninit */ ipfw_dyn_detach(); printf("IP firewall unloaded\n"); } /* * Stuff that must be initialized for every instance * (including the first of course). */ static int vnet_ipfw_init(const void *unused) { int error; struct ip_fw *rule = NULL; struct ip_fw_chain *chain; chain = &V_layer3_chain; /* First set up some values that are compile time options */ V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ V_fw_deny_unknown_exthdrs = 1; #ifdef IPFIREWALL_VERBOSE V_fw_verbose = 1; #endif #ifdef IPFIREWALL_VERBOSE_LIMIT V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; #endif #ifdef IPFIREWALL_NAT LIST_INIT(&chain->nat); #endif /* insert the default rule and create the initial map */ chain->n_rules = 1; chain->static_len = sizeof(struct ip_fw); chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_NOWAIT | M_ZERO); if (chain->map) rule = malloc(chain->static_len, M_IPFW, M_NOWAIT | M_ZERO); if (rule == NULL) { if (chain->map) free(chain->map, M_IPFW); printf("ipfw2: ENOSPC initializing default rule " "(support disabled)\n"); return (ENOSPC); } error = ipfw_init_tables(chain); if (error) { panic("init_tables"); /* XXX Marko fix this ! */ } /* fill and insert the default rule */ rule->act_ofs = 0; rule->rulenum = IPFW_DEFAULT_RULE; rule->cmd_len = 1; rule->set = RESVD_SET; rule->cmd[0].len = 1; rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY; chain->rules = chain->default_rule = chain->map[0] = rule; chain->id = rule->id = 1; IPFW_LOCK_INIT(chain); ipfw_dyn_init(); /* First set up some values that are compile time options */ V_ipfw_vnet_ready = 1; /* Open for business */ /* * Hook the sockopt handler, and the layer2 (V_ip_fw_chk_ptr) * and pfil hooks for ipv4 and ipv6. Even if the latter two fail * we still keep the module alive because the sockopt and * layer2 paths are still useful. * ipfw[6]_hook return 0 on success, ENOENT on failure, * so we can ignore the exact return value and just set a flag. * * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so * changes in the underlying (per-vnet) variables trigger * immediate hook()/unhook() calls. * In layer2 we have the same behaviour, except that V_ether_ipfw * is checked on each packet because there are no pfil hooks. */ V_ip_fw_ctl_ptr = ipfw_ctl; V_ip_fw_chk_ptr = ipfw_chk; error = ipfw_attach_hooks(1); return (error); } /* * Called for the removal of each instance. */ static int vnet_ipfw_uninit(const void *unused) { struct ip_fw *reap, *rule; struct ip_fw_chain *chain = &V_layer3_chain; int i; V_ipfw_vnet_ready = 0; /* tell new callers to go away */ /* * disconnect from ipv4, ipv6, layer2 and sockopt. * Then grab, release and grab again the WLOCK so we make * sure the update is propagated and nobody will be in. */ (void)ipfw_attach_hooks(0 /* detach */); V_ip_fw_chk_ptr = NULL; V_ip_fw_ctl_ptr = NULL; IPFW_UH_WLOCK(chain); IPFW_UH_WUNLOCK(chain); IPFW_UH_WLOCK(chain); IPFW_WLOCK(chain); IPFW_WUNLOCK(chain); IPFW_WLOCK(chain); ipfw_dyn_uninit(0); /* run the callout_drain */ ipfw_destroy_tables(chain); reap = NULL; for (i = 0; i < chain->n_rules; i++) { rule = chain->map[i]; rule->x_next = reap; reap = rule; } if (chain->map) free(chain->map, M_IPFW); IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); if (reap != NULL) ipfw_reap_rules(reap); IPFW_LOCK_DESTROY(chain); ipfw_dyn_uninit(1); /* free the remaining parts */ return 0; } /* * Module event handler. * In general we have the choice of handling most of these events by the * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to * use the SYSINIT handlers as they are more capable of expressing the * flow of control during module and vnet operations, so this is just * a skeleton. Note there is no SYSINIT equivalent of the module * SHUTDOWN handler, but we don't have anything to do in that case anyhow. */ static int ipfw_modevent(module_t mod, int type, void *unused) { int err = 0; switch (type) { case MOD_LOAD: /* Called once at module load or * system boot if compiled in. */ break; case MOD_QUIESCE: /* Called before unload. May veto unloading. */ break; case MOD_UNLOAD: /* Called during unload. */ break; case MOD_SHUTDOWN: /* Called during system shutdown. */ break; default: err = EOPNOTSUPP; break; } return err; } static moduledata_t ipfwmod = { "ipfw", ipfw_modevent, 0 }; /* Define startup order. */ #define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN #define IPFW_MODEVENT_ORDER (SI_ORDER_ANY - 255) /* On boot slot in here. */ #define IPFW_MODULE_ORDER (IPFW_MODEVENT_ORDER + 1) /* A little later. */ #define IPFW_VNET_ORDER (IPFW_MODEVENT_ORDER + 2) /* Later still. */ DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER); MODULE_VERSION(ipfw, 2); /* should declare some dependencies here */ /* * Starting up. Done in order after ipfwmod() has been called. * VNET_SYSINIT is also called for each existing vnet and each new vnet. */ SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, ipfw_init, NULL); VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, vnet_ipfw_init, NULL); /* * Closing up shop. These are done in REVERSE ORDER, but still * after ipfwmod() has been called. Not called on reboot. * VNET_SYSUNINIT is also called for each exiting vnet as it exits. * or when the module is unloaded. */ SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, ipfw_destroy, NULL); VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, vnet_ipfw_uninit, NULL); /* end of file */ ================================================ FILE: sys/netinet/ipfw/ip_fw_dynamic.c ================================================ /*- * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_dynamic.c 200601 2009-12-16 10:48:40Z luigi $"); #define DEB(x) #define DDB(x) x /* * Dynamic rule support for ipfw */ #if !defined(KLD_MODULE) #include "opt_ipfw.h" #include "opt_ipdivert.h" #include "opt_ipdn.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #endif #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include /* for ETHERTYPE_IP */ #include #include #include #include #include /* ip_defttl */ #include #include #include #include #include /* IN6_ARE_ADDR_EQUAL */ #ifdef INET6 #include #include #endif #include /* XXX for in_cksum */ #ifdef MAC #include #endif /* * Description of dynamic rules. * * Dynamic rules are stored in lists accessed through a hash table * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can * be modified through the sysctl variable dyn_buckets which is * updated when the table becomes empty. * * XXX currently there is only one list, ipfw_dyn. * * When a packet is received, its address fields are first masked * with the mask defined for the rule, then hashed, then matched * against the entries in the corresponding list. * Dynamic rules can be used for different purposes: * + stateful rules; * + enforcing limits on the number of sessions; * + in-kernel NAT (not implemented yet) * * The lifetime of dynamic rules is regulated by dyn_*_lifetime, * measured in seconds and depending on the flags. * * The total number of dynamic rules is stored in dyn_count. * The max number of dynamic rules is dyn_max. When we reach * the maximum number of rules we do not create anymore. This is * done to avoid consuming too much memory, but also too much * time when searching on each packet (ideally, we should try instead * to put a limit on the length of the list on each bucket...). * * Each dynamic rule holds a pointer to the parent ipfw rule so * we know what action to perform. Dynamic rules are removed when * the parent rule is deleted. XXX we should make them survive. * * There are some limitations with dynamic rules -- we do not * obey the 'randomized match', and we do not do multiple * passes through the firewall. XXX check the latter!!! */ /* * Static variables followed by global ones */ static VNET_DEFINE(ipfw_dyn_rule **, ipfw_dyn_v); static VNET_DEFINE(u_int32_t, dyn_buckets); static VNET_DEFINE(u_int32_t, curr_dyn_buckets); static VNET_DEFINE(struct callout, ipfw_timeout); #define V_ipfw_dyn_v VNET(ipfw_dyn_v) #define V_dyn_buckets VNET(dyn_buckets) #define V_curr_dyn_buckets VNET(curr_dyn_buckets) #define V_ipfw_timeout VNET(ipfw_timeout) static uma_zone_t ipfw_dyn_rule_zone; #ifndef __FreeBSD__ DEFINE_SPINLOCK(ipfw_dyn_mtx); #else static struct mtx ipfw_dyn_mtx; /* mutex guarding dynamic rules */ #endif #define IPFW_DYN_LOCK_INIT() \ mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF) #define IPFW_DYN_LOCK_DESTROY() mtx_destroy(&ipfw_dyn_mtx) #define IPFW_DYN_LOCK() mtx_lock(&ipfw_dyn_mtx) #define IPFW_DYN_UNLOCK() mtx_unlock(&ipfw_dyn_mtx) #define IPFW_DYN_LOCK_ASSERT() mtx_assert(&ipfw_dyn_mtx, MA_OWNED) void ipfw_dyn_unlock(void) { IPFW_DYN_UNLOCK(); } /* * Timeouts for various events in handing dynamic rules. */ static VNET_DEFINE(u_int32_t, dyn_ack_lifetime); static VNET_DEFINE(u_int32_t, dyn_syn_lifetime); static VNET_DEFINE(u_int32_t, dyn_fin_lifetime); static VNET_DEFINE(u_int32_t, dyn_rst_lifetime); static VNET_DEFINE(u_int32_t, dyn_udp_lifetime); static VNET_DEFINE(u_int32_t, dyn_short_lifetime); #define V_dyn_ack_lifetime VNET(dyn_ack_lifetime) #define V_dyn_syn_lifetime VNET(dyn_syn_lifetime) #define V_dyn_fin_lifetime VNET(dyn_fin_lifetime) #define V_dyn_rst_lifetime VNET(dyn_rst_lifetime) #define V_dyn_udp_lifetime VNET(dyn_udp_lifetime) #define V_dyn_short_lifetime VNET(dyn_short_lifetime) /* * Keepalives are sent if dyn_keepalive is set. They are sent every * dyn_keepalive_period seconds, in the last dyn_keepalive_interval * seconds of lifetime of a rule. * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower * than dyn_keepalive_period. */ static VNET_DEFINE(u_int32_t, dyn_keepalive_interval); static VNET_DEFINE(u_int32_t, dyn_keepalive_period); static VNET_DEFINE(u_int32_t, dyn_keepalive); #define V_dyn_keepalive_interval VNET(dyn_keepalive_interval) #define V_dyn_keepalive_period VNET(dyn_keepalive_period) #define V_dyn_keepalive VNET(dyn_keepalive) static VNET_DEFINE(u_int32_t, dyn_count); /* # of dynamic rules */ static VNET_DEFINE(u_int32_t, dyn_max); /* max # of dynamic rules */ #define V_dyn_count VNET(dyn_count) #define V_dyn_max VNET(dyn_max) #ifdef SYSCTL_NODE SYSBEGIN(f2) SYSCTL_DECL(_net_inet_ip_fw); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW, &VNET_NAME(dyn_buckets), 0, "Number of dyn. buckets"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0, "Current Number of dyn. buckets"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD, &VNET_NAME(dyn_count), 0, "Number of dyn. rules"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW, &VNET_NAME(dyn_max), 0, "Max number of dyn. rules"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0, "Lifetime of dyn. rules for acks"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0, "Lifetime of dyn. rules for syn"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0, "Lifetime of dyn. rules for fin"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0, "Lifetime of dyn. rules for rst"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0, "Lifetime of dyn. rules for UDP"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0, "Lifetime of dyn. rules for other situations"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0, "Enable keepalives for dyn. rules"); SYSEND #endif /* SYSCTL_NODE */ static __inline int hash_packet6(struct ipfw_flow_id *id) { u_int32_t i; i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ (id->src_ip6.__u6_addr.__u6_addr32[3]) ^ (id->dst_port) ^ (id->src_port); return i; } /* * IMPORTANT: the hash function for dynamic rules must be commutative * in source and destination (ip,port), because rules are bidirectional * and we want to find both in the same bucket. */ static __inline int hash_packet(struct ipfw_flow_id *id) { u_int32_t i; #ifdef INET6 if (IS_IP6_FLOW_ID(id)) i = hash_packet6(id); else #endif /* INET6 */ i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); i &= (V_curr_dyn_buckets - 1); return i; } static __inline void unlink_dyn_rule_print(struct ipfw_flow_id *id) { struct in_addr da; #ifdef INET6 char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN]; #else char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; #endif #ifdef INET6 if (IS_IP6_FLOW_ID(id)) { ip6_sprintf(src, &id->src_ip6); ip6_sprintf(dst, &id->dst_ip6); } else #endif { da.s_addr = htonl(id->src_ip); inet_ntoa_r(da, src); da.s_addr = htonl(id->dst_ip); inet_ntoa_r(da, dst); } printf("ipfw: unlink entry %s %d -> %s %d, %d left\n", src, id->src_port, dst, id->dst_port, V_dyn_count - 1); } /** * unlink a dynamic rule from a chain. prev is a pointer to * the previous one, q is a pointer to the rule to delete, * head is a pointer to the head of the queue. * Modifies q and potentially also head. */ #define UNLINK_DYN_RULE(prev, head, q) { \ ipfw_dyn_rule *old_q = q; \ \ /* remove a refcount to the parent */ \ if (q->dyn_type == O_LIMIT) \ q->parent->count--; \ DEB(unlink_dyn_rule_print(&q->id);) \ if (prev != NULL) \ prev->next = q = q->next; \ else \ head = q = q->next; \ V_dyn_count--; \ uma_zfree(ipfw_dyn_rule_zone, old_q); } #define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) /** * Remove dynamic rules pointing to "rule", or all of them if rule == NULL. * * If keep_me == NULL, rules are deleted even if not expired, * otherwise only expired rules are removed. * * The value of the second parameter is also used to point to identify * a rule we absolutely do not want to remove (e.g. because we are * holding a reference to it -- this is the case with O_LIMIT_PARENT * rules). The pointer is only used for comparison, so any non-null * value will do. */ static void remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me) { static u_int32_t last_remove = 0; #define FORCE (keep_me == NULL) ipfw_dyn_rule *prev, *q; int i, pass = 0, max_pass = 0; IPFW_DYN_LOCK_ASSERT(); if (V_ipfw_dyn_v == NULL || V_dyn_count == 0) return; /* do not expire more than once per second, it is useless */ if (!FORCE && last_remove == time_uptime) return; last_remove = time_uptime; /* * because O_LIMIT refer to parent rules, during the first pass only * remove child and mark any pending LIMIT_PARENT, and remove * them in a second pass. */ next_pass: for (i = 0 ; i < V_curr_dyn_buckets ; i++) { for (prev=NULL, q = V_ipfw_dyn_v[i] ; q ; ) { /* * Logic can become complex here, so we split tests. */ if (q == keep_me) goto next; if (rule != NULL && rule != q->rule) goto next; /* not the one we are looking for */ if (q->dyn_type == O_LIMIT_PARENT) { /* * handle parent in the second pass, * record we need one. */ max_pass = 1; if (pass == 0) goto next; if (FORCE && q->count != 0 ) { /* XXX should not happen! */ printf("ipfw: OUCH! cannot remove rule," " count %d\n", q->count); } } else { if (!FORCE && !TIME_LEQ( q->expire, time_uptime )) goto next; } if (q->dyn_type != O_LIMIT_PARENT || !q->count) { UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q); continue; } next: prev=q; q=q->next; } } if (pass++ < max_pass) goto next_pass; } void ipfw_remove_dyn_children(struct ip_fw *rule) { IPFW_DYN_LOCK(); remove_dyn_rule(rule, NULL /* force removal */); IPFW_DYN_UNLOCK(); } /** * lookup a dynamic rule, locked version */ static ipfw_dyn_rule * lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction, struct tcphdr *tcp) { /* * stateful ipfw extensions. * Lookup into dynamic session queue */ #define MATCH_REVERSE 0 #define MATCH_FORWARD 1 #define MATCH_NONE 2 #define MATCH_UNKNOWN 3 int i, dir = MATCH_NONE; ipfw_dyn_rule *prev, *q=NULL; IPFW_DYN_LOCK_ASSERT(); if (V_ipfw_dyn_v == NULL) goto done; /* not found */ i = hash_packet( pkt ); for (prev=NULL, q = V_ipfw_dyn_v[i] ; q != NULL ; ) { if (q->dyn_type == O_LIMIT_PARENT && q->count) goto next; if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */ UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q); continue; } if (pkt->proto == q->id.proto && q->dyn_type != O_LIMIT_PARENT) { if (IS_IP6_FLOW_ID(pkt)) { if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), &(q->id.src_ip6)) && IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), &(q->id.dst_ip6)) && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port ) { dir = MATCH_FORWARD; break; } if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), &(q->id.dst_ip6)) && IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), &(q->id.src_ip6)) && pkt->src_port == q->id.dst_port && pkt->dst_port == q->id.src_port ) { dir = MATCH_REVERSE; break; } } else { if (pkt->src_ip == q->id.src_ip && pkt->dst_ip == q->id.dst_ip && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port ) { dir = MATCH_FORWARD; break; } if (pkt->src_ip == q->id.dst_ip && pkt->dst_ip == q->id.src_ip && pkt->src_port == q->id.dst_port && pkt->dst_port == q->id.src_port ) { dir = MATCH_REVERSE; break; } } } next: prev = q; q = q->next; } if (q == NULL) goto done; /* q = NULL, not found */ if ( prev != NULL) { /* found and not in front */ prev->next = q->next; q->next = V_ipfw_dyn_v[i]; V_ipfw_dyn_v[i] = q; } if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */ u_char flags = pkt->_flags & (TH_FIN|TH_SYN|TH_RST); #define BOTH_SYN (TH_SYN | (TH_SYN << 8)) #define BOTH_FIN (TH_FIN | (TH_FIN << 8)) q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8); switch (q->state) { case TH_SYN: /* opening */ q->expire = time_uptime + V_dyn_syn_lifetime; break; case BOTH_SYN: /* move to established */ case BOTH_SYN | TH_FIN : /* one side tries to close */ case BOTH_SYN | (TH_FIN << 8) : if (tcp) { #define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0) u_int32_t ack = ntohl(tcp->th_ack); if (dir == MATCH_FORWARD) { if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd)) q->ack_fwd = ack; else { /* ignore out-of-sequence */ break; } } else { if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev)) q->ack_rev = ack; else { /* ignore out-of-sequence */ break; } } } q->expire = time_uptime + V_dyn_ack_lifetime; break; case BOTH_SYN | BOTH_FIN: /* both sides closed */ if (V_dyn_fin_lifetime >= V_dyn_keepalive_period) V_dyn_fin_lifetime = V_dyn_keepalive_period - 1; q->expire = time_uptime + V_dyn_fin_lifetime; break; default: #if 0 /* * reset or some invalid combination, but can also * occur if we use keep-state the wrong way. */ if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) printf("invalid state: 0x%x\n", q->state); #endif if (V_dyn_rst_lifetime >= V_dyn_keepalive_period) V_dyn_rst_lifetime = V_dyn_keepalive_period - 1; q->expire = time_uptime + V_dyn_rst_lifetime; break; } } else if (pkt->proto == IPPROTO_UDP) { q->expire = time_uptime + V_dyn_udp_lifetime; } else { /* other protocols */ q->expire = time_uptime + V_dyn_short_lifetime; } done: if (match_direction) *match_direction = dir; return q; } ipfw_dyn_rule * ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, struct tcphdr *tcp) { ipfw_dyn_rule *q; IPFW_DYN_LOCK(); q = lookup_dyn_rule_locked(pkt, match_direction, tcp); if (q == NULL) IPFW_DYN_UNLOCK(); /* NB: return table locked when q is not NULL */ return q; } static void realloc_dynamic_table(void) { IPFW_DYN_LOCK_ASSERT(); /* * Try reallocation, make sure we have a power of 2 and do * not allow more than 64k entries. In case of overflow, * default to 1024. */ if (V_dyn_buckets > 65536) V_dyn_buckets = 1024; if ((V_dyn_buckets & (V_dyn_buckets-1)) != 0) { /* not a power of 2 */ V_dyn_buckets = V_curr_dyn_buckets; /* reset */ return; } V_curr_dyn_buckets = V_dyn_buckets; if (V_ipfw_dyn_v != NULL) free(V_ipfw_dyn_v, M_IPFW); for (;;) { V_ipfw_dyn_v = malloc(V_curr_dyn_buckets * sizeof(ipfw_dyn_rule *), M_IPFW, M_NOWAIT | M_ZERO); if (V_ipfw_dyn_v != NULL || V_curr_dyn_buckets <= 2) break; V_curr_dyn_buckets /= 2; } } /** * Install state of type 'type' for a dynamic session. * The hash table contains two type of rules: * - regular rules (O_KEEP_STATE) * - rules for sessions with limited number of sess per user * (O_LIMIT). When they are created, the parent is * increased by 1, and decreased on delete. In this case, * the third parameter is the parent rule and not the chain. * - "parent" rules for the above (O_LIMIT_PARENT). */ static ipfw_dyn_rule * add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule) { ipfw_dyn_rule *r; int i; IPFW_DYN_LOCK_ASSERT(); if (V_ipfw_dyn_v == NULL || (V_dyn_count == 0 && V_dyn_buckets != V_curr_dyn_buckets)) { realloc_dynamic_table(); if (V_ipfw_dyn_v == NULL) return NULL; /* failed ! */ } i = hash_packet(id); r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO); if (r == NULL) { printf ("ipfw: sorry cannot allocate state\n"); return NULL; } /* increase refcount on parent, and set pointer */ if (dyn_type == O_LIMIT) { ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule; if ( parent->dyn_type != O_LIMIT_PARENT) panic("invalid parent"); parent->count++; r->parent = parent; rule = parent->rule; } r->id = *id; r->expire = time_uptime + V_dyn_syn_lifetime; r->rule = rule; r->dyn_type = dyn_type; r->pcnt = r->bcnt = 0; r->count = 0; r->bucket = i; r->next = V_ipfw_dyn_v[i]; V_ipfw_dyn_v[i] = r; V_dyn_count++; DEB({ struct in_addr da; #ifdef INET6 char src[INET6_ADDRSTRLEN]; char dst[INET6_ADDRSTRLEN]; #else char src[INET_ADDRSTRLEN]; char dst[INET_ADDRSTRLEN]; #endif #ifdef INET6 if (IS_IP6_FLOW_ID(&(r->id))) { ip6_sprintf(src, &r->id.src_ip6); ip6_sprintf(dst, &r->id.dst_ip6); } else #endif { da.s_addr = htonl(r->id.src_ip); inet_ntoa_r(da, src); da.s_addr = htonl(r->id.dst_ip); inet_ntoa_r(da, dst); } printf("ipfw: add dyn entry ty %d %s %d -> %s %d, total %d\n", dyn_type, src, r->id.src_port, dst, r->id.dst_port, V_dyn_count); }) return r; } /** * lookup dynamic parent rule using pkt and rule as search keys. * If the lookup fails, then install one. */ static ipfw_dyn_rule * lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule) { ipfw_dyn_rule *q; int i; IPFW_DYN_LOCK_ASSERT(); if (V_ipfw_dyn_v) { int is_v6 = IS_IP6_FLOW_ID(pkt); i = hash_packet( pkt ); for (q = V_ipfw_dyn_v[i] ; q != NULL ; q=q->next) if (q->dyn_type == O_LIMIT_PARENT && rule== q->rule && pkt->proto == q->id.proto && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port && ( (is_v6 && IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), &(q->id.src_ip6)) && IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), &(q->id.dst_ip6))) || (!is_v6 && pkt->src_ip == q->id.src_ip && pkt->dst_ip == q->id.dst_ip) ) ) { q->expire = time_uptime + V_dyn_short_lifetime; DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);) return q; } } return add_dyn_rule(pkt, O_LIMIT_PARENT, rule); } /** * Install dynamic state for rule type cmd->o.opcode * * Returns 1 (failure) if state is not installed because of errors or because * session limitations are enforced. */ int ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, struct ip_fw_args *args, uint32_t tablearg) { static int last_log; ipfw_dyn_rule *q; struct in_addr da; #ifdef INET6 char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; #else char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; #endif src[0] = '\0'; dst[0] = '\0'; IPFW_DYN_LOCK(); DEB( #ifdef INET6 if (IS_IP6_FLOW_ID(&(args->f_id))) { ip6_sprintf(src, &args->f_id.src_ip6); ip6_sprintf(dst, &args->f_id.dst_ip6); } else #endif { da.s_addr = htonl(args->f_id.src_ip); inet_ntoa_r(da, src); da.s_addr = htonl(args->f_id.dst_ip); inet_ntoa_r(da, dst); } printf("ipfw: %s: type %d %s %u -> %s %u\n", __func__, cmd->o.opcode, src, args->f_id.src_port, dst, args->f_id.dst_port); src[0] = '\0'; dst[0] = '\0'; ) q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL); if (q != NULL) { /* should never occur */ if (last_log != time_uptime) { last_log = time_uptime; printf("ipfw: %s: entry already present, done\n", __func__); } IPFW_DYN_UNLOCK(); return (0); } if (V_dyn_count >= V_dyn_max) /* Run out of slots, try to remove any expired rule. */ remove_dyn_rule(NULL, (ipfw_dyn_rule *)1); if (V_dyn_count >= V_dyn_max) { if (last_log != time_uptime) { last_log = time_uptime; printf("ipfw: %s: Too many dynamic rules\n", __func__); } IPFW_DYN_UNLOCK(); return (1); /* cannot install, notify caller */ } switch (cmd->o.opcode) { case O_KEEP_STATE: /* bidir rule */ add_dyn_rule(&args->f_id, O_KEEP_STATE, rule); break; case O_LIMIT: { /* limit number of sessions */ struct ipfw_flow_id id; ipfw_dyn_rule *parent; uint32_t conn_limit; uint16_t limit_mask = cmd->limit_mask; conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ? tablearg : cmd->conn_limit; DEB( if (cmd->conn_limit == IP_FW_TABLEARG) printf("ipfw: %s: O_LIMIT rule, conn_limit: %u " "(tablearg)\n", __func__, conn_limit); else printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n", __func__, conn_limit); ) id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0; id.proto = args->f_id.proto; id.addr_type = args->f_id.addr_type; id.fib = M_GETFIB(args->m); if (IS_IP6_FLOW_ID (&(args->f_id))) { if (limit_mask & DYN_SRC_ADDR) id.src_ip6 = args->f_id.src_ip6; if (limit_mask & DYN_DST_ADDR) id.dst_ip6 = args->f_id.dst_ip6; } else { if (limit_mask & DYN_SRC_ADDR) id.src_ip = args->f_id.src_ip; if (limit_mask & DYN_DST_ADDR) id.dst_ip = args->f_id.dst_ip; } if (limit_mask & DYN_SRC_PORT) id.src_port = args->f_id.src_port; if (limit_mask & DYN_DST_PORT) id.dst_port = args->f_id.dst_port; if ((parent = lookup_dyn_parent(&id, rule)) == NULL) { printf("ipfw: %s: add parent failed\n", __func__); IPFW_DYN_UNLOCK(); return (1); } if (parent->count >= conn_limit) { /* See if we can remove some expired rule. */ remove_dyn_rule(rule, parent); if (parent->count >= conn_limit) { if (V_fw_verbose && last_log != time_uptime) { last_log = time_uptime; #ifdef INET6 /* * XXX IPv6 flows are not * supported yet. */ if (IS_IP6_FLOW_ID(&(args->f_id))) { char ip6buf[INET6_ADDRSTRLEN]; snprintf(src, sizeof(src), "[%s]", ip6_sprintf(ip6buf, &args->f_id.src_ip6)); snprintf(dst, sizeof(dst), "[%s]", ip6_sprintf(ip6buf, &args->f_id.dst_ip6)); } else #endif { da.s_addr = htonl(args->f_id.src_ip); inet_ntoa_r(da, src); da.s_addr = htonl(args->f_id.dst_ip); inet_ntoa_r(da, dst); } log(LOG_SECURITY | LOG_DEBUG, "ipfw: %d %s %s:%u -> %s:%u, %s\n", parent->rule->rulenum, "drop session", src, (args->f_id.src_port), dst, (args->f_id.dst_port), "too many entries"); } IPFW_DYN_UNLOCK(); return (1); } } add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent); break; } default: printf("ipfw: %s: unknown dynamic rule type %u\n", __func__, cmd->o.opcode); IPFW_DYN_UNLOCK(); return (1); } /* XXX just set lifetime */ lookup_dyn_rule_locked(&args->f_id, NULL, NULL); IPFW_DYN_UNLOCK(); return (0); } /* * Generate a TCP packet, containing either a RST or a keepalive. * When flags & TH_RST, we are sending a RST packet, because of a * "reset" action matched the packet. * Otherwise we are sending a keepalive, and flags & TH_ * The 'replyto' mbuf is the mbuf being replied to, if any, and is required * so that MAC can label the reply appropriately. */ struct mbuf * ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) { struct mbuf *m = NULL; /* stupid compiler */ int len, dir; struct ip *h = NULL; /* stupid compiler */ #ifdef INET6 struct ip6_hdr *h6 = NULL; #endif struct tcphdr *th = NULL; MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) return (NULL); M_SETFIB(m, id->fib); #ifdef MAC if (replyto != NULL) mac_netinet_firewall_reply(replyto, m); else mac_netinet_firewall_send(m); #else (void)replyto; /* don't warn about unused arg */ #endif switch (id->addr_type) { case 4: len = sizeof(struct ip) + sizeof(struct tcphdr); break; #ifdef INET6 case 6: len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); break; #endif default: /* XXX: log me?!? */ FREE_PKT(m); return (NULL); } dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN); m->m_data += max_linkhdr; m->m_flags |= M_SKIP_FIREWALL; m->m_pkthdr.len = m->m_len = len; m->m_pkthdr.rcvif = NULL; bzero(m->m_data, len); switch (id->addr_type) { case 4: h = mtod(m, struct ip *); /* prepare for checksum */ h->ip_p = IPPROTO_TCP; h->ip_len = htons(sizeof(struct tcphdr)); if (dir) { h->ip_src.s_addr = htonl(id->src_ip); h->ip_dst.s_addr = htonl(id->dst_ip); } else { h->ip_src.s_addr = htonl(id->dst_ip); h->ip_dst.s_addr = htonl(id->src_ip); } th = (struct tcphdr *)(h + 1); break; #ifdef INET6 case 6: h6 = mtod(m, struct ip6_hdr *); /* prepare for checksum */ h6->ip6_nxt = IPPROTO_TCP; h6->ip6_plen = htons(sizeof(struct tcphdr)); if (dir) { h6->ip6_src = id->src_ip6; h6->ip6_dst = id->dst_ip6; } else { h6->ip6_src = id->dst_ip6; h6->ip6_dst = id->src_ip6; } th = (struct tcphdr *)(h6 + 1); break; #endif } if (dir) { th->th_sport = htons(id->src_port); th->th_dport = htons(id->dst_port); } else { th->th_sport = htons(id->dst_port); th->th_dport = htons(id->src_port); } th->th_off = sizeof(struct tcphdr) >> 2; if (flags & TH_RST) { if (flags & TH_ACK) { th->th_seq = htonl(ack); th->th_flags = TH_RST; } else { if (flags & TH_SYN) seq++; th->th_ack = htonl(seq); th->th_flags = TH_RST | TH_ACK; } } else { /* * Keepalive - use caller provided sequence numbers */ th->th_seq = htonl(seq); th->th_ack = htonl(ack); th->th_flags = TH_ACK; } switch (id->addr_type) { case 4: th->th_sum = in_cksum(m, len); /* finish the ip header */ h->ip_v = 4; h->ip_hl = sizeof(*h) >> 2; h->ip_tos = IPTOS_LOWDELAY; h->ip_off = 0; /* ip_len must be in host format for ip_output */ h->ip_len = len; h->ip_ttl = V_ip_defttl; h->ip_sum = 0; break; #ifdef INET6 case 6: th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6), sizeof(struct tcphdr)); /* finish the ip6 header */ h6->ip6_vfc |= IPV6_VERSION; h6->ip6_hlim = IPV6_DEFHLIM; break; #endif } return (m); } /* * This procedure is only used to handle keepalives. It is invoked * every dyn_keepalive_period */ /* dummynet() and ipfw_tick() can't be static in windows */ void ipfw_tick(void * vnetx) { struct mbuf *m0, *m, *mnext, **mtailp; #ifdef INET6 struct mbuf *m6, **m6_tailp; #endif int i; ipfw_dyn_rule *q; #ifdef VIMAGE struct vnet *vp = vnetx; #endif CURVNET_SET(vp); if (V_dyn_keepalive == 0 || V_ipfw_dyn_v == NULL || V_dyn_count == 0) goto done; /* * We make a chain of packets to go out here -- not deferring * until after we drop the IPFW dynamic rule lock would result * in a lock order reversal with the normal packet input -> ipfw * call stack. */ m0 = NULL; mtailp = &m0; #ifdef INET6 m6 = NULL; m6_tailp = &m6; #endif IPFW_DYN_LOCK(); for (i = 0 ; i < V_curr_dyn_buckets ; i++) { for (q = V_ipfw_dyn_v[i] ; q ; q = q->next ) { if (q->dyn_type == O_LIMIT_PARENT) continue; if (q->id.proto != IPPROTO_TCP) continue; if ( (q->state & BOTH_SYN) != BOTH_SYN) continue; if (TIME_LEQ(time_uptime + V_dyn_keepalive_interval, q->expire)) continue; /* too early */ if (TIME_LEQ(q->expire, time_uptime)) continue; /* too late, rule expired */ m = ipfw_send_pkt(NULL, &(q->id), q->ack_rev - 1, q->ack_fwd, TH_SYN); mnext = ipfw_send_pkt(NULL, &(q->id), q->ack_fwd - 1, q->ack_rev, 0); switch (q->id.addr_type) { case 4: if (m != NULL) { *mtailp = m; mtailp = &(*mtailp)->m_nextpkt; } if (mnext != NULL) { *mtailp = mnext; mtailp = &(*mtailp)->m_nextpkt; } break; #ifdef INET6 case 6: if (m != NULL) { *m6_tailp = m; m6_tailp = &(*m6_tailp)->m_nextpkt; } if (mnext != NULL) { *m6_tailp = mnext; m6_tailp = &(*m6_tailp)->m_nextpkt; } break; #endif } m = mnext = NULL; } } IPFW_DYN_UNLOCK(); for (m = mnext = m0; m != NULL; m = mnext) { mnext = m->m_nextpkt; m->m_nextpkt = NULL; ip_output(m, NULL, NULL, 0, NULL, NULL); } #ifdef INET6 for (m = mnext = m6; m != NULL; m = mnext) { mnext = m->m_nextpkt; m->m_nextpkt = NULL; ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); } #endif done: callout_reset_on(&V_ipfw_timeout, V_dyn_keepalive_period * hz, ipfw_tick, vnetx, 0); CURVNET_RESTORE(); } void ipfw_dyn_attach(void) { ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); IPFW_DYN_LOCK_INIT(); } void ipfw_dyn_detach(void) { uma_zdestroy(ipfw_dyn_rule_zone); IPFW_DYN_LOCK_DESTROY(); } void ipfw_dyn_init(void) { V_ipfw_dyn_v = NULL; V_dyn_buckets = 256; /* must be power of 2 */ V_curr_dyn_buckets = 256; /* must be power of 2 */ V_dyn_ack_lifetime = 300; V_dyn_syn_lifetime = 20; V_dyn_fin_lifetime = 1; V_dyn_rst_lifetime = 1; V_dyn_udp_lifetime = 10; V_dyn_short_lifetime = 5; V_dyn_keepalive_interval = 20; V_dyn_keepalive_period = 5; V_dyn_keepalive = 1; /* do send keepalives */ V_dyn_max = 4096; /* max # of dynamic rules */ callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE); callout_reset_on(&V_ipfw_timeout, hz, ipfw_tick, curvnet, 0); } void ipfw_dyn_uninit(int pass) { if (pass == 0) callout_drain(&V_ipfw_timeout); else { if (V_ipfw_dyn_v != NULL) free(V_ipfw_dyn_v, M_IPFW); } } int ipfw_dyn_len(void) { return (V_ipfw_dyn_v == NULL) ? 0 : (V_dyn_count * sizeof(ipfw_dyn_rule)); } void ipfw_get_dynamic(char **pbp, const char *ep) { ipfw_dyn_rule *p, *last = NULL; char *bp; int i; if (V_ipfw_dyn_v == NULL) return; bp = *pbp; IPFW_DYN_LOCK(); for (i = 0 ; i < V_curr_dyn_buckets; i++) for (p = V_ipfw_dyn_v[i] ; p != NULL; p = p->next) { if (bp + sizeof *p <= ep) { ipfw_dyn_rule *dst = (ipfw_dyn_rule *)bp; bcopy(p, dst, sizeof *p); bcopy(&(p->rule->rulenum), &(dst->rule), sizeof(p->rule->rulenum)); /* * store set number into high word of * dst->rule pointer. */ bcopy(&(p->rule->set), (char *)&dst->rule + sizeof(p->rule->rulenum), sizeof(p->rule->set)); /* * store a non-null value in "next". * The userland code will interpret a * NULL here as a marker * for the last dynamic rule. */ bcopy(&dst, &dst->next, sizeof(dst)); last = dst; dst->expire = TIME_LEQ(dst->expire, time_uptime) ? 0 : dst->expire - time_uptime ; bp += sizeof(ipfw_dyn_rule); } } IPFW_DYN_UNLOCK(); if (last != NULL) /* mark last dynamic rule */ bzero(&last->next, sizeof(last)); *pbp = bp; } /* end of file */ ================================================ FILE: sys/netinet/ipfw/ip_fw_log.c ================================================ /*- * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_log.c 209845 2010-07-09 11:27:33Z glebius $"); /* * Logging support for ipfw */ #if !defined(KLD_MODULE) #include "opt_ipfw.h" #include "opt_ipdivert.h" #include "opt_ipdn.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #endif #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include /* for ETHERTYPE_IP */ #include #include #include /* for IFT_ETHER */ #include /* for BPF */ #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include /* ip6_sprintf() */ #endif #ifdef MAC #include #endif /* * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type */ #define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) #define TCP(p) ((struct tcphdr *)(p)) #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 #define SNP(buf) buf, sizeof(buf) #ifdef WITHOUT_BPF void ipfw_log_bpf(int onoff) { } #else /* !WITHOUT_BPF */ static struct ifnet *log_if; /* hook to attach to bpf */ /* we use this dummy function for all ifnet callbacks */ static int log_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr) { return EINVAL; } static int ipfw_log_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct route *ro) { if (m != NULL) m_freem(m); return EINVAL; } static void ipfw_log_start(struct ifnet* ifp) { panic("ipfw_log_start() must not be called"); } static const u_char ipfwbroadcastaddr[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; void ipfw_log_bpf(int onoff) { struct ifnet *ifp; if (onoff) { if (log_if) return; ifp = if_alloc(IFT_ETHER); if (ifp == NULL) return; if_initname(ifp, "ipfw", 0); ifp->if_mtu = 65536; ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_init = (void *)log_dummy; ifp->if_ioctl = log_dummy; ifp->if_start = ipfw_log_start; ifp->if_output = ipfw_log_output; ifp->if_addrlen = 6; ifp->if_hdrlen = 14; if_attach(ifp); ifp->if_broadcastaddr = ipfwbroadcastaddr; ifp->if_baudrate = IF_Mbps(10); bpfattach(ifp, DLT_EN10MB, 14); log_if = ifp; } else { if (log_if) { ether_ifdetach(log_if); if_free(log_if); } log_if = NULL; } } #endif /* !WITHOUT_BPF */ /* * We enter here when we have a rule with O_LOG. * XXX this function alone takes about 2Kbytes of code! */ void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, struct ip *ip) { char *action; int limit_reached = 0; char action2[40], proto[128], fragment[32]; if (V_fw_verbose == 0) { #ifndef WITHOUT_BPF if (log_if == NULL || log_if->if_bpf == NULL) return; if (args->eh) /* layer2, use orig hdr */ BPF_MTAP2(log_if, args->eh, ETHER_HDR_LEN, m); else /* Add fake header. Later we will store * more info in the header. */ BPF_MTAP2(log_if, "DDDDDDSSSSSS\x08\x00", ETHER_HDR_LEN, m); #endif /* !WITHOUT_BPF */ return; } /* the old 'log' function */ fragment[0] = '\0'; proto[0] = '\0'; if (f == NULL) { /* bogus pkt */ if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit) return; V_norule_counter++; if (V_norule_counter == V_verbose_limit) limit_reached = V_verbose_limit; action = "Refuse"; } else { /* O_LOG is the first action, find the real one */ ipfw_insn *cmd = ACTION_PTR(f); ipfw_insn_log *l = (ipfw_insn_log *)cmd; if (l->max_log != 0 && l->log_left == 0) return; l->log_left--; if (l->log_left == 0) limit_reached = l->max_log; cmd += F_LEN(cmd); /* point to first action */ if (cmd->opcode == O_ALTQ) { ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; snprintf(SNPARGS(action2, 0), "Altq %d", altq->qid); cmd += F_LEN(cmd); } if (cmd->opcode == O_PROB) cmd += F_LEN(cmd); if (cmd->opcode == O_TAG) cmd += F_LEN(cmd); action = action2; switch (cmd->opcode) { case O_DENY: action = "Deny"; break; case O_REJECT: if (cmd->arg1==ICMP_REJECT_RST) action = "Reset"; else if (cmd->arg1==ICMP_UNREACH_HOST) action = "Reject"; else snprintf(SNPARGS(action2, 0), "Unreach %d", cmd->arg1); break; case O_UNREACH6: if (cmd->arg1==ICMP6_UNREACH_RST) action = "Reset"; else snprintf(SNPARGS(action2, 0), "Unreach %d", cmd->arg1); break; case O_ACCEPT: action = "Accept"; break; case O_COUNT: action = "Count"; break; case O_DIVERT: snprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1); break; case O_TEE: snprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1); break; case O_SETFIB: snprintf(SNPARGS(action2, 0), "SetFib %d", cmd->arg1); break; case O_SKIPTO: snprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1); break; case O_PIPE: snprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1); break; case O_QUEUE: snprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1); break; case O_FORWARD_IP: { ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; int len; struct in_addr dummyaddr; if (sa->sa.sin_addr.s_addr == INADDR_ANY) dummyaddr.s_addr = htonl(tablearg); else dummyaddr.s_addr = sa->sa.sin_addr.s_addr; len = snprintf(SNPARGS(action2, 0), "Forward to %s", inet_ntoa(dummyaddr)); if (sa->sa.sin_port) snprintf(SNPARGS(action2, len), ":%d", sa->sa.sin_port); } break; case O_NETGRAPH: snprintf(SNPARGS(action2, 0), "Netgraph %d", cmd->arg1); break; case O_NGTEE: snprintf(SNPARGS(action2, 0), "Ngtee %d", cmd->arg1); break; case O_NAT: action = "Nat"; break; case O_REASS: action = "Reass"; break; default: action = "UNKNOWN"; break; } } if (hlen == 0) { /* non-ip */ snprintf(SNPARGS(proto, 0), "MAC"); } else { int len; #ifdef INET6 char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; #else char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; #endif struct icmphdr *icmp; struct tcphdr *tcp; struct udphdr *udp; #ifdef INET6 struct ip6_hdr *ip6 = NULL; struct icmp6_hdr *icmp6; #endif src[0] = '\0'; dst[0] = '\0'; #ifdef INET6 if (IS_IP6_FLOW_ID(&(args->f_id))) { char ip6buf[INET6_ADDRSTRLEN]; snprintf(src, sizeof(src), "[%s]", ip6_sprintf(ip6buf, &args->f_id.src_ip6)); snprintf(dst, sizeof(dst), "[%s]", ip6_sprintf(ip6buf, &args->f_id.dst_ip6)); ip6 = (struct ip6_hdr *)ip; tcp = (struct tcphdr *)(((char *)ip) + hlen); udp = (struct udphdr *)(((char *)ip) + hlen); } else #endif { tcp = L3HDR(struct tcphdr, ip); udp = L3HDR(struct udphdr, ip); inet_ntoa_r(ip->ip_src, src); inet_ntoa_r(ip->ip_dst, dst); } switch (args->f_id.proto) { case IPPROTO_TCP: len = snprintf(SNPARGS(proto, 0), "TCP %s", src); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(tcp->th_sport), dst, ntohs(tcp->th_dport)); else snprintf(SNPARGS(proto, len), " %s", dst); break; case IPPROTO_UDP: len = snprintf(SNPARGS(proto, 0), "UDP %s", src); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(udp->uh_sport), dst, ntohs(udp->uh_dport)); else snprintf(SNPARGS(proto, len), " %s", dst); break; case IPPROTO_ICMP: icmp = L3HDR(struct icmphdr, ip); if (offset == 0) len = snprintf(SNPARGS(proto, 0), "ICMP:%u.%u ", icmp->icmp_type, icmp->icmp_code); else len = snprintf(SNPARGS(proto, 0), "ICMP "); len += snprintf(SNPARGS(proto, len), "%s", src); snprintf(SNPARGS(proto, len), " %s", dst); break; #ifdef INET6 case IPPROTO_ICMPV6: icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen); if (offset == 0) len = snprintf(SNPARGS(proto, 0), "ICMPv6:%u.%u ", icmp6->icmp6_type, icmp6->icmp6_code); else len = snprintf(SNPARGS(proto, 0), "ICMPv6 "); len += snprintf(SNPARGS(proto, len), "%s", src); snprintf(SNPARGS(proto, len), " %s", dst); break; #endif default: len = snprintf(SNPARGS(proto, 0), "P:%d %s", args->f_id.proto, src); snprintf(SNPARGS(proto, len), " %s", dst); break; } #ifdef INET6 if (IS_IP6_FLOW_ID(&(args->f_id))) { if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) snprintf(SNPARGS(fragment, 0), " (frag %08x:%d@%d%s)", args->f_id.extra, ntohs(ip6->ip6_plen) - hlen, ntohs(offset & IP6F_OFF_MASK) << 3, (offset & IP6F_MORE_FRAG) ? "+" : ""); } else #endif { int ipoff, iplen; ipoff = ntohs(ip->ip_off); iplen = ntohs(ip->ip_len); if (ipoff & (IP_MF | IP_OFFMASK)) snprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)", ntohs(ip->ip_id), iplen - (ip->ip_hl << 2), offset << 3, (ipoff & IP_MF) ? "+" : ""); } } #ifdef __FreeBSD__ if (oif || m->m_pkthdr.rcvif) log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s %s via %s%s\n", f ? f->rulenum : -1, action, proto, oif ? "out" : "in", oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname, fragment); else #endif log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s [no if info]%s\n", f ? f->rulenum : -1, action, proto, fragment); if (limit_reached) log(LOG_SECURITY | LOG_NOTICE, "ipfw: limit %d reached on entry %d\n", limit_reached, f ? f->rulenum : -1); } /* end of file */ ================================================ FILE: sys/netinet/ipfw/ip_fw_lookup.c ================================================ /*- * Copyright (c) 2009 Luigi Rizzo Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_fw_table.c 200601 2009-12-16 10:48:40Z luigi $"); /* * Rule and pipe lookup support for ipfw. * ipfw and dummynet need to quickly find objects (rules, pipes) that may be dynamically created or destroyed. To address the problem, we label each new object with a unique 32-bit identifier whose low K bits are the index in a lookup table. All existing objects are referred by the lookup table, and identifiers are chosen so that for each slot there is at most one active object (whose identifier points to the slot). This is almost a hash table, except that we can pick the identifiers after looking at the table's occupation so we have a trivial hash function and are collision free. With this structure, operations are very fast and simple: - the table has N entries s[i] with two fields, 'id' and 'ptr', with N <= M = 2^k (M is an upper bound to the size of the table); - initially, all slots have s[i].id = i, and the pointers are used to build a freelist (tailq). - a slot is considered empty if ptr == NULL or s[0] <= ptr < s[N]. This is easy to detect and we can use ptr to build the freelist. - when a new object is created, we put it in the empty slot i at the head of the freelist, and set the id to s[i].id; - when an object is destroyed, we append its slot i to the end of the freelist, and set s[i].id += M (note M, not N). - on a lookup for id = X, we look at slot i = X & (M-1), and consider the lookup successful only if the slot is not empty and s[i].id == X; - wraps occur at most every F * 2^32/M operations, where F is the number of free slots. Because F is usually a reasonable fraction of M, we should not worry too much. - if the table fills up, we can extend it by increasing N - shrinking the table is more difficult as we might create collisions during the rehashing. * */ #include #ifdef _KERNEL #include #include #include #include #include #include MALLOC_DEFINE(M_IPFW_LUT, "ipfw_lookup", "IpFw lookup"); #define Malloc(n) malloc(n, M_IPFW_LUT, M_WAITOK) #define Calloc(n) calloc(n, M_IPFW_LUT, M_WAITOK | M_ZERO) #define Free(p) free(p, M_IPFW_LUT) #define log(x, arg...) #else /* !_KERNEL */ #include #include #include #include #define Malloc(n) malloc(n) #define Calloc(n) calloc(1, n) #define Free(p) free(p) #define log(x, arg...) fprintf(stderr, "%s: " x "\n", __FUNCTION__, ##arg) #endif /* !_KERNEL */ struct entry { uint32_t id; struct entry *ptr; }; struct lookup_table { int _size; int used; int mask; /* 2^k -1, used for hashing */ struct entry *f_head, *f_tail; /* freelist */ struct entry * s; /* slots, array of N entries */ }; static __inline int empty(struct lookup_table *head, const void *p) { const struct entry *ep = p; return (ep == NULL || (ep >= head->s && ep < &head->s[head->_size])); } /* * init or reinit a table */ struct lookup_table * ipfw_lut_init(struct lookup_table *head, int new_size, int mask) { int i; struct entry *s; /* the new slots */ struct entry *fh, *ft; /* the freelist */ if (head != NULL) { mask = head->mask; if (new_size <= head->_size) return head; if (new_size >= mask+1) { log("size larger than mask"); return NULL; } } else { log("old is null, initialize"); head = Calloc(sizeof(*head)); if (head == NULL) return NULL; if (new_size >= mask) mask = new_size; if (mask & (mask -1)) { for (i = 1; i < mask; i += i) ; log("mask %d not 2^k, round up to %d", mask, i); mask = i; } mask = head->mask = mask - 1; } s = Calloc(new_size * sizeof(*s)); if (s == NULL) return NULL; if (!head->s) { head->s = s; head->_size = 1; } fh = ft = NULL; /* remap the entries, adjust the freelist */ for (i = 0; i < new_size; i++) { s[i].id = (i >= head->_size) ? i : head->s[i].id; if (i < head->_size && !empty(head, head->s[i].ptr)) { s[i].ptr = head->s[i].ptr; continue; } if (fh == NULL) fh = &s[i]; else ft->ptr = &s[i]; ft = &s[i]; } head->f_head = fh; head->f_tail = ft; /* write lock on the structure, to protect the readers */ fh = head->s; head->s = s; head->_size = new_size; /* release write lock */ if (fh != s) Free(fh); log("done"); return head; } /* insert returns the id */ int ipfw_lut_insert(struct lookup_table *head, void *d) { struct entry *e; e = head->f_head; if (e == NULL) return -1; head->f_head = e->ptr; e->ptr = d; head->used++; return e->id; } /* delete, returns the original entry */ void * ipfw_lut_delete(struct lookup_table *head, int id) { int i = id & head->mask; void *result; struct entry *e; if (i >= head->_size) return NULL; e = &head->s[i]; if (e->id != id) return NULL; result = e->ptr; /* write lock to invalidate the entry to readers */ e->id += head->mask + 1; /* prepare for next insert */ e->ptr = NULL; /* release write lock */ if (head->f_head == NULL) head->f_head = e; else head->f_tail->ptr = e; head->f_tail = e; head->used--; return result; } void * ipfw_lut_lookup(struct lookup_table *head, int id) { int i = id & head->mask; struct entry *e; if (i >= head->_size) return NULL; e = &head->s[i]; return (e->id == id) ? e->ptr : NULL; } void ipfw_lut_dump(struct lookup_table *head) { int i; log("head %p size %d used %d freelist %d", head, head->_size, head->used, head->f_head ? head->f_head - head->s : -1); for (i = 0; i < head->_size; i++) { struct entry *e = &head->s[i]; char ee = empty(head, e->ptr) ? 'E' : ' '; log("%5d %5d %c %p", i, e->id, ee, ee == 'E' && e->ptr != NULL ? (void *)((struct entry *)e->ptr - head->s) : e->ptr); } } #ifndef _KERNEL void dump_p(struct lookup_table *p, int *map) { int i; for (i = 0; i < p->_size; i++) { int id = (int)ipfw_lut_lookup(p, map[i]); log("%3d: %3d: %c", map[i] % 64, i, id); } } int main(int argc, char *argv[]) { int i, j, l; #define S 1000 int map[S]; struct lookup_table *p; struct lookup_table *p1; const char *m = "nel mezzo del cammin di nostra vita mi ritrovai" " in una selva oscura e la diritta via era smarrita!"; fprintf(stderr, "testing lookup\n"); l = strlen(m); p = ipfw_lut_init(NULL, 120, 33); ipfw_lut_dump(p); for (i = 0; i < l; i++) { int x = m[i]; int id = ipfw_lut_insert(p, (void *)x); //ipfw_lut_dump(p); map[i] = id; for (j=0; j < 10; j++) { id = ipfw_lut_insert(p, (void *)'a'); // ipfw_lut_dump(p); ipfw_lut_delete(p, id); // ipfw_lut_dump(p); } // ipfw_lut_dump(p); } dump_p(p, map); p1 = ipfw_lut_init(p, 23, 0); if (!p1) return 1; dump_p(p1, map); p1 = ipfw_lut_init(p1, 120, 0); if (!p1) return 1; dump_p(p1, map); return 0; } #endif /* end of file */ ================================================ FILE: sys/netinet/ipfw/ip_fw_nat.c ================================================ /*- * Copyright (c) 2008 Paolo Pisati * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_fw_nat.c 200975 2009-12-25 01:15:39Z luigi $"); #include #include #include #include #include #include #include #include #define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */ #include #include #include #include #include #include #include #include #include #include #include /* XXX for in_cksum */ static VNET_DEFINE(eventhandler_tag, ifaddr_event_tag); #define V_ifaddr_event_tag VNET(ifaddr_event_tag) static void ifaddr_change(void *arg, struct ifnet *ifp) { struct cfg_nat *ptr; struct ifaddr *ifa; struct ip_fw_chain *chain; (void)arg; chain = &V_layer3_chain; IPFW_WLOCK(chain); /* Check every nat entry... */ LIST_FOREACH(ptr, &chain->nat, _next) { /* ...using nic 'ifp->if_xname' as dynamic alias address. */ if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0) continue; if_addr_rlock(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL) continue; if (ifa->ifa_addr->sa_family != AF_INET) continue; ptr->ip = ((struct sockaddr_in *) (ifa->ifa_addr))->sin_addr; LibAliasSetAddress(ptr->lib, ptr->ip); } if_addr_runlock(ifp); } IPFW_WUNLOCK(chain); } /* * delete the pointers for nat entry ix, or all of them if ix < 0 */ static void flush_nat_ptrs(struct ip_fw_chain *chain, const int ix) { int i; ipfw_insn_nat *cmd; IPFW_WLOCK_ASSERT(chain); for (i = 0; i < chain->n_rules; i++) { cmd = (ipfw_insn_nat *)ACTION_PTR(chain->map[i]); /* XXX skip log and the like ? */ if (cmd->o.opcode == O_NAT && cmd->nat != NULL && (ix < 0 || cmd->nat->id == ix)) cmd->nat = NULL; } } static void del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head) { struct cfg_redir *r, *tmp_r; struct cfg_spool *s, *tmp_s; int i, num; LIST_FOREACH_SAFE(r, head, _next, tmp_r) { num = 1; /* Number of alias_link to delete. */ switch (r->mode) { case REDIR_PORT: num = r->pport_cnt; /* FALLTHROUGH */ case REDIR_ADDR: case REDIR_PROTO: /* Delete all libalias redirect entry. */ for (i = 0; i < num; i++) LibAliasRedirectDelete(n->lib, r->alink[i]); /* Del spool cfg if any. */ LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) { LIST_REMOVE(s, _next); free(s, M_IPFW); } free(r->alink, M_IPFW); LIST_REMOVE(r, _next); free(r, M_IPFW); break; default: printf("unknown redirect mode: %u\n", r->mode); /* XXX - panic?!?!? */ break; } } } static int add_redir_spool_cfg(char *buf, struct cfg_nat *ptr) { struct cfg_redir *r, *ser_r; struct cfg_spool *s, *ser_s; int cnt, off, i; for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) { ser_r = (struct cfg_redir *)&buf[off]; r = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO); memcpy(r, ser_r, SOF_REDIR); LIST_INIT(&r->spool_chain); off += SOF_REDIR; r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt, M_IPFW, M_WAITOK | M_ZERO); switch (r->mode) { case REDIR_ADDR: r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr, r->paddr); break; case REDIR_PORT: for (i = 0 ; i < r->pport_cnt; i++) { /* If remotePort is all ports, set it to 0. */ u_short remotePortCopy = r->rport + i; if (r->rport_cnt == 1 && r->rport == 0) remotePortCopy = 0; r->alink[i] = LibAliasRedirectPort(ptr->lib, r->laddr, htons(r->lport + i), r->raddr, htons(remotePortCopy), r->paddr, htons(r->pport + i), r->proto); if (r->alink[i] == NULL) { r->alink[0] = NULL; break; } } break; case REDIR_PROTO: r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr, r->raddr, r->paddr, r->proto); break; default: printf("unknown redirect mode: %u\n", r->mode); break; } /* XXX perhaps return an error instead of panic ? */ if (r->alink[0] == NULL) panic("LibAliasRedirect* returned NULL"); /* LSNAT handling. */ for (i = 0; i < r->spool_cnt; i++) { ser_s = (struct cfg_spool *)&buf[off]; s = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO); memcpy(s, ser_s, SOF_SPOOL); LibAliasAddServer(ptr->lib, r->alink[0], s->addr, htons(s->port)); off += SOF_SPOOL; /* Hook spool entry. */ LIST_INSERT_HEAD(&r->spool_chain, s, _next); } /* And finally hook this redir entry. */ LIST_INSERT_HEAD(&ptr->redir_chain, r, _next); } return (1); } static int ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m) { struct mbuf *mcl; struct ip *ip; /* XXX - libalias duct tape */ int ldt, retval; char *c; ldt = 0; retval = 0; mcl = m_megapullup(m, m->m_pkthdr.len); if (mcl == NULL) { args->m = NULL; return (IP_FW_DENY); } ip = mtod(mcl, struct ip *); /* * XXX - Libalias checksum offload 'duct tape': * * locally generated packets have only pseudo-header checksum * calculated and libalias will break it[1], so mark them for * later fix. Moreover there are cases when libalias modifies * tcp packet data[2], mark them for later fix too. * * [1] libalias was never meant to run in kernel, so it does * not have any knowledge about checksum offloading, and * expects a packet with a full internet checksum. * Unfortunately, packets generated locally will have just the * pseudo header calculated, and when libalias tries to adjust * the checksum it will actually compute a wrong value. * * [2] when libalias modifies tcp's data content, full TCP * checksum has to be recomputed: the problem is that * libalias does not have any idea about checksum offloading. * To work around this, we do not do checksumming in LibAlias, * but only mark the packets in th_x2 field. If we receive a * marked packet, we calculate correct checksum for it * aware of offloading. Why such a terrible hack instead of * recalculating checksum for each packet? * Because the previous checksum was not checked! * Recalculating checksums for EVERY packet will hide ALL * transmission errors. Yes, marked packets still suffer from * this problem. But, sigh, natd(8) has this problem, too. * * TODO: -make libalias mbuf aware (so * it can handle delayed checksum and tso) */ if (mcl->m_pkthdr.rcvif == NULL && mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) ldt = 1; c = mtod(mcl, char *); if (args->oif == NULL) retval = LibAliasIn(t->lib, c, mcl->m_len + M_TRAILINGSPACE(mcl)); else retval = LibAliasOut(t->lib, c, mcl->m_len + M_TRAILINGSPACE(mcl)); if (retval == PKT_ALIAS_RESPOND) { m->m_flags |= M_SKIP_FIREWALL; retval = PKT_ALIAS_OK; } if (retval != PKT_ALIAS_OK && retval != PKT_ALIAS_FOUND_HEADER_FRAGMENT) { /* XXX - should i add some logging? */ m_free(mcl); args->m = NULL; return (IP_FW_DENY); } mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len); /* * XXX - libalias checksum offload * 'duct tape' (see above) */ if ((ip->ip_off & htons(IP_OFFMASK)) == 0 && ip->ip_p == IPPROTO_TCP) { struct tcphdr *th; th = (struct tcphdr *)(ip + 1); if (th->th_x2) ldt = 1; } if (ldt) { struct tcphdr *th; struct udphdr *uh; u_short cksum; ip->ip_len = ntohs(ip->ip_len); cksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ip->ip_p + ip->ip_len - (ip->ip_hl << 2))); switch (ip->ip_p) { case IPPROTO_TCP: th = (struct tcphdr *)(ip + 1); /* * Maybe it was set in * libalias... */ th->th_x2 = 0; th->th_sum = cksum; mcl->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); break; case IPPROTO_UDP: uh = (struct udphdr *)(ip + 1); uh->uh_sum = cksum; mcl->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); break; } /* No hw checksum offloading: do it ourselves */ if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) { in_delayed_cksum(mcl); mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } ip->ip_len = htons(ip->ip_len); } args->m = mcl; return (IP_FW_NAT); } static struct cfg_nat * lookup_nat(struct nat_list *l, int nat_id) { struct cfg_nat *res; LIST_FOREACH(res, l, _next) { if (res->id == nat_id) break; } return res; } static int ipfw_nat_cfg(struct sockopt *sopt) { struct cfg_nat *ptr, *ser_n; char *buf; struct ip_fw_chain *chain = &V_layer3_chain; buf = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO); sooptcopyin(sopt, buf, NAT_BUF_LEN, sizeof(struct cfg_nat)); ser_n = (struct cfg_nat *)buf; /* check valid parameter ser_n->id > 0 ? */ /* * Find/create nat rule. */ IPFW_WLOCK(chain); ptr = lookup_nat(&chain->nat, ser_n->id); if (ptr == NULL) { /* New rule: allocate and init new instance. */ ptr = malloc(sizeof(struct cfg_nat), M_IPFW, M_NOWAIT | M_ZERO); if (ptr == NULL) { IPFW_WUNLOCK(chain); free(buf, M_IPFW); return (ENOSPC); } ptr->lib = LibAliasInit(NULL); if (ptr->lib == NULL) { IPFW_WUNLOCK(chain); free(ptr, M_IPFW); free(buf, M_IPFW); return (EINVAL); } LIST_INIT(&ptr->redir_chain); } else { /* Entry already present: temporarly unhook it. */ LIST_REMOVE(ptr, _next); flush_nat_ptrs(chain, ser_n->id); } IPFW_WUNLOCK(chain); /* * Basic nat configuration. */ ptr->id = ser_n->id; /* * XXX - what if this rule doesn't nat any ip and just * redirect? * do we set aliasaddress to 0.0.0.0? */ ptr->ip = ser_n->ip; ptr->redir_cnt = ser_n->redir_cnt; ptr->mode = ser_n->mode; LibAliasSetMode(ptr->lib, ser_n->mode, ser_n->mode); LibAliasSetAddress(ptr->lib, ptr->ip); memcpy(ptr->if_name, ser_n->if_name, IF_NAMESIZE); /* * Redir and LSNAT configuration. */ /* Delete old cfgs. */ del_redir_spool_cfg(ptr, &ptr->redir_chain); /* Add new entries. */ add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr); free(buf, M_IPFW); IPFW_WLOCK(chain); LIST_INSERT_HEAD(&chain->nat, ptr, _next); IPFW_WUNLOCK(chain); return (0); } static int ipfw_nat_del(struct sockopt *sopt) { struct cfg_nat *ptr; struct ip_fw_chain *chain = &V_layer3_chain; int i; sooptcopyin(sopt, &i, sizeof i, sizeof i); /* XXX validate i */ IPFW_WLOCK(chain); ptr = lookup_nat(&chain->nat, i); if (ptr == NULL) { IPFW_WUNLOCK(chain); return (EINVAL); } LIST_REMOVE(ptr, _next); flush_nat_ptrs(chain, i); IPFW_WUNLOCK(chain); del_redir_spool_cfg(ptr, &ptr->redir_chain); LibAliasUninit(ptr->lib); free(ptr, M_IPFW); return (0); } static int ipfw_nat_get_cfg(struct sockopt *sopt) { uint8_t *data; struct cfg_nat *n; struct cfg_redir *r; struct cfg_spool *s; int nat_cnt, off; struct ip_fw_chain *chain; int err = ENOSPC; chain = &V_layer3_chain; nat_cnt = 0; off = sizeof(nat_cnt); data = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO); IPFW_RLOCK(chain); /* Serialize all the data. */ LIST_FOREACH(n, &chain->nat, _next) { nat_cnt++; if (off + SOF_NAT >= NAT_BUF_LEN) goto nospace; bcopy(n, &data[off], SOF_NAT); off += SOF_NAT; LIST_FOREACH(r, &n->redir_chain, _next) { if (off + SOF_REDIR >= NAT_BUF_LEN) goto nospace; bcopy(r, &data[off], SOF_REDIR); off += SOF_REDIR; LIST_FOREACH(s, &r->spool_chain, _next) { if (off + SOF_SPOOL >= NAT_BUF_LEN) goto nospace; bcopy(s, &data[off], SOF_SPOOL); off += SOF_SPOOL; } } } err = 0; /* all good */ nospace: IPFW_RUNLOCK(chain); if (err == 0) { bcopy(&nat_cnt, data, sizeof(nat_cnt)); sooptcopyout(sopt, data, NAT_BUF_LEN); } else { printf("serialized data buffer not big enough:" "please increase NAT_BUF_LEN\n"); } free(data, M_IPFW); return (err); } static int ipfw_nat_get_log(struct sockopt *sopt) { uint8_t *data; struct cfg_nat *ptr; int i, size; struct ip_fw_chain *chain; chain = &V_layer3_chain; IPFW_RLOCK(chain); /* one pass to count, one to copy the data */ i = 0; LIST_FOREACH(ptr, &chain->nat, _next) { if (ptr->lib->logDesc == NULL) continue; i++; } size = i * (LIBALIAS_BUF_SIZE + sizeof(int)); data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO); if (data == NULL) { IPFW_RUNLOCK(chain); return (ENOSPC); } i = 0; LIST_FOREACH(ptr, &chain->nat, _next) { if (ptr->lib->logDesc == NULL) continue; bcopy(&ptr->id, &data[i], sizeof(int)); i += sizeof(int); bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE); i += LIBALIAS_BUF_SIZE; } IPFW_RUNLOCK(chain); sooptcopyout(sopt, data, size); free(data, M_IPFW); return(0); } static void ipfw_nat_init(void) { IPFW_WLOCK(&V_layer3_chain); /* init ipfw hooks */ ipfw_nat_ptr = ipfw_nat; lookup_nat_ptr = lookup_nat; ipfw_nat_cfg_ptr = ipfw_nat_cfg; ipfw_nat_del_ptr = ipfw_nat_del; ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg; ipfw_nat_get_log_ptr = ipfw_nat_get_log; IPFW_WUNLOCK(&V_layer3_chain); V_ifaddr_event_tag = EVENTHANDLER_REGISTER( ifaddr_event, ifaddr_change, NULL, EVENTHANDLER_PRI_ANY); } static void ipfw_nat_destroy(void) { struct cfg_nat *ptr, *ptr_temp; struct ip_fw_chain *chain; chain = &V_layer3_chain; IPFW_WLOCK(chain); LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) { LIST_REMOVE(ptr, _next); del_redir_spool_cfg(ptr, &ptr->redir_chain); LibAliasUninit(ptr->lib); free(ptr, M_IPFW); } EVENTHANDLER_DEREGISTER(ifaddr_event, V_ifaddr_event_tag); flush_nat_ptrs(chain, -1 /* flush all */); /* deregister ipfw_nat */ ipfw_nat_ptr = NULL; lookup_nat_ptr = NULL; ipfw_nat_cfg_ptr = NULL; ipfw_nat_del_ptr = NULL; ipfw_nat_get_cfg_ptr = NULL; ipfw_nat_get_log_ptr = NULL; IPFW_WUNLOCK(chain); } static int ipfw_nat_modevent(module_t mod, int type, void *unused) { int err = 0; switch (type) { case MOD_LOAD: ipfw_nat_init(); break; case MOD_UNLOAD: ipfw_nat_destroy(); break; default: return EOPNOTSUPP; break; } return err; } static moduledata_t ipfw_nat_mod = { "ipfw_nat", ipfw_nat_modevent, 0 }; DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1); MODULE_DEPEND(ipfw_nat, ipfw, 2, 2, 2); MODULE_VERSION(ipfw_nat, 1); /* end of file */ ================================================ FILE: sys/netinet/ipfw/ip_fw_pfil.c ================================================ /*- * Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_pfil.c 200601 2009-12-16 10:48:40Z luigi $"); #if !defined(KLD_MODULE) #include "opt_ipfw.h" #include "opt_ipdn.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #endif /* KLD_MODULE */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static VNET_DEFINE(int, fw_enable) = 1; #define V_fw_enable VNET(fw_enable) #ifdef INET6 static VNET_DEFINE(int, fw6_enable) = 1; #define V_fw6_enable VNET(fw6_enable) #endif int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); /* Forward declarations. */ static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int); #ifdef SYSCTL_NODE SYSBEGIN(f1) SYSCTL_DECL(_net_inet_ip_fw); SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0, ipfw_chg_hook, "I", "Enable ipfw"); #ifdef INET6 SYSCTL_DECL(_net_inet6_ip6_fw); SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0, ipfw_chg_hook, "I", "Enable ipfw+6"); #endif /* INET6 */ SYSEND #endif /* SYSCTL_NODE */ /* * The pfilter hook to pass packets to ipfw_chk and then to * dummynet, divert, netgraph or other modules. * The packet may be consumed. */ int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, struct inpcb *inp) { struct ip_fw_args args; struct m_tag *tag; int ipfw; int ret; /* all the processing now uses ip_len in net format */ if (mtod(*m0, struct ip *)->ip_v == 4) SET_NET_IPLEN(mtod(*m0, struct ip *)); /* convert dir to IPFW values */ dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT; bzero(&args, sizeof(args)); again: /* * extract and remove the tag if present. If we are left * with onepass, optimize the outgoing path. */ tag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); if (tag != NULL) { args.rule = *((struct ipfw_rule_ref *)(tag+1)); m_tag_delete(*m0, tag); if (args.rule.info & IPFW_ONEPASS) { SET_HOST_IPLEN(mtod(*m0, struct ip *)); return 0; } } args.m = *m0; args.oif = dir == DIR_OUT ? ifp : NULL; args.inp = inp; ipfw = ipfw_chk(&args); *m0 = args.m; KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL", __func__)); /* breaking out of the switch means drop */ ret = 0; /* default return value for pass */ switch (ipfw) { case IP_FW_PASS: /* next_hop may be set by ipfw_chk */ if (args.next_hop == NULL) break; /* pass */ #ifndef IPFIREWALL_FORWARD ret = EACCES; #else { struct m_tag *fwd_tag; /* Incoming packets should not be tagged so we do not * m_tag_find. Outgoing packets may be tagged, so we * reuse the tag if present. */ fwd_tag = (dir == DIR_IN) ? NULL : m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL); if (fwd_tag != NULL) { m_tag_unlink(*m0, fwd_tag); } else { fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD, sizeof(struct sockaddr_in), M_NOWAIT); if (fwd_tag == NULL) { ret = EACCES; break; /* i.e. drop */ } } bcopy(args.next_hop, (fwd_tag+1), sizeof(struct sockaddr_in)); m_tag_prepend(*m0, fwd_tag); if (in_localip(args.next_hop->sin_addr)) (*m0)->m_flags |= M_FASTFWD_OURS; } #endif break; case IP_FW_DENY: ret = EACCES; break; /* i.e. drop */ case IP_FW_DUMMYNET: ret = EACCES; if (ip_dn_io_ptr == NULL) break; /* i.e. drop */ if (mtod(*m0, struct ip *)->ip_v == 4) ret = ip_dn_io_ptr(m0, dir, &args); else if (mtod(*m0, struct ip *)->ip_v == 6) ret = ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args); else break; /* drop it */ /* * XXX should read the return value. * dummynet normally eats the packet and sets *m0=NULL * unless the packet can be sent immediately. In this * case args is updated and we should re-run the * check without clearing args. */ if (*m0 != NULL) goto again; break; case IP_FW_TEE: case IP_FW_DIVERT: if (ip_divert_ptr == NULL) { ret = EACCES; break; /* i.e. drop */ } ret = ipfw_divert(m0, dir, &args.rule, (ipfw == IP_FW_TEE) ? 1 : 0); /* continue processing for the original packet (tee). */ if (*m0) goto again; break; case IP_FW_NGTEE: case IP_FW_NETGRAPH: if (ng_ipfw_input_p == NULL) { ret = EACCES; break; /* i.e. drop */ } ret = ng_ipfw_input_p(m0, dir, &args, (ipfw == IP_FW_NGTEE) ? 1 : 0); if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */ goto again; /* continue with packet */ break; case IP_FW_NAT: /* honor one-pass in case of successful nat */ if (V_fw_one_pass) break; /* ret is already 0 */ goto again; case IP_FW_REASS: goto again; /* continue with packet */ default: KASSERT(0, ("%s: unknown retval", __func__)); } if (ret != 0) { if (*m0) FREE_PKT(*m0); *m0 = NULL; } if (*m0 && mtod(*m0, struct ip *)->ip_v == 4) SET_HOST_IPLEN(mtod(*m0, struct ip *)); return ret; } /* do the divert, return 1 on error 0 on success */ static int ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule, int tee) { /* * ipfw_chk() has already tagged the packet with the divert tag. * If tee is set, copy packet and return original. * If not tee, consume packet and send it to divert socket. */ struct mbuf *clone; struct ip *ip; struct m_tag *tag; /* Cloning needed for tee? */ if (tee == 0) { clone = *m0; /* use the original mbuf */ *m0 = NULL; } else { clone = m_dup(*m0, M_DONTWAIT); /* If we cannot duplicate the mbuf, we sacrifice the divert * chain and continue with the tee-ed packet. */ if (clone == NULL) return 1; } /* * Divert listeners can normally handle non-fragmented packets, * but we can only reass in the non-tee case. * This means that listeners on a tee rule may get fragments, * and have to live with that. * Note that we now have the 'reass' ipfw option so if we care * we can do it before a 'tee'. */ ip = mtod(clone, struct ip *); if (!tee && ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) { int hlen; struct mbuf *reass; SET_HOST_IPLEN(ip); /* ip_reass wants host order */ reass = ip_reass(clone); /* Reassemble packet. */ if (reass == NULL) return 0; /* not an error */ /* if reass = NULL then it was consumed by ip_reass */ /* * IP header checksum fixup after reassembly and leave header * in network byte order. */ ip = mtod(reass, struct ip *); hlen = ip->ip_hl << 2; SET_NET_IPLEN(ip); ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); else ip->ip_sum = in_cksum(reass, hlen); clone = reass; } /* attach a tag to the packet with the reinject info */ tag = m_tag_alloc(MTAG_IPFW_RULE, 0, sizeof(struct ipfw_rule_ref), M_NOWAIT); if (tag == NULL) { FREE_PKT(clone); return 1; } *((struct ipfw_rule_ref *)(tag+1)) = *rule; m_tag_prepend(clone, tag); /* Do the dirty job... */ ip_divert_ptr(clone, incoming); return 0; } /* * attach or detach hooks for a given protocol family */ static int ipfw_hook(int onoff, int pf) { struct pfil_head *pfh; pfh = pfil_head_get(PFIL_TYPE_AF, pf); if (pfh == NULL) return ENOENT; (void) (onoff ? pfil_add_hook : pfil_remove_hook) (ipfw_check_hook, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh); return 0; } int ipfw_attach_hooks(int arg) { int error = 0; if (arg == 0) /* detach */ ipfw_hook(0, AF_INET); else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) { error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */ printf("ipfw_hook() error\n"); } #ifdef INET6 if (arg == 0) /* detach */ ipfw_hook(0, AF_INET6); else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) { error = ENOENT; printf("ipfw6_hook() error\n"); } #endif return error; } int ipfw_chg_hook(SYSCTL_HANDLER_ARGS) { int enable; int oldenable; int error; int af; if (arg1 == &VNET_NAME(fw_enable)) { enable = V_fw_enable; af = AF_INET; } #ifdef INET6 else if (arg1 == &VNET_NAME(fw6_enable)) { enable = V_fw6_enable; af = AF_INET6; } #endif else return (EINVAL); oldenable = enable; error = sysctl_handle_int(oidp, &enable, 0, req); if (error) return (error); enable = (enable) ? 1 : 0; if (enable == oldenable) return (0); error = ipfw_hook(enable, af); if (error) return (error); if (af == AF_INET) V_fw_enable = enable; #ifdef INET6 else if (af == AF_INET6) V_fw6_enable = enable; #endif return (0); } /* end of file */ ================================================ FILE: sys/netinet/ipfw/ip_fw_private.h ================================================ /*- * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD: head/sys/netinet/ipfw/ip_fw_private.h 200601 2009-12-16 10:48:40Z luigi $ */ #ifndef _IPFW2_PRIVATE_H #define _IPFW2_PRIVATE_H /* * Internal constants and data structures used by ipfw components * and not meant to be exported outside the kernel. */ #ifdef _KERNEL /* * For platforms that do not have SYSCTL support, we wrap the * SYSCTL_* into a function (one per file) to collect the values * into an array at module initialization. The wrapping macros, * SYSBEGIN() and SYSEND, are empty in the default case. */ #ifndef SYSBEGIN #define SYSBEGIN(x) #endif #ifndef SYSEND #define SYSEND #endif /* Return values from ipfw_chk() */ enum { IP_FW_PASS = 0, IP_FW_DENY, IP_FW_DIVERT, IP_FW_TEE, IP_FW_DUMMYNET, IP_FW_NETGRAPH, IP_FW_NGTEE, IP_FW_NAT, IP_FW_REASS, }; /* * Structure for collecting parameters to dummynet for ip6_output forwarding */ struct _ip6dn_args { struct ip6_pktopts *opt_or; struct route_in6 ro_or; int flags_or; struct ip6_moptions *im6o_or; struct ifnet *origifp_or; struct ifnet *ifp_or; struct sockaddr_in6 dst_or; u_long mtu_or; struct route_in6 ro_pmtu_or; }; /* * Arguments for calling ipfw_chk() and dummynet_io(). We put them * all into a structure because this way it is easier and more * efficient to pass variables around and extend the interface. */ struct ip_fw_args { struct mbuf *m; /* the mbuf chain */ struct ifnet *oif; /* output interface */ struct sockaddr_in *next_hop; /* forward address */ /* * On return, it points to the matching rule. * On entry, rule.slot > 0 means the info is valid and * contains the the starting rule for an ipfw search. * If chain_id == chain->id && slot >0 then jump to that slot. * Otherwise, we locate the first rule >= rulenum:rule_id */ struct ipfw_rule_ref rule; /* match/restart info */ struct ether_header *eh; /* for bridged packets */ struct ipfw_flow_id f_id; /* grabbed from IP header */ //uint32_t cookie; /* a cookie depending on rule action */ struct inpcb *inp; struct _ip6dn_args dummypar; /* dummynet->ip6_output */ struct sockaddr_in hopstore; /* store here if cannot use a pointer */ }; MALLOC_DECLARE(M_IPFW); /* * Hooks sometime need to know the direction of the packet * (divert, dummynet, netgraph, ...) * We use a generic definition here, with bit0-1 indicating the * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the * specific protocol * indicating the protocol (if necessary) */ enum { DIR_MASK = 0x3, DIR_OUT = 0, DIR_IN = 1, DIR_FWD = 2, DIR_DROP = 3, PROTO_LAYER2 = 0x4, /* set for layer 2 */ /* PROTO_DEFAULT = 0, */ PROTO_IPV4 = 0x08, PROTO_IPV6 = 0x10, PROTO_IFB = 0x0c, /* layer2 + ifbridge */ /* PROTO_OLDBDG = 0x14, unused, old bridge */ }; /* wrapper for freeing a packet, in case we need to do more work */ #ifndef FREE_PKT #if defined(__linux__) || defined(_WIN32) #define FREE_PKT(m) netisr_dispatch(-1, m) #else #define FREE_PKT(m) m_freem(m) #endif #endif /* !FREE_PKT */ /* * Function definitions. */ /* attach (arg = 1) or detach (arg = 0) hooks */ int ipfw_attach_hooks(int); #ifdef NOTYET void ipfw_nat_destroy(void); #endif /* In ip_fw_log.c */ struct ip; void ipfw_log_bpf(int); void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, struct ip *ip); VNET_DECLARE(u_int64_t, norule_counter); #define V_norule_counter VNET(norule_counter) VNET_DECLARE(int, verbose_limit); #define V_verbose_limit VNET(verbose_limit) /* In ip_fw_dynamic.c */ enum { /* result for matching dynamic rules */ MATCH_REVERSE = 0, MATCH_FORWARD, MATCH_NONE, MATCH_UNKNOWN, }; /* * The lock for dynamic rules is only used once outside the file, * and only to release the result of lookup_dyn_rule(). * Eventually we may implement it with a callback on the function. */ void ipfw_dyn_unlock(void); struct tcphdr; struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *, u_int32_t, u_int32_t, int); int ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, struct ip_fw_args *args, uint32_t tablearg); ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, struct tcphdr *tcp); void ipfw_remove_dyn_children(struct ip_fw *rule); void ipfw_get_dynamic(char **bp, const char *ep); void ipfw_dyn_attach(void); /* uma_zcreate .... */ void ipfw_dyn_detach(void); /* uma_zdestroy ... */ void ipfw_dyn_init(void); /* per-vnet initialization */ void ipfw_dyn_uninit(int); /* per-vnet deinitialization */ int ipfw_dyn_len(void); /* common variables */ VNET_DECLARE(int, fw_one_pass); #define V_fw_one_pass VNET(fw_one_pass) VNET_DECLARE(int, fw_verbose); #define V_fw_verbose VNET(fw_verbose) VNET_DECLARE(struct ip_fw_chain, layer3_chain); #define V_layer3_chain VNET(layer3_chain) VNET_DECLARE(u_int32_t, set_disable); #define V_set_disable VNET(set_disable) VNET_DECLARE(int, autoinc_step); #define V_autoinc_step VNET(autoinc_step) struct ip_fw_chain { struct ip_fw *rules; /* list of rules */ struct ip_fw *reap; /* list of rules to reap */ struct ip_fw *default_rule; int n_rules; /* number of static rules */ int static_len; /* total len of static rules */ struct ip_fw **map; /* array of rule ptrs to ease lookup */ LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */ struct radix_node_head *tables[IPFW_TABLES_MAX]; #if defined( __linux__ ) || defined( _WIN32 ) spinlock_t rwmtx; spinlock_t uh_lock; #else struct rwlock rwmtx; struct rwlock uh_lock; /* lock for upper half */ #endif uint32_t id; /* ruleset id */ }; struct sockopt; /* used by tcp_var.h */ /* * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c * so the variable and the macros must be here. */ #define IPFW_LOCK_INIT(_chain) do { \ rw_init(&(_chain)->rwmtx, "IPFW static rules"); \ rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ } while (0) #define IPFW_LOCK_DESTROY(_chain) do { \ rw_destroy(&(_chain)->rwmtx); \ rw_destroy(&(_chain)->uh_lock); \ } while (0) #define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED) #define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx) #define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx) #define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx) #define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx) #define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock) #define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock) #define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock) #define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock) /* In ip_fw_sockopt.c */ int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id); int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule); int ipfw_ctl(struct sockopt *sopt); int ipfw_chk(struct ip_fw_args *args); void ipfw_reap_rules(struct ip_fw *head); /* In ip_fw_pfil */ int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, struct inpcb *inp); /* In ip_fw_table.c */ struct radix_node; int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint32_t *val); int ipfw_init_tables(struct ip_fw_chain *ch); void ipfw_destroy_tables(struct ip_fw_chain *ch); int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl); int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint8_t mlen, uint32_t value); int ipfw_dump_table_entry(struct radix_node *rn, void *arg); int ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint8_t mlen); int ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt); int ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl); /* In ip_fw_nat.c -- XXX to be moved to ip_var.h */ extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *); typedef int ipfw_nat_cfg_t(struct sockopt *); extern ipfw_nat_t *ipfw_nat_ptr; #define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL) extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; extern ipfw_nat_cfg_t *ipfw_nat_del_ptr; extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; #endif /* _KERNEL */ #endif /* _IPFW2_PRIVATE_H */ ================================================ FILE: sys/netinet/ipfw/ip_fw_sockopt.c ================================================ /*- * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Supported by: Valeria Paoli * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_sockopt.c 206339 2010-04-07 08:23:58Z luigi $"); /* * Sockopt support for ipfw. The routines here implement * the upper half of the ipfw code. */ #if !defined(KLD_MODULE) #include "opt_ipfw.h" #include "opt_ipdivert.h" #include "opt_ipdn.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #endif #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include /* struct m_tag used by nested headers */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* hooks */ #include #include #ifdef MAC #include #endif MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); /* * static variables followed by global ones (none in this file) */ /* * Find the smallest rule >= key, id. * We could use bsearch but it is so simple that we code it directly */ int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id) { int i, lo, hi; struct ip_fw *r; for (lo = 0, hi = chain->n_rules - 1; lo < hi;) { i = (lo + hi) / 2; r = chain->map[i]; if (r->rulenum < key) lo = i + 1; /* continue from the next one */ else if (r->rulenum > key) hi = i; /* this might be good */ else if (r->id < id) lo = i + 1; /* continue from the next one */ else /* r->id >= id */ hi = i; /* this might be good */ }; return hi; } /* * allocate a new map, returns the chain locked. extra is the number * of entries to add or delete. */ static struct ip_fw ** get_map(struct ip_fw_chain *chain, int extra, int locked) { for (;;) { struct ip_fw **map; int i; i = chain->n_rules + extra; map = malloc(i * sizeof(struct ip_fw *), M_IPFW, locked ? M_NOWAIT : M_WAITOK); if (map == NULL) { printf("%s: cannot allocate map\n", __FUNCTION__); return NULL; } if (!locked) IPFW_UH_WLOCK(chain); if (i >= chain->n_rules + extra) /* good */ return map; /* otherwise we lost the race, free and retry */ if (!locked) IPFW_UH_WUNLOCK(chain); free(map, M_IPFW); } } /* * swap the maps. It is supposed to be called with IPFW_UH_WLOCK */ static struct ip_fw ** swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len) { struct ip_fw **old_map; IPFW_WLOCK(chain); chain->id++; chain->n_rules = new_len; old_map = chain->map; chain->map = new_map; IPFW_WUNLOCK(chain); return old_map; } /* * Add a new rule to the list. Copy the rule into a malloc'ed area, then * possibly create a rule number and add the rule to the list. * Update the rule_number in the input struct so the caller knows it as well. * XXX DO NOT USE FOR THE DEFAULT RULE. * Must be called without IPFW_UH held */ int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule) { struct ip_fw *rule; int i, l, insert_before; struct ip_fw **map; /* the new array of pointers */ if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1) return (EINVAL); l = RULESIZE(input_rule); rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO); if (rule == NULL) return (ENOSPC); /* get_map returns with IPFW_UH_WLOCK if successful */ map = get_map(chain, 1, 0 /* not locked */); if (map == NULL) { free(rule, M_IPFW); return ENOSPC; } bcopy(input_rule, rule, l); /* clear fields not settable from userland */ rule->x_next = NULL; rule->next_rule = NULL; rule->pcnt = 0; rule->bcnt = 0; rule->timestamp = 0; if (V_autoinc_step < 1) V_autoinc_step = 1; else if (V_autoinc_step > 1000) V_autoinc_step = 1000; /* find the insertion point, we will insert before */ insert_before = rule->rulenum ? rule->rulenum + 1 : IPFW_DEFAULT_RULE; i = ipfw_find_rule(chain, insert_before, 0); /* duplicate first part */ if (i > 0) bcopy(chain->map, map, i * sizeof(struct ip_fw *)); map[i] = rule; /* duplicate remaining part, we always have the default rule */ bcopy(chain->map + i, map + i + 1, sizeof(struct ip_fw *) *(chain->n_rules - i)); if (rule->rulenum == 0) { /* write back the number */ rule->rulenum = i > 0 ? map[i-1]->rulenum : 0; if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step) rule->rulenum += V_autoinc_step; input_rule->rulenum = rule->rulenum; } rule->id = chain->id + 1; map = swap_map(chain, map, chain->n_rules + 1); chain->static_len += l; IPFW_UH_WUNLOCK(chain); if (map) free(map, M_IPFW); return (0); } /* * Reclaim storage associated with a list of rules. This is * typically the list created using remove_rule. * A NULL pointer on input is handled correctly. */ void ipfw_reap_rules(struct ip_fw *head) { struct ip_fw *rule; while ((rule = head) != NULL) { head = head->x_next; free(rule, M_IPFW); } } /* * Used by del_entry() to check if a rule should be kept. * Returns 1 if the rule must be kept, 0 otherwise. * * Called with cmd = {0,1,5}. * cmd == 0 matches on rule numbers, excludes rules in RESVD_SET if n == 0 ; * cmd == 1 matches on set numbers only, rule numbers are ignored; * cmd == 5 matches on rule and set numbers. * * n == 0 is a wildcard for rule numbers, there is no wildcard for sets. * * Rules to keep are * (default || reserved || !match_set || !match_number) * where * default ::= (rule->rulenum == IPFW_DEFAULT_RULE) * // the default rule is always protected * * reserved ::= (cmd == 0 && n == 0 && rule->set == RESVD_SET) * // RESVD_SET is protected only if cmd == 0 and n == 0 ("ipfw flush") * * match_set ::= (cmd == 0 || rule->set == set) * // set number is ignored for cmd == 0 * * match_number ::= (cmd == 1 || n == 0 || n == rule->rulenum) * // number is ignored for cmd == 1 or n == 0 * */ static int keep_rule(struct ip_fw *rule, uint8_t cmd, uint8_t set, uint32_t n) { return (rule->rulenum == IPFW_DEFAULT_RULE) || (cmd == 0 && n == 0 && rule->set == RESVD_SET) || !(cmd == 0 || rule->set == set) || !(cmd == 1 || n == 0 || n == rule->rulenum); } /** * Remove all rules with given number, or do set manipulation. * Assumes chain != NULL && *chain != NULL. * * The argument is an uint32_t. The low 16 bit are the rule or set number; * the next 8 bits are the new set; the top 8 bits indicate the command: * * 0 delete rules numbered "rulenum" * 1 delete rules in set "rulenum" * 2 move rules "rulenum" to set "new_set" * 3 move rules from set "rulenum" to set "new_set" * 4 swap sets "rulenum" and "new_set" * 5 delete rules "rulenum" and set "new_set" */ static int del_entry(struct ip_fw_chain *chain, uint32_t arg) { struct ip_fw *rule; uint32_t num; /* rule number or old_set */ uint8_t cmd, new_set; int start, end, i, ofs, n; struct ip_fw **map = NULL; int error = 0; num = arg & 0xffff; cmd = (arg >> 24) & 0xff; new_set = (arg >> 16) & 0xff; if (cmd > 5 || new_set > RESVD_SET) return EINVAL; if (cmd == 0 || cmd == 2 || cmd == 5) { if (num >= IPFW_DEFAULT_RULE) return EINVAL; } else { if (num > RESVD_SET) /* old_set */ return EINVAL; } IPFW_UH_WLOCK(chain); /* arbitrate writers */ chain->reap = NULL; /* prepare for deletions */ switch (cmd) { case 0: /* delete rules "num" (num == 0 matches all) */ case 1: /* delete all rules in set N */ case 5: /* delete rules with number N and set "new_set". */ /* * Locate first rule to delete (start), the rule after * the last one to delete (end), and count how many * rules to delete (n). Always use keep_rule() to * determine which rules to keep. */ n = 0; if (cmd == 1) { /* look for a specific set including RESVD_SET. * Must scan the entire range, ignore num. */ new_set = num; for (start = -1, end = i = 0; i < chain->n_rules; i++) { if (keep_rule(chain->map[i], cmd, new_set, 0)) continue; if (start < 0) start = i; end = i; n++; } end++; /* first non-matching */ } else { /* Optimized search on rule numbers */ start = ipfw_find_rule(chain, num, 0); for (end = start; end < chain->n_rules; end++) { rule = chain->map[end]; if (num > 0 && rule->rulenum != num) break; if (!keep_rule(rule, cmd, new_set, num)) n++; } } if (n == 0) { /* A flush request (arg == 0) on empty ruleset * returns with no error. On the contrary, * if there is no match on a specific request, * we return EINVAL. */ error = (arg == 0) ? 0 : EINVAL; break; } /* We have something to delete. Allocate the new map */ map = get_map(chain, -n, 1 /* locked */); if (map == NULL) { error = EINVAL; break; } /* 1. bcopy the initial part of the map */ if (start > 0) bcopy(chain->map, map, start * sizeof(struct ip_fw *)); /* 2. copy active rules between start and end */ for (i = ofs = start; i < end; i++) { rule = chain->map[i]; if (keep_rule(rule, cmd, new_set, num)) map[ofs++] = rule; } /* 3. copy the final part of the map */ bcopy(chain->map + end, map + ofs, (chain->n_rules - end) * sizeof(struct ip_fw *)); /* 4. swap the maps (under BH_LOCK) */ map = swap_map(chain, map, chain->n_rules - n); /* 5. now remove the rules deleted from the old map */ for (i = start; i < end; i++) { int l; rule = map[i]; if (keep_rule(rule, cmd, new_set, num)) continue; l = RULESIZE(rule); chain->static_len -= l; ipfw_remove_dyn_children(rule); rule->x_next = chain->reap; chain->reap = rule; } break; /* * In the next 3 cases the loop stops at (n_rules - 1) * because the default rule is never eligible.. */ case 2: /* move rules with given RULE number to new set */ for (i = 0; i < chain->n_rules - 1; i++) { rule = chain->map[i]; if (rule->rulenum == num) rule->set = new_set; } break; case 3: /* move rules with given SET number to new set */ for (i = 0; i < chain->n_rules - 1; i++) { rule = chain->map[i]; if (rule->set == num) rule->set = new_set; } break; case 4: /* swap two sets */ for (i = 0; i < chain->n_rules - 1; i++) { rule = chain->map[i]; if (rule->set == num) rule->set = new_set; else if (rule->set == new_set) rule->set = num; } break; } rule = chain->reap; chain->reap = NULL; IPFW_UH_WUNLOCK(chain); ipfw_reap_rules(rule); if (map) free(map, M_IPFW); return error; } /* * Clear counters for a specific rule. * Normally run under IPFW_UH_RLOCK, but these are idempotent ops * so we only care that rules do not disappear. */ static void clear_counters(struct ip_fw *rule, int log_only) { ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); if (log_only == 0) { rule->bcnt = rule->pcnt = 0; rule->timestamp = 0; } if (l->o.opcode == O_LOG) l->log_left = l->max_log; } /** * Reset some or all counters on firewall rules. * The argument `arg' is an u_int32_t. The low 16 bit are the rule number, * the next 8 bits are the set number, the top 8 bits are the command: * 0 work with rules from all set's; * 1 work with rules only from specified set. * Specified rule number is zero if we want to clear all entries. * log_only is 1 if we only want to reset logs, zero otherwise. */ static int zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only) { struct ip_fw *rule; char *msg; int i; uint16_t rulenum = arg & 0xffff; uint8_t set = (arg >> 16) & 0xff; uint8_t cmd = (arg >> 24) & 0xff; if (cmd > 1) return (EINVAL); if (cmd == 1 && set > RESVD_SET) return (EINVAL); IPFW_UH_RLOCK(chain); if (rulenum == 0) { V_norule_counter = 0; for (i = 0; i < chain->n_rules; i++) { rule = chain->map[i]; /* Skip rules not in our set. */ if (cmd == 1 && rule->set != set) continue; clear_counters(rule, log_only); } msg = log_only ? "All logging counts reset" : "Accounting cleared"; } else { int cleared = 0; for (i = 0; i < chain->n_rules; i++) { rule = chain->map[i]; if (rule->rulenum == rulenum) { if (cmd == 0 || rule->set == set) clear_counters(rule, log_only); cleared = 1; } if (rule->rulenum > rulenum) break; } if (!cleared) { /* we did not find any matching rules */ IPFW_UH_RUNLOCK(chain); return (EINVAL); } msg = log_only ? "logging count reset" : "cleared"; } IPFW_UH_RUNLOCK(chain); if (V_fw_verbose) { int lev = LOG_SECURITY | LOG_NOTICE; if (rulenum) log(lev, "ipfw: Entry %d %s.\n", rulenum, msg); else log(lev, "ipfw: %s.\n", msg); } return (0); } /* * Check validity of the structure before insert. * Rules are simple, so this mostly need to check rule sizes. */ static int check_ipfw_struct(struct ip_fw *rule, int size) { int l, cmdlen = 0; int have_action=0; ipfw_insn *cmd; if (size < sizeof(*rule)) { printf("ipfw: rule too short\n"); return (EINVAL); } /* first, check for valid size */ l = RULESIZE(rule); if (l != size) { printf("ipfw: size mismatch (have %d want %d)\n", size, l); return (EINVAL); } if (rule->act_ofs >= rule->cmd_len) { printf("ipfw: bogus action offset (%u > %u)\n", rule->act_ofs, rule->cmd_len - 1); return (EINVAL); } /* * Now go for the individual checks. Very simple ones, basically only * instruction sizes. */ for (l = rule->cmd_len, cmd = rule->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); if (cmdlen > l) { printf("ipfw: opcode %d size truncated\n", cmd->opcode); return EINVAL; } switch (cmd->opcode) { case O_PROBE_STATE: case O_KEEP_STATE: case O_PROTO: case O_IP_SRC_ME: case O_IP_DST_ME: case O_LAYER2: case O_IN: case O_FRAG: case O_DIVERTED: case O_IPOPT: case O_IPTOS: case O_IPPRECEDENCE: case O_IPVER: case O_TCPWIN: case O_TCPFLAGS: case O_TCPOPTS: case O_ESTAB: case O_VERREVPATH: case O_VERSRCREACH: case O_ANTISPOOF: case O_IPSEC: #ifdef INET6 case O_IP6_SRC_ME: case O_IP6_DST_ME: case O_EXT_HDR: case O_IP6: #endif case O_IP4: case O_TAG: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; break; case O_FIB: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; if (cmd->arg1 >= rt_numfibs) { printf("ipfw: invalid fib number %d\n", cmd->arg1); return EINVAL; } break; case O_SETFIB: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; if (cmd->arg1 >= rt_numfibs) { printf("ipfw: invalid fib number %d\n", cmd->arg1); return EINVAL; } goto check_action; case O_UID: case O_GID: case O_JAIL: case O_IP_SRC: case O_IP_DST: case O_TCPSEQ: case O_TCPACK: case O_PROB: case O_ICMPTYPE: if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; break; case O_LIMIT: if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) goto bad_size; break; case O_LOG: if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) goto bad_size; ((ipfw_insn_log *)cmd)->log_left = ((ipfw_insn_log *)cmd)->max_log; break; case O_IP_SRC_MASK: case O_IP_DST_MASK: /* only odd command lengths */ if ( !(cmdlen & 1) || cmdlen > 31) goto bad_size; break; case O_IP_SRC_SET: case O_IP_DST_SET: if (cmd->arg1 == 0 || cmd->arg1 > 256) { printf("ipfw: invalid set size %d\n", cmd->arg1); return EINVAL; } if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + (cmd->arg1+31)/32 ) goto bad_size; break; case O_IP_SRC_LOOKUP: case O_IP_DST_LOOKUP: if (cmd->arg1 >= IPFW_TABLES_MAX) { printf("ipfw: invalid table number %d\n", cmd->arg1); return (EINVAL); } if (cmdlen != F_INSN_SIZE(ipfw_insn) && cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 && cmdlen != F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; break; case O_MACADDR2: if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) goto bad_size; break; case O_NOP: case O_IPID: case O_IPTTL: case O_IPLEN: case O_TCPDATALEN: case O_TAGGED: if (cmdlen < 1 || cmdlen > 31) goto bad_size; break; case O_MAC_TYPE: case O_IP_SRCPORT: case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ if (cmdlen < 2 || cmdlen > 31) goto bad_size; break; case O_RECV: case O_XMIT: case O_VIA: if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) goto bad_size; break; case O_ALTQ: if (cmdlen != F_INSN_SIZE(ipfw_insn_altq)) goto bad_size; break; case O_PIPE: case O_QUEUE: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; goto check_action; case O_FORWARD_IP: #ifdef IPFIREWALL_FORWARD if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) goto bad_size; goto check_action; #else return EINVAL; #endif case O_DIVERT: case O_TEE: if (ip_divert_ptr == NULL) return EINVAL; else goto check_size; case O_NETGRAPH: case O_NGTEE: if (ng_ipfw_input_p == NULL) return EINVAL; else goto check_size; case O_NAT: if (!IPFW_NAT_LOADED) return EINVAL; if (cmdlen != F_INSN_SIZE(ipfw_insn_nat)) goto bad_size; goto check_action; case O_FORWARD_MAC: /* XXX not implemented yet */ case O_CHECK_STATE: case O_COUNT: case O_ACCEPT: case O_DENY: case O_REJECT: #ifdef INET6 case O_UNREACH6: #endif case O_SKIPTO: case O_REASS: check_size: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; check_action: if (have_action) { printf("ipfw: opcode %d, multiple actions" " not allowed\n", cmd->opcode); return EINVAL; } have_action = 1; if (l != cmdlen) { printf("ipfw: opcode %d, action must be" " last opcode\n", cmd->opcode); return EINVAL; } break; #ifdef INET6 case O_IP6_SRC: case O_IP6_DST: if (cmdlen != F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn)) goto bad_size; break; case O_FLOW6ID: if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + ((ipfw_insn_u32 *)cmd)->o.arg1) goto bad_size; break; case O_IP6_SRC_MASK: case O_IP6_DST_MASK: if ( !(cmdlen & 1) || cmdlen > 127) goto bad_size; break; case O_ICMP6TYPE: if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) ) goto bad_size; break; #endif default: switch (cmd->opcode) { #ifndef INET6 case O_IP6_SRC_ME: case O_IP6_DST_ME: case O_EXT_HDR: case O_IP6: case O_UNREACH6: case O_IP6_SRC: case O_IP6_DST: case O_FLOW6ID: case O_IP6_SRC_MASK: case O_IP6_DST_MASK: case O_ICMP6TYPE: printf("ipfw: no IPv6 support in kernel\n"); return EPROTONOSUPPORT; #endif default: printf("ipfw: opcode %d, unknown opcode\n", cmd->opcode); return EINVAL; } } } if (have_action == 0) { printf("ipfw: missing action\n"); return EINVAL; } return 0; bad_size: printf("ipfw: opcode %d size %d wrong\n", cmd->opcode, cmdlen); return EINVAL; } /* * Translation of requests for compatibility with FreeBSD 7.2/8. * a static variable tells us if we have an old client from userland, * and if necessary we translate requests and responses between the * two formats. */ static int is7 = 0; struct ip_fw7 { struct ip_fw7 *next; /* linked list of rules */ struct ip_fw7 *next_rule; /* ptr to next [skipto] rule */ /* 'next_rule' is used to pass up 'set_disable' status */ uint16_t act_ofs; /* offset of action in 32-bit units */ uint16_t cmd_len; /* # of 32-bit words in cmd */ uint16_t rulenum; /* rule number */ uint8_t set; /* rule set (0..31) */ // #define RESVD_SET 31 /* set for default and persistent rules */ uint8_t _pad; /* padding */ // uint32_t id; /* rule id, only in v.8 */ /* These fields are present in all rules. */ uint64_t pcnt; /* Packet counter */ uint64_t bcnt; /* Byte counter */ uint32_t timestamp; /* tv_sec of last match */ ipfw_insn cmd[1]; /* storage for commands */ }; int convert_rule_to_7(struct ip_fw *rule); int convert_rule_to_8(struct ip_fw *rule); #ifndef RULESIZE7 #define RULESIZE7(rule) (sizeof(struct ip_fw7) + \ ((struct ip_fw7 *)(rule))->cmd_len * 4 - 4) #endif /* * Copy the static and dynamic rules to the supplied buffer * and return the amount of space actually used. * Must be run under IPFW_UH_RLOCK */ static size_t ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) { char *bp = buf; char *ep = bp + space; struct ip_fw *rule, *dst; int l, i; time_t boot_seconds; boot_seconds = boottime.tv_sec; for (i = 0; i < chain->n_rules; i++) { rule = chain->map[i]; if (is7) { /* Convert rule to FreeBSd 7.2 format */ l = RULESIZE7(rule); if (bp + l + sizeof(uint32_t) <= ep) { int error; bcopy(rule, bp, l + sizeof(uint32_t)); error = convert_rule_to_7((struct ip_fw *) bp); if (error) return 0; /*XXX correct? */ /* * XXX HACK. Store the disable mask in the "next" * pointer in a wild attempt to keep the ABI the same. * Why do we do this on EVERY rule? */ bcopy(&V_set_disable, &(((struct ip_fw7 *)bp)->next_rule), sizeof(V_set_disable)); if (((struct ip_fw7 *)bp)->timestamp) ((struct ip_fw7 *)bp)->timestamp += boot_seconds; bp += l; } continue; /* go to next rule */ } /* normal mode, don't touch rules */ l = RULESIZE(rule); if (bp + l > ep) { /* should not happen */ printf("overflow dumping static rules\n"); break; } dst = (struct ip_fw *)bp; bcopy(rule, dst, l); /* * XXX HACK. Store the disable mask in the "next" * pointer in a wild attempt to keep the ABI the same. * Why do we do this on EVERY rule? */ bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable)); if (dst->timestamp) dst->timestamp += boot_seconds; bp += l; } ipfw_get_dynamic(&bp, ep); /* protected by the dynamic lock */ return (bp - (char *)buf); } /** * {set|get}sockopt parser. */ int ipfw_ctl(struct sockopt *sopt) { #define RULE_MAXSIZE (256*sizeof(u_int32_t)) int error; size_t size; struct ip_fw *buf, *rule; struct ip_fw_chain *chain; u_int32_t rulenum[2]; error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW); if (error) return (error); /* * Disallow modifications in really-really secure mode, but still allow * the logging counters to be reset. */ if (sopt->sopt_name == IP_FW_ADD || (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) { error = securelevel_ge(sopt->sopt_td->td_ucred, 3); if (error) return (error); } chain = &V_layer3_chain; error = 0; switch (sopt->sopt_name) { case IP_FW_GET: /* * pass up a copy of the current rules. Static rules * come first (the last of which has number IPFW_DEFAULT_RULE), * followed by a possibly empty list of dynamic rule. * The last dynamic rule has NULL in the "next" field. * * Note that the calculated size is used to bound the * amount of data returned to the user. The rule set may * change between calculating the size and returning the * data in which case we'll just return what fits. */ for (;;) { int len = 0, want; size = chain->static_len; size += ipfw_dyn_len(); if (size >= sopt->sopt_valsize) break; buf = malloc(size, M_TEMP, M_WAITOK); if (buf == NULL) break; IPFW_UH_RLOCK(chain); /* check again how much space we need */ want = chain->static_len + ipfw_dyn_len(); if (size >= want) len = ipfw_getrules(chain, buf, size); IPFW_UH_RUNLOCK(chain); if (size >= want) error = sooptcopyout(sopt, buf, len); free(buf, M_TEMP); if (size >= want) break; } break; case IP_FW_FLUSH: /* locking is done within del_entry() */ error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */ break; case IP_FW_ADD: rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK); error = sooptcopyin(sopt, rule, RULE_MAXSIZE, sizeof(struct ip_fw7) ); /* * If the size of commands equals RULESIZE7 then we assume * a FreeBSD7.2 binary is talking to us (set is7=1). * is7 is persistent so the next 'ipfw list' command * will use this format. * NOTE: If wrong version is guessed (this can happen if * the first ipfw command is 'ipfw [pipe] list') * the ipfw binary may crash or loop infinitly... */ if (sopt->sopt_valsize == RULESIZE7(rule)) { is7 = 1; error = convert_rule_to_8(rule); if (error) return error; if (error == 0) error = check_ipfw_struct(rule, RULESIZE(rule)); } else { is7 = 0; if (error == 0) error = check_ipfw_struct(rule, sopt->sopt_valsize); } if (error == 0) { /* locking is done within ipfw_add_rule() */ error = ipfw_add_rule(chain, rule); size = RULESIZE(rule); if (!error && sopt->sopt_dir == SOPT_GET) { if (is7) { error = convert_rule_to_7(rule); size = RULESIZE7(rule); if (error) return error; } error = sooptcopyout(sopt, rule, size); } } free(rule, M_TEMP); break; case IP_FW_DEL: /* * IP_FW_DEL is used for deleting single rules or sets, * and (ab)used to atomically manipulate sets. Argument size * is used to distinguish between the two: * sizeof(u_int32_t) * delete single rule or set of rules, * or reassign rules (or sets) to a different set. * 2*sizeof(u_int32_t) * atomic disable/enable sets. * first u_int32_t contains sets to be disabled, * second u_int32_t contains sets to be enabled. */ error = sooptcopyin(sopt, rulenum, 2*sizeof(u_int32_t), sizeof(u_int32_t)); if (error) break; size = sopt->sopt_valsize; if (size == sizeof(u_int32_t) && rulenum[0] != 0) { /* delete or reassign, locking done in del_entry() */ error = del_entry(chain, rulenum[0]); } else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */ IPFW_UH_WLOCK(chain); V_set_disable = (V_set_disable | rulenum[0]) & ~rulenum[1] & ~(1<sopt_val != 0) { error = sooptcopyin(sopt, rulenum, sizeof(u_int32_t), sizeof(u_int32_t)); if (error) break; } error = zero_entry(chain, rulenum[0], sopt->sopt_name == IP_FW_RESETLOG); break; /*--- TABLE manipulations are protected by the IPFW_LOCK ---*/ case IP_FW_TABLE_ADD: { ipfw_table_entry ent; error = sooptcopyin(sopt, &ent, sizeof(ent), sizeof(ent)); if (error) break; error = ipfw_add_table_entry(chain, ent.tbl, ent.addr, ent.masklen, ent.value); } break; case IP_FW_TABLE_DEL: { ipfw_table_entry ent; error = sooptcopyin(sopt, &ent, sizeof(ent), sizeof(ent)); if (error) break; error = ipfw_del_table_entry(chain, ent.tbl, ent.addr, ent.masklen); } break; case IP_FW_TABLE_FLUSH: { u_int16_t tbl; error = sooptcopyin(sopt, &tbl, sizeof(tbl), sizeof(tbl)); if (error) break; IPFW_WLOCK(chain); error = ipfw_flush_table(chain, tbl); IPFW_WUNLOCK(chain); } break; case IP_FW_TABLE_GETSIZE: { u_int32_t tbl, cnt; if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl), sizeof(tbl)))) break; IPFW_RLOCK(chain); error = ipfw_count_table(chain, tbl, &cnt); IPFW_RUNLOCK(chain); if (error) break; error = sooptcopyout(sopt, &cnt, sizeof(cnt)); } break; case IP_FW_TABLE_LIST: { ipfw_table *tbl; if (sopt->sopt_valsize < sizeof(*tbl)) { error = EINVAL; break; } size = sopt->sopt_valsize; tbl = malloc(size, M_TEMP, M_WAITOK); error = sooptcopyin(sopt, tbl, size, sizeof(*tbl)); if (error) { free(tbl, M_TEMP); break; } tbl->size = (size - sizeof(*tbl)) / sizeof(ipfw_table_entry); IPFW_RLOCK(chain); error = ipfw_dump_table(chain, tbl); IPFW_RUNLOCK(chain); if (error) { free(tbl, M_TEMP); break; } error = sooptcopyout(sopt, tbl, size); free(tbl, M_TEMP); } break; /*--- NAT operations are protected by the IPFW_LOCK ---*/ case IP_FW_NAT_CFG: if (IPFW_NAT_LOADED) error = ipfw_nat_cfg_ptr(sopt); else { printf("IP_FW_NAT_CFG: %s\n", "ipfw_nat not present, please load it"); error = EINVAL; } break; case IP_FW_NAT_DEL: if (IPFW_NAT_LOADED) error = ipfw_nat_del_ptr(sopt); else { printf("IP_FW_NAT_DEL: %s\n", "ipfw_nat not present, please load it"); error = EINVAL; } break; case IP_FW_NAT_GET_CONFIG: if (IPFW_NAT_LOADED) error = ipfw_nat_get_cfg_ptr(sopt); else { printf("IP_FW_NAT_GET_CFG: %s\n", "ipfw_nat not present, please load it"); error = EINVAL; } break; case IP_FW_NAT_GET_LOG: if (IPFW_NAT_LOADED) error = ipfw_nat_get_log_ptr(sopt); else { printf("IP_FW_NAT_GET_LOG: %s\n", "ipfw_nat not present, please load it"); error = EINVAL; } break; default: printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name); error = EINVAL; } return (error); #undef RULE_MAXSIZE } #define RULE_MAXSIZE (256*sizeof(u_int32_t)) /* Functions to convert rules 7.2 <==> 8.0 */ int convert_rule_to_7(struct ip_fw *rule) { /* Used to modify original rule */ struct ip_fw7 *rule7 = (struct ip_fw7 *)rule; /* copy of original rule, version 8 */ struct ip_fw *tmp; /* Used to copy commands */ ipfw_insn *ccmd, *dst; int ll = 0, ccmdlen = 0; tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); if (tmp == NULL) { return 1; //XXX error } bcopy(rule, tmp, RULE_MAXSIZE); /* Copy fields */ rule7->_pad = tmp->_pad; rule7->set = tmp->set; rule7->rulenum = tmp->rulenum; rule7->cmd_len = tmp->cmd_len; rule7->act_ofs = tmp->act_ofs; rule7->next_rule = (struct ip_fw7 *)tmp->next_rule; rule7->next = (struct ip_fw7 *)tmp->x_next; rule7->cmd_len = tmp->cmd_len; rule7->pcnt = tmp->pcnt; rule7->bcnt = tmp->bcnt; rule7->timestamp = tmp->timestamp; /* Copy commands */ for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ; ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { ccmdlen = F_LEN(ccmd); bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); if (dst->opcode > O_NAT) /* O_REASS doesn't exists in 7.2 version, so * decrement opcode if it is after O_REASS */ dst->opcode--; if (ccmdlen > ll) { printf("ipfw: opcode %d size truncated\n", ccmd->opcode); return EINVAL; } } free(tmp, M_TEMP); return 0; } int convert_rule_to_8(struct ip_fw *rule) { /* Used to modify original rule */ struct ip_fw7 *rule7 = (struct ip_fw7 *) rule; /* Used to copy commands */ ipfw_insn *ccmd, *dst; int ll = 0, ccmdlen = 0; /* Copy of original rule */ struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); if (tmp == NULL) { return 1; //XXX error } bcopy(rule7, tmp, RULE_MAXSIZE); for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ; ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { ccmdlen = F_LEN(ccmd); bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); if (dst->opcode > O_NAT) /* O_REASS doesn't exists in 7.2 version, so * increment opcode if it is after O_REASS */ dst->opcode++; if (ccmdlen > ll) { printf("ipfw: opcode %d size truncated\n", ccmd->opcode); return EINVAL; } } rule->_pad = tmp->_pad; rule->set = tmp->set; rule->rulenum = tmp->rulenum; rule->cmd_len = tmp->cmd_len; rule->act_ofs = tmp->act_ofs; rule->next_rule = (struct ip_fw *)tmp->next_rule; rule->x_next = (struct ip_fw *)tmp->next; rule->cmd_len = tmp->cmd_len; rule->id = 0; /* XXX see if is ok = 0 */ rule->pcnt = tmp->pcnt; rule->bcnt = tmp->bcnt; rule->timestamp = tmp->timestamp; free (tmp, M_TEMP); return 0; } /* end of file */ ================================================ FILE: sys/netinet/ipfw/ip_fw_table.c ================================================ /*- * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_table.c 200601 2009-12-16 10:48:40Z luigi $"); /* * Lookup table support for ipfw * * Lookup tables are implemented (at the moment) using the radix * tree used for routing tables. Tables store key-value entries, where * keys are network prefixes (addr/masklen), and values are integers. * As a degenerate case we can interpret keys as 32-bit integers * (with a /32 mask). * * The table is protected by the IPFW lock even for manipulation coming * from userland, because operations are typically fast. */ #if !defined(KLD_MODULE) #include "opt_ipfw.h" #include "opt_ipdivert.h" #include "opt_ipdn.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #endif #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include /* ip_fw.h requires IFNAMSIZ */ #include #include #include #include #include /* struct ipfw_rule_ref */ #include #include /* LIST_HEAD */ #include #ifdef MAC #include #endif MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); struct table_entry { struct radix_node rn[2]; struct sockaddr_in addr, mask; u_int32_t value; }; /* * The radix code expects addr and mask to be array of bytes, * with the first byte being the length of the array. rn_inithead * is called with the offset in bits of the lookup key within the * array. If we use a sockaddr_in as the underlying type, * sin_len is conveniently located at offset 0, sin_addr is at * offset 4 and normally aligned. * But for portability, let's avoid assumption and make the code explicit */ #define KEY_LEN(v) *((uint8_t *)&(v)) #define KEY_OFS (8*offsetof(struct sockaddr_in, sin_addr)) int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint8_t mlen, uint32_t value) { struct radix_node_head *rnh; struct table_entry *ent; struct radix_node *rn; if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl]; ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO); if (ent == NULL) return (ENOMEM); ent->value = value; KEY_LEN(ent->addr) = KEY_LEN(ent->mask) = 8; ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr; IPFW_WLOCK(ch); rn = rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent); if (rn == NULL) { IPFW_WUNLOCK(ch); free(ent, M_IPFW_TBL); return (EEXIST); } IPFW_WUNLOCK(ch); return (0); } int ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint8_t mlen) { struct radix_node_head *rnh; struct table_entry *ent; struct sockaddr_in sa, mask; if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl]; KEY_LEN(sa) = KEY_LEN(mask) = 8; mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr; IPFW_WLOCK(ch); ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh); if (ent == NULL) { IPFW_WUNLOCK(ch); return (ESRCH); } IPFW_WUNLOCK(ch); free(ent, M_IPFW_TBL); return (0); } static int flush_table_entry(struct radix_node *rn, void *arg) { struct radix_node_head * const rnh = arg; struct table_entry *ent; ent = (struct table_entry *) rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh); if (ent != NULL) free(ent, M_IPFW_TBL); return (0); } int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl) { struct radix_node_head *rnh; IPFW_WLOCK_ASSERT(ch); if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl]; KASSERT(rnh != NULL, ("NULL IPFW table")); rnh->rnh_walktree(rnh, flush_table_entry, rnh); return (0); } void ipfw_destroy_tables(struct ip_fw_chain *ch) { uint16_t tbl; struct radix_node_head *rnh; IPFW_WLOCK_ASSERT(ch); for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++) { ipfw_flush_table(ch, tbl); rnh = ch->tables[tbl]; rn_detachhead((void **)&rnh); } } int ipfw_init_tables(struct ip_fw_chain *ch) { int i; uint16_t j; for (i = 0; i < IPFW_TABLES_MAX; i++) { if (!rn_inithead((void **)&ch->tables[i], KEY_OFS)) { for (j = 0; j < i; j++) { (void) ipfw_flush_table(ch, j); } return (ENOMEM); } } return (0); } int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint32_t *val) { struct radix_node_head *rnh; struct table_entry *ent; struct sockaddr_in sa; if (tbl >= IPFW_TABLES_MAX) return (0); rnh = ch->tables[tbl]; KEY_LEN(sa) = 8; sa.sin_addr.s_addr = addr; ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh)); if (ent != NULL) { *val = ent->value; return (1); } return (0); } static int count_table_entry(struct radix_node *rn, void *arg) { u_int32_t * const cnt = arg; (*cnt)++; return (0); } int ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt) { struct radix_node_head *rnh; if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl]; *cnt = 0; rnh->rnh_walktree(rnh, count_table_entry, cnt); return (0); } static int dump_table_entry(struct radix_node *rn, void *arg) { struct table_entry * const n = (struct table_entry *)rn; ipfw_table * const tbl = arg; ipfw_table_entry *ent; if (tbl->cnt == tbl->size) return (1); ent = &tbl->ent[tbl->cnt]; ent->tbl = tbl->tbl; if (in_nullhost(n->mask.sin_addr)) ent->masklen = 0; else ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr)); ent->addr = n->addr.sin_addr.s_addr; ent->value = n->value; tbl->cnt++; return (0); } int ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl) { struct radix_node_head *rnh; if (tbl->tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl->tbl]; tbl->cnt = 0; rnh->rnh_walktree(rnh, dump_table_entry, tbl); return (0); } /* end of file */ ================================================ FILE: sys/netinet/tcp.h ================================================ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp.h 8.1 (Berkeley) 6/10/93 * $FreeBSD: src/sys/netinet/tcp.h,v 1.40.2.2 2008/07/31 06:10:25 kmacy Exp $ */ #ifndef _NETINET_TCP_H_ #define _NETINET_TCP_H_ #include #define __BSD_VISIBLE 1 #if __BSD_VISIBLE typedef u_int32_t tcp_seq; #define tcp6_seq tcp_seq /* for KAME src sync over BSD*'s */ #define tcp6hdr tcphdr /* for KAME src sync over BSD*'s */ /* * TCP header. * Per RFC 793, September, 1981. */ struct tcphdr { u_short th_sport; /* source port */ u_short th_dport; /* destination port */ tcp_seq th_seq; /* sequence number */ tcp_seq th_ack; /* acknowledgement number */ #if BYTE_ORDER == LITTLE_ENDIAN u_char th_x2:4, /* (unused) */ th_off:4; /* data offset */ #endif #if BYTE_ORDER == BIG_ENDIAN u_char th_off:4, /* data offset */ th_x2:4; /* (unused) */ #endif u_char th_flags; #define TH_FIN 0x01 #define TH_SYN 0x02 #define TH_RST 0x04 #define TH_PUSH 0x08 #define TH_ACK 0x10 #define TH_URG 0x20 #define TH_ECE 0x40 #define TH_CWR 0x80 #define TH_FLAGS (TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG|TH_ECE|TH_CWR) #define PRINT_TH_FLAGS "\20\1FIN\2SYN\3RST\4PUSH\5ACK\6URG\7ECE\10CWR" u_short th_win; /* window */ u_short th_sum; /* checksum */ u_short th_urp; /* urgent pointer */ }; #define TCPOPT_EOL 0 #define TCPOLEN_EOL 1 #define TCPOPT_PAD 0 /* padding after EOL */ #define TCPOLEN_PAD 1 #define TCPOPT_NOP 1 #define TCPOLEN_NOP 1 #define TCPOPT_MAXSEG 2 #define TCPOLEN_MAXSEG 4 #define TCPOPT_WINDOW 3 #define TCPOLEN_WINDOW 3 #define TCPOPT_SACK_PERMITTED 4 #define TCPOLEN_SACK_PERMITTED 2 #define TCPOPT_SACK 5 #define TCPOLEN_SACKHDR 2 #define TCPOLEN_SACK 8 /* 2*sizeof(tcp_seq) */ #define TCPOPT_TIMESTAMP 8 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ #define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */ #define TCPOLEN_SIGNATURE 18 /* Miscellaneous constants */ #define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at receiver side */ #define TCP_MAX_SACK 4 /* MAX # SACKs sent in any segment */ /* * Default maximum segment size for TCP. * With an IP MTU of 576, this is 536, * but 512 is probably more convenient. * This should be defined as MIN(512, IP_MSS - sizeof (struct tcpiphdr)). */ #define TCP_MSS 512 /* * TCP_MINMSS is defined to be 216 which is fine for the smallest * link MTU (256 bytes, AX.25 packet radio) in the Internet. * However it is very unlikely to come across such low MTU interfaces * these days (anno dato 2003). * See tcp_subr.c tcp_minmss SYSCTL declaration for more comments. * Setting this to "0" disables the minmss check. */ #define TCP_MINMSS 216 /* * Default maximum segment size for TCP6. * With an IP6 MSS of 1280, this is 1220, * but 1024 is probably more convenient. (xxx kazu in doubt) * This should be defined as MIN(1024, IP6_MSS - sizeof (struct tcpip6hdr)) */ #define TCP6_MSS 1024 #define TCP_MAXWIN 65535 /* largest value for (unscaled) window */ #define TTCP_CLIENT_SND_WND 4096 /* dflt send window for T/TCP client */ #define TCP_MAX_WINSHIFT 14 /* maximum window shift */ #define TCP_MAXBURST 4 /* maximum segments in a burst */ #define TCP_MAXHLEN (0xf<<2) /* max length of header in bytes */ #define TCP_MAXOLEN (TCP_MAXHLEN - sizeof(struct tcphdr)) /* max space left for options */ #endif /* __BSD_VISIBLE */ /* * User-settable options (used with setsockopt). */ #define TCP_NODELAY 0x01 /* don't delay send to coalesce packets */ #if __BSD_VISIBLE #define TCP_MAXSEG 0x02 /* set maximum segment size */ #define TCP_NOPUSH 0x04 /* don't push last block of write */ #define TCP_NOOPT 0x08 /* don't use TCP options */ #define TCP_MD5SIG 0x10 /* use MD5 digests (RFC2385) */ #define TCP_INFO 0x20 /* retrieve tcp_info structure */ #define TCP_CONGESTION 0x40 /* get/set congestion control algorithm */ #define TCP_CA_NAME_MAX 16 /* max congestion control name length */ #define TCPI_OPT_TIMESTAMPS 0x01 #define TCPI_OPT_SACK 0x02 #define TCPI_OPT_WSCALE 0x04 #define TCPI_OPT_ECN 0x08 #define TCPI_OPT_TOE 0x10 /* * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits * the caller to query certain information about the state of a TCP * connection. We provide an overlapping set of fields with the Linux * implementation, but since this is a fixed size structure, room has been * left for growth. In order to maximize potential future compatibility with * the Linux API, the same variable names and order have been adopted, and * padding left to make room for omitted fields in case they are added later. * * XXX: This is currently an unstable ABI/API, in that it is expected to * change. */ struct tcp_info { u_int8_t tcpi_state; /* TCP FSM state. */ u_int8_t __tcpi_ca_state; u_int8_t __tcpi_retransmits; u_int8_t __tcpi_probes; u_int8_t __tcpi_backoff; u_int8_t tcpi_options; /* Options enabled on conn. */ u_int8_t tcpi_snd_wscale:4, /* RFC1323 send shift value. */ tcpi_rcv_wscale:4; /* RFC1323 recv shift value. */ u_int32_t __tcpi_rto; u_int32_t __tcpi_ato; u_int32_t __tcpi_snd_mss; u_int32_t __tcpi_rcv_mss; u_int32_t __tcpi_unacked; u_int32_t __tcpi_sacked; u_int32_t __tcpi_lost; u_int32_t __tcpi_retrans; u_int32_t __tcpi_fackets; /* Times; measurements in usecs. */ u_int32_t __tcpi_last_data_sent; u_int32_t __tcpi_last_ack_sent; /* Also unimpl. on Linux? */ u_int32_t __tcpi_last_data_recv; u_int32_t __tcpi_last_ack_recv; /* Metrics; variable units. */ u_int32_t __tcpi_pmtu; u_int32_t __tcpi_rcv_ssthresh; u_int32_t tcpi_rtt; /* Smoothed RTT in usecs. */ u_int32_t tcpi_rttvar; /* RTT variance in usecs. */ u_int32_t tcpi_snd_ssthresh; /* Slow start threshold. */ u_int32_t tcpi_snd_cwnd; /* Send congestion window. */ u_int32_t __tcpi_advmss; u_int32_t __tcpi_reordering; u_int32_t __tcpi_rcv_rtt; u_int32_t tcpi_rcv_space; /* Advertised recv window. */ /* FreeBSD extensions to tcp_info. */ u_int32_t tcpi_snd_wnd; /* Advertised send window. */ u_int32_t tcpi_snd_bwnd; /* Bandwidth send window. */ u_int32_t tcpi_snd_nxt; /* Next egress seqno */ u_int32_t tcpi_rcv_nxt; /* Next ingress seqno */ u_int32_t tcpi_toe_tid; /* HWTID for TOE endpoints */ /* Padding to grow without breaking ABI. */ u_int32_t __tcpi_pad[29]; /* Padding. */ }; #endif #endif /* !_NETINET_TCP_H_ */ ================================================ FILE: sys/netinet/tcp_var.h ================================================ #ifndef _NETINET_TCP_VAR_H_ #define _NETINET_TCP_VAR_H_ #include #endif /* !_NETINET_TCP_VAR_H_ */ ================================================ FILE: sys/netinet/udp.h ================================================ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp.h 8.1 (Berkeley) 6/10/93 * $FreeBSD: src/sys/netinet/udp.h,v 1.10 2007/02/20 10:13:11 rwatson Exp $ */ #ifndef _NETINET_UDP_H_ #define _NETINET_UDP_H_ /* * UDP protocol header. * Per RFC 768, September, 1981. */ struct udphdr { u_short uh_sport; /* source port */ u_short uh_dport; /* destination port */ u_short uh_ulen; /* udp length */ u_short uh_sum; /* udp checksum */ }; /* * User-settable options (used with setsockopt). */ #define UDP_ENCAP 0x01 /* * UDP Encapsulation of IPsec Packets options. */ /* Encapsulation types. */ #define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */ #define UDP_ENCAP_ESPINUDP 2 /* draft-ietf-ipsec-udp-encaps-02+ */ /* Default ESP in UDP encapsulation port. */ #define UDP_ENCAP_ESPINUDP_PORT 500 /* Maximum UDP fragment size for ESP over UDP. */ #define UDP_ENCAP_ESPINUDP_MAXFRAGLEN 552 #endif ================================================ FILE: sys/sys/cdefs.h ================================================ #ifndef _CDEFS_H_ #define _CDEFS_H_ /* * various compiler macros and common functions */ #ifndef __packed #define __packed __attribute__ ((__packed__)) #endif #ifndef __aligned #define __aligned(x) __attribute__((__aligned__(x))) #endif /* defined as assert */ void panic(const char *fmt, ...); #define KASSERT(exp,msg) do { \ if (__predict_false(!(exp))) \ panic msg; \ } while (0) /* don't bother to optimize */ #ifndef __predict_false #define __predict_false(x) (x) /* __builtin_expect((exp), 0) */ #endif #endif /* !_CDEFS_H_ */ ================================================ FILE: sys/sys/kernel.h ================================================ /* * from freebsd's kernel.h */ #ifndef _SYS_KERNEL_H_ #define _SYS_KERNEL_H_ #define SYSINIT(a, b, c, d, e) \ void *sysinit_ ## d = d #define VNET_SYSINIT(a, b, c, d, e) \ void *sysinit_ ## d = d #define SYSUNINIT(a, b, c, d, e) \ void *sysuninit_ ## d = d #define VNET_SYSUNINIT(a, b, c, d, e) \ void *sysuninit_ ## d = d /* * Some enumerated orders; "ANY" sorts last. */ enum sysinit_elem_order { SI_ORDER_FIRST = 0x0000000, /* first*/ SI_ORDER_SECOND = 0x0000001, /* second*/ SI_ORDER_THIRD = 0x0000002, /* third*/ SI_ORDER_MIDDLE = 0x1000000, /* somewhere in the middle */ SI_ORDER_ANY = 0xfffffff /* last*/ }; #endif ================================================ FILE: sys/sys/malloc.h ================================================ #ifndef _SYS_MALLOC_H_ #define _SYS_MALLOC_H_ /* * No matter what, try to get clear memory and be non-blocking. * XXX check if 2.4 has a native way to zero memory, * XXX obey to the flags (M_NOWAIT <-> GPF_ATOMIC, M_WAIT <-> GPF_KERNEL) */ #ifndef _WIN32 /* this is the linux version */ /* * XXX On zeroshell (2.6.25.17) we get a load error * __you_cannot_kmalloc_that_much * which is triggered when kmalloc() is called with a large * compile-time constant argument (include/linux/slab_def.h) * * I think it may be a compiler (or source) bug because there is no * evidence that such a large request is made. * Making the _size argument to kmalloc volatile prevents the compiler * from making the mistake, though it is clearly not ideal. */ #if !defined (LINUX_24) && LINUX_VERSION_CODE > KERNEL_VERSION(2,6,22) #define malloc(_size, type, flags) \ ({ volatile int _v = _size; kmalloc(_v, GFP_ATOMIC | __GFP_ZERO); }) #else /* LINUX <= 2.6.22 and LINUX_24 */ /* linux 2.6.22 does not zero allocated memory */ #define malloc(_size, type, flags) \ ({ int _s = _size; \ void *_ret = kmalloc(_s, GFP_ATOMIC); \ if (_ret) memset(_ret, 0, _s); \ (_ret); \ }) #endif /* LINUX <= 2.6.22 */ #define calloc(_n, _s) malloc((_n * _s), NULL, GFP_ATOMIC | __GFP_ZERO) #define free(_var, type) kfree(_var) #else /* _WIN32, the windows version */ /* * ntddk.h uses win_malloc() and MmFreeContiguousMemory(). * wipfw uses * ExAllocatePoolWithTag(, pool, len, tag) * ExFreePoolWithTag(ptr, tag) */ #define malloc(_size, _type, _flags) my_alloc(_size) #define calloc(_size, _type, _flags) my_alloc(_size) void *my_alloc(int _size); /* the 'tag' version does not work without -Gz in the linker */ #define free(_var, type) ExFreePool(_var) //#define free(_var, type) ExFreePoolWithTag(_var, 'wfpi') #endif /* _WIN32 */ #define M_NOWAIT 0x0001 /* do not block */ #define M_ZERO 0x0100 /* bzero the allocation */ #endif /* _SYS_MALLOC_H_ */ ================================================ FILE: sys/sys/mbuf.h ================================================ /* * Copyright (C) 2009 Luigi Rizzo, Universita` di Pisa * * BSD copyright. * * A simple compatibility interface to map mbufs onto sk_buff */ #ifndef _SYS_MBUF_H_ #define _SYS_MBUF_H_ #include /* we use free() */ /* hopefully queue.h is already included by someone else */ #include #ifdef _KERNEL /* bzero not present on linux, but this should go in glue.h */ // #define bzero(s, n) memset(s, 0, n) /* * We implement a very simplified UMA allocator where the backend * is simply malloc, and uma_zone only stores the length of the components. */ typedef int uma_zone_t; /* the zone size */ #define uma_zcreate(name, len, _3, _4, _5, _6, _7, _8) (len) #define uma_zfree(zone, item) free(item, M_IPFW) #define uma_zalloc(zone, flags) malloc(zone, M_IPFW, flags) #define uma_zdestroy(zone) do {} while (0) /*- * Macros for type conversion: * mtod(m, t) -- Convert mbuf pointer to data pointer of correct type. */ #define mtod(m, t) ((t)((m)->m_data)) #endif /* _KERNEL */ /* * Packet tag structure (see below for details). */ struct m_tag { SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ u_int16_t m_tag_id; /* Tag ID */ u_int16_t m_tag_len; /* Length of data */ u_int32_t m_tag_cookie; /* ABI/Module ID */ void (*m_tag_free)(struct m_tag *); }; #if defined(__linux__) || defined( _WIN32 ) /* * Auxiliary structure to store values from the sk_buf. * Note that we should not alter the sk_buff, and if we do * so make sure to keep the values in sync between the mbuf * and the sk_buff (especially m_len and m_pkthdr.len). */ struct mbuf { struct mbuf *m_next; struct mbuf *m_nextpkt; char *m_data; // XXX was void * int m_len; /* length in this mbuf */ int m_flags; #ifdef __linux__ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) struct nf_info *queue_entry; #else struct nf_queue_entry *queue_entry; #endif #else /* _WIN32 */ int direction; /* could go in rcvif */ NDIS_HANDLE context; /* replaces queue_entry or skb ?*/ PNDIS_PACKET pkt; #endif struct sk_buff *m_skb; struct { #ifdef __linux__ struct net_device *rcvif; #else struct ifnet *rcvif; #endif int len; /* total packet len */ SLIST_HEAD (packet_tags, m_tag) tags; } m_pkthdr; }; #define M_SKIP_FIREWALL 0x01 /* skip firewall processing */ #define M_BCAST 0x02 /* send/received as link-level broadcast */ #define M_MCAST 0x04 /* send/received as link-level multicast */ #define M_DONTWAIT M_NOWAIT /* should not be here... */ /* * m_dup() is used in the TEE case, currently unsupported so we * just return. */ static __inline struct mbuf *m_dup(struct mbuf *m, int n) { (void)m; (void)n; return NULL; }; #define MTAG_ABI_COMPAT 0 /* compatibility ABI */ static __inline struct m_tag * m_tag_find(struct mbuf *m, int type, struct m_tag *start) { (void)m; (void)type; (void)start; return NULL; }; static __inline void m_tag_prepend(struct mbuf *m, struct m_tag *t) { SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link); } /* * Return the next tag in the list of tags associated with an mbuf. */ static __inline struct m_tag * m_tag_next(struct mbuf *m, struct m_tag *t) { return (SLIST_NEXT(t, m_tag_link)); } /* * Create an mtag of the given type */ static __inline struct m_tag * m_tag_alloc(uint32_t cookie, int type, int length, int wait) { int l = length + sizeof(struct m_tag); struct m_tag *m = malloc(l, 0, M_NOWAIT); if (m) { memset(m, 0, l); m->m_tag_id = type; m->m_tag_len = length; m->m_tag_cookie = cookie; } return m; }; static __inline struct m_tag * m_tag_get(int type, int length, int wait) { return m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait); } static __inline struct m_tag * m_tag_first(struct mbuf *m) { return SLIST_FIRST(&m->m_pkthdr.tags); }; static __inline void m_tag_delete(struct mbuf *m, struct m_tag *t) { }; static __inline struct m_tag * m_tag_locate(struct mbuf *m, u_int32_t n, int x, struct m_tag *t) { struct m_tag *tag; tag = m_tag_first(m); if (tag == NULL) return NULL; if (tag->m_tag_cookie != n || tag->m_tag_id != x) return NULL; else return tag; }; #define M_SETFIB(_m, _fib) /* nothing on linux */ static __inline void m_freem(struct mbuf *m) { struct m_tag *t; /* free the m_tag chain */ while ( (t = SLIST_FIRST(&m->m_pkthdr.tags) ) ) { SLIST_REMOVE_HEAD(&m->m_pkthdr.tags, m_tag_link); free(t, 0); } /* free the mbuf */ free(m, M_IPFW); }; /* m_pullup is not supported, there is a macro in missing.h */ #define M_GETFIB(_m) 0 /* macro used to create a new mbuf */ #define MT_DATA 1 /* dynamic (data) allocation */ #define MSIZE 256 /* size of an mbuf */ #define MGETHDR(_m, _how, _type) ((_m) = m_gethdr((_how), (_type))) /* allocate and init a new mbuf using the same structure of FreeBSD */ static __inline struct mbuf * m_gethdr(int how, short type) { struct mbuf *m; m = malloc(MSIZE, M_IPFW, M_NOWAIT); if (m == NULL) { return m; } /* here we have MSIZE - sizeof(struct mbuf) available */ m->m_data = (char *)(m + 1); return m; } #endif /* __linux__ || _WIN32 */ /* * Persistent tags stay with an mbuf until the mbuf is reclaimed. Otherwise * tags are expected to ``vanish'' when they pass through a network * interface. For most interfaces this happens normally as the tags are * reclaimed when the mbuf is free'd. However in some special cases * reclaiming must be done manually. An example is packets that pass through * the loopback interface. Also, one must be careful to do this when * ``turning around'' packets (e.g., icmp_reflect). * * To mark a tag persistent bit-or this flag in when defining the tag id. * The tag will then be treated as described above. */ #define MTAG_PERSISTENT 0x800 #define PACKET_TAG_NONE 0 /* Nadda */ /* Packet tags for use with PACKET_ABI_COMPAT. */ #define PACKET_TAG_IPSEC_IN_DONE 1 /* IPsec applied, in */ #define PACKET_TAG_IPSEC_OUT_DONE 2 /* IPsec applied, out */ #define PACKET_TAG_IPSEC_IN_CRYPTO_DONE 3 /* NIC IPsec crypto done */ #define PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED 4 /* NIC IPsec crypto req'ed */ #define PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO 5 /* NIC notifies IPsec */ #define PACKET_TAG_IPSEC_PENDING_TDB 6 /* Reminder to do IPsec */ #define PACKET_TAG_BRIDGE 7 /* Bridge processing done */ #define PACKET_TAG_GIF 8 /* GIF processing done */ #define PACKET_TAG_GRE 9 /* GRE processing done */ #define PACKET_TAG_IN_PACKET_CHECKSUM 10 /* NIC checksumming done */ #define PACKET_TAG_ENCAP 11 /* Encap. processing */ #define PACKET_TAG_IPSEC_SOCKET 12 /* IPSEC socket ref */ #define PACKET_TAG_IPSEC_HISTORY 13 /* IPSEC history */ #define PACKET_TAG_IPV6_INPUT 14 /* IPV6 input processing */ #define PACKET_TAG_DUMMYNET 15 /* dummynet info */ #define PACKET_TAG_DIVERT 17 /* divert info */ #define PACKET_TAG_IPFORWARD 18 /* ipforward info */ #define PACKET_TAG_MACLABEL (19 | MTAG_PERSISTENT) /* MAC label */ #define PACKET_TAG_PF 21 /* PF + ALTQ information */ #define PACKET_TAG_RTSOCKFAM 25 /* rtsock sa family */ #define PACKET_TAG_IPOPTIONS 27 /* Saved IP options */ #define PACKET_TAG_CARP 28 /* CARP info */ #endif /* !_SYS_MBUF_H_ */ ================================================ FILE: sys/sys/module.h ================================================ /* * trivial module support */ #ifndef _SYS_MODULE_H_ #define _SYS_MODULE_H_ typedef struct module *module_t; typedef int (*modeventhand_t)(module_t, int /* modeventtype_t */, void *); typedef enum modeventtype { MOD_LOAD, MOD_UNLOAD, MOD_SHUTDOWN, MOD_QUIESCE } modeventtype_t; typedef struct moduledata { const char *name; /* module name */ modeventhand_t evhand; /* event handler */ void *priv; /* extra data */ } moduledata_t; /* * Hook the module descriptor, md, into our list of things to do. * We should in principle respect the order of loading. * * XXX use the gcc .init functions */ #define DECLARE_MODULE(a, md, c,d) \ moduledata_t *moddesc_##a = &md; /* * XXX MODULE_VERSION is define in linux too */ #define MODULE_DEPEND(a,b,c,d,e) #if defined( __linux__ ) || defined( _WIN32 ) #undef MODULE_VERSION #define MODULE_VERSION(a,b) #endif #endif /* _SYS_MODULE_H_ */ ================================================ FILE: sys/sys/param.h ================================================ #ifndef _SYS_PARAM_H_ #define _SYS_PARAM_H_ /* * number of additional groups */ #ifndef LINUX_24 #define NGROUPS 16 #endif #endif /* _SYS_PARAM_H_ */ ================================================ FILE: sys/sys/queue.h ================================================ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)queue.h 8.5 (Berkeley) 8/20/94 * $FreeBSD: src/sys/sys/queue.h,v 1.68 2006/10/24 11:20:29 ru Exp $ */ #ifndef _SYS_QUEUE_H_ #define _SYS_QUEUE_H_ //#include /* * This file defines four types of data structures: singly-linked lists, * singly-linked tail queues, lists and tail queues. * * A singly-linked list is headed by a single forward pointer. The elements * are singly linked for minimum space and pointer manipulation overhead at * the expense of O(n) removal for arbitrary elements. New elements can be * added to the list after an existing element or at the head of the list. * Elements being removed from the head of the list should use the explicit * macro for this purpose for optimum efficiency. A singly-linked list may * only be traversed in the forward direction. Singly-linked lists are ideal * for applications with large datasets and few or no removals or for * implementing a LIFO queue. * * A singly-linked tail queue is headed by a pair of pointers, one to the * head of the list and the other to the tail of the list. The elements are * singly linked for minimum space and pointer manipulation overhead at the * expense of O(n) removal for arbitrary elements. New elements can be added * to the list after an existing element, at the head of the list, or at the * end of the list. Elements being removed from the head of the tail queue * should use the explicit macro for this purpose for optimum efficiency. * A singly-linked tail queue may only be traversed in the forward direction. * Singly-linked tail queues are ideal for applications with large datasets * and few or no removals or for implementing a FIFO queue. * * A list is headed by a single forward pointer (or an array of forward * pointers for a hash table header). The elements are doubly linked * so that an arbitrary element can be removed without a need to * traverse the list. New elements can be added to the list before * or after an existing element or at the head of the list. A list * may only be traversed in the forward direction. * * A tail queue is headed by a pair of pointers, one to the head of the * list and the other to the tail of the list. The elements are doubly * linked so that an arbitrary element can be removed without a need to * traverse the list. New elements can be added to the list before or * after an existing element, at the head of the list, or at the end of * the list. A tail queue may be traversed in either direction. * * For details on the use of these macros, see the queue(3) manual page. * * * SLIST LIST STAILQ TAILQ * _HEAD + + + + * _HEAD_INITIALIZER + + + + * _ENTRY + + + + * _INIT + + + + * _EMPTY + + + + * _FIRST + + + + * _NEXT + + + + * _PREV - - - + * _LAST - - + + * _FOREACH + + + + * _FOREACH_SAFE + + + + * _FOREACH_REVERSE - - - + * _FOREACH_REVERSE_SAFE - - - + * _INSERT_HEAD + + + + * _INSERT_BEFORE - + - + * _INSERT_AFTER + + + + * _INSERT_TAIL - - + + * _CONCAT - - + + * _REMOVE_HEAD + - + - * _REMOVE + + + + * */ #ifdef QUEUE_MACRO_DEBUG /* Store the last 2 places the queue element or head was altered */ struct qm_trace { char * lastfile; int lastline; char * prevfile; int prevline; }; #define TRACEBUF struct qm_trace trace; #define TRASHIT(x) do {(x) = (void *)-1;} while (0) #define QMD_TRACE_HEAD(head) do { \ (head)->trace.prevline = (head)->trace.lastline; \ (head)->trace.prevfile = (head)->trace.lastfile; \ (head)->trace.lastline = __LINE__; \ (head)->trace.lastfile = __FILE__; \ } while (0) #define QMD_TRACE_ELEM(elem) do { \ (elem)->trace.prevline = (elem)->trace.lastline; \ (elem)->trace.prevfile = (elem)->trace.lastfile; \ (elem)->trace.lastline = __LINE__; \ (elem)->trace.lastfile = __FILE__; \ } while (0) #else #define QMD_TRACE_ELEM(elem) #define QMD_TRACE_HEAD(head) #define TRACEBUF #define TRASHIT(x) #endif /* QUEUE_MACRO_DEBUG */ /* * Singly-linked List declarations. */ #define SLIST_HEAD(name, type) \ struct name { \ struct type *slh_first; /* first element */ \ } #define SLIST_HEAD_INITIALIZER(head) \ { NULL } #if defined( _WIN32 ) && defined(SLIST_ENTRY) #undef SLIST_ENTRY #endif #define SLIST_ENTRY(type) \ struct { \ struct type *sle_next; /* next element */ \ } /* * Singly-linked List functions. */ #define SLIST_EMPTY(head) ((head)->slh_first == NULL) #define SLIST_FIRST(head) ((head)->slh_first) #define SLIST_FOREACH(var, head, field) \ for ((var) = SLIST_FIRST((head)); \ (var); \ (var) = SLIST_NEXT((var), field)) #define SLIST_FOREACH_SAFE(var, head, field, tvar) \ for ((var) = SLIST_FIRST((head)); \ (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ (var) = (tvar)) #define SLIST_FOREACH_PREVPTR(var, varp, head, field) \ for ((varp) = &SLIST_FIRST((head)); \ ((var) = *(varp)) != NULL; \ (varp) = &SLIST_NEXT((var), field)) #define SLIST_INIT(head) do { \ SLIST_FIRST((head)) = NULL; \ } while (0) #define SLIST_INSERT_AFTER(slistelm, elm, field) do { \ SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \ SLIST_NEXT((slistelm), field) = (elm); \ } while (0) #define SLIST_INSERT_HEAD(head, elm, field) do { \ SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \ SLIST_FIRST((head)) = (elm); \ } while (0) #define SLIST_NEXT(elm, field) ((elm)->field.sle_next) #define SLIST_REMOVE(head, elm, type, field) do { \ if (SLIST_FIRST((head)) == (elm)) { \ SLIST_REMOVE_HEAD((head), field); \ } \ else { \ struct type *curelm = SLIST_FIRST((head)); \ while (SLIST_NEXT(curelm, field) != (elm)) \ curelm = SLIST_NEXT(curelm, field); \ SLIST_NEXT(curelm, field) = \ SLIST_NEXT(SLIST_NEXT(curelm, field), field); \ } \ TRASHIT((elm)->field.sle_next); \ } while (0) #define SLIST_REMOVE_HEAD(head, field) do { \ SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \ } while (0) /* * Singly-linked Tail queue declarations. */ #define STAILQ_HEAD(name, type) \ struct name { \ struct type *stqh_first;/* first element */ \ struct type **stqh_last;/* addr of last next element */ \ } #define STAILQ_HEAD_INITIALIZER(head) \ { NULL, &(head).stqh_first } #define STAILQ_ENTRY(type) \ struct { \ struct type *stqe_next; /* next element */ \ } /* * Singly-linked Tail queue functions. */ #define STAILQ_CONCAT(head1, head2) do { \ if (!STAILQ_EMPTY((head2))) { \ *(head1)->stqh_last = (head2)->stqh_first; \ (head1)->stqh_last = (head2)->stqh_last; \ STAILQ_INIT((head2)); \ } \ } while (0) #define STAILQ_EMPTY(head) ((head)->stqh_first == NULL) #define STAILQ_FIRST(head) ((head)->stqh_first) #define STAILQ_FOREACH(var, head, field) \ for((var) = STAILQ_FIRST((head)); \ (var); \ (var) = STAILQ_NEXT((var), field)) #define STAILQ_FOREACH_SAFE(var, head, field, tvar) \ for ((var) = STAILQ_FIRST((head)); \ (var) && ((tvar) = STAILQ_NEXT((var), field), 1); \ (var) = (tvar)) #define STAILQ_INIT(head) do { \ STAILQ_FIRST((head)) = NULL; \ (head)->stqh_last = &STAILQ_FIRST((head)); \ } while (0) #define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \ if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\ (head)->stqh_last = &STAILQ_NEXT((elm), field); \ STAILQ_NEXT((tqelm), field) = (elm); \ } while (0) #define STAILQ_INSERT_HEAD(head, elm, field) do { \ if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \ (head)->stqh_last = &STAILQ_NEXT((elm), field); \ STAILQ_FIRST((head)) = (elm); \ } while (0) #define STAILQ_INSERT_TAIL(head, elm, field) do { \ STAILQ_NEXT((elm), field) = NULL; \ *(head)->stqh_last = (elm); \ (head)->stqh_last = &STAILQ_NEXT((elm), field); \ } while (0) #define STAILQ_LAST(head, type, field) \ (STAILQ_EMPTY((head)) ? \ NULL : \ ((struct type *)(void *) \ ((char *)((head)->stqh_last) - __offsetof(struct type, field)))) #define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next) #define STAILQ_REMOVE(head, elm, type, field) do { \ if (STAILQ_FIRST((head)) == (elm)) { \ STAILQ_REMOVE_HEAD((head), field); \ } \ else { \ struct type *curelm = STAILQ_FIRST((head)); \ while (STAILQ_NEXT(curelm, field) != (elm)) \ curelm = STAILQ_NEXT(curelm, field); \ if ((STAILQ_NEXT(curelm, field) = \ STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\ (head)->stqh_last = &STAILQ_NEXT((curelm), field);\ } \ TRASHIT((elm)->field.stqe_next); \ } while (0) #define STAILQ_REMOVE_HEAD(head, field) do { \ if ((STAILQ_FIRST((head)) = \ STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \ (head)->stqh_last = &STAILQ_FIRST((head)); \ } while (0) #ifndef LIST_HEAD /* * List declarations. */ #define LIST_HEAD(name, type) \ struct name { \ struct type *lh_first; /* first element */ \ } #define LIST_HEAD_INITIALIZER(head) \ { NULL } #define LIST_ENTRY(type) \ struct { \ struct type *le_next; /* next element */ \ struct type **le_prev; /* address of previous next element */ \ } /* * List functions. */ #if (defined(_KERNEL) && defined(INVARIANTS)) #define QMD_LIST_CHECK_HEAD(head, field) do { \ if (LIST_FIRST((head)) != NULL && \ LIST_FIRST((head))->field.le_prev != \ &LIST_FIRST((head))) \ panic("Bad list head %p first->prev != head", (head)); \ } while (0) #define QMD_LIST_CHECK_NEXT(elm, field) do { \ if (LIST_NEXT((elm), field) != NULL && \ LIST_NEXT((elm), field)->field.le_prev != \ &((elm)->field.le_next)) \ panic("Bad link elm %p next->prev != elm", (elm)); \ } while (0) #define QMD_LIST_CHECK_PREV(elm, field) do { \ if (*(elm)->field.le_prev != (elm)) \ panic("Bad link elm %p prev->next != elm", (elm)); \ } while (0) #else #define QMD_LIST_CHECK_HEAD(head, field) #define QMD_LIST_CHECK_NEXT(elm, field) #define QMD_LIST_CHECK_PREV(elm, field) #endif /* (_KERNEL && INVARIANTS) */ #define LIST_EMPTY(head) ((head)->lh_first == NULL) #define LIST_FIRST(head) ((head)->lh_first) #define LIST_FOREACH(var, head, field) \ for ((var) = LIST_FIRST((head)); \ (var); \ (var) = LIST_NEXT((var), field)) #define LIST_FOREACH_SAFE(var, head, field, tvar) \ for ((var) = LIST_FIRST((head)); \ (var) && ((tvar) = LIST_NEXT((var), field), 1); \ (var) = (tvar)) #define LIST_INIT(head) do { \ LIST_FIRST((head)) = NULL; \ } while (0) #define LIST_INSERT_AFTER(listelm, elm, field) do { \ QMD_LIST_CHECK_NEXT(listelm, field); \ if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\ LIST_NEXT((listelm), field)->field.le_prev = \ &LIST_NEXT((elm), field); \ LIST_NEXT((listelm), field) = (elm); \ (elm)->field.le_prev = &LIST_NEXT((listelm), field); \ } while (0) #define LIST_INSERT_BEFORE(listelm, elm, field) do { \ QMD_LIST_CHECK_PREV(listelm, field); \ (elm)->field.le_prev = (listelm)->field.le_prev; \ LIST_NEXT((elm), field) = (listelm); \ *(listelm)->field.le_prev = (elm); \ (listelm)->field.le_prev = &LIST_NEXT((elm), field); \ } while (0) #define LIST_INSERT_HEAD(head, elm, field) do { \ QMD_LIST_CHECK_HEAD((head), field); \ if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \ LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\ LIST_FIRST((head)) = (elm); \ (elm)->field.le_prev = &LIST_FIRST((head)); \ } while (0) #define LIST_NEXT(elm, field) ((elm)->field.le_next) #define LIST_REMOVE(elm, field) do { \ QMD_LIST_CHECK_NEXT(elm, field); \ QMD_LIST_CHECK_PREV(elm, field); \ if (LIST_NEXT((elm), field) != NULL) \ LIST_NEXT((elm), field)->field.le_prev = \ (elm)->field.le_prev; \ *(elm)->field.le_prev = LIST_NEXT((elm), field); \ TRASHIT((elm)->field.le_next); \ TRASHIT((elm)->field.le_prev); \ } while (0) #endif /* LIST_HEAD */ /* * Tail queue declarations. */ #define TAILQ_HEAD(name, type) \ struct name { \ struct type *tqh_first; /* first element */ \ struct type **tqh_last; /* addr of last next element */ \ TRACEBUF \ } #define TAILQ_HEAD_INITIALIZER(head) \ { NULL, &(head).tqh_first } #define TAILQ_ENTRY(type) \ struct { \ struct type *tqe_next; /* next element */ \ struct type **tqe_prev; /* address of previous next element */ \ TRACEBUF \ } /* * Tail queue functions. */ #if (defined(_KERNEL) && defined(INVARIANTS)) #define QMD_TAILQ_CHECK_HEAD(head, field) do { \ if (!TAILQ_EMPTY(head) && \ TAILQ_FIRST((head))->field.tqe_prev != \ &TAILQ_FIRST((head))) \ panic("Bad tailq head %p first->prev != head", (head)); \ } while (0) #define QMD_TAILQ_CHECK_TAIL(head, field) do { \ if (*(head)->tqh_last != NULL) \ panic("Bad tailq NEXT(%p->tqh_last) != NULL", (head)); \ } while (0) #define QMD_TAILQ_CHECK_NEXT(elm, field) do { \ if (TAILQ_NEXT((elm), field) != NULL && \ TAILQ_NEXT((elm), field)->field.tqe_prev != \ &((elm)->field.tqe_next)) \ panic("Bad link elm %p next->prev != elm", (elm)); \ } while (0) #define QMD_TAILQ_CHECK_PREV(elm, field) do { \ if (*(elm)->field.tqe_prev != (elm)) \ panic("Bad link elm %p prev->next != elm", (elm)); \ } while (0) #else #define QMD_TAILQ_CHECK_HEAD(head, field) #define QMD_TAILQ_CHECK_TAIL(head, headname) #define QMD_TAILQ_CHECK_NEXT(elm, field) #define QMD_TAILQ_CHECK_PREV(elm, field) #endif /* (_KERNEL && INVARIANTS) */ #define TAILQ_CONCAT(head1, head2, field) do { \ if (!TAILQ_EMPTY(head2)) { \ *(head1)->tqh_last = (head2)->tqh_first; \ (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \ (head1)->tqh_last = (head2)->tqh_last; \ TAILQ_INIT((head2)); \ QMD_TRACE_HEAD(head1); \ QMD_TRACE_HEAD(head2); \ } \ } while (0) #define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) #define TAILQ_FIRST(head) ((head)->tqh_first) #define TAILQ_FOREACH(var, head, field) \ for ((var) = TAILQ_FIRST((head)); \ (var); \ (var) = TAILQ_NEXT((var), field)) #define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ for ((var) = TAILQ_FIRST((head)); \ (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ (var) = (tvar)) #define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ for ((var) = TAILQ_LAST((head), headname); \ (var); \ (var) = TAILQ_PREV((var), headname, field)) #define TAILQ_FOREACH_REVERSE_SAFE(var, head, headname, field, tvar) \ for ((var) = TAILQ_LAST((head), headname); \ (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \ (var) = (tvar)) #define TAILQ_INIT(head) do { \ TAILQ_FIRST((head)) = NULL; \ (head)->tqh_last = &TAILQ_FIRST((head)); \ QMD_TRACE_HEAD(head); \ } while (0) #define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ QMD_TAILQ_CHECK_NEXT(listelm, field); \ if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\ TAILQ_NEXT((elm), field)->field.tqe_prev = \ &TAILQ_NEXT((elm), field); \ else { \ (head)->tqh_last = &TAILQ_NEXT((elm), field); \ QMD_TRACE_HEAD(head); \ } \ TAILQ_NEXT((listelm), field) = (elm); \ (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \ QMD_TRACE_ELEM(&(elm)->field); \ QMD_TRACE_ELEM(&listelm->field); \ } while (0) #define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ QMD_TAILQ_CHECK_PREV(listelm, field); \ (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ TAILQ_NEXT((elm), field) = (listelm); \ *(listelm)->field.tqe_prev = (elm); \ (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \ QMD_TRACE_ELEM(&(elm)->field); \ QMD_TRACE_ELEM(&listelm->field); \ } while (0) #define TAILQ_INSERT_HEAD(head, elm, field) do { \ QMD_TAILQ_CHECK_HEAD(head, field); \ if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \ TAILQ_FIRST((head))->field.tqe_prev = \ &TAILQ_NEXT((elm), field); \ else \ (head)->tqh_last = &TAILQ_NEXT((elm), field); \ TAILQ_FIRST((head)) = (elm); \ (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \ QMD_TRACE_HEAD(head); \ QMD_TRACE_ELEM(&(elm)->field); \ } while (0) #define TAILQ_INSERT_TAIL(head, elm, field) do { \ QMD_TAILQ_CHECK_TAIL(head, field); \ TAILQ_NEXT((elm), field) = NULL; \ (elm)->field.tqe_prev = (head)->tqh_last; \ *(head)->tqh_last = (elm); \ (head)->tqh_last = &TAILQ_NEXT((elm), field); \ QMD_TRACE_HEAD(head); \ QMD_TRACE_ELEM(&(elm)->field); \ } while (0) #define TAILQ_LAST(head, headname) \ (*(((struct headname *)((head)->tqh_last))->tqh_last)) #define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) #define TAILQ_PREV(elm, headname, field) \ (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) #define TAILQ_REMOVE(head, elm, field) do { \ QMD_TAILQ_CHECK_NEXT(elm, field); \ QMD_TAILQ_CHECK_PREV(elm, field); \ if ((TAILQ_NEXT((elm), field)) != NULL) \ TAILQ_NEXT((elm), field)->field.tqe_prev = \ (elm)->field.tqe_prev; \ else { \ (head)->tqh_last = (elm)->field.tqe_prev; \ QMD_TRACE_HEAD(head); \ } \ *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \ TRASHIT((elm)->field.tqe_next); \ TRASHIT((elm)->field.tqe_prev); \ QMD_TRACE_ELEM(&(elm)->field); \ } while (0) #ifdef _KERNEL /* * XXX insque() and remque() are an old way of handling certain queues. * They bogusly assumes that all queue heads look alike. */ struct quehead { struct quehead *qh_link; struct quehead *qh_rlink; }; #ifdef __CC_SUPPORTS___INLINE static __inline void insque(void *a, void *b) { struct quehead *element = (struct quehead *)a, *head = (struct quehead *)b; element->qh_link = head->qh_link; element->qh_rlink = head; head->qh_link = element; element->qh_link->qh_rlink = element; } static __inline void remque(void *a) { struct quehead *element = (struct quehead *)a; element->qh_link->qh_rlink = element->qh_rlink; element->qh_rlink->qh_link = element->qh_link; element->qh_rlink = 0; } #else /* !__CC_SUPPORTS___INLINE */ void insque(void *a, void *b); void remque(void *a); #endif /* __CC_SUPPORTS___INLINE */ #endif /* _KERNEL */ #endif /* !_SYS_QUEUE_H_ */ ================================================ FILE: sys/sys/syslog.h ================================================ #ifndef _SYS_SYSLOG_H_ #define _SYS_SYSLOG_H_ /* XXX find linux equivalent */ #define LOG_SECURITY 0 #define LOG_NOTICE 0 #define LOG_DEBUG 0 #endif /* _SYS_SYSLOG_H_ */ ================================================ FILE: sys/sys/systm.h ================================================ #ifndef _SYS_SYSTM_H_ #define _SYS_SYSTM_H_ #define CALLOUT_ACTIVE 0x0002 /* callout is currently active */ #define CALLOUT_MPSAFE 0x0008 /* callout handler is mp safe */ #ifndef _WIN32 /* this is the linux version */ /* callout support, in on FreeBSD */ /* * callout support on linux module is done using timers */ #include #ifdef LINUX_24 #include /* jiffies definition is here in 2.4 */ #endif #define callout timer_list static __inline int callout_reset_on(struct callout *co, int ticks, void (*fn)(void *), void *arg, int cpu) { co->expires = jiffies + ticks; co->function = (void (*)(unsigned long))fn; co->data = (unsigned long)arg; /* * Linux 2.6.31 and above has add_timer_on(co, cpu), * otherwise add_timer() always schedules a callout on the same * CPU used the first time, so we don't need more. */ add_timer(co); return 0; } #define callout_init(co, safe) init_timer(co) #define callout_drain(co) del_timer(co) #define callout_stop(co) del_timer(co) #else /* _WIN32 */ #include /* This is the windows part for callout support */ struct callout { KTIMER thetimer; KDPC timerdpc; int dpcinitialized; LARGE_INTEGER duetime; }; void dummynet (void*); VOID dummynet_dpc( __in struct _KDPC *Dpc, __in_opt PVOID DeferredContext, __in_opt PVOID SystemArgument1, __in_opt PVOID SystemArgument2 ); VOID ipfw_dpc( __in struct _KDPC *Dpc, __in_opt PVOID DeferredContext, __in_opt PVOID SystemArgument1, __in_opt PVOID SystemArgument2 ); /* callout_reset must handle two problems: * - dummynet() scheduler must be run always on the same processor * because do_gettimeofday() is based on cpu performance counter, and * _occasionally_ can leap backward in time if we query another cpu. * typically this won't happen that much, and the cpu will almost always * be the same even without the affinity restriction, but better to be sure. * - ipfw_tick() does not have the granularity requirements of dummynet() * but we need to pass a pointer as argument. * * for these reasons, if we are called for dummynet() timer, * KeInitializeDpc is called only once as it should be, and the thread * is forced on cpu0 (which is always present), while if we're called * for ipfw_tick(), we re-initialize the DPC each time, using * parameter DeferredContext to pass the needed pointer. since this * timer is called only once a sec, this won't hurt that much. */ static __inline int callout_reset_on(struct callout *co, int ticks, void (*fn)(void *), void *arg, int cpu) { if(fn == &dummynet) { if(co->dpcinitialized == 0) { KeInitializeDpc(&co->timerdpc, dummynet_dpc, NULL); KeSetTargetProcessorDpc(&co->timerdpc, cpu); co->dpcinitialized = 1; } } else { KeInitializeDpc(&co->timerdpc, ipfw_dpc, arg); } co->duetime.QuadPart = (-ticks)*10000; KeSetTimer(&co->thetimer, co->duetime, &co->timerdpc); return 0; } static __inline void callout_init(struct callout* co, int safe) { printf("%s: initializing timer at %p\n",__FUNCTION__,co); KeInitializeTimer(&co->thetimer); } static __inline int callout_drain(struct callout* co) { BOOLEAN canceled = KeCancelTimer(&co->thetimer); while (canceled != TRUE) { canceled = KeCancelTimer(&co->thetimer); } printf("%s: stopping timer at %p\n",__FUNCTION__,co); return 0; } static __inline int callout_stop(struct callout* co) { return callout_drain(co); } #endif /* _WIN32 */ #endif /* _SYS_SYSTM_H_ */ ================================================ FILE: sys/sys/taskqueue.h ================================================ #ifndef _SYS_TASKQUEUE_H_ #define _SYS_TASKQUEUE_H_ /* * Remap taskqueue to direct calls */ #ifdef _WIN32 struct task { void (*func)(void*, int); }; #define taskqueue_enqueue(tq, ta) (ta)->func(NULL,1) #define TASK_INIT(a,b,c,d) do { \ (a)->func = (c); } while (0) #else struct task { void (*func)(void); }; #define taskqueue_enqueue(tq, ta) (ta)->func() #define TASK_INIT(a,b,c,d) do { \ (a)->func = (void (*)(void))c; } while (0) #endif #define taskqueue_create_fast(_a, _b, _c, _d) NULL #define taskqueue_start_threads(_a, _b, _c, _d) #define taskqueue_drain(_a, _b) /* XXX to be completed */ #define taskqueue_free(_a) /* XXX to be completed */ #define PRI_MIN (0) /* Highest priority. */ #define PRI_MIN_ITHD (PRI_MIN) #define PI_NET (PRI_MIN_ITHD + 16) #endif /* !_SYS_TASKQUEUE_H_ */ ================================================ FILE: tcc_glue.h ================================================ /* * Copyright (c) 2010 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * headers to build userland ipfw under tcc. */ #ifndef _TCC_GLUE_H #define _TCC_GLUE_H //#define __restrict #define NULL ((void *)0) typedef int size_t; typedef unsigned char u_char; typedef unsigned char uint8_t; typedef unsigned char u_int8_t; typedef unsigned short u_short; typedef unsigned short uint16_t; typedef unsigned short u_int16_t; typedef int __int32_t; typedef int int32_t; typedef int socklen_t; typedef int pid_t; typedef unsigned int time_t; typedef unsigned int uint; typedef unsigned int u_int; typedef unsigned int uint32_t; typedef unsigned int u_int32_t; typedef unsigned int gid_t; typedef unsigned int uid_t; typedef unsigned long u_long; typedef unsigned long uintptr_t; typedef long long int int64_t; typedef unsigned long long int uint64_t; typedef unsigned long long int u_int64_t; typedef uint32_t in_addr_t; struct in_addr { uint32_t s_addr; }; struct sockaddr_in { uint8_t _sin_len; uint8_t sin_family; uint16_t sin_port; struct in_addr sin_addr; char sin_zero[8]; }; #define IFNAMSIZ 16 #define INET6_ADDRSTRLEN 64 struct in6_addr { union { uint8_t __s6_addr8[16]; uint16_t __s6_addr16[8]; uint32_t __s6_addr32[4]; } __u6; // _addr; /* 128-bit IP6 address */ }; #define LITTLE_ENDIAN 1234 #define BYTE_ORDER LITTLE_ENDIAN /* to be revised */ #define EX_OK 0 #define EX_DATAERR 1 #define EX_OSERR 2 #define EX_UNAVAILABLE 3 #define EX_USAGE 4 #define EX_NOHOST 5 #define EEXIST 1 #define EINVAL 2 #define ERANGE 3 #define ESRCH 4 #define IPPROTO_IP 1 #define IPPROTO_IPV6 2 #define IPPROTO_RAW 100 #define IPTOS_LOWDELAY 100 #define IPTOS_MINCOST 101 #define IPTOS_RELIABILITY 102 #define IPTOS_THROUGHPUT 103 #define SOCK_RAW 12 #define AF_INET 2 #define AF_INET6 28 #define INADDR_ANY 0 #define bcmp(src, dst, len) memcmp(src, dst, len) #define bcopy(src, dst, len) memcpy(dst, src, len) #define bzero(p, len) memset(p, 0, len) #define index(s, c) strchr(s, c) char *strsep(char **stringp, const char *delim); void warn(const char *, ...); //void warnx(const char *, ...); #define warnx warn void err(int, const char *, ...); #define errx err uint16_t htons(uint16_t)__attribute__ ((stdcall)); uint16_t ntohs(uint16_t)__attribute__ ((stdcall)); uint32_t htonl(uint32_t)__attribute__ ((stdcall)); uint32_t ntohl(uint32_t)__attribute__ ((stdcall)); int inet_aton(const char *cp, struct in_addr *pin)__attribute__ ((stdcall));; char * inet_ntoa(struct in_addr)__attribute__ ((stdcall));; const char * inet_ntop(int af, const void * src, char * dst, socklen_t size)__attribute__ ((stdcall));; int inet_pton(int af, const char * src, void * dst)__attribute__ ((stdcall));; struct group { gid_t gr_gid; char gr_name[16]; }; struct passwd { uid_t pw_uid; char pw_name[16]; }; #define getpwnam(s) (NULL) #define getpwuid(s) (NULL) #define getgrnam(x) (NULL) #define getgrgid(x) (NULL) int getopt(int argc, char * const argv[], const char *optstring); int getsockopt(int s, int level, int optname, void * optval, socklen_t * optlen); int setsockopt(int s, int level, int optname, const void *optval, socklen_t optlen); struct protoent { char *p_name; /* official protocol name */ char **p_aliases; /* alias list */ short p_proto; /* protocol # */ }; struct servent { char *s_name; /* official service name */ char **s_aliases; /* alias list */ short s_port; /* port # */ char *s_proto; /* protocol to use */ }; struct hostent { char *h_name; /* official name of host */ char **h_aliases; /* alias list */ short h_addrtype; /* host address type */ short h_length; /* length of address */ char **h_addr_list; /* list of addresses */ #define h_addr h_addr_list[0] /* address, for backward compat */ }; struct hostent* gethostbyaddr(const char* addr, int len, int type)__attribute__ ((stdcall)); struct hostent* gethostbyname(const char *name)__attribute__ ((stdcall)); struct protoent* getprotobynumber(int number)__attribute__ ((stdcall)); struct protoent* getprotobyname(const char* name)__attribute__ ((stdcall)); struct servent* getservbyport(int port, const char* proto)__attribute__ ((stdcall)); struct servent* getservbyname(const char* name, const char* proto) __attribute__ ((stdcall)); extern int optind; extern char *optarg; #include #define WSADESCRIPTION_LEN 256 #define WSASYS_STATUS_LEN 128 typedef struct WSAData { WORD wVersion; WORD wHighVersion; char szDescription[WSADESCRIPTION_LEN+1]; char szSystemStatus[WSASYS_STATUS_LEN+1]; unsigned short iMaxSockets; unsigned short iMaxUdpDg; char FAR * lpVendorInfo; } WSADATA, * LPWSADATA; int WSAStartup( WORD wVersionRequested, LPWSADATA lpWSAData ); int WSACleanup(void); int WSAGetLastError(); /* return error on process handling */ #define pipe(f) (-1) #define kill(p, s) (-1) #define waitpid(w,s,o) (-1) #define fork(x) (-1) #define execvp(f, a) (-1) #define _W_INT(i) (i) #define _WSTATUS(x) (_W_INT(x) & 0177) #define WIFEXITED(x) (_WSTATUS(x) == 0) #define WEXITSTATUS(x) (_W_INT(x) >> 8) #define _WSTOPPED 0177 /* _WSTATUS if process is stopped */ #define WIFSIGNALED(x) (_WSTATUS(x) != _WSTOPPED && _WSTATUS(x) != 0) #define WTERMSIG(x) (_WSTATUS(x)) #endif /* _TCC_GLUE_H */ ================================================ FILE: test/Makefile ================================================ # # $Id: Makefile 5626 2010-03-04 21:55:22Z luigi $ # # Makefile for building userland tests # this is written in a form compatible with gmake SCHED_SRCS = test_dn_sched.c SCHED_SRCS += dn_sched_fifo.c SCHED_SRCS += dn_sched_wf2q.c SCHED_SRCS += dn_sched_qfq.c SCHED_SRCS += dn_sched_rr.c SCHED_SRCS += dn_heap.c SCHED_SRCS += main.c SCHED_OBJS=$(SCHED_SRCS:.c=.o) HEAP_SRCS = dn_heap.c test_dn_heap.c HEAP_OBJS=$(HEAP_SRCS:.c=.o) VPATH= .:../dummynet2 #CFLAGS = -I../dummynet2/include -I. -Wall -Werror -O3 -DIPFW CFLAGS = -I. -I../dummynet2/include/netinet/ipfw -DIPFW CFLAGS += -Wall -Werror CFLAGS += -g -O3 TARGETS= test_sched # no test_heap by default all: $(TARGETS) test_heap : $(HEAP_OBJS) $(CC) -o $@ $(HEAP_OBJS) test_sched : $(SCHED_OBJS) $(CC) -o $@ $(SCHED_OBJS) $(SCHED_OBJS): dn_test.h main.o: mylist.h clean: - rm *.o $(TARGETS) *.core ALLSRCS = $(SCHED_SRCS) dn_test.h mylist.h \ dn_sched.h dn_heap.h ip_dn_private.h Makefile TMPBASE = /tmp/testXYZ TMPDIR = $(TMPBASE)/test tgz: -rm -rf $(TMPDIR) mkdir -p $(TMPDIR) -cp -p $(ALLSRCS) $(TMPDIR) -(cd ..; cp -p $(ALLSRCS) $(TMPDIR)) ls -la $(TMPDIR) (cd $(TMPBASE); tar cvzf /tmp/test.tgz test) ================================================ FILE: test/basic_ipfw.sh ================================================ #!/bin/sh IPFW=./ipfw/ipfw PING=/bin/ping RH=127.0.0.1 # remote host R=10 # test rule number P=1 # test pipe number abort() { echo $* } #insmod dummynet2/ipfw_mod.ko #$IPFW show > /dev/null #$IPFW pipe show echo "Flushing rules, do you agree ?" $IPFW flush # test_msg rule counter clean() { $IPFW delete $R 2> /dev/null $IPFW pipe $P delete 2> /dev/null } # simple counter/allow test echo -n "counter/allow test..." clean $IPFW add $R allow icmp from any to 127.0.0.1 > /dev/null $PING -f -c100 $RH > /dev/null counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f3` [ ! $counter -eq 400 ] && abort "Wrong counter $counter 400" echo "...OK" # simple drop test echo -n "deny test..." clean $IPFW add $R deny icmp from any to 127.0.0.1 > /dev/null $PING -f -c10 -W 1 $RH > /dev/null counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4` [ ! $counter -eq 10 ] && abort "Wrong counter $counter 10" echo "...OK" # pipe delay test echo -n "pipe delay test..." clean $IPFW pipe $P config delay 2000ms >/dev/null $IPFW add $R pipe $P icmp from any to $RH >/dev/null $PING -f -c10 -W 1 $RH > /dev/null counter1=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4` sleep 2 counter2=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4` [ ! $counter1 -eq 10 ] && abort "Wrong counter $counter 10" [ ! $counter2 -eq 20 ] && abort "Wrong counter $counter 20" echo "...OK" # pipe bw test echo -n "pipe bw test..." clean $IPFW pipe $P config bw 2Kbit/s >/dev/null $IPFW add $R pipe $P icmp from any to $RH >/dev/null $PING -i 0.1 -c10 -W 1 $RH > /dev/null counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4` [ $counter -gt 30 ] && abort "Wrong counter $counter should be < 30" sleep 1 counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4` [ $counter -gt 30 ] && abort "Wrong counter $counter should be < 30" echo "...OK" # Final clean clean ================================================ FILE: test/dn_test.h ================================================ /* * $Id: dn_test.h 5626 2010-03-04 21:55:22Z luigi $ * * userspace compatibility code for dummynet schedulers */ #ifndef _DN_TEST_H #define _DN_TEST_H #include #include #include #include /* bzero, ffs, ... */ #include /* strcmp */ #include #include #include extern int debug; #define ND(fmt, args...) do {} while (0) #define D1(fmt, args...) do {} while (0) #define D(fmt, args...) fprintf(stderr, "%-8s " fmt "\n", \ __FUNCTION__, ## args) #define DX(lev, fmt, args...) do { \ if (debug > lev) D(fmt, ## args); } while (0) #define offsetof(t,m) (int)((&((t *)0L)->m)) #include /* prevent include of other system headers */ #define _NETINET_IP_VAR_H_ /* ip_fw_args */ #define _IPFW2_H #define _SYS_MBUF_H_ enum { DN_QUEUE, }; enum { DN_SCHED_FIFO, DN_SCHED_WF2QP, }; struct dn_id { int type, subtype, len, id; }; struct dn_fs { int par[4]; /* flowset parameters */ /* simulation entries. * 'index' is not strictly necessary * y is used for the inverse mapping , */ int index; int y; /* inverse mapping */ int base_y; /* inverse mapping */ int next_y; /* inverse mapping */ int n_flows; int first_flow; int next_flow; /* first_flow + n_flows */ /* * when generating, let 'cur' go from 0 to n_flows-1, * then point to flow first_flow + cur */ int cur; }; struct dn_sch { }; struct dn_flow { struct dn_id oid; int length; int len_bytes; int drops; uint64_t tot_bytes; uint32_t flow_id; struct list_head h; /* used by the generator */ }; struct dn_link { }; struct ip_fw_args { }; struct mbuf { struct { int len; } m_pkthdr; struct mbuf *m_nextpkt; int flow_id; /* for testing, index of a flow */ //int flowset_id; /* for testing, index of a flowset */ void *cfg; /* config args */ }; #define MALLOC_DECLARE(x) #define KASSERT(x, y) do { if (!(x)) printf y ; exit(0); } while (0) struct ipfw_flow_id { }; typedef void * module_t; struct _md_t { const char *name; int (*f)(module_t, int, void *); void *p; }; typedef struct _md_t moduledata_t; #define DECLARE_MODULE(name, b, c, d) \ moduledata_t *_g_##name = & b #define MODULE_DEPEND(a, b, c, d, e) #ifdef IPFW #include #include #include #else struct dn_queue { struct dn_fsk *fs; /* parent flowset. */ struct dn_sch_inst *_si; /* parent sched instance. */ }; struct dn_schk { }; struct dn_fsk { struct dn_fs fs; struct dn_schk *sched; }; struct dn_sch_inst { struct dn_schk *sched; }; struct dn_alg { int type; const char *name; void *enqueue, *dequeue; int q_datalen, si_datalen, schk_datalen; int (*config)(struct dn_schk *); int (*new_sched)(struct dn_sch_inst *); int (*new_fsk)(struct dn_fsk *); int (*new_queue)(struct dn_queue *q); }; #endif #ifndef __FreeBSD__ int fls(int); #endif static inline void mq_append(struct mq *q, struct mbuf *m) { if (q->head == NULL) q->head = m; else q->tail->m_nextpkt = m; q->tail = m; m->m_nextpkt = NULL; } #endif /* _DN_TEST_H */ ================================================ FILE: test/dynrules.sh ================================================ #!/bin/sh # # 20100507 marta, quick test for dyn rules # ./ipfw/ipfw -d show |grep \ 80 IPFW_MOD=dummynet2/ipfw_mod.ko IPFW=ipfw/ipfw # main # remove any previous loaded module /sbin/rmmod ipfw_mod /sbin/insmod ${IPFW_MOD} echo "25" > /sys/module/ipfw_mod/parameters/dyn_ack_lifetime ${IPFW} add 1 check-state ${IPFW} add 9 allow all from any to any keep-state ${IPFW} add 10 allow all from any to onelab1.iet.unipi.it keep-state telnet 72.14.234.104 80 ================================================ FILE: test/interpolation.c ================================================ #include #include #include /* gcc interpolation.c -o interpolation */ void err(int eval, const char *fmt, ...) { } void errx(int eval, const char *fmt, ...) { } #define ED_MAX_SAMPLES_NO 1000 #define ED_MAX_LINE_LEN 128 #define EX_DATAERR 1 #define EX_UNAVAILABLE 3 #define ED_TOK_DELAY "delay" #define ED_TOK_PROB "prob" #define ED_SEPARATORS " \t\n" #define ED_TOK_PROFILE_NO "profile_no" struct point { double prob; /* y */ double delay; /* x */ }; struct profile { char filename[128]; /* profile filename */ int samples[ED_MAX_SAMPLES_NO+1]; /* may be shorter */ int samples_no; /* actual len of samples[] */ }; /* * returns 1 if s is a non-negative number, with at least one '.' */ static int is_valid_number(const char *s) { #if 0 int i, dots_found = 0; int len = strlen(s); for (i = 0; i 1)) return 0; #endif return 1; } static int compare_points(const void *vp1, const void *vp2) { const struct point *p1 = vp1; const struct point *p2 = vp2; double res = 0; res = p1->prob - p2->prob; if (res == 0) res = p1->delay - p2->delay; if (res < 0) return -1; else if (res > 0) return 1; else return 0; } #define ED_EFMT(s) 1,"error in %s at line %d: "#s,filename,lineno /* * The points defined by the user are stored in the ponts structure. * The number of user defined points is stored in points_no. * We assume that The last point for the '1' value of the * probability should be defined. (XXX add checks for this) * The user defined sampling value is stored in samples_no. * The resulting samples are in the "samples" pointer. */ static void interpolate_samples(struct point *p, int points_no, int *samples, int samples_no, const char *filename) { double dy; /* delta on the y axis */ double y; /* current value of y */ double x; /* current value of x */ double m; /* the y slope */ int i; /* samples index */ int curr; /* points current index */ dy = 1.0/samples_no; y = 0; for (i=0, curr = 0; i < samples_no; i++, y+=dy) { /* This statment move the curr pointer to the next point * skipping the points with the same x value. We are * guaranteed to exit from the loop because the * last possible value of y is stricly less than 1 * and the last possible value of the y points is 1 */ while ( y >= p[curr+1].prob ) curr++; /* compute the slope of the curve */ m = (p[curr+1].delay - p[curr].delay) / (p[curr+1].prob - p[curr].prob); /* compute the x value starting from the current point */ x = p[curr].delay + (y - p[curr].prob) * m; samples[i] = x; } /* add the last sample */ samples[i] = p[curr+1].delay; } #if 0 static void interpolate_samples_old(struct point *points, int points_no, int *samples, int samples_no, const char *filename) { int i; /* pointer to the sampled array */ int j = 0; /* pointer to user defined samples */ double dy; /* delta y */ double y; /* current value of y */ int x; /* computed value of x */ double m; /* slope of the line */ double y1, x1, y2, x2; /* two points of the current line */ /* make sure that there are enough points. */ /* XXX Duplicated shoule be removed */ if (points_no < 3) errx(EX_DATAERR, "%s too few samples, need at least %d", filename, 3); qsort(points, points_no, sizeof(struct point), compare_points); samples_no--; dy = 1.0/samples_no; printf("\nsamples no is %d dy is %f ", samples_no, dy); /* start with the first two points */ y1 = points[j].prob * samples_no; x1 = points[j].delay; j++; y2 = points[j].prob * samples_no; x2 = points[j].delay; m = (y2-y1)/(x2-x1); printf("\nStart"); printf("\n\tCurrent points x1 y1 %f %f next point x2y2 %f %f m %f\n", x1, y1, x2, y2, m); y = 0; x = x1; for(i=0; i < samples_no+1; i++, y+=dy) { printf("\ni:%d j:%d y:%f real y:%f", i, j, y, y*samples_no); if ( (y*samples_no) >= y2 ) { /* move to the next point */ j++; if ( j >= points_no ) { printf("\n\tNo more points, exit with j: %d i: %d and y:%f %f\n", j, i, y, (y*samples_no)); break; /* no more user defined points */ } /* load a new point */ y1 = y2; x1 = x2; y2 = points[j].prob * samples_no; x2 = points[j].delay; m = (y2-y1)/(x2-x1); if (x1==x2) { /* m = infinito */ m = -1; x = x2; } /* very small m problem */ printf ("\ndelta %f\n", (y1 - y2)); if (abs(y1 - y2) < 0.00001) { /* m = 0 XXX Should this magic number depend on samples_no ? */ m = 0; x = x2; } printf("\n\tCurrent points x1 y1 %f %f next point x2y2 %f %f (%f/%f)=m \n", x1, y1, x2, y2, (y2-y1), (x2-x1), m); } printf("\n\tcompute step y %f x[%d]=%d ", y, i, x); if ((m != -1) && ( m != 0 )) { x = x + (dy * samples_no)/m; } samples[i] = x; printf(" dy %f x new %d\n", dy*samples_no, x); printf(" m %f (dy * samples_no)/m %f \n", m, (dy * samples_no)/m); } x = samples[i-1]; printf("Finish i is %d samples_no is %d\n", i, samples_no); /* The last point has a probability less than 1 */ for (; i <= samples_no; i++) samples[i] = x; } #endif static void load_profile(struct profile *p) { FILE *f; /* file handler */ char line[ED_MAX_LINE_LEN]; int lineno = 0; int do_points = 0; int delay_first = -1; int i; struct point points[1000]; /* MAX_POINTS_NO */ int points_no = 0; char *filename = p->filename; f = fopen(filename, "r"); if (f == NULL) { err(EX_UNAVAILABLE, "fopen: %s", filename); } while (fgets(line, ED_MAX_LINE_LEN, f)) { /* read commands */ char *s, *cur = line, *name = NULL, *arg = NULL; ++lineno; /* parse the line */ while (cur) { s = strsep(&cur, ED_SEPARATORS); if (s == NULL || *s == '#') break; if (*s == '\0') continue; if (arg) errx(ED_EFMT("too many arguments")); if (name == NULL) name = s; else arg = s; } if (name == NULL) continue; if (!strcasecmp(name, ED_TOK_DELAY)) { if (do_points) errx(ED_EFMT("duplicated token: %s"), name); delay_first = 1; do_points = 1; continue; } else if (!strcasecmp(name, ED_TOK_PROB)) { if (do_points) errx(ED_EFMT("duplicated token: %s"), name); delay_first = 0; do_points = 1; continue; } if (!strcasecmp(name, ED_TOK_PROFILE_NO)) { int p_no = atof(arg); if (p_no <= 0) { p_no = 100; printf("invalid interpolation samples, using %d\n", p_no); } if (p_no > ED_MAX_SAMPLES_NO) { p_no = ED_MAX_SAMPLES_NO; printf("invalid interpolation samples, using %d\n", p_no); } p->samples_no = p_no; continue; } else if (do_points) { if (!is_valid_number(name) || !is_valid_number(arg)) errx(ED_EFMT("invalid point found")); if (delay_first) { points[points_no].delay = atof(name); points[points_no].prob = atof(arg); } else { points[points_no].delay = atof(arg); points[points_no].prob = atof(name); } if (points[points_no].prob > 1.0) errx(ED_EFMT("probability greater than 1.0")); ++points_no; /* XXX no more that 1000 */ continue; } else { errx(ED_EFMT("unrecognised command '%s'"), name); } } for(i=0; i < p->samples_no; i++) { p->samples[i] = 666; } /* This code assume the user define a value of X for the sampling value, * and that: * - the value stored in the emulator structure is X; * - the allocated structure for the samples is X+1; */ interpolate_samples(points, points_no, p->samples, p->samples_no, filename); // User defined samples printf("\nLoaded %d points:\n", points_no); for(i=0; i < points_no; i++) { printf("%f %f\n", points[i].prob, points[i].delay); } printf("\n"); printf("The sample value is %d \n", p->samples_no); } int main(int argc, char **argv) { if (argc < 2) { printf("Usage: ./interpolation \n"); return -1; } char *filename; filename = argv[1]; struct profile p; int i; strncpy(p.filename, filename, 128); load_profile(&p); printf("-----------\n"); for (i=0; i<=p.samples_no; i++) printf("%d %d\n", i, p.samples[i]); printf("-----------\n"); return 0; } ================================================ FILE: test/main.c ================================================ /* * $Id: main.c 5626 2010-03-04 21:55:22Z luigi $ * * Testing program for schedulers * * The framework include a simple controller which, at each * iteration, decides whether we can enqueue and/or dequeue. * Then the mainloop runs the required number of tests, * keeping track of statistics. */ #include "dn_test.h" struct q_list { struct list_head h; }; struct cfg_s { int ac; char * const *av; const char *name; int loops; struct timeval time; /* running counters */ uint32_t _enqueue; uint32_t drop; uint32_t pending; uint32_t dequeue; /* generator parameters */ int th_min, th_max; int maxburst; int lmin, lmax; /* packet len */ int flows; /* number of flows */ int flowsets; /* number of flowsets */ int wsum; /* sum of weights of all flows */ int max_y; /* max random number in the generation */ int cur_y, cur_fs; /* used in generation, between 0 and max_y - 1 */ const char *fs_config; /* flowset config */ int can_dequeue; int burst; /* count of packets sent in a burst */ struct mbuf *tosend; /* packet to send -- also flag to enqueue */ struct mbuf *freelist; struct mbuf *head, *tail; /* a simple tailq */ /* scheduler hooks */ int (*enq)(struct dn_sch_inst *, struct dn_queue *, struct mbuf *); struct mbuf * (*deq)(struct dn_sch_inst *); /* size of the three fields including sched-specific areas */ int schk_len; int q_len; /* size of a queue including sched-fields */ int si_len; /* size of a sch_inst including sched-fields */ char *q; /* array of flow queues */ /* use a char* because size is variable */ struct dn_fsk *fs; /* array of flowsets */ struct dn_sch_inst *si; struct dn_schk *sched; /* generator state */ int state; /* 0 = going up, 1: going down */ /* * We keep lists for each backlog level, and always serve * the one with shortest backlog. llmask contains a bitmap * of lists, and ll are the heads of the lists. The last * entry (BACKLOG) contains all entries considered 'full' * XXX to optimize things, entry i could contain queues with * 2^{i-1}+1 .. 2^i entries. */ #define BACKLOG 30 uint32_t llmask; struct list_head ll[BACKLOG + 10]; }; /* FI2Q and Q2FI converts from flow_id to dn_queue and back. * We cannot easily use pointer arithmetic because it is variable size. */ #define FI2Q(c, i) ((struct dn_queue *)((c)->q + (c)->q_len * (i))) #define Q2FI(c, q) (((char *)(q) - (c)->q)/(c)->q_len) int debug = 0; struct dn_parms dn_cfg; static void controller(struct cfg_s *c); /* release a packet: put the mbuf in the freelist, and the queue in * the bucket. */ int drop(struct cfg_s *c, struct mbuf *m) { struct dn_queue *q; int i; c->drop++; q = FI2Q(c, m->flow_id); i = q->ni.length; // XXX or ffs... ND("q %p id %d current length %d", q, m->flow_id, i); if (i < BACKLOG) { struct list_head *h = &q->ni.h; c->llmask &= ~(1<<(i+1)); c->llmask |= (1<<(i)); list_del(h); list_add_tail(h, &c->ll[i]); } m->m_nextpkt = c->freelist; c->freelist = m; return 0; } /* dequeue returns NON-NULL when a packet is dropped */ static int enqueue(struct cfg_s *c, void *_m) { struct mbuf *m = _m; if (c->enq) return c->enq(c->si, FI2Q(c, m->flow_id), m); if (c->head == NULL) c->head = m; else c->tail->m_nextpkt = m; c->tail = m; return 0; /* default - success */ } /* dequeue returns NON-NULL when a packet is available */ static void * dequeue(struct cfg_s *c) { struct mbuf *m; if (c->deq) return c->deq(c->si); if ((m = c->head)) { m = c->head; c->head = m->m_nextpkt; m->m_nextpkt = NULL; } return m; } static int mainloop(struct cfg_s *c) { int i; struct mbuf *m; for (i=0; i < c->loops; i++) { /* implement histeresis */ controller(c); DX(3, "loop %d enq %d send %p rx %d", i, c->_enqueue, c->tosend, c->can_dequeue); if ( (m = c->tosend) ) { c->_enqueue++; if (enqueue(c, m)) { drop(c, m); ND("loop %d enqueue fail", i ); } else { ND("enqueue ok"); c->pending++; } } if (c->can_dequeue) { c->dequeue++; if ((m = dequeue(c))) { c->pending--; drop(c, m); c->drop--; /* compensate */ } } } DX(1, "mainloop ends %d", i); return 0; } int dump(struct cfg_s *c) { int i; struct dn_queue *q; for (i=0; i < c->flows; i++) { q = FI2Q(c, i); DX(1, "queue %4d tot %10lld", i, q->ni.tot_bytes); } DX(1, "done %d loops\n", c->loops); return 0; } /* interpret a number in human form */ static long getnum(const char *s, char **next, const char *key) { char *end = NULL; long l; if (next) /* default */ *next = NULL; if (s && *s) { DX(3, "token is <%s> %s", s, key ? key : "-"); l = strtol(s, &end, 0); } else { DX(3, "empty string"); l = -1; } if (l < 0) { DX(2, "invalid %s for %s", s ? s : "NULL", (key ? key : "") ); return 0; // invalid } if (!end || !*end) return l; if (*end == 'n') l = -l; /* multiply by n */ else if (*end == 'K') l = l*1000; else if (*end == 'M') l = l*1000000; else if (*end == 'k') l = l*1024; else if (*end == 'm') l = l*1024*1024; else if (*end == 'w') ; else {/* not recognized */ D("suffix %s for %s, next %p", end, key, next); end--; } end++; DX(3, "suffix now %s for %s, next %p", end, key, next); if (next && *end) { DX(3, "setting next to %s for %s", end, key); *next = end; } return l; } /* * flowsets are a comma-separated list of * weight:maxlen:flows * indicating how many flows are hooked to that fs. * Both weight and range can be min-max-steps. * In a first pass we just count the number of flowsets and flows, * in a second pass we complete the setup. */ static void parse_flowsets(struct cfg_s *c, const char *fs, int pass) { char *s, *cur, *next; int n_flows = 0, n_fs = 0, wsum = 0; int i, j; struct dn_fs *prev = NULL; DX(3, "--- pass %d flows %d flowsets %d", pass, c->flows, c->flowsets); if (pass == 0) c->fs_config = fs; s = c->fs_config ? strdup(c->fs_config) : NULL; if (s == NULL) { if (pass == 0) D("no fsconfig"); return; } for (next = s; (cur = strsep(&next, ","));) { char *p = NULL; int w, w_h, w_steps, wi; int len, len_h, l_steps, li; int flows; w = getnum(strsep(&cur, ":"), &p, "weight"); if (w <= 0) w = 1; w_h = p ? getnum(p+1, &p, "weight_max") : w; w_steps = p ? getnum(p+1, &p, "w_steps") : (w_h == w ?1:2); len = getnum(strsep(&cur, ":"), &p, "len"); if (len <= 0) len = 1000; len_h = p ? getnum(p+1, &p, "len_max") : len; l_steps = p ? getnum(p+1, &p, "l_steps") : (len_h == len ? 1 : 2); flows = getnum(strsep(&cur, ":"), NULL, "flows"); if (flows == 0) flows = 1; DX(4, "weight %d..%d (%d) len %d..%d (%d) flows %d", w, w_h, w_steps, len, len_h, l_steps, flows); if (w == 0 || w_h < w || len == 0 || len_h < len || flows == 0) { DX(4,"wrong parameters %s", fs); return; } n_flows += flows * w_steps * l_steps; for (i = 0; i < w_steps; i++) { wi = w + ((w_h - w)* i)/(w_steps == 1 ? 1 : (w_steps-1)); for (j = 0; j < l_steps; j++, n_fs++) { struct dn_fs *fs = &c->fs[n_fs].fs; // tentative int x; li = len + ((len_h - len)* j)/(l_steps == 1 ? 1 : (l_steps-1)); x = (wi*2048)/li; DX(3, "----- fs %4d weight %4d lmax %4d X %4d flows %d", n_fs, wi, li, x, flows); if (pass == 0) continue; if (c->fs == NULL || c->flowsets <= n_fs) { D("error in number of flowsets"); return; } wsum += wi * flows; fs->par[0] = wi; fs->par[1] = li; fs->index = n_fs; fs->n_flows = flows; fs->cur = fs->first_flow = prev==NULL ? 0 : prev->next_flow; fs->next_flow = fs->first_flow + fs->n_flows; fs->y = x * flows; fs->base_y = (prev == NULL) ? 0 : prev->next_y; fs->next_y = fs->base_y + fs->y; prev = fs; } } } c->max_y = prev ? prev->base_y + prev->y : 0; c->flows = n_flows; c->flowsets = n_fs; c->wsum = wsum; if (pass == 0) return; /* now link all flows to their parent flowsets */ DX(1,"%d flows on %d flowsets max_y %d", c->flows, c->flowsets, c->max_y); for (i=0; i < c->flowsets; i++) { struct dn_fs *fs = &c->fs[i].fs; DX(1, "fs %3d w %5d l %4d flow %5d .. %5d y %6d .. %6d", i, fs->par[0], fs->par[1], fs->first_flow, fs->next_flow, fs->base_y, fs->next_y); for (j = fs->first_flow; j < fs->next_flow; j++) { struct dn_queue *q = FI2Q(c, j); q->fs = &c->fs[i]; } } } static int init(struct cfg_s *c) { int i; int ac = c->ac; char * const *av = c->av; c->si_len = sizeof(struct dn_sch_inst); c->q_len = sizeof(struct dn_queue); moduledata_t *mod = NULL; struct dn_alg *p = NULL; c->th_min = 0; c->th_max = -20;/* 20 packets per flow */ c->lmin = c->lmax = 1280; /* packet len */ c->flows = 1; c->flowsets = 1; c->name = "null"; ac--; av++; while (ac > 1) { if (!strcmp(*av, "-n")) { c->loops = getnum(av[1], NULL, av[0]); } else if (!strcmp(*av, "-d")) { debug = atoi(av[1]); } else if (!strcmp(*av, "-alg")) { extern moduledata_t *_g_dn_fifo; extern moduledata_t *_g_dn_wf2qp; extern moduledata_t *_g_dn_rr; extern moduledata_t *_g_dn_qfq; #ifdef WITH_KPS extern moduledata_t *_g_dn_kps; #endif if (!strcmp(av[1], "rr")) mod = _g_dn_rr; else if (!strcmp(av[1], "wf2qp")) mod = _g_dn_wf2qp; else if (!strcmp(av[1], "fifo")) mod = _g_dn_fifo; else if (!strcmp(av[1], "qfq")) mod = _g_dn_qfq; #ifdef WITH_KPS else if (!strcmp(av[1], "kps")) mod = _g_dn_kps; #endif else mod = NULL; c->name = mod ? mod->name : "NULL"; DX(3, "using scheduler %s", c->name); } else if (!strcmp(*av, "-len")) { c->lmin = getnum(av[1], NULL, av[0]); c->lmax = c->lmin; DX(3, "setting max to %d", c->th_max); } else if (!strcmp(*av, "-burst")) { c->maxburst = getnum(av[1], NULL, av[0]); DX(3, "setting max to %d", c->th_max); } else if (!strcmp(*av, "-qmax")) { c->th_max = getnum(av[1], NULL, av[0]); DX(3, "setting max to %d", c->th_max); } else if (!strcmp(*av, "-qmin")) { c->th_min = getnum(av[1], NULL, av[0]); DX(3, "setting min to %d", c->th_min); } else if (!strcmp(*av, "-flows")) { c->flows = getnum(av[1], NULL, av[0]); DX(3, "setting flows to %d", c->flows); } else if (!strcmp(*av, "-flowsets")) { parse_flowsets(c, av[1], 0); DX(3, "setting flowsets to %d", c->flowsets); } else { D("option %s not recognised, ignore", *av); } ac -= 2; av += 2; } if (c->maxburst <= 0) c->maxburst = 1; if (c->loops <= 0) c->loops = 1; if (c->flows <= 0) c->flows = 1; if (c->flowsets <= 0) c->flowsets = 1; if (c->lmin <= 0) c->lmin = 1; if (c->lmax <= 0) c->lmax = 1; /* multiply by N */ if (c->th_min < 0) c->th_min = c->flows * -c->th_min; if (c->th_max < 0) c->th_max = c->flows * -c->th_max; if (c->th_max <= c->th_min) c->th_max = c->th_min + 1; if (mod) { p = mod->p; DX(3, "using module %s f %p p %p", mod->name, mod->f, mod->p); DX(3, "modname %s ty %d", p->name, p->type); c->enq = p->enqueue; c->deq = p->dequeue; c->si_len += p->si_datalen; c->q_len += p->q_datalen; c->schk_len += p->schk_datalen; } /* allocate queues, flowsets and one scheduler */ c->q = calloc(c->flows, c->q_len); c->fs = calloc(c->flowsets, sizeof(struct dn_fsk)); c->si = calloc(1, c->si_len); c->sched = calloc(c->flows, c->schk_len); if (c->q == NULL || c->fs == NULL) { D("error allocating memory for flows"); exit(1); } c->si->sched = c->sched; if (p) { if (p->config) p->config(c->sched); if (p->new_sched) p->new_sched(c->si); } /* parse_flowsets links queues to their flowsets */ parse_flowsets(c, av[1], 1); /* complete the work calling new_fsk */ for (i = 0; i < c->flowsets; i++) { if (c->fs[i].fs.par[1] == 0) c->fs[i].fs.par[1] = 1000; /* default pkt len */ c->fs[i].sched = c->sched; if (p && p->new_fsk) p->new_fsk(&c->fs[i]); } /* initialize the lists for the generator, and put * all flows in the list for backlog = 0 */ for (i=0; i <= BACKLOG+5; i++) INIT_LIST_HEAD(&c->ll[i]); for (i = 0; i < c->flows; i++) { struct dn_queue *q = FI2Q(c, i); if (q->fs == NULL) q->fs = &c->fs[0]; /* XXX */ q->_si = c->si; if (p && p->new_queue) p->new_queue(q); INIT_LIST_HEAD(&q->ni.h); list_add_tail(&q->ni.h, &c->ll[0]); } c->llmask = 1; return 0; } int main(int ac, char *av[]) { struct cfg_s c; struct timeval end; double ll; int i; char msg[40]; bzero(&c, sizeof(c)); c.ac = ac; c.av = av; init(&c); gettimeofday(&c.time, NULL); mainloop(&c); gettimeofday(&end, NULL); end.tv_sec -= c.time.tv_sec; end.tv_usec -= c.time.tv_usec; if (end.tv_usec < 0) { end.tv_usec += 1000000; end.tv_sec--; } c.time = end; ll = end.tv_sec*1000000 + end.tv_usec; ll *= 1000; /* convert to nanoseconds */ ll /= c._enqueue; sprintf(msg, "1::%d", c.flows); D("%-8s n %d %d time %d.%06d %8.3f qlen %d %d flows %s drops %d", c.name, c._enqueue, c.loops, (int)c.time.tv_sec, (int)c.time.tv_usec, ll, c.th_min, c.th_max, c.fs_config ? c.fs_config : msg, c.drop); dump(&c); DX(1, "done ac %d av %p", ac, av); for (i=0; i < ac; i++) DX(1, "arg %d %s", i, av[i]); return 0; } /* * The controller decides whether in this iteration we should send * (the packet is in c->tosend) and/or receive (flag c->can_dequeue) */ static void controller(struct cfg_s *c) { struct mbuf *m; struct dn_fs *fs; int flow_id; /* histeresis between max and min */ if (c->state == 0 && c->pending >= c->th_max) c->state = 1; else if (c->state == 1 && c->pending <= c->th_min) c->state = 0; ND(1, "state %d pending %2d", c->state, c->pending); c->can_dequeue = c->state; c->tosend = NULL; if (c->state) return; if (1) { int i; struct dn_queue *q; struct list_head *h; i = ffs(c->llmask) - 1; if (i < 0) { DX(2, "no candidate"); c->can_dequeue = 1; return; } h = &c->ll[i]; ND(1, "backlog %d p %p prev %p next %p", i, h, h->prev, h->next); q = list_first_entry(h, struct dn_queue, ni.h); list_del(&q->ni.h); flow_id = Q2FI(c, q); DX(2, "extracted flow %p %d backlog %d", q, flow_id, i); if (list_empty(h)) { ND(2, "backlog %d empty", i); c->llmask &= ~(1<ni.h, h+1); ND(1, " after %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next); if (i < BACKLOG) { ND(2, "backlog %d full", i+1); c->llmask |= 1<<(1+i); } fs = &q->fs->fs; c->cur_fs = q->fs - c->fs; fs->cur = flow_id; } else { /* XXX this does not work ? */ /* now decide whom to send the packet, and the length */ /* lookup in the flow table */ if (c->cur_y >= c->max_y) { /* handle wraparound */ c->cur_y = 0; c->cur_fs = 0; } fs = &c->fs[c->cur_fs].fs; flow_id = fs->cur++; if (fs->cur >= fs->next_flow) fs->cur = fs->first_flow; c->cur_y++; if (c->cur_y >= fs->next_y) c->cur_fs++; } /* construct a packet */ if (c->freelist) { m = c->tosend = c->freelist; c->freelist = c->freelist->m_nextpkt; } else { m = c->tosend = calloc(1, sizeof(struct mbuf)); } if (m == NULL) return; m->cfg = c; m->m_nextpkt = NULL; m->m_pkthdr.len = fs->par[1]; // XXX maxlen m->flow_id = flow_id; ND(2,"y %6d flow %5d fs %3d weight %4d len %4d", c->cur_y, m->flow_id, c->cur_fs, fs->par[0], m->m_pkthdr.len); } /* Packet allocation: to achieve a distribution that matches weights, for each X=w/lmax class we should generate a number of packets proportional to Y = X times the number of flows in the class. So we construct an array with the cumulative distribution of Y's, and use it to identify the flow via inverse mapping (if the Y's are not too many we can use an array for the lookup). In practice, each flow will have X entries [virtually] pointing to it. */ ================================================ FILE: test/memory_leak.sh ================================================ #!/bin/sh # this script execute N times the command CMD # collecting the memory usage on a file. # The value of the Dirty memory should not increase # between tests. BASE_NAME=ipfw_r5808_ N=10000 CMD1="/sbin/insmod ../dummynet2/ipfw_mod.ko" CMD2="/sbin/rmmod ipfw_mod" # main # remove any previous loaded module /sbin/rmmod ipfw_mod # pre for n in `seq $N`; do $CMD1 $CMD2 [ $n = 10 ] && cat /proc/meminfo > /tmp/${BASE_NAME}_${n} [ $n = 100 ] && cat /proc/meminfo > /tmp/${BASE_NAME}_${n} [ $n = 1000 ] && cat /proc/meminfo > /tmp/${BASE_NAME}_${n} done; # post ================================================ FILE: test/mylist.h ================================================ /* * $Id: mylist.h 5626 2010-03-04 21:55:22Z luigi $ * * linux-like bidirectional lists */ #ifndef _MYLIST_H #define _MYLIST_H struct list_head { struct list_head *prev, *next; }; #define INIT_LIST_HEAD(l) do { (l)->prev = (l)->next = (l); } while (0) #define list_empty(l) ( (l)->next == l ) static inline void __list_add(struct list_head *o, struct list_head *prev, struct list_head *next) { next->prev = o; o->next = next; o->prev = prev; prev->next = o; } static inline void list_add_tail(struct list_head *o, struct list_head *head) { __list_add(o, head->prev, head); } #define list_first_entry(pL, ty, member) \ (ty *)((char *)((pL)->next) - offsetof(ty, member)) static inline void __list_del(struct list_head *prev, struct list_head *next) { next->prev = prev; prev->next = next; } static inline void list_del(struct list_head *entry) { ND("called on %p", entry); __list_del(entry->prev, entry->next); entry->next = entry->prev = NULL; } #endif /* _MYLIST_H */ ================================================ FILE: test/profile_bench1 ================================================ profile_no 100 delay prob 207 0.000264 255 0.034117 270 0.072280 279 0.106749 288 0.148604 298 0.184304 302 0.202194 353 0.384541 423 0.588842 510 0.782126 516 0.800970 545 0.845706 553 0.861411 573 0.889430 586 0.912117 620 0.920003 661 0.938308 695 0.944191 740 0.949112 765 0.952598 848 0.957109 1379 0.983768 1555 0.983778 1649 1 ================================================ FILE: test/profile_bench2 ================================================ samples 10 delay prob 0 0 250 0 250 0.5 500 0.5 500 1 ================================================ FILE: test/profile_bench3 ================================================ profile_no 100 delay prob 0 0 50 0.5 100 1 ================================================ FILE: test/test_dn_heap.c ================================================ /*- * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Userland code for testing binary heaps and hash tables * * $Id: test_dn_heap.c 6131 2010-04-22 15:37:36Z svn_panicucci $ */ #include #include #include #include #include #include "dn_test.h" #include "dn_heap.h" #define log(x, arg...) fprintf(stderr, ## arg) #define panic(x...) fprintf(stderr, ## x), exit(1) #include struct x { struct x *ht_link; char buf[0]; }; uint32_t hf(uintptr_t key, int flags, void *arg) { return (flags & DNHT_KEY_IS_OBJ) ? ((struct x *)key)->buf[0] : *(char *)key; } int matchf(void *obj, uintptr_t key, int flags, void *arg) { char *s = (flags & DNHT_KEY_IS_OBJ) ? ((struct x *)key)->buf : (char *)key; return (strcmp(((struct x *)obj)->buf, s) == 0); } void *newfn(uintptr_t key, int flags, void *arg) { char *s = (char *)key; struct x *p = malloc(sizeof(*p) + 1 + strlen(s)); if (p) strcpy(p->buf, s); return p; } char *strings[] = { "undici", "unico", "doppio", "devoto", "uno", "due", "tre", "quattro", "cinque", "sei", "uno", "due", "tre", "quattro", "cinque", "sei", NULL, }; int doprint(void *_x, void *arg) { struct x *x = _x; printf("found element <%s>\n", x->buf); return (int)arg; } static void test_hash() { char **p; struct dn_ht *h; uintptr_t x = 0; uintptr_t x1 = 0; /* first, find and allocate */ h = dn_ht_init(NULL, 10, 0, hf, matchf, newfn); for (p = strings; *p; p++) { dn_ht_find(h, (uintptr_t)*p, DNHT_INSERT, NULL); } dn_ht_scan(h, doprint, 0); printf("/* second -- find without allocate */\n"); h = dn_ht_init(NULL, 10, 0, hf, matchf, NULL); for (p = strings; *p; p++) { void **y = newfn((uintptr_t)*p, 0, NULL); if (x == 0) x = (uintptr_t)y; else { if (x1 == 0) x1 = (uintptr_t)*p; } dn_ht_find(h, (uintptr_t)y, DNHT_INSERT | DNHT_KEY_IS_OBJ, NULL); } dn_ht_scan(h, doprint, 0); printf("remove %p gives %p\n", (void *)x, dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL)); printf("remove %p gives %p\n", (void *)x, dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL)); printf("remove %p gives %p\n", (void *)x, dn_ht_find(h, x1, DNHT_REMOVE, NULL)); printf("remove %p gives %p\n", (void *)x, dn_ht_find(h, x1, DNHT_REMOVE, NULL)); dn_ht_scan(h, doprint, 0); } int main(int argc, char *argv[]) { struct dn_heap h; int i, n, n2, n3; test_hash(); return 0; /* n = elements, n2 = cycles */ n = (argc > 1) ? atoi(argv[1]) : 0; if (n <= 0 || n > 1000000) n = 100; n2 = (argc > 2) ? atoi(argv[2]) : 0; if (n2 <= 0) n = 1000000; n3 = (argc > 3) ? atoi(argv[3]) : 0; bzero(&h, sizeof(h)); heap_init(&h, n, -1); while (n2-- > 0) { uint64_t prevk = 0; for (i=0; i < n; i++) heap_insert(&h, n3 ? n-i: random(), (void *)(100+i)); for (i=0; h.elements > 0; i++) { uint64_t k = h.p[0].key; if (k < prevk) panic("wrong sequence\n"); prevk = k; if (0) printf("%d key %llu, val %p\n", i, h.p[0].key, h.p[0].object); heap_extract(&h, NULL); } } return 0; } ================================================ FILE: test/test_dn_sched.c ================================================ /* * $Id: test_dn_sched.c 5626 2010-03-04 21:55:22Z luigi $ * * library functions for userland testing of dummynet schedulers */ #include "dn_test.h" void m_freem(struct mbuf *m) { printf("free %p\n", m); } int dn_sched_modevent(module_t mod, int cmd, void *arg) { return 0; } void dn_free_pkts(struct mbuf *m) { struct mbuf *x; while ( (x = m) ) { m = m->m_nextpkt; m_freem(x); } } int dn_delete_queue(void *_q, void *do_free) { struct dn_queue *q = _q; if (q->mq.head) dn_free_pkts(q->mq.head); free(q); return 0; } /* * This is a simplified function for testing purposes, which does * not implement statistics or random loss. * Enqueue a packet in q, subject to space and queue management policy * (whose parameters are in q->fs). * Update stats for the queue and the scheduler. * Return 0 on success, 1 on drop. The packet is consumed anyways. */ int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) { if (drop) goto drop; if (q->ni.length >= 200) goto drop; mq_append(&q->mq, m); q->ni.length++; q->ni.tot_bytes += m->m_pkthdr.len; return 0; drop: q->ni.drops++; return 1; } int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg) { if (*v < lo) { *v = dflt; } else if (*v > hi) { *v = hi; } return *v; } #ifndef __FreeBSD__ int fls(int mask) { int bit; if (mask == 0) return (0); for (bit = 1; mask != 1; bit++) mask = (unsigned int)mask >> 1; return (bit); } #endif