Repository: rjhogan/Adept-2
Branch: master
Commit: d0a7751a0871
Files: 136
Total size: 1.5 MB

Directory structure:
gitextract_yk4ax793/

├── .gitignore
├── .travis.yml
├── AUTHORS
├── COPYING
├── ChangeLog
├── INSTALL
├── Makefile.am
├── NEWS
├── README.md
├── TODO
├── adept/
│   ├── Array.cpp
│   ├── Makefile.am
│   ├── Minimizer.cpp
│   ├── Stack.cpp
│   ├── StackStorageOrig.cpp
│   ├── Storage.cpp
│   ├── cppblas.cpp
│   ├── cpplapack.h
│   ├── index.cpp
│   ├── inv.cpp
│   ├── jacobian.cpp
│   ├── line_search.cpp
│   ├── minimize_conjugate_gradient.cpp
│   ├── minimize_levenberg_marquardt.cpp
│   ├── minimize_limited_memory_bfgs.cpp
│   ├── settings.cpp
│   ├── solve.cpp
│   └── vector_utilities.cpp
├── benchmark/
│   ├── Makefile.am
│   ├── advection_schemes.h
│   ├── advection_schemes_AD.h
│   ├── advection_schemes_K.h
│   ├── animate.cpp
│   ├── autodiff_benchmark.cpp
│   ├── differentiator.h
│   ├── math_benchmark.cpp
│   ├── matrix_benchmark.cpp
│   └── nx.h
├── config_platform_independent.h.in
├── configure.ac
├── doc/
│   ├── COPYING
│   ├── Makefile
│   ├── README
│   ├── adept_documentation.tex
│   └── adept_reference.tex
├── include/
│   ├── Makefile.am
│   ├── Timer.h
│   ├── adept/
│   │   ├── Active.h
│   │   ├── ActiveConstReference.h
│   │   ├── ActiveReference.h
│   │   ├── Allocator.h
│   │   ├── Array.h
│   │   ├── ArrayWrapper.h
│   │   ├── BinaryOperation.h
│   │   ├── Expression.h
│   │   ├── ExpressionSize.h
│   │   ├── FixedArray.h
│   │   ├── GradientIndex.h
│   │   ├── IndexedArray.h
│   │   ├── Minimizer.h
│   │   ├── Optimizable.h
│   │   ├── Packet.h
│   │   ├── RangeIndex.h
│   │   ├── ScratchVector.h
│   │   ├── SpecialMatrix.h
│   │   ├── Stack.h
│   │   ├── StackStorage.h
│   │   ├── StackStorageOrig.h
│   │   ├── StackStorageOrigStl.h
│   │   ├── Statement.h
│   │   ├── Storage.h
│   │   ├── UnaryOperation.h
│   │   ├── array_shortcuts.h
│   │   ├── base.h
│   │   ├── contiguous_matrix.h
│   │   ├── cppblas.h
│   │   ├── eval.h
│   │   ├── exception.h
│   │   ├── interp.h
│   │   ├── inv.h
│   │   ├── matmul.h
│   │   ├── noalias.h
│   │   ├── outer_product.h
│   │   ├── quick_e.h
│   │   ├── reduce.h
│   │   ├── scalar_shortcuts.h
│   │   ├── settings.h
│   │   ├── solve.h
│   │   ├── spread.h
│   │   ├── store_transpose.h
│   │   ├── traits.h
│   │   ├── vector_utilities.h
│   │   └── where.h
│   ├── adept.h
│   ├── adept_arrays.h
│   ├── adept_fortran.h
│   ├── adept_optimize.h
│   └── create_adept_source_header
├── m4/
│   ├── adept.m4
│   ├── ax_blas.m4
│   ├── ax_lapack.m4
│   ├── ltsugar.m4
│   └── lt~obsolete.m4
├── makefile_include.in
└── test/
    ├── Makefile
    ├── README
    ├── algorithm.cpp
    ├── algorithm.h
    ├── algorithm_with_and_without_ad.h
    ├── rosenbrock_banana_function.cpp
    ├── run_tests.sh
    ├── simulate_radiances.cpp
    ├── simulate_radiances.h
    ├── state.cpp
    ├── state.h
    ├── test_adept.cpp
    ├── test_adept_with_and_without_ad.cpp
    ├── test_array_derivatives.cpp
    ├── test_array_speed.cpp
    ├── test_arrays.cpp
    ├── test_checkpoint.cpp
    ├── test_constructors.cpp
    ├── test_derivatives.cpp
    ├── test_fastexp.cpp
    ├── test_fixed_arrays.cpp
    ├── test_gsl_interface.cpp
    ├── test_interp.cpp
    ├── test_minimizer.cpp
    ├── test_misc.cpp
    ├── test_no_lib.cpp
    ├── test_packet_operations.cpp
    ├── test_radiances.cpp
    ├── test_radiances_array.cpp
    ├── test_reduce_active.cpp
    ├── test_thread_safe.cpp
    └── test_thread_safe_arrays.cpp

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
Makefile.in
/aclocal.m4
/config.guess
/config.h.in
/config.log
/config.sub
/config.status
/configure
/depcomp
/install-sh
/ltmain.sh
/missing
/ar-lib
/autom4te.cache
/compile
/libtool
/stamp-*
*.o
*.a
*.so
*.la
*.tar*
doc/adept_*.log
doc/adept_*.toc
doc/adept_*.aux
doc/adept_*.out
.deps
*~
Makefile
!test/Makefile
!doc/Makefile
include/adept_source.h


================================================
FILE: .travis.yml
================================================
language: cpp
os: linux
sudo: required
dist: trusty
compiler:
  - gcc
before_install:
  - sudo apt-get install gfortran -y
  - type gfortran
install: autoreconf -i && ./configure && make -j8 
script: 
  - make check -j8
  - cat test/test_results.txt


================================================
FILE: AUTHORS
================================================
Robin Hogan <r.j.hogan@ecmwf.int>

================================================
FILE: COPYING
================================================

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: ChangeLog
================================================
version 2.1.4 (in progress)
	- Added support for the copysign function
	- Added aArray::set_gradient(Array) function

version 2.1.3 (22 Feb 2024)
	- Added interp2d and interp3d interpolation functions
	- Added option of nearest-neighbour interpolation

version 2.1.2 (3 Oct 2023)
	- Further bug fixes to reduction of active arrays which did not
	have addequate space allocated by check_space, including "product"
	which requires an additional differential operation per element
	- Fixed out-of-bounds access in test_thread_safe_arrays
	- Slight change to reduce_dimension to avoid incorrect warning
	about ExpressionSize array subscript of -1
	- Fixed broken benchmark/autodiff_benchmark to work with ADOL-C
	- Changed COMPILE_FLAGS argument order in test/Makefile in case
	CPPFLAGS contains Timer.h or other conflicting header file
	- Added benchmark/math_benchmark program

version 2.1.1 (10 April 2022)
	- interp function can perform 1D interpolation of higher
	dimensional Y arrays
	- Bug fix in reduction of an "n" dimensional active array to an
	"n-1" dimensional array: check_space had been forgotten
	- Added Newton-Levenberg[-Marquardt] options to test_minimizer,
	which use the exact Hessian of the Rosenbrock banana function

version 2.1 (5 February 2021)
	- Removed README in favour of README.md

version 2.0.9 (28 January 2021)
	- Fix bug in Array::alignment_offset causing occasional
	crashes reduce and assign operations due to unaligned AVX access,
	now tested in test_packet_operations
	- Added Conjugate-Gradient and L-BFGS minimization methods, both
	bounded and unbounded methods
	- Disabled vectorization on 32-bit ARM NEON targets as there are
	insufficient floating-point intrinsics
	- Fixed interp(x,y,xi) function in case x and y have 0 or 1
	elements

version 2.0.8 (22 August 2020)
	- Added adept_optimize.h header file providing minimization
	capability, initially with the constrained and unconstrained
	Levenberg-Marquardt minimization algorithm
	- Test program test_minimizer tests with the N-dimensional
	Rosenbrock function
	- The Stack member function "jacobian" can now operate on or
	return Adept matrices, rather than solely on raw pointers which
	had to point to data in column-major order
	- Removed "using namespace internal" from several header files so
	that adept namespace is clean
	- Fixed C++98 compatibility

version 2.0.7 (23 June 2020)
	- Added fast, vectorizable exponential function "fastexp", or can
	use as adept::exp if the ADEPT_FAST_EXPONENTIAL preprocessor
	variable is defined
	- Moved all the vector intrinsic stuff to quick_e.h
	- Added ARM-NEON support to quick_e.h
	- Adept is now thread safe on Mac OS versions that support the
	thread_local keyword
	- Fixed bug that caused incorrect differentiation of
	Active<double>/int
	- Preprocessor option ADEPT_INIT_REAL_SNAN and
	ADEPT_INIT_REAL_ZERO initialize real numbers (and complex numbers)
	to signaling NaN or zero, useful for debugging
	- Fixed bug that caused incorrect result of maxval and minval
	applied to active arrays
	- Fixed bug that caused incorrect differentiation of "product"
	function
	- Fixed bug that caused incorrect norm2 for passive vector large
	enough to use vectorization

version 2.0.6 (20 February 2020)
	- Fixed bug in hand-coded adjoint of Toon advection scheme
	(benchmark/advection_schemes_AD.h), as well as other bugs that
	would have prevented the Adjoint and hand-coded adjoints from
	being correct compared to each other
	- Fixed memory leak in Packet.h by ensuring memory is freed in the
	case that neither _POSIX_VERSION nor _MSC_VER are defined
	- Fixed bug in FixedArray.h that prevented active fixed arrays
	from registering themselves with the stack when initialized using
	an initializer list
	- Fixed missing "template" directives in UnaryOperation.h that
	prevented isfinite, isnan and isinf from working correctly on
	arrays
	- Added Array::resize_contigous functions
	- minval and maxval now work correctly with negative and +/-Inf
	arguments; previously minval gave incorrect results even for
	negative arguments
	- Added array_fortran.h to provide the ability to exchange arrays
	between C++/Adept and Fortran, for those Fortran compilers that
	support the 2018 standard
	- Added support for AVX512 vectorization: operations on 16 floats
	and 8 doubles at a time;
	- Added test_packet_operations to check Intel vector intrinsics
	correctly implemented

version 2.0.5 (6 February 2018)
	- Use set_array_print_style(x) to set behaviour of <<Array;
	available are x=PRINT_STYLE_[PLAIN|CSV|CURLY|MATLAB]
	- Fix use of _mm_undefined_ps intrinsic: only use on GCC>=4.9.1
	and Clang if appropriate built-in is present; can't guarantee its
	presence with other compilers
	- Fix writing of active scalar expressions to a stream
	- Added missing fmin/fmax(Expr,Scalar)

version 2.0.4 (8 January 2018)
	- Packet.h copes with undefined _mm_undefined_ps in GCC<4.9.1
	- Fix Packet.h in case SSE2 not enabled
	- ADEPT_FAST preprocessor variable enables
	ADEPT_NO_DIMENSION_CHECKING, ADEPT_NO_ALIAS_CHECKING and
	ADEPT_STACK_THREAD_UNSAFE
	- Divide by scalar now only converts to multiply by (1.0/scalar)
	if scalar is of floating-point type; this fixes indexing with
	"end/2"
	- Fix bug in Packet.h (found by valgrind) to ensure new[] followed
	by delete[] and posix_memalign followed by free
	- Increase initial stack size from 1000 to 1024^2
	- Fixed two bugs in IndexedArray.h that broke indexing a matrix
	with Matrix(int,intVector)
	- Allocated memory in non-OpenMP jacobian_forward is now freed

version 2.0.3 (28 October 2017)
	- Replaced template class "cast" with "expr_cast" to avoid clash
	with Expression's non-template member function; this enables
	compilation with Visual C++.
	- Added adept::have_matrix_multiplication() and
	adept::have_linear_algebra() to test for BLAS and LAPACK
	(respectively) at run-time

version 2.0.2 (21 October 2017)
	- Fixed standards-compliance problem with use of Expression in
	Curiously Recurring Template Pattern, by removing any "static
	const" members that referred to the derived class.  This enabled
	the same code to work with g++, clang++ and the Intel compiler icc.

version 2.0.1 (18 October 2017)
	- Basic passive complex arrays work, tested with
	test/test_complex_arrays
	- Added ADEPT_NO_DIMENSION_CHECKING option
	- Vectorized sqrt, unary-, unary+, max and min
	- Removed the option to vectorize with Packet representing a
	*pair* of SSE2/AVX packed vector; now a Packet can only represent
	a single packed vector. This simplifies maintenance of Packet.h,
	and the pair option offered no performance advantage anyway.
	- Vectorized reduce operations sum, product etc.
	- Many fixes to enable compilation with clang++
	- Fixed FixedArray::operator[] for rank>1

version 2.0 (September 2017)
	- Finalized version for release
	- PDF documentation is no longer installed, so that Git users are
	not obliged to have pdflatex

version 1.9.11 (30 September 2017)
	- Fixed get_gradient member function of Array and FixedArray
	- Added test_array_derivatives test program
	- Fixed indexing of FixedArrays of rank>1
	- Fixed IndexedArray applied to FixedArrays (before had reference
	to temporary dimension object
	- Test and benchmarking programs now work with single precision
	- Stack functions accept Index passed by value rather than
	reference, so that "static const int" passed from FixedArray does
	not need to be explicitly instantiated
	- Active::add_derivative_dependence and
	append_derivative_dependence no longer only accept arguments of
	type "Real"
	- ADEPT_STORAGE_THREAD_SAFE option to protect Storage reference
	counter in multi-threaded environment (C++11 only)
	- Added Array::soft_link() as another means to get thread safety
	- Added test program test_thread_safe_arrays
	- Added adept_reference latex file to doc directory
	- Added "dimensions" function for creating ExpressionSize objects

version 1.9.10 (25 September 2017)
	- Added link syntax A >>= B
	- Added assignment and initialization from initializer_lists for
	Array and FixedArray classes
	- Implemented Fortran-like "count" reduction function
	- Bug fix sending active expression to a stream with "<<"
	- Added "spread<dim>(array,n)" to match Fortran spread(array,dim,n)
	- Added outer_product(x,y)
	- Fixed adept_source.h for non-Unix systems
	- Moved mathematical functions from global to adept namespace
	- Fixed pausable recording and added test_adept_active_pausable
	- Removed unsafe ADEPT_COPY_CONSTRUCTOR_ONLY_ON_RETURN_FROM_FUNCTION
	- C++98 and C++11 correctly take cmath functions from :: and std::
	respectively
	- "make check" now runs test script test/run_tests.sh
	- inv and solve now take general expression arguments
	- Enabled indexed arrays to be assigned to an initializer list
	- BLAS now optional (without it matrix multiplication causes
	run-time exception)
	- Added test_derivatives to test quality of derivatives for all
	mathematical functions
	- Enabled SpecialMatrix and IndexedArray to be assigned to an
	active scalar expression
	- Added fmax and fmin functions (even if C++11 not used)
	- Added atan2 support
	- C++11 on non-Mac platforms uses thread_local keyword instead of
	C++98 compiler extensions
	- Matrix multiplication on active special matrices implemented by
	copying them to a dense Array<2,Real,true>. Very inefficient, but
	it works.
	- Matrix multiplication on inactive triangular and "square"
	matrices now works by converting to them to a dense Array<2,Real,false>.
	- Added alias detection in IndexedArray
	- Alias detection in IndexedArray and SpecialMatrix can be
	deactivated with ADEPT_NO_ALIAS_CHECKING
	- Added "eval" function to evaluate an expression that might be
	subject to aliasing

version 1.9.9 (August 2017)
	- Put on GitHub as rjhogan/Adept-2
	- Added Expression::next_value_contiguous for faster inner loops
	in the case that all expressions have a contiguous and increasing
	inner dimension
	- Preliminary vectorization via Packet class and
	Expression::next_packet
	- Vectorized forward Jacobian calculation using packets
	- Split Expression.h into also UnaryOperation.h and BinaryOperation.h
	- Fixed bug in matmul.h that causes failure if matrix in
	matrix-vector multiplication is strided in both dimensions
	- Added move semantics if C++11 enabled

version 1.9.8 (April 2016):
	- Completed FixedArray.h and tested for active arguments
	- Added array_shortcuts for FixedArrays: (a)VectorX, (a)MatrixXX
	- Added array_shortcuts for Arrays: (a)ArrayXD (for X = 3 to 7)
	- interp permits general Expression arguments

version 1.9.7 (April 2016):
	- Nearly completed FixedArray.h

version 1.9.6 (March 2016):
	- Started FixedArray.h

version 1.9.5 (March 2016):
	- Fixed add_derivative_dependence and append_derivative_dependence
	when applied to elements of arrays
	- Added ADEPT_BOUNDS_CHECKING capability, and fixed IndexedArray
	to work with this
	- Now call BLAS and LAPACK (Fortran) routines, rather than C-BLAS
	and LAPACKE functions
	- Added matrix multiplication benchmark program
	- Added IndexedArray for dimensions up to 7
	- Added Array::data() and Array::const_data() for direct access
	- Added Array::subset(); slightly more concise than using "range"

version 1.9.4 (January 2016):
	- Completed changes to documentation in doc directory
	- Added control/inquiry of settings, e.g. set_max_blas_threads()
	and configuration()

version 1.9.3 (December 2015):
	- Added "max" and "min" as binary operators (note that "maxval"
	and "minval" are reduction operators as in Fortran)

version 1.9.2 (December 2015):
	- Added ActiveConstReference type for active constant references

version 1.9.1 (November 2015):
	- New matmul.h/matmul.cpp - not yet complete

version 1.9.0 (November 2015):
	- SUBSTANTIAL REWRITE TO INCORPORATE ARRAY FUNCTIONALITY

version 1.1 (June 2015):
	- Added ./configure script using autotools
	- Added support for additional mathematical functions: asinh,
	acosh, atanh, expm1, log1p, cbrt, erf, erfc, exp2, log2
	- Changed license from GNU General Public License to Apache
	License, Version 2.0
	- Jacobian calculation uses OpenMP parallelization
	- Removed multiscatter example code
	- New benchmarking program in benchmark/ that compares to other
	automatic differentiation tools if available
	- Fixed bug so that gaps in the gradient list now merge properly
	- Provided capability to compile code without an external library,
	to facilitate porting to Windows
	- Added programs in test/ demonstrating checkpointing,
	thread-safety and compiling without an external library

version 1.0 (September 2013):
	- Very many internal changes and added features
	- Detailed documentation in the doc/ directory
	- Removed the LIFO requirement on the order with which aReal
	objects ought to be created and destroyed
	- For users of version 0.9, the main change to the interface is
	that the Stack::start() member function is no longer supported;
	rather you should call the Stack::new_recording() member function
	*after* the independent variables have been initialized but
	*before* any mathematical operations are performed using them

version 0.9:
	- First public release


================================================
FILE: INSTALL
================================================
Installation Instructions
*************************

Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005,
2006, 2007 Free Software Foundation, Inc.

This file is free documentation; the Free Software Foundation gives
unlimited permission to copy, distribute and modify it.

Basic Installation
==================

Briefly, the shell commands `./configure; make; make install' should
configure, build, and install this package.  The following
more-detailed instructions are generic; see the `README' file for
instructions specific to this package.

   The `configure' shell script attempts to guess correct values for
various system-dependent variables used during compilation.  It uses
those values to create a `Makefile' in each directory of the package.
It may also create one or more `.h' files containing system-dependent
definitions.  Finally, it creates a shell script `config.status' that
you can run in the future to recreate the current configuration, and a
file `config.log' containing compiler output (useful mainly for
debugging `configure').

   It can also use an optional file (typically called `config.cache'
and enabled with `--cache-file=config.cache' or simply `-C') that saves
the results of its tests to speed up reconfiguring.  Caching is
disabled by default to prevent problems with accidental use of stale
cache files.

   If you need to do unusual things to compile the package, please try
to figure out how `configure' could check whether to do them, and mail
diffs or instructions to the address given in the `README' so they can
be considered for the next release.  If you are using the cache, and at
some point `config.cache' contains results you don't want to keep, you
may remove or edit it.

   The file `configure.ac' (or `configure.in') is used to create
`configure' by a program called `autoconf'.  You need `configure.ac' if
you want to change it or regenerate `configure' using a newer version
of `autoconf'.

The simplest way to compile this package is:

  1. `cd' to the directory containing the package's source code and type
     `./configure' to configure the package for your system.

     Running `configure' might take a while.  While running, it prints
     some messages telling which features it is checking for.

  2. Type `make' to compile the package.

  3. Optionally, type `make check' to run any self-tests that come with
     the package.

  4. Type `make install' to install the programs and any data files and
     documentation.

  5. You can remove the program binaries and object files from the
     source code directory by typing `make clean'.  To also remove the
     files that `configure' created (so you can compile the package for
     a different kind of computer), type `make distclean'.  There is
     also a `make maintainer-clean' target, but that is intended mainly
     for the package's developers.  If you use it, you may have to get
     all sorts of other programs in order to regenerate files that came
     with the distribution.

  6. Often, you can also type `make uninstall' to remove the installed
     files again.

Compilers and Options
=====================

Some systems require unusual options for compilation or linking that the
`configure' script does not know about.  Run `./configure --help' for
details on some of the pertinent environment variables.

   You can give `configure' initial values for configuration parameters
by setting variables in the command line or in the environment.  Here
is an example:

     ./configure CC=c99 CFLAGS=-g LIBS=-lposix

   *Note Defining Variables::, for more details.

Compiling For Multiple Architectures
====================================

You can compile the package for more than one kind of computer at the
same time, by placing the object files for each architecture in their
own directory.  To do this, you can use GNU `make'.  `cd' to the
directory where you want the object files and executables to go and run
the `configure' script.  `configure' automatically checks for the
source code in the directory that `configure' is in and in `..'.

   With a non-GNU `make', it is safer to compile the package for one
architecture at a time in the source code directory.  After you have
installed the package for one architecture, use `make distclean' before
reconfiguring for another architecture.

Installation Names
==================

By default, `make install' installs the package's commands under
`/usr/local/bin', include files under `/usr/local/include', etc.  You
can specify an installation prefix other than `/usr/local' by giving
`configure' the option `--prefix=PREFIX'.

   You can specify separate installation prefixes for
architecture-specific files and architecture-independent files.  If you
pass the option `--exec-prefix=PREFIX' to `configure', the package uses
PREFIX as the prefix for installing programs and libraries.
Documentation and other data files still use the regular prefix.

   In addition, if you use an unusual directory layout you can give
options like `--bindir=DIR' to specify different values for particular
kinds of files.  Run `configure --help' for a list of the directories
you can set and what kinds of files go in them.

   If the package supports it, you can cause programs to be installed
with an extra prefix or suffix on their names by giving `configure' the
option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.

Optional Features
=================

Some packages pay attention to `--enable-FEATURE' options to
`configure', where FEATURE indicates an optional part of the package.
They may also pay attention to `--with-PACKAGE' options, where PACKAGE
is something like `gnu-as' or `x' (for the X Window System).  The
`README' should mention any `--enable-' and `--with-' options that the
package recognizes.

   For packages that use the X Window System, `configure' can usually
find the X include and library files automatically, but if it doesn't,
you can use the `configure' options `--x-includes=DIR' and
`--x-libraries=DIR' to specify their locations.

Specifying the System Type
==========================

There may be some features `configure' cannot figure out automatically,
but needs to determine by the type of machine the package will run on.
Usually, assuming the package is built to be run on the _same_
architectures, `configure' can figure that out, but if it prints a
message saying it cannot guess the machine type, give it the
`--build=TYPE' option.  TYPE can either be a short name for the system
type, such as `sun4', or a canonical name which has the form:

     CPU-COMPANY-SYSTEM

where SYSTEM can have one of these forms:

     OS KERNEL-OS

   See the file `config.sub' for the possible values of each field.  If
`config.sub' isn't included in this package, then this package doesn't
need to know the machine type.

   If you are _building_ compiler tools for cross-compiling, you should
use the option `--target=TYPE' to select the type of system they will
produce code for.

   If you want to _use_ a cross compiler, that generates code for a
platform different from the build platform, you should specify the
"host" platform (i.e., that on which the generated programs will
eventually be run) with `--host=TYPE'.

Sharing Defaults
================

If you want to set default values for `configure' scripts to share, you
can create a site shell script called `config.site' that gives default
values for variables like `CC', `cache_file', and `prefix'.
`configure' looks for `PREFIX/share/config.site' if it exists, then
`PREFIX/etc/config.site' if it exists.  Or, you can set the
`CONFIG_SITE' environment variable to the location of the site script.
A warning: not all `configure' scripts look for a site script.

Defining Variables
==================

Variables not defined in a site shell script can be set in the
environment passed to `configure'.  However, some packages may run
configure again during the build, and the customized values of these
variables may be lost.  In order to avoid this problem, you should set
them in the `configure' command line, using `VAR=value'.  For example:

     ./configure CC=/usr/local2/bin/gcc

causes the specified `gcc' to be used as the C compiler (unless it is
overridden in the site shell script).

Unfortunately, this technique does not work for `CONFIG_SHELL' due to
an Autoconf bug.  Until the bug is fixed you can use this workaround:

     CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash

`configure' Invocation
======================

`configure' recognizes the following options to control how it operates.

`--help'
`-h'
     Print a summary of the options to `configure', and exit.

`--version'
`-V'
     Print the version of Autoconf used to generate the `configure'
     script, and exit.

`--cache-file=FILE'
     Enable the cache: use and save the results of the tests in FILE,
     traditionally `config.cache'.  FILE defaults to `/dev/null' to
     disable caching.

`--config-cache'
`-C'
     Alias for `--cache-file=config.cache'.

`--quiet'
`--silent'
`-q'
     Do not print messages saying which checks are being made.  To
     suppress all normal output, redirect it to `/dev/null' (any error
     messages will still be shown).

`--srcdir=DIR'
     Look for the package's source code in directory DIR.  Usually
     `configure' can determine that directory automatically.

`configure' also accepts some other, not widely useful, options.  Run
`configure --help' for more details.


================================================
FILE: Makefile.am
================================================
dist_pkgdata_DATA = README.md
pkgdata_DATA = COPYING ChangeLog NEWS AUTHORS
SUBDIRS = adept include benchmark test
# The test/ directory does not use automake so we need to specify the
# files that will be included in the distribution
EXTRA_DIST = test/Makefile test/README test/*.cpp test/*.h test/run_tests.sh \
	doc/Makefile doc/README doc/COPYING doc/*.tex 
ACLOCAL_AMFLAGS = -I m4


================================================
FILE: NEWS
================================================
version 2.0
	- Fixed pausable recording and library-free compilation to provide full backwards compatibility with version 1.1
	- C++11 features such as initializer lists
	- Automatic vectorization of passive array statements if possible
	- Additional mathematical functions: round, trunc, rint, nearbyint, atan2, fmin, fmax
	- Additional array operations: spread, outer_product, count, maxval, minval, reshape
	- Many more test programs

version 1.9.8 (April 2016)
	- First beta release of version 2.0 incorporating array capability up to 7 dimensions
	- Matrix multiplication and basic linear from BLAS and LAPACK
	- Options for thread-safe accessing of arrays

version 1.1 (June 2015)
	- Added ./configure script
	- Added support for additional mathematical functions: asinh, acosh, atanh, expm1, log1p, cbrt, erf, erfc, exp2, log2
	- License changed to Apache License, Version 2.0

================================================
FILE: README.md
================================================
# Adept 2: Combined array and automatic differentiation library in C++

## Introduction

The Adept version 2.1 software library provides three different
functionalities:

* Its automatic differentiation capability enables algorithms written
  in C++ to be differentiated with little code modification, very
  useful for a wide range of applications that involve mathematical
  optimization. It is backwards compatible with and as fast as Adept
  1.1. The name "Adept" refers to "Automatic Differentiation using
  Expression Templates".

* Its array capability provides support for vectors, matrices, arrays
  of up to 7 dimensions and linear algebra. Adept 2 uses a single
  expression-template framework under the hood to enable array
  operations to be differentiated with very good computational
  performance.

* Its optimization capability provides the various minimization
  algorithms (Levenberg, Levenberg-Marquardt, Conjugate Gradient and
  Limited Memory BFGS) each of which can be used with or without box
  constraints on the state variables. The interface to the
  optimization functionality is in terms of Adept vectors and matrices.

If you are not interested in the array or optimization capabilities of
Adept 2 then Adept 1.1 may be more to your liking as a very
lightweight library that has virtually all the
automatic-differentiation capabilities of version 2.


## Documentation and links

* The [Adept web site](http://www.met.reading.ac.uk/clouds/adept/) for formal Adept releases
* The [Adept-2 GitHub page](https://github.com/rjhogan/Adept-2) for the latest snapshot
* The [Adept-1.1 GitHub page](https://github.com/rjhogan/Adept) for the older (scalar) library
* A detailed [User Guide](http://www.met.reading.ac.uk/clouds/adept/adept_documentation.pdf)
* A paper describing the automatic differentiation capability: [Hogan, R. J., 2014: Fast reverse-mode automatic differentiation using expression templates in C++. *ACM Trans. Math. Softw.* **40,** 26:1-26:16](http://www.met.reading.ac.uk/~swrhgnrj/publications/adept.pdf)
* The [Adept Wikipedia page](https://en.wikipedia.org/wiki/Adept_(C++_library))
* Bug fixes, and queries not answered by the documentation, should be addressed to Robin Hogan (r.j.hogan at ecmwf.int)

## Installation

To build Adept from a GitHub snapshot, first do the following to
recreate the configure script (requiring the autotools package):

    autoreconf -i

Formal release packages already contain a configure script. The normal
build sequence is then:

    ./configure
    make
    make check
    make install

Please consult the User Guide for further installation options; in
particular, if you plan to make serious us of matrix multiplication
and linear algebra then you should compile Adept to use an optimized
BLAS library such as OpenBLAS.


## License and copyright

The code in this package has a mix of copyright owners:

Copyright (C) 2012-2015 University of Reading

Copyright (C) 2015-     European Centre for Medium-Range Weather Forecasts

Two licenses are used for the code in this package:

* The files that form the Adept library are distributed under the
  conditions of the Apache License, Version 2 - see the COPYING file
  for details.  This is a permissive free-software license but one
  that does impose a few conditions if you intend to distribute
  derivative works.  The files this license applies to are those in
  the include/ and adept/ directories, and the subdirectories below
  them.

* All code in the test/ and benchmark/ directories is subject to the
  terms of the GNU all-permissive license, given at the top of those
  files - basically you can do what you like with the code from these
  files.

If you use Adept in published scientific work then it is requested
that you cite the Hogan (2014) paper above, but this is not a
condition of the license.


================================================
FILE: TODO
================================================
BUGS
spread<DIM> function does not use the right DIM

DESIRABLE BUT NEEDS NEW STACK
Differentiated BLAS operations on symmetric matrices etc
Implement general OpenMP for forward pass

OPTIMIZATION
Vectorize active expressions
Fix vectorization of spread and outer_product by storing pointer to start of row and not using index
Communicate band diagonals statically to optimize Array = band expression (e.g. 2*TridiagMatrix)
Implement active scalar precomputation
Optimize reciprocal to use 1.0 or 1.0f; vectorize
Optimize storage of data range
SquareMatrix::is_vectorizable = true

FEATURES
long double calls double matmul functions?
std::string configuration function returning options for this compilation unit
Mathematical functions copysign, fdim, hypot, remainder?
Implement user elemental function
Implement user choice of Jacobian array ordering
Clean-up benchmark and test_arrays/test_array_speed code
Check can do Array<*,Active<Real>,false>
Rename ExpressionSize
Enable functions taking ExpressionSize arguments (e.g. resize and array constructor) to take equivalent arguments, e.g. std::vector, initializer lists etc
Fall-back if BLAS not available
Implement pow<int> and sqr
Implement non-member functions merge?, reshape, shape?, size, [un]pack(?), minloc, maxloc
Implement matlab-like tile (generic repmat) plus zeros and ones
Implement iterators
Triangular/symmetric views
Const link does not increment reference counter
Cannot link non-const to const either by construction or explicit link
Should reduce functions take dimensions as template arguments?
reduce operations have a template version with the reduce dimension provided statically
differentiate complex number operations
matmul and solve on complex numbers
complex functions arg, abs, real, imag etc

CHECK
Check Square matmul
All vectorization combinations work, e.g. double/int, aligned/unaligned LHS
Set whole arrays as independent/dependent
Reduce RMS difference in Toon case

CLEAN
References to OpenMP for array operations - remove?

DOCUMENTATION
Document diag_vector non-member function (in reduce.h) and test in test_arrays

OLDER IDEAS
Clarify vector orientation when in matrix multiplication
Vector orientation changed with row(), col()?
Implement move semantics and make copy constructors do deep copy ADEPT_***
Implement OpenMP passive array operations
Implement OpenMP active array operations
Link can only be performed on empty object


If new Expression types are to be added, they should provide the
following interface:

      static const int  rank_      = 0;
      static const int  n_scratch_ = 0;
      static const int  n_active_ = 0;
      static const int  n_arrays_ = 0;
      static const bool is_active_ = false;
      static const bool is_vectorizable_ = true;

      bool get_dimensions_(ExpressionSize<0>& dim) const;

      std::string expression_string_() const;

      bool is_aliased_(const Type* mem1, const Type* mem2) const;

      Type value_with_len_(const Index& j, const Index& len) const;

      template <int MyArrayNum, int NArrays>
      void advance_location_(ExpressionSize<NArrays>& loc) const;

      template <int MyArrayNum, int NArrays>
      Type value_at_location_(const ExpressionSize<NArrays>& loc) const;

      template <int MyArrayNum, int NArrays>
      Packet<Type>
      packet_at_location_(const ExpressionSize<NArrays>& loc) const;

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_at_location_store_(const ExpressionSize<NArrays>& loc,
				    ScratchVector<NScratch>& scratch) const;

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_stored_(const ExpressionSize<NArrays>& loc,
			 const ScratchVector<NScratch>& scratch) const;

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      void calc_gradient_(Stack& stack, 
			  const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const;

      template <int MyArrayNum, int MyScratchNum, 
		int NArrays, int NScratch, typename MyType>
      void calc_gradient_(Stack& stack, 
			  const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch,
			  const MyType& multiplier) const;

      template <int MyArrayNum, int Rank, int NArrays>
      void set_location_(const ExpressionSize<Rank>& i, 
			 ExpressionSize<NArrays>& index) const;


================================================
FILE: adept/Array.cpp
================================================
/* Array.cpp -- Functions and global variables controlling array behaviour

    Copyright (C) 2015-2016 European Centre for Medium-Range Weather Forecasts

    Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.
*/


#include <adept/Array.h>

namespace adept {
  namespace internal {
    bool array_row_major_order = true;
    //    bool array_print_curly_brackets = true;

    // Variables describing how arrays are written to a stream
    ArrayPrintStyle array_print_style = PRINT_STYLE_CURLY;
    std::string vector_separator = ", ";
    std::string vector_print_before = "{";
    std::string vector_print_after = "}";
    std::string array_opening_bracket = "{";
    std::string array_closing_bracket = "}";
    std::string array_contiguous_separator = ", ";
    std::string array_non_contiguous_separator = ",\n";
    std::string array_print_before = "\n{";
    std::string array_print_after = "}";
    std::string array_print_empty_before = "(empty rank-";
    std::string array_print_empty_after = " array)";
    bool array_print_indent = true;
    bool array_print_empty_rank = true;
  }

  void set_array_print_style(ArrayPrintStyle ps) {
    using namespace internal;
    switch (ps) {
    case PRINT_STYLE_PLAIN:
       vector_separator = " ";
       vector_print_before = "";
       vector_print_after = "";
       array_opening_bracket = "";
       array_closing_bracket = "";
       array_contiguous_separator = " ";
       array_non_contiguous_separator = "\n";
       array_print_before = "";
       array_print_after = "";
       array_print_empty_before = "(empty rank-";
       array_print_empty_after = " array)";
       array_print_indent = false;
       array_print_empty_rank = true;
       break;
    case PRINT_STYLE_CSV:
       vector_separator = ", ";
       vector_print_before = "";
       vector_print_after = "";
       array_opening_bracket = "";
       array_closing_bracket = "";
       array_contiguous_separator = ", ";
       array_non_contiguous_separator = "\n";
       array_print_before = "";
       array_print_after = "";
       array_print_empty_before = "empty";
       array_print_empty_after = "";
       array_print_indent = false;
       array_print_empty_rank = false;
       break;
    case PRINT_STYLE_MATLAB:
       vector_separator = " ";
       vector_print_before = "[";
       vector_print_after = "]";
       array_opening_bracket = "[";
       array_closing_bracket = "]";
       array_contiguous_separator = " ";
       array_non_contiguous_separator = ";\n";
       array_print_before = "[";
       array_print_after = "]";
       array_print_empty_before = "[";
       array_print_empty_after = "]";
       array_print_indent = true;
       array_print_empty_rank = false;
       break;
    case PRINT_STYLE_CURLY:
       vector_separator = ", ";
       vector_print_before = "{";
       vector_print_after = "}";
       array_opening_bracket = "{";
       array_closing_bracket = "}";
       array_contiguous_separator = ", ";
       array_non_contiguous_separator = ",\n";
       array_print_before = "\n{";
       array_print_after = "}";
       array_print_empty_before = "(empty rank-";
       array_print_empty_after = " array)";
       array_print_indent = true;
       array_print_empty_rank = true;
       break;
    default:
      throw invalid_operation("Array print style not understood");
    }
    array_print_style = ps;
  }

}


================================================
FILE: adept/Makefile.am
================================================
lib_LTLIBRARIES = libadept.la
libadept_la_SOURCES = Array.cpp Stack.cpp StackStorageOrig.cpp \
	jacobian.cpp Storage.cpp index.cpp settings.cpp \
	cppblas.cpp cpplapack.h solve.cpp inv.cpp \
	vector_utilities.cpp Minimizer.cpp \
	minimize_limited_memory_bfgs.cpp minimize_levenberg_marquardt.cpp \
	minimize_conjugate_gradient.cpp line_search.cpp

libadept_la_CPPFLAGS = -I@top_srcdir@/include


================================================
FILE: adept/Minimizer.cpp
================================================
/* Minimizer.h -- class for minimizing the cost function of an optimizable object

    Copyright (C) 2020 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#include <cctype>

#include <adept/Minimizer.h>
#include <adept/exception.h>

namespace adept {

  // List of the names of available minimizer algorithms
  static const char* minimizer_algorithm_names_[]
    = {"L-BFGS",
       "Conjugate-Gradient",
       "Conjugate-Gradient-FR",
       "Levenberg",
       "Levenberg-Marquardt"};

  // Lower-case versions of the list above
  static const char* minimizer_algorithm_lower_names_[]
    = {"l-bfgs",
       "conjugate-gradient",
       "conjugate-gradient-fr",
       "levenberg",
       "levenberg-marquardt"};

  // Convert to lower case, and convert spaces and underscores to
  // hyphens. This function is used to do a case-insensitive
  // string-based selection of the minimizer algorithm to use.
  static void to_lower_in_place(std::string& str) {
    for (std::string::size_type istr = 0; istr < str.size(); ++istr) {
      str[istr] = std::tolower(str[istr]);
      if (str[istr] == ' ' || str[istr] == '_') {
	str[istr] = '-';
      }
    }
  }

  // Return a C string describing the minimizer status
  const char*
  minimizer_status_string(MinimizerStatus status)
  {
    switch (status) {
    case MINIMIZER_STATUS_SUCCESS:
      return "Converged";
      break;
    case MINIMIZER_STATUS_EMPTY_STATE:
      return "Empty state vector, no minimization performed";
      break;
    case MINIMIZER_STATUS_MAX_ITERATIONS_REACHED:
      return "Maximum iterations reached";
      break;
    case MINIMIZER_STATUS_FAILED_TO_CONVERGE:
      return "Failed to converge";
      break;
    case MINIMIZER_STATUS_DIRECTION_UPHILL:
      return "Search direction points uphill";
      break;
    case MINIMIZER_STATUS_BOUND_REACHED:
      return "Bound reached"; // Should not be returned from a minimize function
      break;
    case MINIMIZER_STATUS_INVALID_COST_FUNCTION:
      return "Non-finite cost function";
      break;
    case MINIMIZER_STATUS_INVALID_GRADIENT:
      return "Non-finite gradient";
      break;
    case MINIMIZER_STATUS_INVALID_BOUNDS:
      return "Invalid bounds for bounded minimization";
      break;
    case MINIMIZER_STATUS_NOT_YET_CONVERGED:
      return "Minimization still in progress";
      break;
    default:
      return "Status unrecognized";
    }
  }

  // Case-insensitive setting of the miminization algorithm given its
  // name
  void
  Minimizer::set_algorithm(const std::string& algo) {
    std::string algo_lower = algo;
    to_lower_in_place(algo_lower);

    std::cout << "Checking \"" << algo_lower << "\"\n";

    for (int ialgo = 0;
	 ialgo < static_cast<int>(MINIMIZER_ALGORITHM_NUMBER_AVAILABLE);
	 ++ialgo) {
      if (algo_lower == minimizer_algorithm_lower_names_[ialgo]) {
	set_algorithm(static_cast<MinimizerAlgorithm>(ialgo));
	return;
      }
    }
    throw optimization_exception("Algorithm name not understood");
  }

  std::string
  Minimizer::algorithm_name() {
    int ialgo = static_cast<MinimizerAlgorithm>(algorithm_);
    if (ialgo >= 0 && ialgo < MINIMIZER_ALGORITHM_NUMBER_AVAILABLE) {
      return minimizer_algorithm_names_[ialgo];
    }
    else {
      return "Unknown";
    }
  }

  // Unconstrained minimization
  MinimizerStatus
  Minimizer::minimize(Optimizable& optimizable, Vector x)
  {
    if (minimizer_algorithm_order(algorithm_) > 1
	&& !optimizable.provides_derivative(2)) {
      throw optimization_exception("2nd-order minimization algorithm requires optimizable that can provide 2nd derivatives");
    }
    else if (algorithm_ == MINIMIZER_ALGORITHM_LIMITED_MEMORY_BFGS) {
      return minimize_limited_memory_bfgs(optimizable, x);
    }
    else if (algorithm_ == MINIMIZER_ALGORITHM_CONJUGATE_GRADIENT) {
      return minimize_conjugate_gradient(optimizable, x);
    }
    else if (algorithm_ == MINIMIZER_ALGORITHM_CONJUGATE_GRADIENT_FR) {
      return minimize_conjugate_gradient(optimizable, x, true);
    }
    else if (algorithm_ == MINIMIZER_ALGORITHM_LEVENBERG) {
      return minimize_levenberg_marquardt(optimizable, x, true);
    }
    else if (algorithm_ == MINIMIZER_ALGORITHM_LEVENBERG_MARQUARDT) {
      return minimize_levenberg_marquardt(optimizable, x, false);
    }
    else {
      throw optimization_exception("Minimization algorithm not recognized");
    }
  }

  // Constrained minimization
  MinimizerStatus
  Minimizer::minimize(Optimizable& optimizable, Vector x,
		      const Vector& x_lower, const Vector& x_upper)
  {
    if (minimizer_algorithm_order(algorithm_) > 1
	&& !optimizable.provides_derivative(2)) {
      throw optimization_exception("2nd-order minimization algorithm requires optimizable that can provide 2nd derivatives");
    }
    if (algorithm_ == MINIMIZER_ALGORITHM_LIMITED_MEMORY_BFGS) {
      return minimize_limited_memory_bfgs_bounded(optimizable, x,
						  x_lower, x_upper);
    }
    else if (algorithm_ == MINIMIZER_ALGORITHM_CONJUGATE_GRADIENT) {
      return minimize_conjugate_gradient_bounded(optimizable, x,
						 x_lower, x_upper);
    }
    else if (algorithm_ == MINIMIZER_ALGORITHM_CONJUGATE_GRADIENT_FR) {
      return minimize_conjugate_gradient_bounded(optimizable, x,
						 x_lower, x_upper, true);
    }
    if (algorithm_ == MINIMIZER_ALGORITHM_LEVENBERG) {
      return minimize_levenberg_marquardt_bounded(optimizable, x,
						  x_lower, x_upper, true);
    }
    if (algorithm_ == MINIMIZER_ALGORITHM_LEVENBERG_MARQUARDT) {
      return minimize_levenberg_marquardt_bounded(optimizable, x,
						  x_lower, x_upper, false);
    }
    else {
      throw optimization_exception("Constrained minimization algorithm not recognized");
    }
  }

};


================================================
FILE: adept/Stack.cpp
================================================
/* Stack.cpp -- Stack for storing automatic differentiation information

     Copyright (C) 2012-2014 University of Reading
    Copyright (C) 2015 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/


#include <iostream>
#include <cstring> // For memcpy


#ifdef _OPENMP
#include <omp.h>
#endif

#include <adept/Stack.h>


namespace adept {

  using namespace internal;

  // Global pointers to the current thread, the second of which is
  // thread safe. The first is only used if ADEPT_STACK_THREAD_UNSAFE
  // is defined.
  ADEPT_THREAD_LOCAL Stack* _stack_current_thread = 0;
  Stack* _stack_current_thread_unsafe = 0;

  // MEMBER FUNCTIONS OF THE STACK CLASS

  // Destructor: frees dynamically allocated memory (if any)
  Stack::~Stack() {
    // If this is the currently active stack then set to NULL as
    // "this" is shortly to become invalid
    if (is_thread_unsafe_) {
      if (_stack_current_thread_unsafe == this) {
	_stack_current_thread_unsafe = 0; 
      }
    }
    else if (_stack_current_thread == this) {
      _stack_current_thread = 0; 
    }
#ifndef ADEPT_STACK_STORAGE_STL
    if (gradient_) {
      delete[] gradient_;
    }
#endif
  }
  
  // Make this stack "active" by copying its "this" pointer to a
  // global variable; this makes it the stack that aReal objects
  // subsequently interact with when being created and participating
  // in mathematical expressions
  void
  Stack::activate()
  {
    // Check that we don't already have an active stack in this thread
    if ((is_thread_unsafe_ && _stack_current_thread_unsafe 
	 && _stack_current_thread_unsafe != this)
	|| ((!is_thread_unsafe_) && _stack_current_thread
	    && _stack_current_thread != this)) {
      throw(stack_already_active());
    }
    else {
      if (!is_thread_unsafe_) {
	_stack_current_thread = this;
      }
      else {
	_stack_current_thread_unsafe = this;
      }
    }    
  }

  
  // Set the maximum number of threads to be used in Jacobian
  // calculations, if possible. A value of 1 indicates that OpenMP
  // will not be used, while a value of 0 indicates that the number
  // will match the number of available processors. Returns the
  // maximum that will be used, which will be 1 if the Adept library
  // was compiled without OpenMP support. Note that a value of 1 will
  // disable the use of OpenMP with Adept, so Adept will then use no
  // OpenMP directives or function calls. Note that if in your program
  // you use OpenMP with each thread performing automatic
  // differentiaion with its own independent Adept stack, then
  // typically only one OpenMP thread is available for each Jacobian
  // calculation, regardless of whether you call this function.
  int
  Stack::set_max_jacobian_threads(int n)
  {
#ifdef _OPENMP
    if (have_openmp_) {
      if (n == 1) {
	openmp_manually_disabled_ = true;
	return 1;
      }
      else if (n < 1) {
	openmp_manually_disabled_ = false;
	omp_set_num_threads(omp_get_num_procs());
	return omp_get_max_threads();
      }
      else {
	openmp_manually_disabled_ = false;
	omp_set_num_threads(n);
	return omp_get_max_threads();
      }
    }
#endif
    return 1;
  }


  // Return maximum number of OpenMP threads to be used in Jacobian
  // calculation
  int 
  Stack::max_jacobian_threads() const
  {
#ifdef _OPENMP
    if (have_openmp_) {
      if (openmp_manually_disabled_) {
	return 1;
      }
      else {
	return omp_get_max_threads();
      }
    }
#endif
    return 1;
  }


  // Perform to adjoint computation (reverse mode). It is assumed that
  // some gradients have been assigned already, otherwise the function
  // returns with an error.
  void
  Stack::compute_adjoint()
  {
    if (gradients_are_initialized()) {
      // Loop backwards through the derivative statements
      for (uIndex ist = n_statements_-1; ist > 0; ist--) {
	const Statement& statement = statement_[ist];
	// We copy the RHS gradient (LHS in the original derivative
	// statement but swapped in the adjoint equivalent) to "a" in
	// case it appears on the LHS in any of the following statements
	Real a = gradient_[statement.index];
	gradient_[statement.index] = 0.0;
	// By only looping if a is non-zero we gain a significant speed-up
	if (a != 0.0) {
	  // Loop over operations
	  for (uIndex i = statement_[ist-1].end_plus_one;
	       i < statement.end_plus_one; i++) {
	    gradient_[index_[i]] += multiplier_[i]*a;
	  }
	}
      }
    }  
    else {
      throw(gradients_not_initialized());
    }  
  }


  // Perform tangent linear computation (forward mode). It is assumed
  // that some gradients have been assigned already, otherwise the
  // function returns with an error.
  void
  Stack::compute_tangent_linear()
  {
    if (gradients_are_initialized()) {
      // Loop forward through the statements
      for (uIndex ist = 1; ist < n_statements_; ist++) {
	const Statement& statement = statement_[ist];
	// We copy the LHS to "a" in case it appears on the RHS in any
	// of the following statements
	Real a = 0.0;
	for (uIndex i = statement_[ist-1].end_plus_one;
	     i < statement.end_plus_one; i++) {
	  a += multiplier_[i]*gradient_[index_[i]];
	}
	gradient_[statement.index] = a;
      }
    }
    else {
      throw(gradients_not_initialized());
    }
  }


  // Register n gradients
  uIndex
  Stack::do_register_gradients(const uIndex& n) {
    n_gradients_registered_ += n;
    if (!gap_list_.empty()) {
      uIndex return_val;
      // Insert in a gap, if there is one big enough
      for (GapListIterator it = gap_list_.begin();
	   it != gap_list_.end(); it++) {
	uIndex len = it->end + 1 - it->start;
	if (len > n) {
	  // Gap a bit larger than needed: reduce its size
	  return_val = it->start;
	  it->start += n;
	  return return_val;
	}
	else if (len == n) {
	  // Gap exactly the size needed: fill it and remove from list
	  return_val = it->start;
	  if (most_recent_gap_ == it) {
	    gap_list_.erase(it);
	    most_recent_gap_ = gap_list_.end();
	  }
	  else {
	    gap_list_.erase(it);
	  }
	  return return_val;
	}
      }
    }
    // No suitable gap found; instead add to end of gradient vector
    i_gradient_ += n;
    if (i_gradient_ > max_gradient_) {
      max_gradient_ = i_gradient_;
    }
    return i_gradient_ - n;
  }
  

  // If an aReal object is deleted, its gradient_index is
  // unregistered from the stack.  If this is at the top of the stack
  // then this is easy and is done inline; this is the usual case
  // since C++ trys to deallocate automatic objects in the reverse
  // order to that in which they were allocated.  If it is not at the
  // top of the stack then a non-inline function is called to ensure
  // that the gap list is adjusted correctly.
  void
  Stack::unregister_gradient_not_top(const uIndex& gradient_index)
  {
    enum {
      ADDED_AT_BASE,
      ADDED_AT_TOP,
      NEW_GAP,
      NOT_FOUND
    } status = NOT_FOUND;
    // First try to find if the unregistered element is at the
    // start or end of an existing gap
    if (!gap_list_.empty() && most_recent_gap_ != gap_list_.end()) {
      // We have a "most recent" gap - check whether the gradient
      // to be unregistered is here
      Gap& current_gap = *most_recent_gap_;
      if (gradient_index == current_gap.start - 1) {
	current_gap.start--;
	status = ADDED_AT_BASE;
      }
      else if (gradient_index == current_gap.end + 1) {
	current_gap.end++;
	status = ADDED_AT_TOP;
      }
      // Should we check for erroneous removal from middle of gap?
    }
    if (status == NOT_FOUND) {
      // Search other gaps
      for (GapListIterator it = gap_list_.begin();
	   it != gap_list_.end(); it++) {
	if (gradient_index <= it->end + 1) {
	  // Gradient to unregister is either within the gap
	  // referenced by iterator "it", or it is between "it"
	  // and the previous gap in the list
	  if (gradient_index == it->start - 1) {
	    status = ADDED_AT_BASE;
	    it->start--;
	    most_recent_gap_ = it;
	  }
	  else if (gradient_index == it->end + 1) {
	    status = ADDED_AT_TOP;
	    it->end++;
	    most_recent_gap_ = it;
	  }
	  else {
	    // Insert a new gap of width 1; note that list::insert
	    // inserts *before* the specified location
	    most_recent_gap_
	      = gap_list_.insert(it, Gap(gradient_index));
	    status = NEW_GAP;
	  }
	  break;
	}
      }
      if (status == NOT_FOUND) {
	gap_list_.push_back(Gap(gradient_index));
	most_recent_gap_ = gap_list_.end();
	most_recent_gap_--;
      }
    }
    // Finally check if gaps have merged
    if (status == ADDED_AT_BASE
	&& most_recent_gap_ != gap_list_.begin()) {
      // Check whether the gap has merged with the next one
      GapListIterator it = most_recent_gap_;
      it--;
      if (it->end == most_recent_gap_->start - 1) {
	// Merge two gaps
	most_recent_gap_->start = it->start;
	gap_list_.erase(it);
      }
    }
    else if (status == ADDED_AT_TOP) {
      GapListIterator it = most_recent_gap_;
      it++;
      if (it != gap_list_.end()
	  && it->start == most_recent_gap_->end + 1) {
	// Merge two gaps
	most_recent_gap_->end = it->end;
	gap_list_.erase(it);
      }
    }
  }	


  // Unregister n gradients starting at gradient_index
  void
  Stack::unregister_gradients(const uIndex& gradient_index,
			      const uIndex& n)
  {
    n_gradients_registered_ -= n;
    if (gradient_index+n == i_gradient_) {
      // Gradient to be unregistered is at the top of the stack
      i_gradient_ -= n;
      if (!gap_list_.empty()) {
	Gap& last_gap = gap_list_.back();
	if (i_gradient_ == last_gap.end+1) {
	  // We have unregistered the elements between the "gap" of
	  // unregistered element and the top of the stack, so can set
	  // the variables indicating the presence of the gap to zero
	  i_gradient_ = last_gap.start;
	  GapListIterator it = gap_list_.end();
	  it--;
	  if (most_recent_gap_ == it) {
	    most_recent_gap_ = gap_list_.end();
	  }
	  gap_list_.pop_back();
	}
      }
    }
    else { // Gradients to be unregistered not at top of stack.
      enum {
	ADDED_AT_BASE,
	ADDED_AT_TOP,
	NEW_GAP,
	NOT_FOUND
      } status = NOT_FOUND;
      // First try to find if the unregistered element is at the start
      // or end of an existing gap
      if (!gap_list_.empty() && most_recent_gap_ != gap_list_.end()) {
	// We have a "most recent" gap - check whether the gradient
	// to be unregistered is here
	Gap& current_gap = *most_recent_gap_;
	if (gradient_index == current_gap.start - n) {
	  current_gap.start -= n;
	  status = ADDED_AT_BASE;
	}
	else if (gradient_index == current_gap.end + 1) {
	  current_gap.end += n;
	  status = ADDED_AT_TOP;
	}
	/*
	else if (gradient_index > current_gap.start - n
		 && gradient_index < current_gap.end + 1) {
	  std::cout << "** Attempt to find " << gradient_index << " in gaps ";
	  print_gaps();
	  std::cout << "\n";
	  throw invalid_operation("Gap list corruption");
	}
	*/
	// Should we check for erroneous removal from middle of gap?
      }
      if (status == NOT_FOUND) {
	// Search other gaps
	for (GapListIterator it = gap_list_.begin();
	     it != gap_list_.end(); it++) {
	  if (gradient_index <= it->end + 1) {
	    // Gradient to unregister is either within the gap
	    // referenced by iterator "it", or it is between "it" and
	    // the previous gap in the list
	    if (gradient_index == it->start - n) {
	      status = ADDED_AT_BASE;
	      it->start -= n;
	      most_recent_gap_ = it;
	    }
	    else if (gradient_index == it->end + 1) {
	      status = ADDED_AT_TOP;
	      it->end += n;
	      most_recent_gap_ = it;
	    }
	    /*
	    else if (gradient_index > it->start - n) {
	      std::cout << "*** Attempt to find " << gradient_index << " in gaps ";
	      print_gaps();
	      std::cout << "\n";
	      throw invalid_operation("Gap list corruption");
	    }
	    */
	    else {
	      // Insert a new gap; note that list::insert inserts
	      // *before* the specified location
	      most_recent_gap_
		= gap_list_.insert(it, Gap(gradient_index,
					   gradient_index+n-1));
	      status = NEW_GAP;
	    }
	    break;
	  }
	}
	if (status == NOT_FOUND) {
	  gap_list_.push_back(Gap(gradient_index,
				  gradient_index+n-1));
	  most_recent_gap_ = gap_list_.end();
	  most_recent_gap_--;
	}
      }
      // Finally check if gaps have merged
      if (status == ADDED_AT_BASE
	  && most_recent_gap_ != gap_list_.begin()) {
	// Check whether the gap has merged with the next one
	GapListIterator it = most_recent_gap_;
	it--;
	if (it->end == most_recent_gap_->start - 1) {
	  // Merge two gaps
	  most_recent_gap_->start = it->start;
	  gap_list_.erase(it);
	}
      }
      else if (status == ADDED_AT_TOP) {
	GapListIterator it = most_recent_gap_;

	it++;
	if (it != gap_list_.end()
	    && it->start == most_recent_gap_->end + 1) {
	  // Merge two gaps
	  most_recent_gap_->end = it->end;
	  gap_list_.erase(it);
	}
      }
    }
  }
  
  
  // Print each derivative statement to the specified stream (standard
  // output if omitted)
  void
  Stack::print_statements(std::ostream& os) const
  {
    for (uIndex ist = 1; ist < n_statements_; ist++) {
      const Statement& statement = statement_[ist];
      os << ist
		<< ": d[" << statement.index
		<< "] = ";
      
      if (statement_[ist-1].end_plus_one == statement_[ist].end_plus_one) {
	os << "0\n";
      }
      else {    
	for (uIndex i = statement_[ist-1].end_plus_one;
	     i < statement.end_plus_one; i++) {
	  os << " + " << multiplier_[i] << "*d[" << index_[i] << "]";
	}
	os << "\n";
      }
    }
  }
  
  // Print the current gradient list to the specified stream (standard
  // output if omitted)
  bool
  Stack::print_gradients(std::ostream& os) const
  {
    if (gradients_are_initialized()) {
      for (uIndex i = 0; i < max_gradient_; i++) {
	if (i%10 == 0) {
	  if (i != 0) {
	    os << "\n";
	  }
	  os << i << ":";
	}
	os << " " << gradient_[i];
      }
      os << "\n";
      return true;
    }
    else {
      os << "No gradients initialized\n";
      return false;
    }
  }

  // Print the list of gaps in the gradient list to the specified
  // stream (standard output if omitted)
  void
  Stack::print_gaps(std::ostream& os) const
  {
    for (std::list<Gap>::const_iterator it = gap_list_.begin();
	 it != gap_list_.end(); it++) {
      os << it->start << "-" << it->end << " ";
    }
  }


#ifndef ADEPT_STACK_STORAGE_STL
  // Initialize the vector of gradients ready for the adjoint
  // calculation
  void
  Stack::initialize_gradients()
  {
    if (max_gradient_ > 0) {
      if (n_allocated_gradients_ < max_gradient_) {
	if (gradient_) {
	  delete[] gradient_;
	}
	gradient_ = new Real[max_gradient_];
	n_allocated_gradients_ = max_gradient_;
      }
      for (uIndex i = 0; i < max_gradient_; i++) {
	gradient_[i] = 0.0;
      }
    }
    gradients_initialized_ = true;
  }
#else
  void
  Stack::initialize_gradients()
  {
    gradient_.resize(max_gradient_+10, 0.0);
      gradients_initialized_ = true;
  }
#endif

  // Report information about the stack to the specified stream, or
  // standard output if omitted; note that this is synonymous with
  // sending the Stack object to a stream using the "<<" operator.
  void
  Stack::print_status(std::ostream& os) const
  {
    os << "Automatic Differentiation Stack (address " << this << "):\n";
    if ((!is_thread_unsafe_) && _stack_current_thread == this) {
      os << "   Currently attached - thread safe\n";
    }
    else if (is_thread_unsafe_ && _stack_current_thread_unsafe == this) {
      os << "   Currently attached - thread unsafe\n";
    }
    else {
      os << "   Currently detached\n";
    }
    os << "   Recording status:\n";
    if (is_recording_) {
      os << "      Recording is ON\n";  
    }
    else {
      os << "      Recording is PAUSED\n";
    }
    // Account for the null statement at the start by subtracting one
    os << "      " << n_statements()-1 << " statements (" 
       << n_allocated_statements() << " allocated)";
    os << " and " << n_operations() << " operations (" 
       << n_allocated_operations() << " allocated)\n";
    os << "      " << n_gradients_registered() << " gradients currently registered ";
    os << "and a total of " << max_gradients() << " needed (current index "
       << i_gradient() << ")\n";
    if (gap_list_.empty()) {
      os << "      Gradient list has no gaps\n";
    }
    else {
      os << "      Gradient list has " << gap_list_.size() << " gaps (";
      print_gaps(os);
      os << ")\n";
    }
    os << "   Computation status:\n";
    if (gradients_are_initialized()) {
      os << "      " << max_gradients() << " gradients assigned (" 
	 << n_allocated_gradients() << " allocated)\n";
    }
    else {
      os << "      0 gradients assigned (" << n_allocated_gradients()
	 << " allocated)\n";
    }
    os << "      Jacobian size: " << n_dependents() << "x" << n_independents() << "\n";
    if (n_dependents() <= 10 && n_independents() <= 10) {
      os << "      Independent indices:";
      for (std::size_t i = 0; i < independent_index_.size(); ++i) {
	os << " " << independent_index_[i];
      }
      os << "\n      Dependent indices:  ";
      for (std::size_t i = 0; i < dependent_index_.size(); ++i) {
	os << " " << dependent_index_[i];
      }
      os << "\n";
    }

#ifdef _OPENMP
    if (have_openmp_) {
      if (openmp_manually_disabled_) {
	os << "      Parallel Jacobian calculation manually disabled\n";
      }
      else {
	os << "      Parallel Jacobian calculation can use up to "
	   << omp_get_max_threads() << " threads\n";
	os << "      Each thread treats " << ADEPT_MULTIPASS_SIZE 
	   << " (in)dependent variables\n";
      }
    }
    else {
#endif
      os << "      Parallel Jacobian calculation not available\n";
#ifdef _OPENMP
    }
#endif
  }
} // End namespace adept


================================================
FILE: adept/StackStorageOrig.cpp
================================================
/* StackStorageOrig.cpp -- Original storage of stacks using STL containers

    Copyright (C) 2014-2015 University of Reading

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

   The Stack class inherits from a class providing the storage (and
   interface to the storage) for the derivative statements that are
   accumulated during the execution of an algorithm.  The derivative
   statements are held in two stacks described by Hogan (2014): the
   "statement stack" and the "operation stack".

   This file provides one of the original storage engine, which used
   std::vector to hold the two stacks. Note that these stacks are
   contiguous in memory, which is not ideal for very large algorithms.

*/

#include <cstring>

#include <adept/StackStorageOrig.h>

namespace adept {
  namespace internal {

    StackStorageOrig::~StackStorageOrig() {
      if (statement_) {
	delete[] statement_;
      }
      if (multiplier_) {
	delete[] multiplier_;
      }
      if (index_) {
	delete[] index_;
      }
    }


    // Double the size of the operation stack, or grow it even more if
    // the requested minimum number of extra entries (min) is greater
    // than this would allow
    void
    StackStorageOrig::grow_operation_stack(uIndex min)
    {
      uIndex new_size = 2*n_allocated_operations_;
      if (min > 0 && new_size < n_allocated_operations_+min) {
	new_size += min;
      }
      Real* new_multiplier = new Real[new_size];
      uIndex* new_index = new uIndex[new_size];
      
      std::memcpy(new_multiplier, multiplier_, n_operations_*sizeof(Real));
      std::memcpy(new_index, index_, n_operations_*sizeof(uIndex));
      
      delete[] multiplier_;
      delete[] index_;
      
      multiplier_ = new_multiplier;
      index_ = new_index;
      
      n_allocated_operations_ = new_size;
    }
    
    // ... likewise for the statement stack
    void
    StackStorageOrig::grow_statement_stack(uIndex min)
    {
      uIndex new_size = 2*n_allocated_statements_;
      if (min > 0 && new_size < n_allocated_statements_+min) {
	new_size += min;
      }
      Statement* new_statement = new Statement[new_size];
      std::memcpy(new_statement, statement_,
		  n_statements_*sizeof(Statement));
      delete[] statement_;
      
      statement_ = new_statement;
      
      n_allocated_statements_ = new_size;
    }

  }
}


================================================
FILE: adept/Storage.cpp
================================================
/* Storage.cpp -- Global variables recording use of Storage objects

    Copyright (C) 2015 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#include <adept/Storage.h>

namespace adept {
  namespace internal {
    Index n_storage_objects_created_;
    Index n_storage_objects_deleted_;
  }
}


================================================
FILE: adept/cppblas.cpp
================================================
/* cppblas.cpp -- C++ interface to BLAS functions

    Copyright (C) 2015-2016 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

   This file provides a C++ interface to selected Level-2 and -3 BLAS
   functions in which the precision of the arguments (float versus
   double) is inferred via overloading

*/

#include <adept/exception.h>
#include <adept/cppblas.h>

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#ifdef HAVE_BLAS

extern "C" {
  void sgemm_(const char* TransA, const char* TransB, const int* M,
	      const int* N, const int* K, const float* alpha,
	      const float* A, const int* lda, const float* B, const int* ldb,
	      const float* beta, const float* C, const int* ldc);
  void dgemm_(const char* TransA, const char* TransB, const int* M,
	      const int* N, const int* K, const double* alpha,
	      const double* A, const int* lda, const double* B, const int* ldb,
	      const double* beta, const double* C, const int* ldc);
  void sgemv_(const char* TransA, const int* M, const int* N, const float* alpha,
	      const float* A, const int* lda, const float* X, const int* incX,
	      const float* beta, const float* Y, const int* incY);
  void dgemv_(const char* TransA, const int* M, const int* N, const double* alpha,
	      const double* A, const int* lda, const double* X, const int* incX,
	      const double* beta, const double* Y, const int* incY);
  void ssymm_(const char* side, const char* uplo, const int* M, const int* N,
	      const float* alpha, const float* A, const int* lda, const float* B,
	      const int* ldb, const float* beta, float* C, const int* ldc);
  void dsymm_(const char* side, const char* uplo, const int* M, const int* N,
	      const double* alpha, const double* A, const int* lda, const double* B,
	      const int* ldb, const double* beta, double* C, const int* ldc);
  void ssymv_(const char* uplo, const int* N, const float* alpha, const float* A, 
	      const int* lda, const float* X, const int* incX, const float* beta, 
	      const float* Y, const int* incY);
  void dsymv_(const char* uplo, const int* N, const double* alpha, const double* A, 
	      const int* lda, const double* X, const int* incX, const double* beta, 
	      const double* Y, const int* incY);
  void sgbmv_(const char* TransA, const int* M, const int* N, const int* kl, 
	      const int* ku, const float* alpha, const float* A, const int* lda,
	      const float* X, const int* incX, const float* beta, 
	      const float* Y, const int* incY);
  void dgbmv_(const char* TransA, const int* M, const int* N, const int* kl, 
	      const int* ku, const double* alpha, const double* A, const int* lda,
	      const double* X, const int* incX, const double* beta, 
	      const double* Y, const int* incY);
}

namespace adept {

  namespace internal {
    
    // Matrix-matrix multiplication for general dense matrices
#define ADEPT_DEFINE_GEMM(T, FUNC, FUNC_COMPLEX)		\
    void cppblas_gemm(BLAS_ORDER Order,				\
		      BLAS_TRANSPOSE TransA,			\
		      BLAS_TRANSPOSE TransB,			\
		      int M, int N,				\
		      int K, T alpha, const T *A,		\
		      int lda, const T *B, int ldb,		\
		      T beta, T *C, int ldc) {			\
      if (Order == BlasColMajor) {				\
        FUNC(&TransA, &TransB, &M, &N, &K, &alpha, A, &lda,	\
	     B, &ldb, &beta, C, &ldc);				\
      }								\
      else {							\
        FUNC(&TransB, &TransA, &N, &M, &K, &alpha, B, &ldb,	\
	     A, &lda, &beta, C, &ldc);				\
      }								\
    }
    ADEPT_DEFINE_GEMM(double, dgemm_, zgemm_)
    ADEPT_DEFINE_GEMM(float,  sgemm_, cgemm_)
#undef ADEPT_DEFINE_GEMM
    
    // Matrix-vector multiplication for a general dense matrix
#define ADEPT_DEFINE_GEMV(T, FUNC, FUNC_COMPLEX)		\
    void cppblas_gemv(const BLAS_ORDER Order,			\
		      const BLAS_TRANSPOSE TransA,		\
		      const int M, const int N,			\
		      const T alpha, const T *A, const int lda,	\
		      const T *X, const int incX, const T beta,	\
		      T *Y, const int incY) {			\
      if (Order == BlasColMajor) {				\
        FUNC(&TransA, &M, &N, &alpha, A, &lda, X, &incX, 	\
	     &beta, Y, &incY);					\
      }								\
      else {							\
        BLAS_TRANSPOSE TransNew					\
	  = TransA == BlasTrans ? BlasNoTrans : BlasTrans;	\
        FUNC(&TransNew, &N, &M, &alpha, A, &lda, X, &incX, 	\
	     &beta, Y, &incY);					\
      }								\
    }
    ADEPT_DEFINE_GEMV(double, dgemv_, zgemv_)
    ADEPT_DEFINE_GEMV(float,  sgemv_, cgemv_)
#undef ADEPT_DEFINE_GEMV
    
    // Matrix-matrix multiplication where matrix A is symmetric
    // FIX! CHECK ROW MAJOR VERSION IS RIGHT			
#define ADEPT_DEFINE_SYMM(T, FUNC, FUNC_COMPLEX)			\
    void cppblas_symm(const BLAS_ORDER Order,				\
		      const BLAS_SIDE Side,				\
		      const BLAS_UPLO Uplo,				\
		      const int M, const int N,				\
		      const T alpha, const T *A, const int lda,		\
		      const T *B, const int ldb, const T beta,		\
		      T *C, const int ldc) {				\
      if (Order == BlasColMajor) {					\
        FUNC(&Side, &Uplo, &M, &N, &alpha, A, &lda,			\
	     B, &ldb, &beta, C, &ldc);					\
      }									\
      else {								\
	BLAS_SIDE SideNew = Side == BlasLeft  ? BlasRight : BlasLeft;	\
	BLAS_UPLO UploNew = Uplo == BlasUpper ? BlasLower : BlasUpper;  \
        FUNC(&SideNew, &UploNew, &N, &M, &alpha, A, &lda,		\
	     B, &ldb, &beta, C, &ldc);					\
      }									\
    }
    ADEPT_DEFINE_SYMM(double, dsymm_, zsymm_)
    ADEPT_DEFINE_SYMM(float,  ssymm_, csymm_)
#undef ADEPT_DEFINE_SYMM
    
    // Matrix-vector multiplication where the matrix is symmetric
#define ADEPT_DEFINE_SYMV(T, FUNC, FUNC_COMPLEX)			\
    void cppblas_symv(const BLAS_ORDER Order,				\
		      const BLAS_UPLO Uplo,				\
		      const int N, const T alpha, const T *A,		\
		      const int lda, const T *X, const int incX,	\
		      const T beta, T *Y, const int incY) {		\
      if (Order == BlasColMajor) {					\
        FUNC(&Uplo, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);	\
      }									\
      else {								\
        BLAS_UPLO UploNew = Uplo == BlasUpper ? BlasLower : BlasUpper;  \
        FUNC(&UploNew, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);	\
      }									\
    }
    ADEPT_DEFINE_SYMV(double, dsymv_, zsymv_)
    ADEPT_DEFINE_SYMV(float,  ssymv_, csymv_)
#undef ADEPT_DEFINE_SYMV
    
    // Matrix-vector multiplication for a general band matrix
#define ADEPT_DEFINE_GBMV(T, FUNC, FUNC_COMPLEX)		\
    void cppblas_gbmv(const BLAS_ORDER Order,			\
		      const BLAS_TRANSPOSE TransA,		\
		      const int M, const int N,			\
		      const int KL, const int KU, const T alpha,\
		      const T *A, const int lda, const T *X,	\
		      const int incX, const T beta, T *Y,	\
		      const int incY) {				\
      if (Order == BlasColMajor) {				\
        FUNC(&TransA, &M, &N, &KL, &KU, &alpha, A, &lda,	\
	     X, &incX, &beta, Y, &incY);			\
      }								\
      else {							\
	BLAS_TRANSPOSE TransNew					\
	  = TransA == BlasTrans ? BlasNoTrans : BlasTrans;	\
	FUNC(&TransNew, &N, &M, &KU, &KL, &alpha, A, &lda,	\
	     X, &incX, &beta, Y, &incY);			\
      }								\
    }
    ADEPT_DEFINE_GBMV(double, dgbmv_, zgbmv_)
    ADEPT_DEFINE_GBMV(float,  sgbmv_, cgbmv_)
#undef ADEPT_DEFINE_GBMV
  
  } // End namespace internal
  
} // End namespace adept
  

#else // Don't have BLAS


namespace adept {

  namespace internal {
    
    // Matrix-matrix multiplication for general dense matrices
#define ADEPT_DEFINE_GEMM(T, FUNC, FUNC_COMPLEX)		\
    void cppblas_gemm(BLAS_ORDER Order,				\
		      BLAS_TRANSPOSE TransA,			\
		      BLAS_TRANSPOSE TransB,			\
		      int M, int N,				\
		      int K, T alpha, const T *A,		\
		      int lda, const T *B, int ldb,		\
		      T beta, T *C, int ldc) {			\
      throw feature_not_available("Cannot perform matrix-matrix multiplication because compiled without BLAS"); \
    }
    ADEPT_DEFINE_GEMM(double, dgemm_, zgemm_)
    ADEPT_DEFINE_GEMM(float,  sgemm_, cgemm_)
#undef ADEPT_DEFINE_GEMM
    
    // Matrix-vector multiplication for a general dense matrix
#define ADEPT_DEFINE_GEMV(T, FUNC, FUNC_COMPLEX)		\
    void cppblas_gemv(const BLAS_ORDER Order,			\
		      const BLAS_TRANSPOSE TransA,		\
		      const int M, const int N,			\
		      const T alpha, const T *A, const int lda,	\
		      const T *X, const int incX, const T beta,	\
		      T *Y, const int incY) {			\
      throw feature_not_available("Cannot perform matrix-vector multiplication because compiled without BLAS"); \
    }
    ADEPT_DEFINE_GEMV(double, dgemv_, zgemv_)
    ADEPT_DEFINE_GEMV(float,  sgemv_, cgemv_)
#undef ADEPT_DEFINE_GEMV
    
    // Matrix-matrix multiplication where matrix A is symmetric
    // FIX! CHECK ROW MAJOR VERSION IS RIGHT			
#define ADEPT_DEFINE_SYMM(T, FUNC, FUNC_COMPLEX)			\
    void cppblas_symm(const BLAS_ORDER Order,				\
		      const BLAS_SIDE Side,				\
		      const BLAS_UPLO Uplo,				\
		      const int M, const int N,				\
		      const T alpha, const T *A, const int lda,		\
		      const T *B, const int ldb, const T beta,		\
		      T *C, const int ldc) {				\
      throw feature_not_available("Cannot perform symmetric matrix-matrix multiplication because compiled without BLAS"); \
    }
    ADEPT_DEFINE_SYMM(double, dsymm_, zsymm_)
    ADEPT_DEFINE_SYMM(float,  ssymm_, csymm_)
#undef ADEPT_DEFINE_SYMM
    
    // Matrix-vector multiplication where the matrix is symmetric
#define ADEPT_DEFINE_SYMV(T, FUNC, FUNC_COMPLEX)			\
    void cppblas_symv(const BLAS_ORDER Order,				\
		      const BLAS_UPLO Uplo,				\
		      const int N, const T alpha, const T *A,		\
		      const int lda, const T *X, const int incX,	\
		      const T beta, T *Y, const int incY) {		\
      throw feature_not_available("Cannot perform symmetric matrix-vector multiplication because compiled without BLAS"); \
    }
    ADEPT_DEFINE_SYMV(double, dsymv_, zsymv_)
    ADEPT_DEFINE_SYMV(float,  ssymv_, csymv_)
#undef ADEPT_DEFINE_SYMV
    
    // Matrix-vector multiplication for a general band matrix
#define ADEPT_DEFINE_GBMV(T, FUNC, FUNC_COMPLEX)		\
    void cppblas_gbmv(const BLAS_ORDER Order,			\
		      const BLAS_TRANSPOSE TransA,		\
		      const int M, const int N,			\
		      const int KL, const int KU, const T alpha,\
		      const T *A, const int lda, const T *X,	\
		      const int incX, const T beta, T *Y,	\
		      const int incY) {				\
      throw feature_not_available("Cannot perform band matrix-vector multiplication because compiled without BLAS"); \
    }
    ADEPT_DEFINE_GBMV(double, dgbmv_, zgbmv_)
    ADEPT_DEFINE_GBMV(float,  sgbmv_, cgbmv_)
#undef ADEPT_DEFINE_GBMV

  }
}

#endif


================================================
FILE: adept/cpplapack.h
================================================
/* cpplapack.h -- C++ interface to LAPACK

    Copyright (C) 2015-2016 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.
*/

#ifndef AdeptCppLapack_H
#define AdeptCppLapack_H 1                       

#include <vector>
#include <cstddef>

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#ifdef HAVE_LAPACK

extern "C" {
  // External LAPACK Fortran functions
  void sgetrf_(const int* m, const int* n, float*  a, const int* lda, int* ipiv, int* info);
  void dgetrf_(const int* m, const int* n, double* a, const int* lda, int* ipiv, int* info);
  void sgetri_(const int* n, float* a, const int* lda, const int* ipiv, 
	       float* work, const int* lwork, int* info);
  void dgetri_(const int* n, double* a, const int* lda, const int* ipiv, 
	       double* work, const int* lwork, int* info);
  void ssytrf_(const char* uplo, const int* n, float* a, const int* lda, int* ipiv,
	       float* work, const int* lwork, int* info);
  void dsytrf_(const char* uplo, const int* n, double* a, const int* lda, int* ipiv,
	       double* work, const int* lwork, int* info);
  void ssytri_(const char* uplo, const int* n, float* a, const int* lda, 
	       const int* ipiv, float* work, int* info);
  void dsytri_(const char* uplo, const int* n, double* a, const int* lda, 
	       const int* ipiv, double* work, int* info);
  void ssysv_(const char* uplo, const int* n, const int* nrhs, float* a, const int* lda, 
	      int* ipiv, float* b, const int* ldb, float* work, const int* lwork, int* info);
  void dsysv_(const char* uplo, const int* n, const int* nrhs, double* a, const int* lda, 
	      int* ipiv, double* b, const int* ldb, double* work, const int* lwork, int* info);
  void sgesv_(const int* n, const int* nrhs, float* a, const int* lda, 
	      int* ipiv, float* b, const int* ldb, int* info);
  void dgesv_(const int* n, const int* nrhs, double* a, const int* lda, 
	      int* ipiv, double* b, const int* ldb, int* info);
}

namespace adept {

  // Overloaded functions provide both single &
  // double precision versions, and prevents the huge lapacke.h having
  // to be included in all user code
  namespace internal {
    typedef int lapack_int;
    // Factorize a general matrix
    inline
    int cpplapack_getrf(int n, float* a,  int lda, int* ipiv) {
      int info;
      sgetrf_(&n, &n, a, &lda, ipiv, &info);
      return info;
    }
    inline
    int cpplapack_getrf(int n, double* a, int lda, int* ipiv) {
      int info;
      dgetrf_(&n, &n, a, &lda, ipiv, &info);
      return info;
    }

    // Invert a general matrix
    inline
    int cpplapack_getri(int n, float* a,  int lda, const int* ipiv) {
      int info;
      float work_query;
      int lwork = -1;
      // Find out how much work memory required
      sgetri_(&n, a, &lda, ipiv, &work_query, &lwork, &info);
      lwork = static_cast<int>(work_query);
      std::vector<float> work(static_cast<std::size_t>(lwork));
      // Do full calculation
      sgetri_(&n, a, &lda, ipiv, &work[0], &lwork, &info);
      return info;
    }
    inline
    int cpplapack_getri(int n, double* a,  int lda, const int* ipiv) {
      int info;
      double work_query;
      int lwork = -1;
      // Find out how much work memory required
      dgetri_(&n, a, &lda, ipiv, &work_query, &lwork, &info);
      lwork = static_cast<int>(work_query);
      std::vector<double> work(static_cast<std::size_t>(lwork));
      // Do full calculation
      dgetri_(&n, a, &lda, ipiv, &work[0], &lwork, &info);
      return info;
    }

    // Factorize a symmetric matrix
    inline
    int cpplapack_sytrf(char uplo, int n, float* a, int lda, int* ipiv) {
      int info;
      float work_query;
      int lwork = -1;
      // Find out how much work memory required
      ssytrf_(&uplo, &n, a, &lda, ipiv, &work_query, &lwork, &info);
      lwork = static_cast<int>(work_query);
      std::vector<float> work(static_cast<std::size_t>(lwork));
      // Do full calculation
      ssytrf_(&uplo, &n, a, &lda, ipiv, &work[0], &lwork, &info);
      return info;
    }
    inline
    int cpplapack_sytrf(char uplo, int n, double* a, int lda, int* ipiv) {
      int info;
      double work_query;
      int lwork = -1;
      // Find out how much work memory required
      dsytrf_(&uplo, &n, a, &lda, ipiv, &work_query, &lwork, &info);
      lwork = static_cast<int>(work_query);
      std::vector<double> work(static_cast<std::size_t>(lwork));
      // Do full calculation
      dsytrf_(&uplo, &n, a, &lda, ipiv, &work[0], &lwork, &info);
      return info;
    }

    // Invert a symmetric matrix
    inline
    int cpplapack_sytri(char uplo, int n, float* a, int lda, const int* ipiv) {
      int info;
      std::vector<float> work(n);
      ssytri_(&uplo, &n, a, &lda, ipiv, &work[0], &info);
      return info;
    }
    inline
    int cpplapack_sytri(char uplo, int n, double* a, int lda, const int* ipiv) {
      int info;
      std::vector<double> work(n);
      dsytri_(&uplo, &n, a, &lda, ipiv, &work[0], &info);
      return info;
    }

    // Solve system of linear equations with general matrix
    inline
    int cpplapack_gesv(int n, int nrhs, float* a, int lda,
		       int* ipiv, float* b, int ldb) {
      int info;
      sgesv_(&n, &nrhs, a, &lda, ipiv, b, &lda, &info);
      return info;
    }
    inline
    int cpplapack_gesv(int n, int nrhs, double* a, int lda,
		       int* ipiv, double* b, int ldb) {
      int info;
      dgesv_(&n, &nrhs, a, &lda, ipiv, b, &lda, &info);
      return info;
    }

    // Solve system of linear equations with symmetric matrix
    inline
    int cpplapack_sysv(char uplo, int n, int nrhs, float* a, int lda, int* ipiv,
		       float* b, int ldb) {
      int info;
      float work_query;
      int lwork = -1;
      // Find out how much work memory required
      ssysv_(&uplo, &n, &nrhs, a, &lda, ipiv, b, &ldb, &work_query, &lwork, &info);
      lwork = static_cast<int>(work_query);
      std::vector<float> work(static_cast<std::size_t>(lwork));
      // Do full calculation
      ssysv_(&uplo, &n, &nrhs, a, &lda, ipiv, b, &ldb, &work[0], &lwork, &info);
      return info;
    }
    inline
    int cpplapack_sysv(char uplo, int n, int nrhs, double* a, int lda, int* ipiv,
		       double* b, int ldb) {
      int info;
      double work_query;
      int lwork = -1;
      // Find out how much work memory required
      dsysv_(&uplo, &n, &nrhs, a, &lda, ipiv, b, &ldb, &work_query, &lwork, &info);
      lwork = static_cast<int>(work_query);
      std::vector<double> work(static_cast<std::size_t>(lwork));
      // Do full calculation
      dsysv_(&uplo, &n, &nrhs, a, &lda, ipiv, b, &ldb, &work[0], &lwork, &info);
      return info;
    }

  }
}

#endif

#endif


================================================
FILE: adept/index.cpp
================================================
/* index.cpp -- Definitions of "end" and "__" for array indexing

    Copyright (C) 2015 European Centre for Medium-Range Weather Forecasts

    Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.
*/

#include <adept/RangeIndex.h>

namespace adept {

  ::adept::internal::EndIndex end;
  ::adept::internal::AllIndex __;

}


================================================
FILE: adept/inv.cpp
================================================
/* inv.cpp -- Invert matrices

    Copyright (C) 2015-2016 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.
*/
                             
#include <vector>

#include <adept/Array.h>
#include <adept/SpecialMatrix.h>

#ifndef AdeptSource_H
#include "cpplapack.h"
#endif

#ifdef HAVE_LAPACK

namespace adept {

  using namespace internal;
  
  // -------------------------------------------------------------------
  // Invert general square matrix A
  // -------------------------------------------------------------------
  template <typename Type>
  Array<2,Type,false> 
  inv(const Array<2,Type,false>& A) {
    using internal::cpplapack_getrf;
    using internal::cpplapack_getri;

    if (A.dimension(0) != A.dimension(1)) {
      throw invalid_operation("Only square matrices can be inverted"
			      ADEPT_EXCEPTION_LOCATION);
    }

    Array<2,Type,false> A_;

    // LAPACKE is more efficient with column-major input
    A_.resize_column_major(A.dimensions());
    A_ = A;

    std::vector<lapack_int> ipiv(A_.dimension(0));

    //    lapack_int status = LAPACKE_dgetrf(LAPACK_COL_MAJOR, A_.dimension(0), A_.dimension(1),
    //				       A_.data(), A_.offset(1), &ipiv[0]);

    lapack_int status = cpplapack_getrf(A_.dimension(0),
					A_.data(), A_.offset(1), &ipiv[0]);
    if (status != 0) {
      std::stringstream s;
      s << "Failed to factorize matrix: LAPACK ?getrf returned code " << status;
      throw(matrix_ill_conditioned(s.str() ADEPT_EXCEPTION_LOCATION));
    }

    //    status = LAPACKE_dgetri(LAPACK_COL_MAJOR, A_.dimension(0),
    //			    A_.data(), A_.offset(1), &ipiv[0]);
    status = cpplapack_getri(A_.dimension(0),
			     A_.data(), A_.offset(1), &ipiv[0]);

    if (status != 0) {
      std::stringstream s;
      s << "Failed to invert matrix: LAPACK ?getri returned code " << status;
      throw(matrix_ill_conditioned(s.str() ADEPT_EXCEPTION_LOCATION));
    }
    return A_;
  }


  // -------------------------------------------------------------------
  // Invert symmetric matrix A
  // -------------------------------------------------------------------
  template <typename Type, SymmMatrixOrientation Orient>
  SpecialMatrix<Type,SymmEngine<Orient>,false> 
  inv(const SpecialMatrix<Type,SymmEngine<Orient>,false>& A) {
    using internal::cpplapack_sytrf;
    using internal::cpplapack_sytri;

    SpecialMatrix<Type,SymmEngine<Orient>,false> A_;

    A_.resize(A.dimension());
    A_ = A;

    // Treat symmetric matrix as column-major
    char uplo;
    if (Orient == ROW_LOWER_COL_UPPER) {
      uplo = 'U';
    }
    else {
      uplo = 'L';
    }

    std::vector<lapack_int> ipiv(A_.dimension(0));

    //    lapack_int status = LAPACKE_dsytrf(LAPACK_COL_MAJOR, uplo, A_.dimension(),
    //				       A_.data(), A_.offset(), &ipiv[0]);
    lapack_int status = cpplapack_sytrf(uplo, A_.dimension(),
					A_.data(), A_.offset(), &ipiv[0]);
    if (status != 0) {
      std::stringstream s;
      s << "Failed to factorize symmetric matrix: LAPACK ?sytrf returned code " << status;
      throw(matrix_ill_conditioned(s.str() ADEPT_EXCEPTION_LOCATION));
    }

    //    status = LAPACKE_dsytri(LAPACK_COL_MAJOR, uplo, A_.dimension(),
    //			    A_.data(), A_.offset(), &ipiv[0]);
    status = cpplapack_sytri(uplo, A_.dimension(),
			     A_.data(), A_.offset(), &ipiv[0]);
    if (status != 0) {
      std::stringstream s;
      s << "Failed to invert symmetric matrix: LAPACK ?sytri returned code " << status;
      throw(matrix_ill_conditioned(s.str() ADEPT_EXCEPTION_LOCATION));
    }
    return A_;
  }

}

#else // LAPACK not available
    
namespace adept {

  using namespace internal;

  // -------------------------------------------------------------------
  // Invert general square matrix A
  // -------------------------------------------------------------------
  template <typename Type>
  Array<2,Type,false> 
  inv(const Array<2,Type,false>& A) {
    throw feature_not_available("Cannot invert matrix because compiled without LAPACK");
  }

  // -------------------------------------------------------------------
  // Invert symmetric matrix A
  // -------------------------------------------------------------------
  template <typename Type, SymmMatrixOrientation Orient>
  SpecialMatrix<Type,SymmEngine<Orient>,false> 
  inv(const SpecialMatrix<Type,SymmEngine<Orient>,false>& A) {
    throw feature_not_available("Cannot invert matrix because compiled without LAPACK");
  }
  
}

#endif

namespace adept {
  // -------------------------------------------------------------------
  // Explicit instantiations
  // -------------------------------------------------------------------
#define ADEPT_EXPLICIT_INV(TYPE)					\
  template Array<2,TYPE,false>						\
  inv(const Array<2,TYPE,false>& A);					\
  template SpecialMatrix<TYPE,SymmEngine<ROW_LOWER_COL_UPPER>,false>	\
  inv(const SpecialMatrix<TYPE,SymmEngine<ROW_LOWER_COL_UPPER>,false>&); \
  template SpecialMatrix<TYPE,SymmEngine<ROW_UPPER_COL_LOWER>,false>	\
  inv(const SpecialMatrix<TYPE,SymmEngine<ROW_UPPER_COL_LOWER>,false>&)

  ADEPT_EXPLICIT_INV(float);
  ADEPT_EXPLICIT_INV(double);

#undef ADEPT_EXPLICIT_INV
  
}


================================================
FILE: adept/jacobian.cpp
================================================
/* jacobian.cpp -- Computation of Jacobian matrix

    Copyright (C) 2012-2014 University of Reading
    Copyright (C) 2015-2020 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#ifdef _OPENMP
#include <omp.h>
#endif

#include <adept_arrays.h>

namespace adept {

  namespace internal {
    static const int MULTIPASS_SIZE = ADEPT_REAL_PACKET_SIZE == 1 ? ADEPT_MULTIPASS_SIZE : ADEPT_REAL_PACKET_SIZE;
  }

  using namespace internal;

  template <typename T>
  T _check_long_double() {
    // The user may have requested Real to be of type "long double" by
    // specifying ADEPT_REAL_TYPE_SIZE=16. If the present system can
    // only support double then sizeof(long double) will be 8, but
    // Adept will not be emitting the best code for this, so it is
    // probably better to fail forcing the user to specify
    // ADEPT_REAL_TYPE_SIZE=8.
    ADEPT_STATIC_ASSERT(ADEPT_REAL_TYPE_SIZE != 16 || ADEPT_REAL_TYPE_SIZE == sizeof(Real),
			COMPILER_DOES_NOT_SUPPORT_16_BYTE_LONG_DOUBLE);
    return 1;
  }

#if ADEPT_REAL_PACKET_SIZE > 1
  void
  Stack::jacobian_forward_kernel(Real* __restrict gradient_multipass_b) const
  {

    // Loop forward through the derivative statements
    for (uIndex ist = 1; ist < n_statements_; ist++) {
      const Statement& statement = statement_[ist];
      // We copy the LHS to "a" in case it appears on the RHS in any
      // of the following statements
      Packet<Real> a; // Zeroed automatically
      // Loop through operations
      for (uIndex iop = statement_[ist-1].end_plus_one;
	   iop < statement.end_plus_one; iop++) {
	Packet<Real> g(gradient_multipass_b+index_[iop]*MULTIPASS_SIZE);
	Packet<Real> m(multiplier_[iop]);
	a += m * g;
      }
      // Copy the results
      a.put(gradient_multipass_b+statement.index*MULTIPASS_SIZE);
    } // End of loop over statements
  }    
#else
  void
  Stack::jacobian_forward_kernel(Real* __restrict gradient_multipass_b) const
  {

    // Loop forward through the derivative statements
    for (uIndex ist = 1; ist < n_statements_; ist++) {
      const Statement& statement = statement_[ist];
      // We copy the LHS to "a" in case it appears on the RHS in any
      // of the following statements
      Block<MULTIPASS_SIZE,Real> a; // Zeroed automatically
      // Loop through operations
      for (uIndex iop = statement_[ist-1].end_plus_one;
	   iop < statement.end_plus_one; iop++) {
	for (uIndex i = 0; i < MULTIPASS_SIZE; i++) {
	  a[i] += multiplier_[iop]*gradient_multipass_b[index_[iop]*MULTIPASS_SIZE+i];
	}
      }
      // Copy the results
      for (uIndex i = 0; i < MULTIPASS_SIZE; i++) {
	gradient_multipass_b[statement.index*MULTIPASS_SIZE+i] = a[i];
      }
    } // End of loop over statements
  }    
#endif

  void
  Stack::jacobian_forward_kernel_extra(Real* __restrict gradient_multipass_b,
				       uIndex n_extra) const
  {

    // Loop forward through the derivative statements
    for (uIndex ist = 1; ist < n_statements_; ist++) {
      const Statement& statement = statement_[ist];
      // We copy the LHS to "a" in case it appears on the RHS in any
      // of the following statements
      Block<MULTIPASS_SIZE,Real> a; // Zeroed automatically
      // Loop through operations
      for (uIndex iop = statement_[ist-1].end_plus_one;
	   iop < statement.end_plus_one; iop++) {
	for (uIndex i = 0; i < n_extra; i++) {
	  a[i] += multiplier_[iop]*gradient_multipass_b[index_[iop]*MULTIPASS_SIZE+i];
	}
      }
      // Copy the results
      for (uIndex i = 0; i < n_extra; i++) {
	gradient_multipass_b[statement.index*MULTIPASS_SIZE+i] = a[i];
      }
    } // End of loop over statements
  }    


  // Compute the Jacobian matrix, parallelized using OpenMP. Normally
  // the user would call the jacobian or jacobian_forward functions,
  // and the OpenMP version would only be called if OpenMP is
  // available and the Jacobian matrix is large enough for
  // parallelization to be worthwhile.  Note that jacobian_out must be
  // allocated to be at least of size m*n, where m is the number of
  // dependent variables and n is the number of independents. The
  // independents and dependents must have already been identified
  // with the functions "independent" and "dependent", otherwise this
  // function will fail with FAILURE_XXDEPENDENT_NOT_IDENTIFIED. The
  // offsets in memory of the two dimensions are provided by
  // dep_offset and indep_offset. This is implemented using a forward
  // pass, appropriate for m>=n.
  void
  Stack::jacobian_forward_openmp(Real* jacobian_out,
				 Index dep_offset, Index indep_offset) const
  {

    // Number of blocks to cycle through, including a possible last
    // block containing fewer than MULTIPASS_SIZE variables
    int n_block = (n_independent() + MULTIPASS_SIZE - 1)
      / MULTIPASS_SIZE;
    uIndex n_extra = n_independent() % MULTIPASS_SIZE;
    
#pragma omp parallel
    {
      //      std::vector<Block<MULTIPASS_SIZE,Real> > 
      //	gradient_multipass_b(max_gradient_);
      uIndex gradient_multipass_size = max_gradient_*MULTIPASS_SIZE;
      Real* __restrict gradient_multipass_b 
	= alloc_aligned<Real>(gradient_multipass_size);
      
#pragma omp for schedule(static)
      for (int iblock = 0; iblock < n_block; iblock++) {
	// Set the index to the dependent variables for this block
	uIndex i_independent =  MULTIPASS_SIZE * iblock;
	
	uIndex block_size = MULTIPASS_SIZE;
	// If this is the last iteration and the number of extra
	// elements is non-zero, then set the block size to the number
	// of extra elements. If the number of extra elements is zero,
	// then the number of independent variables is exactly divisible
	// by MULTIPASS_SIZE, so the last iteration will be the
	// same as all the rest.
	if (iblock == n_block-1 && n_extra > 0) {
	  block_size = n_extra;
	}
	
	// Set the initial gradients all to zero
	for (uIndex i = 0; i < gradient_multipass_size; i++) {
	  gradient_multipass_b[i] = 0.0;
	}
	// Each seed vector has one non-zero entry of 1.0
	for (uIndex i = 0; i < block_size; i++) {
	  gradient_multipass_b[independent_index_[i_independent+i]*MULTIPASS_SIZE+i] = 1.0;
	}

	jacobian_forward_kernel(gradient_multipass_b);

	// Copy the gradients corresponding to the dependent variables
	// into the Jacobian matrix
	if (indep_offset == 1) {
	  for (uIndex idep = 0; idep < n_dependent(); idep++) {
	    for (uIndex i = 0; i < block_size; i++) {
	      jacobian_out[idep*dep_offset+i_independent+i]
		= gradient_multipass_b[dependent_index_[idep]*MULTIPASS_SIZE+i];
	    }
	  }
	}
	else {
	  for (uIndex idep = 0; idep < n_dependent(); idep++) {
	    for (uIndex i = 0; i < block_size; i++) {
	      jacobian_out[(i_independent+i)*indep_offset+idep*dep_offset]
		= gradient_multipass_b[dependent_index_[idep]*MULTIPASS_SIZE+i];
	    }
	  }
	}
      } // End of loop over blocks
      free_aligned(gradient_multipass_b);
    } // End of parallel section
  } // End of jacobian function


  // Compute the Jacobian matrix; note that jacobian_out must be
  // allocated to be of size m*n, where m is the number of dependent
  // variables and n is the number of independents. The independents
  // and dependents must have already been identified with the
  // functions "independent" and "dependent", otherwise this function
  // will fail with FAILURE_XXDEPENDENT_NOT_IDENTIFIED. This is
  // implemented using a forward pass, appropriate for m>=n.
  void
  Stack::jacobian_forward(Real* jacobian_out,
			  Index dep_offset, Index indep_offset) const
  {
    if (independent_index_.empty() || dependent_index_.empty()) {
      throw(dependents_or_independents_not_identified());
    }

    // If either of the offsets are zero, set them to the size of the
    // other dimension, which assumes that the full Jacobian matrix is
    // contiguous in memory.
    if (dep_offset <= 0) {
      dep_offset = n_independent();
    }
    if (indep_offset <= 0) {
      indep_offset = n_dependent();
    }

#ifdef _OPENMP
    if (have_openmp_ 
	&& !openmp_manually_disabled_
	&& n_independent() > MULTIPASS_SIZE
	&& omp_get_max_threads() > 1) {
      // Call the parallel version
      jacobian_forward_openmp(jacobian_out, dep_offset, indep_offset);
      return;
    }
#endif

    // For optimization reasons, we process a block of
    // MULTIPASS_SIZE columns of the Jacobian at once; calculate
    // how many blocks are needed and how many extras will remain
    uIndex n_block = n_independent() / MULTIPASS_SIZE;
    uIndex n_extra = n_independent() % MULTIPASS_SIZE;

    ///gradient_multipass_.resize(max_gradient_);
    uIndex gradient_multipass_size = max_gradient_*MULTIPASS_SIZE;
    Real* __restrict gradient_multipass_b 
      = alloc_aligned<Real>(gradient_multipass_size);

    // Loop over blocks of MULTIPASS_SIZE columns
    for (uIndex iblock = 0; iblock < n_block; iblock++) {
      // Set the index to the dependent variables for this block
      uIndex i_independent =  MULTIPASS_SIZE * iblock;

      // Set the initial gradients all to zero
      ///zero_gradient_multipass();
      for (uIndex i = 0; i < gradient_multipass_size; i++) {
	gradient_multipass_b[i] = 0.0;
      }

      // Each seed vector has one non-zero entry of 1.0
      for (uIndex i = 0; i < MULTIPASS_SIZE; i++) {
	gradient_multipass_b[independent_index_[i_independent+i]*MULTIPASS_SIZE+i] = 1.0;
      }

      jacobian_forward_kernel(gradient_multipass_b);

      // Copy the gradients corresponding to the dependent variables
      // into the Jacobian matrix
      if (indep_offset == 1) {
	for (uIndex idep = 0; idep < n_dependent(); idep++) {
	  for (uIndex i = 0; i < MULTIPASS_SIZE; i++) {
	    jacobian_out[idep*dep_offset+i_independent+i]
	      = gradient_multipass_b[dependent_index_[idep]*MULTIPASS_SIZE+i];
	  }
	}
      }
      else {
	for (uIndex idep = 0; idep < n_dependent(); idep++) {
	  for (uIndex i = 0; i < MULTIPASS_SIZE; i++) {
	    jacobian_out[(i_independent+i)*indep_offset+idep*dep_offset] 
	      = gradient_multipass_b[dependent_index_[idep]*MULTIPASS_SIZE+i];
	  }
	}
      }
    } // End of loop over blocks
    
    // Now do the same but for the remaining few columns in the matrix
    if (n_extra > 0) {
      uIndex i_independent =  MULTIPASS_SIZE * n_block;
      ///zero_gradient_multipass();
      for (uIndex i = 0; i < gradient_multipass_size; i++) {
	gradient_multipass_b[i] = 0.0;
      }

      for (uIndex i = 0; i < n_extra; i++) {
	gradient_multipass_b[independent_index_[i_independent+i]*MULTIPASS_SIZE+i] = 1.0;
      }

      jacobian_forward_kernel_extra(gradient_multipass_b, n_extra);

      if (indep_offset == 1) {
	for (uIndex idep = 0; idep < n_dependent(); idep++) {
	  for (uIndex i = 0; i < n_extra; i++) {
	    jacobian_out[idep*dep_offset+i_independent+i]
	      = gradient_multipass_b[dependent_index_[idep]*MULTIPASS_SIZE+i];
	  }
	}
      }
      else {
	for (uIndex idep = 0; idep < n_dependent(); idep++) {
	  for (uIndex i = 0; i < n_extra; i++) {
	    jacobian_out[(i_independent+i)*indep_offset+idep*dep_offset] 
	      = gradient_multipass_b[dependent_index_[idep]*MULTIPASS_SIZE+i];
	  }
	}
      }
    }

    free_aligned(gradient_multipass_b);
  }


  // Compute the Jacobian matrix, parallelized using OpenMP.  Normally
  // the user would call the jacobian or jacobian_reverse functions,
  // and the OpenMP version would only be called if OpenMP is
  // available and the Jacobian matrix is large enough for
  // parallelization to be worthwhile.  Note that jacobian_out must be
  // allocated to be at least of size m*n, where m is the number of
  // dependent variables and n is the number of independents. The
  // independents and dependents must have already been identified
  // with the functions "independent" and "dependent", otherwise this
  // function will fail with FAILURE_XXDEPENDENT_NOT_IDENTIFIED. The
  // offsets in memory of the two dimensions are provided by
  // dep_offset and indep_offset.  This is implemented using a reverse
  // pass, appropriate for m<n.
  void
  Stack::jacobian_reverse_openmp(Real* jacobian_out,
				 Index dep_offset, Index indep_offset) const
  {

    // Number of blocks to cycle through, including a possible last
    // block containing fewer than MULTIPASS_SIZE variables
    int n_block = (n_dependent() + MULTIPASS_SIZE - 1)
      / MULTIPASS_SIZE;
    uIndex n_extra = n_dependent() % MULTIPASS_SIZE;
    
    // Inside the OpenMP loop, the "this" pointer may be NULL if the
    // adept::Stack pointer is declared as thread-local and if the
    // OpenMP memory model uses thread-local storage for private
    // data. If this is the case then local pointers to or copies of
    // the following members of the adept::Stack object may need to be
    // made: dependent_index_ n_statements_ statement_ multiplier_
    // index_ independent_index_ n_dependent() n_independent().
    // Limited testing implies this is OK though.

#pragma omp parallel
    {
      std::vector<Block<MULTIPASS_SIZE,Real> > 
	gradient_multipass_b(max_gradient_);
      
#pragma omp for schedule(static)
      for (int iblock = 0; iblock < n_block; iblock++) {
	// Set the index to the dependent variables for this block
	uIndex i_dependent =  MULTIPASS_SIZE * iblock;
	
	uIndex block_size = MULTIPASS_SIZE;
	// If this is the last iteration and the number of extra
	// elements is non-zero, then set the block size to the number
	// of extra elements. If the number of extra elements is zero,
	// then the number of independent variables is exactly divisible
	// by MULTIPASS_SIZE, so the last iteration will be the
	// same as all the rest.
	if (iblock == n_block-1 && n_extra > 0) {
	  block_size = n_extra;
	}

	// Set the initial gradients all to zero
	for (std::size_t i = 0; i < gradient_multipass_b.size(); i++) {
	  gradient_multipass_b[i].zero();
	}
	// Each seed vector has one non-zero entry of 1.0
	for (uIndex i = 0; i < block_size; i++) {
	  gradient_multipass_b[dependent_index_[i_dependent+i]][i] = 1.0;
	}

	// Loop backward through the derivative statements
	for (uIndex ist = n_statements_-1; ist > 0; ist--) {
	  const Statement& statement = statement_[ist];
	  // We copy the RHS to "a" in case it appears on the LHS in any
	  // of the following statements
	  Real a[MULTIPASS_SIZE];
#if MULTIPASS_SIZE > MULTIPASS_SIZE_ZERO_CHECK
	  // For large blocks, we only process the ones where a[i] is
	  // non-zero
	  uIndex i_non_zero[MULTIPASS_SIZE];
#endif
	  uIndex n_non_zero = 0;
	  for (uIndex i = 0; i < block_size; i++) {
	    a[i] = gradient_multipass_b[statement.index][i];
	    gradient_multipass_b[statement.index][i] = 0.0;
	    if (a[i] != 0.0) {
#if MULTIPASS_SIZE > MULTIPASS_SIZE_ZERO_CHECK
	      i_non_zero[n_non_zero++] = i;
#else
	      n_non_zero = 1;
#endif
	    }
	  }

	  // Only do anything for this statement if any of the a values
	  // are non-zero
	  if (n_non_zero) {
	    // Loop through the operations
	    for (uIndex iop = statement_[ist-1].end_plus_one;
		 iop < statement.end_plus_one; iop++) {
	      // Try to minimize pointer dereferencing by making local
	      // copies
	      Real multiplier = multiplier_[iop];
	      Real* __restrict gradient_multipass 
		= &(gradient_multipass_b[index_[iop]][0]);
#if MULTIPASS_SIZE > MULTIPASS_SIZE_ZERO_CHECK
	      // For large blocks, loop over only the indices
	      // corresponding to non-zero a
	      for (uIndex i = 0; i < n_non_zero; i++) {
		gradient_multipass[i_non_zero[i]] += multiplier*a[i_non_zero[i]];
	      }
#else
	      // For small blocks, do all indices
	      for (uIndex i = 0; i < block_size; i++) {
	      //	      for (uIndex i = 0; i < MULTIPASS_SIZE; i++) {
		gradient_multipass[i] += multiplier*a[i];
	      }
#endif
	    }
	  }
	} // End of loop over statement
	// Copy the gradients corresponding to the independent
	// variables into the Jacobian matrix
	if (dep_offset == 1) {
	  for (uIndex iindep = 0; iindep < n_independent(); iindep++) {
	    for (uIndex i = 0; i < block_size; i++) {
	      jacobian_out[iindep*indep_offset+i_dependent+i] 
		= gradient_multipass_b[independent_index_[iindep]][i];
	    }
	  }
	}
	else {
	  for (uIndex iindep = 0; iindep < n_independent(); iindep++) {
	    for (uIndex i = 0; i < block_size; i++) {
	      jacobian_out[iindep*indep_offset+(i_dependent+i)*dep_offset] 
		= gradient_multipass_b[independent_index_[iindep]][i];
	    }
	  }
	}
      } // End of loop over blocks
    } // end #pragma omp parallel
  } // end jacobian_reverse_openmp


  // Compute the Jacobian matrix; note that jacobian_out must be
  // allocated to be of size m*n, where m is the number of dependent
  // variables and n is the number of independents. The independents
  // and dependents must have already been identified with the
  // functions "independent" and "dependent", otherwise this function
  // will fail with FAILURE_XXDEPENDENT_NOT_IDENTIFIED.  This is
  // implemented using a reverse pass, appropriate for m<n.
  void
  Stack::jacobian_reverse(Real* jacobian_out,
			  Index dep_offset, Index indep_offset) const
  {
    if (independent_index_.empty() || dependent_index_.empty()) {
      throw(dependents_or_independents_not_identified());
    }

    // If either of the offsets are zero, set them to the size of the
    // other dimension, which assumes that the full Jacobian matrix is
    // contiguous in memory.
    if (dep_offset <= 0) {
      dep_offset = n_independent();
    }
    if (indep_offset <= 0) {
      indep_offset = n_dependent();
    }

#ifdef _OPENMP
    if (have_openmp_ 
	&& !openmp_manually_disabled_
	&& n_dependent() > MULTIPASS_SIZE
	&& omp_get_max_threads() > 1) {
      // Call the parallel version
      jacobian_reverse_openmp(jacobian_out,
			      dep_offset, indep_offset);
      return;
    }
#endif

    //    gradient_multipass_.resize(max_gradient_);
    std::vector<Block<MULTIPASS_SIZE,Real> > 
      gradient_multipass_b(max_gradient_);

    // For optimization reasons, we process a block of
    // MULTIPASS_SIZE rows of the Jacobian at once; calculate
    // how many blocks are needed and how many extras will remain
    uIndex n_block = n_dependent() / MULTIPASS_SIZE;
    uIndex n_extra = n_dependent() % MULTIPASS_SIZE;
    uIndex i_dependent = 0; // uIndex of first row in the block we are
			    // currently computing
    // Loop over the of MULTIPASS_SIZE rows
    for (uIndex iblock = 0; iblock < n_block; iblock++) {
      // Set the initial gradients all to zero
      //      zero_gradient_multipass();
      for (std::size_t i = 0; i < gradient_multipass_b.size(); i++) {
	gradient_multipass_b[i].zero();
      }

      // Each seed vector has one non-zero entry of 1.0
      for (uIndex i = 0; i < MULTIPASS_SIZE; i++) {
	gradient_multipass_b[dependent_index_[i_dependent+i]][i] = 1.0;
      }
      // Loop backward through the derivative statements
      for (uIndex ist = n_statements_-1; ist > 0; ist--) {
	const Statement& statement = statement_[ist];
	// We copy the RHS to "a" in case it appears on the LHS in any
	// of the following statements
	Real a[MULTIPASS_SIZE];
#if MULTIPASS_SIZE > MULTIPASS_SIZE_ZERO_CHECK
	// For large blocks, we only process the ones where a[i] is
	// non-zero
	uIndex i_non_zero[MULTIPASS_SIZE];
#endif
	uIndex n_non_zero = 0;
	for (uIndex i = 0; i < MULTIPASS_SIZE; i++) {
	  a[i] = gradient_multipass_b[statement.index][i];
	  gradient_multipass_b[statement.index][i] = 0.0;
	  if (a[i] != 0.0) {
#if MULTIPASS_SIZE > MULTIPASS_SIZE_ZERO_CHECK
	    i_non_zero[n_non_zero++] = i;
#else
	    n_non_zero = 1;
#endif
	  }
	}
	// Only do anything for this statement if any of the a values
	// are non-zero
	if (n_non_zero) {
	  // Loop through the operations
	  for (uIndex iop = statement_[ist-1].end_plus_one;
	       iop < statement.end_plus_one; iop++) {
	    // Try to minimize pointer dereferencing by making local
	    // copies
	    Real multiplier = multiplier_[iop];
	    Real* __restrict gradient_multipass 
	      = &(gradient_multipass_b[index_[iop]][0]);
#if MULTIPASS_SIZE > MULTIPASS_SIZE_ZERO_CHECK
	    // For large blocks, loop over only the indices
	    // corresponding to non-zero a
	    for (uIndex i = 0; i < n_non_zero; i++) {
	      gradient_multipass[i_non_zero[i]] += multiplier*a[i_non_zero[i]];
	    }
#else
	    // For small blocks, do all indices
	    for (uIndex i = 0; i < MULTIPASS_SIZE; i++) {
	      gradient_multipass[i] += multiplier*a[i];
	    }
#endif
	  }
	}
      } // End of loop over statement
      // Copy the gradients corresponding to the independent variables
      // into the Jacobian matrix
      if (dep_offset == 1) {
	for (uIndex iindep = 0; iindep < n_independent(); iindep++) {
	  for (uIndex i = 0; i < MULTIPASS_SIZE; i++) {
	    jacobian_out[iindep*indep_offset+i_dependent+i] 
	      = gradient_multipass_b[independent_index_[iindep]][i];
	  }
	}
      }
      else {
	for (uIndex iindep = 0; iindep < n_independent(); iindep++) {
	  for (uIndex i = 0; i < MULTIPASS_SIZE; i++) {
	    jacobian_out[iindep*indep_offset+(i_dependent+i)*dep_offset] 
	      = gradient_multipass_b[independent_index_[iindep]][i];
	  }
	}
      }
      i_dependent += MULTIPASS_SIZE;
    } // End of loop over blocks
    
    // Now do the same but for the remaining few rows in the matrix
    if (n_extra > 0) {
      for (std::size_t i = 0; i < gradient_multipass_b.size(); i++) {
	gradient_multipass_b[i].zero();
      }
      //      zero_gradient_multipass();
      for (uIndex i = 0; i < n_extra; i++) {
	gradient_multipass_b[dependent_index_[i_dependent+i]][i] = 1.0;
      }
      for (uIndex ist = n_statements_-1; ist > 0; ist--) {
	const Statement& statement = statement_[ist];
	Real a[MULTIPASS_SIZE];
#if MULTIPASS_SIZE > MULTIPASS_SIZE_ZERO_CHECK
	uIndex i_non_zero[MULTIPASS_SIZE];
#endif
	uIndex n_non_zero = 0;
	for (uIndex i = 0; i < n_extra; i++) {
	  a[i] = gradient_multipass_b[statement.index][i];
	  gradient_multipass_b[statement.index][i] = 0.0;
	  if (a[i] != 0.0) {
#if MULTIPASS_SIZE > MULTIPASS_SIZE_ZERO_CHECK
	    i_non_zero[n_non_zero++] = i;
#else
	    n_non_zero = 1;
#endif
	  }
	}
	if (n_non_zero) {
	  for (uIndex iop = statement_[ist-1].end_plus_one;
	       iop < statement.end_plus_one; iop++) {
	    Real multiplier = multiplier_[iop];
	    Real* __restrict gradient_multipass 
	      = &(gradient_multipass_b[index_[iop]][0]);
#if MULTIPASS_SIZE > MULTIPASS_SIZE_ZERO_CHECK
	    for (uIndex i = 0; i < n_non_zero; i++) {
	      gradient_multipass[i_non_zero[i]] += multiplier*a[i_non_zero[i]];
	    }
#else
	    for (uIndex i = 0; i < n_extra; i++) {
	      gradient_multipass[i] += multiplier*a[i];
	    }
#endif
	  }
	}
      }
      if (dep_offset == 1) {
	for (uIndex iindep = 0; iindep < n_independent(); iindep++) {
	  for (uIndex i = 0; i < n_extra; i++) {
	    jacobian_out[iindep*indep_offset+i_dependent+i] 
	      = gradient_multipass_b[independent_index_[iindep]][i];
	  }
	}
      }
      else {
	for (uIndex iindep = 0; iindep < n_independent(); iindep++) {
	  for (uIndex i = 0; i < n_extra; i++) {
	    jacobian_out[iindep*indep_offset+(i_dependent+i)*dep_offset] 
	      = gradient_multipass_b[independent_index_[iindep]][i];
	  }
	}
      }
    }
  }
  
  // Return the Jacobian matrix in the matrix "jac", using the forward
  // or reverse method depending which would be faster
  void Stack::jacobian(Array<2,Real,false> jac) const {
    if (jac.dimension(0) != n_dependent()
	|| jac.dimension(1) != n_independent()) {
      throw size_mismatch("Jacobian matrix has wrong size");
    }
    if (n_independent() <= n_dependent()) {
      jacobian_forward(jac.data(), jac.offset(0), jac.offset(1));
    }
    else {
      jacobian_reverse(jac.data(), jac.offset(0), jac.offset(1));
    }
  }

  // Return the Jacobian matrix in the matrix "jac", explicitly
  // specifying whether to use the forward or reverse method
  void Stack::jacobian_forward(Array<2,Real,false> jac) const {
    if (jac.dimension(0) != n_dependent()
	|| jac.dimension(1) != n_independent()) {
      throw size_mismatch("Jacobian matrix has wrong size");
    }
    jacobian_forward(jac.data(), jac.offset(0), jac.offset(1));
  }

  void Stack::jacobian_reverse(Array<2,Real,false> jac) const {
    if (jac.dimension(0) != n_dependent()
	|| jac.dimension(1) != n_independent()) {
      throw size_mismatch("Jacobian matrix has wrong size");
    }
    jacobian_reverse(jac.data(), jac.offset(0), jac.offset(1));
  }

  // Return the Jacobian matrix using the forward or reverse method
  // depending which would be faster
  Array<2,Real,false> Stack::jacobian() const {
    Array<2,Real,false> jac(n_dependent(), n_independent());
    if (n_independent() <= n_dependent()) {
      jacobian_forward(jac.data(), jac.offset(0), jac.offset(1));
    }
    else {
      jacobian_reverse(jac.data(), jac.offset(0), jac.offset(1));
    }
    return jac;
  }

  // Return the Jacobian matrix, explicitly specifying whether to use
  // the forward or reverse method
  Array<2,Real,false> Stack::jacobian_forward() const {
    Array<2,Real,false> jac(n_dependent(), n_independent());
    jacobian_forward(jac.data(), jac.offset(0), jac.offset(1));
    return jac;
  }

  Array<2,Real,false> Stack::jacobian_reverse() const {
    Array<2,Real,false> jac(n_dependent(), n_independent());
    jacobian_reverse(jac.data(), jac.offset(0), jac.offset(1));
    return jac;
  }

} // End namespace adept


================================================
FILE: adept/line_search.cpp
================================================
/* line_search.cpp -- Approximate minimization of function along a line

    Copyright (C) 2020 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#include <limits>
#include <cmath>
#include <adept/Minimizer.h>

namespace adept {

  // Compute the cost function "cf" and gradient vector "gradient",
  // along with the scalar gradient "grad" in the search direction
  // "direction" (normalized with "dir_scaling"), from the state
  // vector "x" plus a step "step_size" in the search direction. If
  // the resulting cost function and gradient satisfy the Wolfe
  // conditions for sufficient convergence, copy the new state vector
  // to "x" and the step size to "final_step_size", and return
  // MINIMIZER_STATUS_SUCCESS.  Otherwise, return
  // MINIMIZER_STATUS_NOT_YET_CONVERGED.  Error conditions
  // MINIMIZER_STATUS_INVALID_COST_FUNCTION and
  // MINIMIZER_STATUS_INVALID_GRADIENT are also possible.
  MinimizerStatus
  Minimizer::line_search_gradient_check(
	Optimizable& optimizable, // Object defining function to be minimized
	Vector x, // Initial and returned state vector
	const Vector& direction, // Un-normalized search direction
	Vector test_x, // Test state vector (working memory)
	Real& final_step_size, // Returned step size if converged
	Vector gradient, // Gradient vector
	int& state_up_to_date, // Is state up-to-date?
	Real step_size, // Candidate step size
	Real grad0, // Gradient in direction at start of line search
	Real dir_scaling, // Scaling of direction vector
	Real& cf, // Returned cost function
	Real& grad, // Returned gradient in direction
	Real curvature_coeff) // Factor by which gradient should reduce (0-1)
  {
    test_x = x + (step_size * dir_scaling) * direction;
    cf = optimizable.calc_cost_function_gradient(test_x, gradient);
    ++n_samples_;
    state_up_to_date = -1;

    // Check cost function and gradient are finite
    if (!std::isfinite(cf)) {
      return MINIMIZER_STATUS_INVALID_COST_FUNCTION;
    }
    else if (any(!isfinite(gradient))) {
      return MINIMIZER_STATUS_INVALID_GRADIENT;
    }

    // Calculate gradient in search direction
    grad = dot_product(direction, gradient) * dir_scaling;

    // Check Wolfe conditions
    if (cf <= cost_function_ + armijo_coeff_*step_size*grad0 // Armijo condition
	&& std::fabs(grad) <= -curvature_coeff*grad0) { // Curvature condition
      x = test_x;
      final_step_size = step_size;
      cost_function_ = cf;
      state_up_to_date = 1;
      return MINIMIZER_STATUS_SUCCESS;
    }
    else {
      return MINIMIZER_STATUS_NOT_YET_CONVERGED;
    }
  }

  // Perform line search starting at state vector "x" with gradient
  // vector "gradient", and initial step "step_size" in un-normalized
  // direction "direction". Successful minimization of the function
  // (according to Wolfe conditions) will lead to
  // MINIMIZER_STATUS_SUCCESS being returned, the new state stored in
  // "x", and if state_up_to_date >= 1 then the gradient stored in
  // "gradient". Other possible return values are
  // MINIMIZER_STATUS_FAILED_TO_CONVERGE and
  // MINIMIZER_STATUS_DIRECTION_UPHILL if the initial direction points
  // uphill, or MINIMIZER_STATUS_INVALID_COST_FUNCTION,
  // MINIMIZER_STATUS_INVALID_GRADIENT or
  // MINIMIZER_STATUS_BOUND_REACHED. First the minimum is bracketed,
  // then a cubic polynomial is fitted to the values and gradients of
  // the function at the two points in order to select the next test
  // point.
  MinimizerStatus
  Minimizer::line_search(
	 Optimizable& optimizable,  // Object defining function to be minimized
	 Vector x, // Initial and returned state vector
	 const Vector& direction, // Un-normalized search direction
	 Vector test_x, // Test state vector (working memory)
	 Real& step_size, // Initial and final step size
	 Vector gradient, // Initial and possibly final gradient
	 int& state_up_to_date, // 1 if gradient up-to-date, -1 otherwise
	 Real curvature_coeff, // Factor by which gradient should reduce (0-1)
	 Real bound_step_size) // Maximum step until bound is reached (-1 for no bound)
  {
    Real dir_scaling = 1.0 / norm2(direction);

    // Numerical suffixes to variables indicate different locations
    // along the line:
    // 0 = initial point of line search, constant within this function
    // 1 = point at which gradient has been calculated (initially the same as 0)
    // 2 = test point
    // 3 = test point

    // Step sizes
    const Real ss0 = 0.0;
    Real ss1 = ss0;
    Real ss2 = step_size;
    Real ss3;

    // Gradients in search direction
    Real grad0 = dot_product(direction, gradient) * dir_scaling;
    Real grad1 = grad0;
    Real grad2, grad3;

    // Cost function values
    Real cf0 = cost_function_;
    Real cf1 = cf0;
    Real cf2, cf3;

    int iterations_remaining = max_line_search_iterations_;

    bool is_bound_step = (bound_step_size > 0.0);
    bool at_bound = false;

    if (grad0 >= 0.0) {
      return MINIMIZER_STATUS_DIRECTION_UPHILL;
    }

    // Check initial step size is within bounds
    if (max_step_size_ > 0.0 && ss2 > max_step_size_) {
      ss2 = max_step_size_;
    }
    if (is_bound_step && ss2 >= bound_step_size) {
      ss2 = bound_step_size;
      at_bound = true;
    }

    // First step: bound the minimum
    while (iterations_remaining > 0) {

      MinimizerStatus status
	= line_search_gradient_check(optimizable, x, direction, test_x,
				     step_size, gradient, state_up_to_date,
				     ss2, grad0, dir_scaling,
				     cf2, grad2, curvature_coeff);
      if (status == MINIMIZER_STATUS_SUCCESS) {
	if (at_bound) {
	  status = MINIMIZER_STATUS_BOUND_REACHED;
	}
	return status;
      }
      else if (status != MINIMIZER_STATUS_NOT_YET_CONVERGED) {
	// Cost function or its gradient not finite: revert to
	// previous step
	step_size = cf1;
	if (cf1 > 0.0) {
	  x += (ss1 * dir_scaling) * direction;
	}
	state_up_to_date = 0;
	return status;
      }
     
      if (grad2 > 0.0 || cf2 >= cf1) {
	// Positive gradient or cost function increase -> bounded
	// between points 1 and 2
	break;
      }
      else if (at_bound) {
	// The cost function has been reduced but we are already at
	// the maximum step size and the gradient points towards it:
	// make this point the solution
	x += (ss2 * dir_scaling) * direction;
	step_size = ss2;
	cost_function_ = cf2;
	state_up_to_date = 1;
	return MINIMIZER_STATUS_BOUND_REACHED;
      }
      else {
	// Reduced cost function but not yet bounded -> look further
	// ahead
	Real new_step;
	if (cf1 > cf2+grad2*(ss1-ss2)) {
	  // Positive curvature: fit a quadratic
	  Real curvature = 2.0*(cf1-cf2-grad2*(ss1-ss2))/((ss1-ss2)*(ss1-ss2));
	  new_step = ss2-grad2/curvature; // Newton's method
	  // Bounds on actual step size
	  new_step = std::max(ss1+1.1*(ss2-ss1), std::min(new_step, ss1+10.0*(ss2-ss1)));
	  if (max_step_size_ > 0.0 && new_step-ss2 > max_step_size_) {
	    new_step = ss2 + max_step_size_;
	  }
	}
	else {
	  // Cliff gets steeper... simply jump ahead a lot more
	  new_step = ss2 + 5.0*(ss2-ss1);
	  if (max_step_size_ > 0.0 && new_step-ss2 > max_step_size_) {
	    new_step = ss2 + max_step_size_;
	  }
	}
	ss1 = ss2;
	cf1 = cf2;
	grad1 = grad2;
	ss2 = new_step;

	if (is_bound_step && ss2 >= bound_step_size) {
	  ss2 = bound_step_size;
	  at_bound = true;
	}
      }

    }

    // Second step: reduce the bounds until we get sufficiently close
    // to the minimum
    while (iterations_remaining > 0) {

      if (ss2 <= ss1) {
	// Two points are identical!
	if (cf1 < cf0) {
	  // Return value at point 1
	  x += (ss1 * dir_scaling) * direction;
	  step_size = ss1;
	  cost_function_ = cf1;
	  return MINIMIZER_STATUS_SUCCESS;
	}
	else {
	  // Cost function did not decrease at all
	  return MINIMIZER_STATUS_FAILED_TO_CONVERGE;
	}
      }

      // Minimizer of cubic function
      Real step_diff = ss2-ss1;
      Real theta = (cf1-cf2) * 3.0 / step_diff + grad1 + grad2;
      Real max_grad = std::max(std::fabs(theta),
			       std::max(std::fabs(grad1), std::fabs(grad2)));
      Real scaled_theta = theta / max_grad;
      Real gamma = max_grad * std::sqrt(scaled_theta*scaled_theta
					- (grad1/max_grad) * (grad2/max_grad));
      ss3 = ss1 + ((gamma - grad1 + theta) / (2.0*gamma + grad2 - grad1)) * step_diff;


      // Bound the step size to be at least 5% away from each end
      ss3 = std::max(0.95*ss1+0.05*ss2,
		     std::min(0.05*ss1+0.95*ss2, ss3));

      MinimizerStatus status
	= line_search_gradient_check(optimizable, x, direction, test_x,
				     step_size, gradient, state_up_to_date,
				     ss3, grad0, dir_scaling,
				     cf3, grad3, curvature_coeff);
      if (status == MINIMIZER_STATUS_SUCCESS) {
	return status;
      }
      else if (status != MINIMIZER_STATUS_NOT_YET_CONVERGED) {
	// Cost function or its gradient not finite: revert to
	// previous step
	step_size = cf1;
	if (cf1 > 0.0) {
	  x += (ss1 * dir_scaling) * direction;
	}
	state_up_to_date = 0;
	return status;
      }
     
      if (grad3 > 0.0) {
	// Positive gradient -> bounded between points 1 and 3
	ss2 = ss3;
	cf2 = cf3;
	grad2 = grad3;
      }
      else if (cf3 < cf1) {
	// Reduced cost function, negative gradient
	ss1 = ss3;
	cf1 = cf3;
	grad1 = grad3;
      }
      else {
	// Increased cost function, negative gradient
	ss2 = ss3;
	cf2 = cf3;
	grad2 = grad3;
      }	

      --iterations_remaining;
    }

    // Maximum iterations reached: check if cost function has been
    // reduced at all
    state_up_to_date = -1;
    if (cf2 < cf1) {
      // Return value at point 2
      x += (ss2 * dir_scaling) * direction;
      step_size = ss2;
      cost_function_ = cf2;  
    }
    else if (cf1 < cf0) {
      // Return value at point 1
      x += (ss1 * dir_scaling) * direction;
      step_size = ss1;
      cost_function_ = cf1;  
    }
    else {
      // Cost function did not decrease at all
      return MINIMIZER_STATUS_FAILED_TO_CONVERGE;
    }

    // Cost function decreased
    return MINIMIZER_STATUS_SUCCESS;

  }

}


================================================
FILE: adept/minimize_conjugate_gradient.cpp
================================================
/* minimize_conjugate_gradient.cpp -- Minimize function using Conjugate Gradient algorithm

    Copyright (C) 2020 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#include <limits>
#include <cmath>
#include <adept/Minimizer.h>

namespace adept {

  // Minimize the cost function embodied in "optimizable" using the
  // Conjugate-Gradient algorithm, where "x" is the initial state
  // vector and also where the solution is stored. By default the
  // Polak-Ribiere method is used to compute the new search direction,
  // but Fletcher-Reeves is also available.
  MinimizerStatus
  Minimizer::minimize_conjugate_gradient(Optimizable& optimizable, Vector x,
					 bool use_fletcher_reeves)
  {
    int nx = x.size();

    // Initial values
    n_iterations_ = 0;
    n_samples_ = 0;
    status_ = MINIMIZER_STATUS_NOT_YET_CONVERGED;
    cost_function_ = std::numeric_limits<Real>::infinity();

    // The Conjugate-Gradient method is the most efficient
    // gradient-based method in terms of memory usage, requiring a
    // working memory of just 4*nx, making it suitable for large state
    // vectors.
    Vector gradient(nx);
    Vector previous_gradient(nx);
    Vector direction(nx);
    Vector test_x(nx); // Used by the line search only

    // Does the last calculation of the cost function in "optimizable"
    // match the current contents of the state vector x? -1=no, 0=yes,
    // 1=yes and the last calculation included the gradient, 2=yes and
    // the last calculation included gradient and Hessian.
    int state_up_to_date = -1;

    // Initial step size
    Real step_size = 1.0;
    if (max_step_size_ > 0.0) {
      step_size = max_step_size_;
    }

    // A restart is performed every nx+1 iterations
    bool do_restart = true;
    int iteration_at_last_restart = n_iterations_;

    // Main loop
    while (status_ == MINIMIZER_STATUS_NOT_YET_CONVERGED) {

      // If the last line search found a minimum along the lines
      // satisfying the Wolfe conditions, then the current cost
      // function and gradient will be consistent with the current
      // state vector.  Otherwise we need to compute them.
      if (state_up_to_date < 1) {
	cost_function_ = optimizable.calc_cost_function_gradient(x, gradient);
	state_up_to_date = 1;
	++n_samples_;
      }

      if (n_iterations_ == 0) {
	start_cost_function_ = cost_function_;
      }

      // Check cost function and gradient are finite
      if (!std::isfinite(cost_function_)) {
	status_ = MINIMIZER_STATUS_INVALID_COST_FUNCTION;
	break;
      }
      else if (any(!isfinite(gradient))) {
	status_ = MINIMIZER_STATUS_INVALID_GRADIENT;
	break;
      }

      // Compute L2 norm of gradient to see how "flat" the environment
      // is
      gradient_norm_ = norm2(gradient);

      // Report progress using user-defined function
      optimizable.report_progress(n_iterations_, x, cost_function_, gradient_norm_);

      // Convergence has been achieved if the L2 norm has been reduced
      // to a user-specified threshold
      if (gradient_norm_ <= converged_gradient_norm_) {
	status_ = MINIMIZER_STATUS_SUCCESS;
	break;
      }

      // Restart every nx+1 iterations
      if (n_iterations_ - iteration_at_last_restart > nx) {
	do_restart = true;
      }

      // Find search direction
      if (do_restart) {
	// Simple gradient descent after a restart
	direction = -gradient;
	do_restart = false;
	iteration_at_last_restart = n_iterations_;
      }
      else {
	// The brains of the Conjugate-Gradient method - note that
	// generally the Polak-Ribiere method is believed to be
	// superior to Fletcher-Reeves
	Real beta;
	if (use_fletcher_reeves) {
	  // Fletcher-Reeves method
	  beta = dot_product(gradient, gradient) 
	    / dot_product(previous_gradient, previous_gradient);
	}
	else {
	  // Default: Polak-Ribiere method
	  beta = std::max(sum(gradient * (gradient - previous_gradient))
			  / dot_product(previous_gradient, previous_gradient),
			  0.0);
	}
	// beta==0 is equivalent to gradient descent (i.e. a restart)
	if (beta <= 0) {
	  iteration_at_last_restart = n_iterations_;
	}
	// Compute new direction
	direction = beta*direction - gradient;
      }

      // Store gradient for computing beta in next iteration
      previous_gradient = gradient;

      // Perform line search, storing new state vector in x
      MinimizerStatus ls_status
	= line_search(optimizable, x, direction,
		      test_x, step_size, gradient, state_up_to_date,
		      cg_curvature_coeff_);

      if (ls_status == MINIMIZER_STATUS_SUCCESS) {
	// Successfully minimized along search direction: continue to
	// next iteration
	status_ = MINIMIZER_STATUS_NOT_YET_CONVERGED;
      }
      else if (iteration_at_last_restart != n_iterations_) {
	// Line search either made no progress or encountered a
	// non-finite cost function or gradient, and this was not a
	// restart; try restarting once
	do_restart = true;
	status_ = MINIMIZER_STATUS_NOT_YET_CONVERGED;
      }
      else {
	// Unrecoverable failure in line-search: return status to
	// calling function
	status_ = ls_status;
      }

      // Better convergence if first step size on next line search is
      // larger than the actual step size on the last line search
      step_size *= 2.0;

      ++n_iterations_;
      if (status_ == MINIMIZER_STATUS_NOT_YET_CONVERGED
	  && n_iterations_ >= max_iterations_) {
	status_ = MINIMIZER_STATUS_MAX_ITERATIONS_REACHED;
      }

      // End of main loop: if status_ is anything other than
      // MINIMIZER_STATUS_NOT_YET_CONVERGED then no more iterations
      // are performed
    }
     
    if (state_up_to_date < ensure_updated_state_) {
      // The last call to calc_cost_function* was not with the state
      // vector returned to the user, and they want it to be.
      if (ensure_updated_state_ > 0) {
	// User wants at least the first derivative
	cost_function_ = optimizable.calc_cost_function_gradient(x, gradient);
      }
      else {
	// User does not need derivatives to have been computed
	cost_function_ = optimizable.calc_cost_function(x);
      }
    }

    return status_;
  }

  // Minimize the cost function embodied in "optimizable" using the
  // Conjugate-Gradient algorithm, where "x" is the initial state
  // vector and also where the solution is stored, subject to the
  // constraint that x lies between min_x and max_x. By default the
  // Polak-Ribiere method is used to compute the new search direction,
  // but Fletcher-Reeves is also available.
  MinimizerStatus
  Minimizer::minimize_conjugate_gradient_bounded(Optimizable& optimizable, Vector x,
					 const Vector& min_x,
					 const Vector& max_x,
					 bool use_fletcher_reeves)
  {
    if (any(min_x >= max_x)
	|| min_x.size() != x.size()
	|| max_x.size() != x.size()) {
      return MINIMIZER_STATUS_INVALID_BOUNDS;
    }

    int nx = x.size();

    // Initial values
    n_iterations_ = 0;
    n_samples_ = 0;
    status_ = MINIMIZER_STATUS_NOT_YET_CONVERGED;
    cost_function_ = std::numeric_limits<Real>::infinity();

    // The Conjugate-Gradient method is the most efficient
    // gradient-based method in terms of memory usage, requiring a
    // working memory of just 4*nx, making it suitable for large state
    // vectors.
    Vector gradient(nx);
    Vector previous_gradient(nx);
    Vector direction(nx);
    Vector test_x(nx); // Used by the line search only

    // Which state variables are at the minimum bound (-1), maximum
    // bound (1) or free (0)?
    intVector bound_status(nx);
    bound_status = 0;

    // Ensure that initial x lies within the specified bounds
    bound_status.where(x >= max_x) =  1;
    bound_status.where(x <= min_x) = -1;
    x = max(min_x, min(x, max_x));

    int nbound = count(bound_status != 0);
    int nfree  = nx - nbound;

    // Floating-point number containing 1.0 if unbound and 0.0 if
    // bound
    Vector unbound_status(nx);
    unbound_status = 1.0-fabs(bound_status);

    // Does the last calculation of the cost function in "optimizable"
    // match the current contents of the state vector x? -1=no, 0=yes,
    // 1=yes and the last calculation included the gradient, 2=yes and
    // the last calculation included gradient and Hessian.
    int state_up_to_date = -1;

    // Initial step size
    Real step_size = 1.0;
    if (max_step_size_ > 0.0) {
      step_size = max_step_size_;
    }

    // A restart is performed every nx+1 iterations
    bool do_restart = true;
    int iteration_at_last_restart = n_iterations_;

    // Main loop
    while (status_ == MINIMIZER_STATUS_NOT_YET_CONVERGED) {

      // If the last line search found a minimum along the lines
      // satisfying the Wolfe conditions, then the current cost
      // function and gradient will be consistent with the current
      // state vector.  Otherwise we need to compute them.
      if (state_up_to_date < 1) {
	cost_function_ = optimizable.calc_cost_function_gradient(x, gradient);
	state_up_to_date = 1;
	++n_samples_;

	if (n_iterations_ == 0) {
	  start_cost_function_ = cost_function_;
	}

	// Check cost function and gradient are finite
	if (!std::isfinite(cost_function_)) {
	  status_ = MINIMIZER_STATUS_INVALID_COST_FUNCTION;
	  break;
	}
	else if (any(!isfinite(gradient))) {
	  status_ = MINIMIZER_STATUS_INVALID_GRADIENT;
	  break;
	}

      }

      // Check whether the bound status of each state variable is
      // consistent with the gradient if a steepest descent were to be
      // taken, and if not flag a restart
      if (any(bound_status == -1 && gradient < 0.0)
	  || any(bound_status == 1 && gradient > 0.0)) {
	bound_status.where(bound_status == -1 && gradient < 0.0) = 0;
	bound_status.where(bound_status ==  1 && gradient > 0.0) = 0;
	unbound_status = 1.0-fabs(bound_status);
	do_restart = true;
      }
      nbound = count(bound_status != 0);
      nfree = nx - nbound;

      // Set gradient at bound points to zero
      gradient.where(bound_status != 0) = 0.0;

      // Compute L2 norm of gradient to see how "flat" the environment
      // is
      if (nfree > 0) {
	gradient_norm_ = norm2(gradient);
      }
      else {
	// If no dimensions are in play we are at a corner of the
	// bounds and the gradient is pointing into the corner: we
	// have reached a minimum in the cost function subject to the
	// bounds so have converged
	gradient_norm_ = 0.0;
      }

      // Report progress using user-defined function
      optimizable.report_progress(n_iterations_, x, cost_function_, gradient_norm_);

      // Convergence has been achieved if the L2 norm has been reduced
      // to a user-specified threshold
      if (gradient_norm_ <= converged_gradient_norm_) {
	status_ = MINIMIZER_STATUS_SUCCESS;
	break;
      }

      // Restart every nx+1 iterations
      if (n_iterations_ - iteration_at_last_restart > nx) {
	do_restart = true;
      }

      // Find search direction
      if (do_restart) {
	// Simple gradient descent after a restart
	direction = -gradient;
	do_restart = false;
	iteration_at_last_restart = n_iterations_;
      }
      else {
	// The brains of the Conjugate-Gradient method - note that
	// generally the Polak-Ribiere method is believed to be
	// superior to Fletcher-Reeves
	Real beta;
	if (use_fletcher_reeves) {
	  // Fletcher-Reeves method
	  beta = dot_product(gradient, gradient) 
	    / dot_product(previous_gradient, previous_gradient);
	}
	else {
	  // Default: Polak-Ribiere method
	  beta = std::max(sum(gradient * (gradient - previous_gradient))
			  / dot_product(previous_gradient, previous_gradient),
			  0.0);
	}
	// beta==0 is equivalent to gradient descent (i.e. a restart)
	if (beta <= 0) {
	  iteration_at_last_restart = n_iterations_;
	}
	// Compute new direction
	direction = beta*direction - gradient;
      }

      // Store gradient for computing beta in next iteration
      previous_gradient = gradient;

      // Distance to the nearest bound
      Real dir_scaling = norm2(direction);
      Real bound_step_size = std::numeric_limits<Real>::max();
      int i_nearest_bound = -1;
      int i_bound_type = 0;
      // Work out the maximum step size along "direction" before a
      // bound is met... there must be a faster way to do this
      for (int ix = 0; ix < nx; ++ix) {
	if (direction(ix) > 0.0 && max_x(ix) < std::numeric_limits<Real>::max()) {
	  Real local_bound_step_size = dir_scaling*(max_x(ix)-x(ix))/direction(ix);
	  if (bound_step_size >= local_bound_step_size) {
	    bound_step_size = local_bound_step_size;
	    i_nearest_bound = ix;
	    i_bound_type = 1;
	  }				   
	}
	else if (direction(ix) < 0.0 && min_x(ix) > -std::numeric_limits<Real>::max()) {
	  Real local_bound_step_size = dir_scaling*(min_x(ix)-x(ix))/direction(ix);
	  if (bound_step_size >= local_bound_step_size) {
	    bound_step_size = local_bound_step_size;
	    i_nearest_bound = ix;
	    i_bound_type = -1;
	  }
	}
      }

      MinimizerStatus ls_status; // line-search outcome
      if (i_nearest_bound >= 0) {
	// Perform line search, storing new state vector in x
	ls_status = line_search(optimizable, x, direction,
			       test_x, step_size, gradient, state_up_to_date,
			       cg_curvature_coeff_, bound_step_size);
	if (ls_status == MINIMIZER_STATUS_BOUND_REACHED) {
	  bound_status(i_nearest_bound) = i_bound_type;
	  do_restart = true;
	  ls_status = MINIMIZER_STATUS_SUCCESS;
	}
      }
      else {
	// Perform line search, storing new state vector in x
	ls_status = line_search(optimizable, x, direction,
				test_x, step_size, gradient, state_up_to_date,
				cg_curvature_coeff_);
      }

      if (ls_status == MINIMIZER_STATUS_SUCCESS) {
	// Successfully minimized along search direction: continue to
	// next iteration
	status_ = MINIMIZER_STATUS_NOT_YET_CONVERGED;
      }
      else if (iteration_at_last_restart != n_iterations_) {
	// Line search either made no progress or encountered a
	// non-finite cost function or gradient, and this was not a
	// restart; try restarting once
	do_restart = true;
	status_ = MINIMIZER_STATUS_NOT_YET_CONVERGED;
      }
      else {
	// Unrecoverable failure in line-search: return status to
	// calling function
	status_ = ls_status;
      }

      // Better convergence if first step size on next line search is
      // larger than the actual step size on the last line search
      step_size *= 2.0;

      ++n_iterations_;
      if (status_ == MINIMIZER_STATUS_NOT_YET_CONVERGED
	  && n_iterations_ >= max_iterations_) {
	status_ = MINIMIZER_STATUS_MAX_ITERATIONS_REACHED;
      }

      // End of main loop: if status_ is anything other than
      // MINIMIZER_STATUS_NOT_YET_CONVERGED then no more iterations
      // are performed
    }
     
    if (state_up_to_date < ensure_updated_state_) {
      // The last call to calc_cost_function* was not with the state
      // vector returned to the user, and they want it to be.
      if (ensure_updated_state_ > 0) {
	// User wants at least the first derivative
	cost_function_ = optimizable.calc_cost_function_gradient(x, gradient);
      }
      else {
	// User does not need derivatives to have been computed
	cost_function_ = optimizable.calc_cost_function(x);
      }
    }

    return status_;
  }

};


================================================
FILE: adept/minimize_levenberg_marquardt.cpp
================================================
/* minimize_levenberg_marquardt.cpp -- Minimize function using Levenberg-Marquardt algorithm

    Copyright (C) 2020 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#include <limits>
#include <cmath>
#include <adept/Minimizer.h>

namespace adept {

  // Minimize the cost function embodied in "optimizable" using the
  // Levenberg-Marquardt algorithm, where "x" is the initial state
  // vector and also where the solution is stored.
  MinimizerStatus
  Minimizer::minimize_levenberg_marquardt(Optimizable& optimizable, Vector x,
					  bool use_additive_damping)
  {
    int nx = x.size();

    // Initial values
    n_iterations_ = 0;
    n_samples_ = 0;
    status_ = MINIMIZER_STATUS_NOT_YET_CONVERGED;
    cost_function_ = std::numeric_limits<Real>::infinity();

    Real new_cost;

    // The main memory storage for the Levenberg family of methods
    // consists of the following three vectors...
    Vector new_x(nx);
    Vector gradient(nx);
    Vector dx(nx);

    // ...and the Hessian matrix, which is stored explicitly
    SymmMatrix hessian(nx);
    hessian = 0.0;

    Real damping = levenberg_damping_start_;
    gradient_norm_ = -1.0;

    // Original Levenberg is additive to the diagonal of the Hessian
    // so to make the performance insensitive to an arbitrary scaling
    // of the cost function, we scale the damping factor by the mean
    // of the diagonal of the Hessian
    Real diag_scaling;

    // Does the last calculation of the cost function in "optimizable"
    // match the current contents of the state vector x? -1=no, 0=yes,
    // 1=yes and the last calculation included the gradient, 2=yes and
    // the last calculation included gradient and Hessian.
    int state_up_to_date = -1;

    do {
      // At this point we have either just started or have just
      // reduced the cost function
      cost_function_ = optimizable.calc_cost_function_gradient_hessian(x, gradient, hessian);
      diag_scaling = mean(hessian.diag_vector());
      state_up_to_date = 2;
      ++n_samples_;
      if (n_iterations_ == 0) {
	start_cost_function_ = cost_function_;
      }

      // Check cost function and gradient are finite
      if (!std::isfinite(cost_function_)) {
	status_ = MINIMIZER_STATUS_INVALID_COST_FUNCTION;
	break;
      }
      else if (any(!isfinite(gradient))) {
	status_ = MINIMIZER_STATUS_INVALID_GRADIENT;
	break;
      }
      // Compute L2 norm of gradient to see how "flat" the environment
      // is
      gradient_norm_ = norm2(gradient);
      // Report progress using user-defined function
      optimizable.report_progress(n_iterations_, x, cost_function_, gradient_norm_);
      // Convergence has been achieved if the L2 norm has been reduced
      // to a user-specified threshold
      if (gradient_norm_ <= converged_gradient_norm_) {
	status_ = MINIMIZER_STATUS_SUCCESS;
	break;
      }

      // Try to minimize cost function 
      Real previous_diag_scaling  = 1.0; // Used in Levenberg-Marquardt version
      Real previous_diag_modifier = 0.0; // Used in Levenberg version
      while(true) {
	if (!use_additive_damping) {
	  // Levenberg-Marquardt formula: scale the diagonal of the
	  // Hessian, where the larger the value of "damping", the
	  // closer the resulting behaviour is to steepest descent
	  hessian.diag_vector() *= (1.0 + damping)/previous_diag_scaling;
	  previous_diag_scaling = 1.0 + damping;
	}
	else {
	  // Older Levenberg approach: add to the diagonal instead
	  hessian.diag_vector() += damping*diag_scaling - previous_diag_modifier;
	  previous_diag_modifier = damping*diag_scaling;
	}
	dx = -adept::solve(hessian, gradient);

	// Limit the maximum step size, if required
	if (max_step_size_ > 0.0) {
	  Real max_dx = maxval(abs(dx));
	  if (max_dx > max_step_size_) {
	    dx *= (max_step_size_/max_dx);
	  }
	}

	// Compute new cost state vector and cost function, but not
	// gradient or Hessian for efficiency
	new_x = x+dx;
	new_cost = optimizable.calc_cost_function(new_x);
	state_up_to_date = -1;
	++n_samples_;

	// If cost function is not finite it may be possible to
	// recover by trying smaller step sizes
	bool cost_invalid = !std::isfinite(new_cost);

	if (new_cost >= cost_function_ || cost_invalid) {
	  // We haven't managed to reduce the cost function: increase
	  // damping value to take smaller steps
	  if (damping <= 0.0) {
	    damping = levenberg_damping_restart_;
	  }
	  else if (damping < levenberg_damping_max_) {
	    damping *= levenberg_damping_multiplier_;
	  }
	  else {
	    // The damping value is now larger than the maximum so we
	    // can get no further
	    if (cost_invalid) {
	      status_ = MINIMIZER_STATUS_INVALID_COST_FUNCTION;
	    }
	    else {
	      status_ = MINIMIZER_STATUS_FAILED_TO_CONVERGE;
	    }
	    break;
	  }
	}
	else {
	  // Managed to reduce cost function
	  x = new_x;
	  n_iterations_++;
	  // Reduce damping for next iteration
	  if (damping > levenberg_damping_min_) {
	    damping /= levenberg_damping_divider_;
	  }
	  else {
	    damping = 0.0;
	  }
	  if (n_iterations_ >= max_iterations_) {
	    status_ = MINIMIZER_STATUS_MAX_ITERATIONS_REACHED;
	  }
	  break;
	}
      } // Inner loop
    }
    while (status_ == MINIMIZER_STATUS_NOT_YET_CONVERGED);
     
    if (state_up_to_date < ensure_updated_state_) {
      // The last call to calc_cost_function* was not with the state
      // vector returned to the user, and they want it to be.  Note
      // that the cost function and gradient norm ought to be
      // up-to-date already at this point.
      if (ensure_updated_state_ > 0) {
	// User wants at least the first derivative, but
	// calc_cost_function_gradient() is not guaranteed to be
	// present so we call the hessain function
	cost_function_ = optimizable.calc_cost_function_gradient_hessian(x, gradient,
									 hessian);
      }
      else {
	// User does not need derivatives to have been computed
	cost_function_ = optimizable.calc_cost_function(x);
      }
    }

    return status_;
  }


  // Minimize the cost function embodied in "optimizable" using the
  // Levenberg-Marquardt algorithm, where "x" is the initial state
  // vector and also where the solution is stored, subject to the
  // constraint that x lies between min_x and max_x.
  MinimizerStatus
  Minimizer::minimize_levenberg_marquardt_bounded(Optimizable& optimizable,
						  Vector x,
						  const Vector& min_x,
						  const Vector& max_x,
						  bool use_additive_damping)
  {
    if (any(min_x >= max_x)
	|| min_x.size() != x.size()
	|| max_x.size() != x.size()) {
      return MINIMIZER_STATUS_INVALID_BOUNDS;
    }

    int nx = x.size();

    // Initial values
    n_iterations_ = 0;
    n_samples_ = 0;
    status_ = MINIMIZER_STATUS_NOT_YET_CONVERGED;
    cost_function_ = std::numeric_limits<Real>::infinity();

    Real new_cost;

    // The main memory storage for the Levenberg family of methods
    // consists of the following three vectors...
    Vector new_x(nx);
    Vector gradient(nx);
    Vector dx(nx);

    // ...and the Hessian matrix, which is stored explicitly
    SymmMatrix hessian(nx);
    SymmMatrix modified_hessian(nx);
    SymmMatrix sub_hessian;
    Vector sub_gradient;
    Vector sub_dx;
    hessian = 0.0;
    Real damping = levenberg_damping_start_;

    // Which state variables are at the minimum bound (-1), maximum
    // bound (1) or free (0)?
    intVector bound_status(nx);
    bound_status = 0;

    // Ensure that initial x lies within the specified bounds
    bound_status.where(x >= max_x) =  1;
    bound_status.where(x <= min_x) = -1;
    x = max(min_x, min(x, max_x));

    int nbound = count(bound_status != 0);
    int nfree  = nx - nbound;
    gradient_norm_ = -1.0;

    // Original Levenberg is additive to the diagonal of the Hessian
    // so to make the performance insensitive to an arbitrary scaling
    // of the cost function, we scale the damping factor by the mean
    // of the diagonal of the Hessian
    Real diag_scaling;

    // Does the last calculation of the cost function in "optimizable"
    // match the current contents of the state vector x? -1=no, 0=yes,
    // 1=yes and the last calculation included the gradient, 2=yes and
    // the last calculation included gradient and Hessian.
    int state_up_to_date = -1;

    do {
      // At this point we have either just started or have just
      // reduced the cost function
      cost_function_ = optimizable.calc_cost_function_gradient_hessian(x, gradient, hessian);
      diag_scaling = mean(hessian.diag_vector());
      state_up_to_date = 2;
      ++n_samples_;
      if (n_iterations_ == 0) {
	start_cost_function_ = cost_function_;
      }

      // Check cost function and gradient are finite
      if (!std::isfinite(cost_function_)) {
	status_ = MINIMIZER_STATUS_INVALID_COST_FUNCTION;
	break;
      }
      else if (any(!isfinite(gradient))) {
	status_ = MINIMIZER_STATUS_INVALID_GRADIENT;
	break;
      }

      // Find which dimensions are in play
      if (nbound > 0) {
	// We release any dimensions from being at a minimum or
	// maximum bound if two conditions are met: (1) the gradient
	// in that dimension slopes away from the bound, and (2) the
	// Levenberg-Marquardt formula to compute dx using the current
	// value of "damping" leads to a point on the valid side of the
	// bound
	modified_hessian = hessian;
	if (!use_additive_damping) {
	  modified_hessian.diag_vector() *= (1.0 + damping);
	}
	else {
	  modified_hessian.diag_vector() += damping*diag_scaling;
	}
	dx = -adept::solve(modified_hessian, gradient);
	// Release points at the minimum bound
	bound_status.where(bound_status == -1
			   && gradient < 0.0
			   && dx > 0.0) = 0;
	// Release points at the maximum bound
	bound_status.where(bound_status == 1
			   && gradient > 0.0
			   && dx < 0.0) = 0;
      }

      nbound = count(bound_status != 0);
      nfree  = nx - nbound;

      // List of indices of free state variables
      intVector ifree(nfree);
      if (nbound > 0) {
	ifree = find(bound_status == 0);
      }
      else {
	ifree = range(0, nx-1);
      }

      // Compute L2 norm of gradient to see how "flat" the environment
      // is, restricting ourselves to the dimensions currently in play
      if (nfree > 0) {
	gradient_norm_ = norm2(gradient(ifree));
      }
      else {
	// If no dimensions are in play we are at a corner of the
	// bounds and the gradient is pointing into the corner: we
	// have reached a minimum in the cost function subject to the
	// bounds so have converged
	gradient_norm_ = 0.0;
      }
      // Report progress using user-defined function
      optimizable.report_progress(n_iterations_, x, cost_function_, gradient_norm_);
      // Convergence has been achieved if the L2 norm has been reduced
      // to a user-specified threshold
      if (gradient_norm_ <= converged_gradient_norm_) {
	status_ = MINIMIZER_STATUS_SUCCESS;
	break;
      }

      sub_gradient.clear();
      sub_hessian.clear();
      if (nbound > 0) {
	sub_gradient = gradient(ifree);
	sub_hessian  = SymmMatrix(Matrix(hessian)(ifree,ifree));
      }
      else {
	sub_gradient >>= gradient;
	sub_hessian  >>= hessian;
      }

      // FIX reuse dx if possible below...

      // Try to minimize cost function 
      Real previous_diag_scaling  = 1.0; // Used in Levenberg-Marquardt version
      Real previous_diag_modifier = 0.0; // Used in Levenberg version
      while(true) {
	sub_dx.resize(nfree);
	if (!use_additive_damping) {
	  // Levenberg-Marquardt formula: scale the diagonal of the
	  // Hessian, where the larger the value of "damping", the
	  // closer the resulting behaviour is to steepest descent
	  sub_hessian.diag_vector() *= (1.0 + damping)/previous_diag_scaling;
	  previous_diag_scaling = 1.0 + damping;
	}
	else {
	  // Older Levenberg approach: add to the diagonal instead
	  sub_hessian.diag_vector() += damping*diag_scaling - previous_diag_modifier;
	  previous_diag_modifier = damping*diag_scaling;
	}
	sub_dx = -adept::solve(sub_hessian, sub_gradient);

	// Limit the maximum step size, if required
	if (max_step_size_ > 0.0) {
	  Real max_dx = maxval(abs(sub_dx));
	  if (max_dx > max_step_size_) {
	    sub_dx *= (max_step_size_/max_dx);
	  }
	}

	// Check for collision with new bounds
	intVector new_min_bounds = find(x(ifree)+sub_dx <= min_x(ifree));
	intVector new_max_bounds = find(x(ifree)+sub_dx >= max_x(ifree));
	Real mmin_frac = 2.0;
	Real mmax_frac = 2.0;
	int imin = 0, imax = 0;
	if (!new_min_bounds.empty()) {
	  Vector min_frac = -(x(ifree(new_min_bounds)) - min_x(ifree(new_min_bounds)))
	    / sub_dx(new_min_bounds);
	  mmin_frac = minval(min_frac);
	  imin = new_min_bounds(minloc(min_frac));
	}
	if (!new_max_bounds.empty()) {
	  Vector max_frac = (max_x(ifree(new_max_bounds)) - x(ifree(new_max_bounds)))
	    / sub_dx(new_max_bounds);
	  mmax_frac = minval(max_frac);
	  imax = new_max_bounds(maxloc(max_frac));
	}

	Real frac = 1.0;
	int bound_type = 0;
	int ibound = 0;
	if (mmin_frac <= 1.0 || mmax_frac <= 1.0) {
	  if (mmin_frac < mmax_frac) {
	    frac = mmin_frac;
	    ibound = imin;
	    bound_type = -1;
	  }
	  else {
	    frac = mmax_frac;
	    ibound = imax;
	    bound_type = 1;
	  }	  
	  sub_dx *= frac;
	}

	// Compute new state vector and cost function, but not
	// gradient or Hessian for efficiency
	new_x = x;
	new_x(ifree) += sub_dx;
	new_cost = optimizable.calc_cost_function(new_x);
	state_up_to_date = -1;
	++n_samples_;

	// If cost function is not finite it may be possible to
	// recover by trying smaller step sizes
	bool cost_invalid = !std::isfinite(new_cost);

	if (new_cost >= cost_function_ || cost_invalid) {
	  // We haven't managed to reduce the cost function: increase
	  // damping value to take smaller steps
	  if (damping <= 0.0) {
	    damping = levenberg_damping_restart_;
	  }
	  else if (damping < levenberg_damping_max_) {
	    damping *= levenberg_damping_multiplier_;
	  }
	  else {
	    // The damping value is now larger than the maximum so we
	    // can get no further
	    if (cost_invalid) {
	      status_ = MINIMIZER_STATUS_INVALID_COST_FUNCTION;
	    }
	    else {
	      status_ = MINIMIZER_STATUS_FAILED_TO_CONVERGE;
	    }
	    break;
	  }
	}
	else {
	  // Managed to reduce cost function
	  x = new_x;
	  n_iterations_++;
	  if (frac < 1.0) {
	    // Found a new bound
	    bound_status(ifree(ibound)) = bound_type;
	  }
	  // Reduce damping for next iteration
	  if (damping > levenberg_damping_min_) {
	    damping /= levenberg_damping_divider_;
	  }
	  else {
	    damping = 0.0;
	  }
	  if (n_iterations_ >= max_iterations_) {
	    status_ = MINIMIZER_STATUS_MAX_ITERATIONS_REACHED;
	  }
	  break;
	}
      } // Inner loop
    }
    while (status_ == MINIMIZER_STATUS_NOT_YET_CONVERGED);
    
    if (state_up_to_date < ensure_updated_state_) {
      // The last call to calc_cost_function* was not with the state
      // vector returned to the user, and they want it to be.  Note
      // that the cost function and gradient norm ought to be
      // up-to-date already at this point.
      if (ensure_updated_state_ > 0) {
	// User wants at least the first derivative, but
	// calc_cost_function_gradient() is not guaranteed to be
	// present so we call the hessain function
	cost_function_ = optimizable.calc_cost_function_gradient_hessian(x, gradient,
									 hessian);
      }
      else {
	// User does not need derivatives to have been computed
	cost_function_ = optimizable.calc_cost_function(x);
      }
    }

    return status_;
  }

};


================================================
FILE: adept/minimize_limited_memory_bfgs.cpp
================================================
/* minimize_limited_memory_bfgs.cpp -- Minimize function using Limited-Memory BFGS algorithm

    Copyright (C) 2020 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#include <limits>

#include <adept/Minimizer.h>

namespace adept {

  // Structure for storing data from previous iterations used by
  // L-BFGS minimization algorithm
  class LbfgsData {

  public:
    LbfgsData(int nx, int ni)
      : nx_(nx), ni_(ni), iteration_(0) {
      x_diff_.resize(ni,nx);
      gradient_diff_.resize(ni,nx);
      rho_.resize(ni);
      alpha_.resize(ni);
      gamma_.resize(ni);
    }

    // Return false if the dot product of x_diff and gradient_diff is
    // zero, true otherwise
    void store(int iter, const Vector& x_diff, const Vector& gradient_diff) {
      int index = (iter-1) % ni_;
      x_diff_[index] = x_diff;
      gradient_diff_[index] = gradient_diff;
      Real dp = dot_product(x_diff, gradient_diff);
      if (std::fabs(dp) > 10.0*std::numeric_limits<Real>::min()) {
	rho_[index] = 1.0 / dp;
      }
      else if (dp >= 0.0) {
	rho_[index] = 1.0 / std::max(dp, 10.0*std::numeric_limits<Real>::min());
      }
      else {
	rho_[index] = 1.0 / std::min(dp, -10.0*std::numeric_limits<Real>::min());
      }
    }

    // Return read-only vectors containing the differences between
    // state vectors and gradients at sequential iterations, by
    // slicing off the appropriate row of the matrix
    Vector x_diff(int iter) {
      return x_diff_[iter % ni_];
    };
    Vector gradient_diff(int iter) {
      return gradient_diff_[iter % ni_];
    };

    Real& alpha(int iter) { return alpha_[iter % ni_]; }
    Real rho(int iter) const { return rho_[iter % ni_]; }
    Real gamma(int iter) const { return gamma_[iter % ni_]; }

  private:
    // Data
    int nx_; // Number of state variables
    int ni_; // Number of iterations to store
    int iteration_; // Current iteration
    Matrix x_diff_;
    Matrix gradient_diff_;
    Vector rho_;
    Vector alpha_;
    Vector gamma_;
  };


  // Minimize the cost function embodied in "optimizable" using the
  // Limited-Memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS)
  // algorithm, where "x" is the initial state vector and also where
  // the solution is stored.
  MinimizerStatus
  Minimizer::minimize_limited_memory_bfgs(Optimizable& optimizable, Vector x)
  {

    int nx = x.size();

    // Initial values
    n_iterations_ = 0;
    n_samples_ = 0;
    status_ = MINIMIZER_STATUS_NOT_YET_CONVERGED;
    cost_function_ = std::numeric_limits<Real>::infinity();

    Vector previous_x(nx);
    Vector gradient(nx);
    Vector previous_gradient(nx);
    Vector direction(nx);
    Vector test_x(nx); // Used by the line search only

    // Previous states needed by the L-BFGS algorithm
    int n_states = std::min(nx, lbfgs_n_states_);
    LbfgsData data(nx, n_states);

    // Does the last calculation of the cost function in "optimizable"
    // match the current contents of the state vector x? -1=no, 0=yes,
    // 1=yes and the last calculation included the gradient, 2=yes and
    // the last calculation included gradient and Hessian.
    int state_up_to_date = -1;

    // Initial step size
    Real step_size = 1.0;
    if (max_step_size_ > 0.0) {
      step_size = max_step_size_;
    }

    // Main loop
    while (status_ == MINIMIZER_STATUS_NOT_YET_CONVERGED) {

      // If the last line search found a minimum along the lines
      // satisfying the Wolfe conditions, then the current cost
      // function and gradient will be consistent with the current
      // state vector.  Otherwise we need to compute them.
      if (state_up_to_date < 1) {
	cost_function_ = optimizable.calc_cost_function_gradient(x, gradient);
	state_up_to_date = 1;
	++n_samples_;

	if (n_iterations_ == 0) {
	  start_cost_function_ = cost_function_;
	}

	// Check cost function and gradient are finite
	if (!std::isfinite(cost_function_)) {
	  status_ = MINIMIZER_STATUS_INVALID_COST_FUNCTION;
	  break;
	}
	else if (any(!isfinite(gradient))) {
	  status_ = MINIMIZER_STATUS_INVALID_GRADIENT;
	  break;
	}
      }

      // Check cost function and gradient are finite
      if (!std::isfinite(cost_function_)) {
	status_ = MINIMIZER_STATUS_INVALID_COST_FUNCTION;
	break;
      }
      else if (any(!isfinite(gradient))) {
	status_ = MINIMIZER_STATUS_INVALID_GRADIENT;
	break;
      }

      // Compute L2 norm of gradient to see how "flat" the environment
      // is
      gradient_norm_ = norm2(gradient);

      // Report progress using user-defined function
      optimizable.report_progress(n_iterations_, x, cost_function_, gradient_norm_);

      // Convergence has been achieved if the L2 norm has been reduced
      // to a user-specified threshold
      if (gradient_norm_ <= converged_gradient_norm_) {
	status_ = MINIMIZER_STATUS_SUCCESS;
	break;
      }

      // Store state and gradient differences
      if (n_iterations_ > 0) {
	data.store(n_iterations_, x-previous_x, gradient-previous_gradient);
      }

      // Find search direction: see page 779 of Nocedal (1980):
      // Updating quasi-Newton matrices with limited
      // storage. Mathematics of Computation, 35, 773-782.
      direction = gradient;
      if (n_iterations_ > 0) {

	for (int ii = n_iterations_-1;
	     ii >= std::max(0,n_iterations_-n_states);
	     --ii) {
	  data.alpha(ii) = data.rho(ii) 
	    * dot_product(data.x_diff(ii), direction);
	  direction -= data.alpha(ii) * data.gradient_diff(ii);
	}

	Real gamma = dot_product(x-previous_x, gradient-previous_gradient)
	  / std::max(10.0*std::numeric_limits<Real>::min(),
		     dot_product(gradient-previous_gradient, gradient-previous_gradient));
	direction *= gamma;

	for (int ii = std::max(0,n_iterations_-n_states);
	     ii < n_iterations_;
	     ++ii) {
	  Real beta = data.rho(ii) * dot_product(data.gradient_diff(ii), direction);
	  direction += data.x_diff(ii) * (data.alpha(ii)-beta);
	}

	direction = -direction;
      }
      else {
	direction = -gradient * (step_size / norm2(gradient));
      }

      // Store state and gradient
      previous_x = x;
      previous_gradient = gradient;

      // Perform line search, storing new state vector in x, and
      // returning MINIMIZER_STATUS_NOT_YET_CONVERGED on success
      Real curvature_coeff = lbfgs_curvature_coeff_;
      if (n_iterations_ < n_states) {
	// In the early iterations we require the line search to be
	// more accurate since the L-BFGS update will have fewer
	// states to make a good estimate of the minimum; interpolate
	// between the Conjugate Gradient and L-BFGS curvature
	// coefficients
	curvature_coeff = (cg_curvature_coeff_ * (n_states-n_iterations_)
			   + lbfgs_curvature_coeff_ * n_iterations_)
	  / n_states;
      }

      // Direction points to the best estimate of the actual location
      // of the minimum, so the step size is the norm of the direction
      // vector
      step_size = norm2(direction);
      MinimizerStatus ls_status
	= line_search(optimizable, x, direction,
		      test_x, step_size, gradient, state_up_to_date,
		      curvature_coeff);

      if (ls_status == MINIMIZER_STATUS_SUCCESS) {
	// Successfully minimized along search direction: continue to
	// next iteration
	status_ = MINIMIZER_STATUS_NOT_YET_CONVERGED;
      }
      else {
	// Unrecoverable failure in line-search: return status to
	// calling function
	status_ = ls_status;
      }

      ++n_iterations_;
      if (status_ == MINIMIZER_STATUS_NOT_YET_CONVERGED
	  && n_iterations_ >= max_iterations_) {
	status_ = MINIMIZER_STATUS_MAX_ITERATIONS_REACHED;
      }

      // End of main loop: if status_ is anything other than
      // MINIMIZER_STATUS_NOT_YET_CONVERGED then no more iterations
      // are performed
    }
     
    if (state_up_to_date < ensure_updated_state_) {
      // The last call to calc_cost_function* was not with the state
      // vector returned to the user, and they want it to be.
      if (ensure_updated_state_ > 0) {
	// User wants at least the first derivative
	cost_function_ = optimizable.calc_cost_function_gradient(x, gradient);
      }
      else {
	// User does not need derivatives to have been computed
	cost_function_ = optimizable.calc_cost_function(x);
      }
    }

    return status_;
  }

  // Minimize the cost function embodied in "optimizable" using the
  // Limited-Memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS)
  // algorithm, where "x" is the initial state vector and also where
  // the solution is stored.
  MinimizerStatus
  Minimizer::minimize_limited_memory_bfgs_bounded(Optimizable& optimizable, Vector x,
						  const Vector& min_x,
						  const Vector& max_x)
  {
    if (any(min_x >= max_x)
	|| min_x.size() != x.size()
	|| max_x.size() != x.size()) {
      return MINIMIZER_STATUS_INVALID_BOUNDS;
    }

    int nx = x.size();

    // Initial values
    n_iterations_ = 0;
    n_samples_ = 0;
    status_ = MINIMIZER_STATUS_NOT_YET_CONVERGED;
    cost_function_ = std::numeric_limits<Real>::infinity();

    Vector previous_x(nx);
    Vector gradient(nx);
    Vector previous_gradient(nx);
    Vector direction(nx);
    Vector test_x(nx); // Used by the line search only

    // Previous states needed by the L-BFGS algorithm
    int n_states = std::min(nx, lbfgs_n_states_);
    LbfgsData data(nx, n_states);

    // Which state variables are at the minimum bound (-1), maximum
    // bound (1) or free (0)?
    intVector bound_status(nx);
    bound_status = 0;

    // Ensure that initial x lies within the specified bounds
    bound_status.where(x >= max_x) =  1;
    bound_status.where(x <= min_x) = -1;
    x = max(min_x, min(x, max_x));

    int nbound = count(bound_status != 0);
    int nfree  = nx - nbound;

    // Floating-point number containing 1.0 if unbound and 0.0 if
    // bound
    Vector unbound_status(nx);
    unbound_status = 1.0-fabs(bound_status);

    // If we reach a bound we need to restart the L-BFGS storage, so
    // store the iteration at the last restart
    int iteration_last_restart = 0;

    // Does the last calculation of the cost function in "optimizable"
    // match the current contents of the state vector x? -1=no, 0=yes,
    // 1=yes and the last calculation included the gradient, 2=yes and
    // the last calculation included gradient and Hessian.
    int state_up_to_date = -1;

    // Initial step size
    Real step_size = 1.0;
    if (max_step_size_ > 0.0) {
      step_size = max_step_size_;
    }

    // Main loop
    while (status_ == MINIMIZER_STATUS_NOT_YET_CONVERGED) {

      // If the last line search found a minimum along the lines
      // satisfying the Wolfe conditions, then the current cost
      // function and gradient will be consistent with the current
      // state vector.  Otherwise we need to compute them.
      if (state_up_to_date < 1) {
	cost_function_ = optimizable.calc_cost_function_gradient(x, gradient);
	state_up_to_date = 1;
	++n_samples_;

	if (n_iterations_ == 0) {
	  start_cost_function_ = cost_function_;
	}

	// Check cost function and gradient are finite
	if (!std::isfinite(cost_function_)) {
	  status_ = MINIMIZER_STATUS_INVALID_COST_FUNCTION;
	  break;
	}
	else if (any(!isfinite(gradient))) {
	  status_ = MINIMIZER_STATUS_INVALID_GRADIENT;
	  break;
	}
      }

      // Check whether the bound status of each state variable is
      // consistent with the gradient if a steepest descent were to be
      // taken, and if not flag a restart
      if (any(bound_status == -1 && gradient < 0.0)
	  || any(bound_status == 1 && gradient > 0.0)) {
	bound_status.where(bound_status == -1 && gradient < 0.0) = 0;
	bound_status.where(bound_status ==  1 && gradient > 0.0) = 0;
	unbound_status = 1.0-fabs(bound_status);
	iteration_last_restart = n_iterations_;
      }
      nbound = count(bound_status != 0);
      nfree = nx - nbound;

      // Set gradient at bound points to zero
      gradient.where(bound_status != 0) = 0.0;

      // Compute L2 norm of gradient to see how "flat" the environment
      // is
      if (nfree > 0) {
	gradient_norm_ = norm2(gradient);
      }
      else {
	// If no dimensions are in play we are at a corner of the
	// bounds and the gradient is pointing into the corner: we
	// have reached a minimum in the cost function subject to the
	// bounds so have converged
	gradient_norm_ = 0.0;
      }

      // Report progress using user-defined function
      optimizable.report_progress(n_iterations_, x, cost_function_, gradient_norm_);

      // Convergence has been achieved if the L2 norm has been reduced
      // to a user-specified threshold
      if (gradient_norm_ <= converged_gradient_norm_) {
	status_ = MINIMIZER_STATUS_SUCCESS;
	break;
      }

      // Store state and gradient differences
      if (n_iterations_ > iteration_last_restart) {
	data.store(n_iterations_, x-previous_x, gradient-previous_gradient);
      }

      // Find search direction: see page 779 of Nocedal (1980):
      // Updating quasi-Newton matrices with limited
      // storage. Mathematics of Computation, 35, 773-782.
      direction = gradient;
      if (n_iterations_ > iteration_last_restart) {

	for (int ii = n_iterations_-1;
	     ii >= std::max(iteration_last_restart,n_iterations_-n_states);
	     --ii) {
	  data.alpha(ii) = data.rho(ii) 
	    * dot_product(data.x_diff(ii), direction);
	  direction -= data.alpha(ii) * data.gradient_diff(ii);
	}

	Real gamma = dot_product(x-previous_x, gradient-previous_gradient)
	  / std::max(10.0*std::numeric_limits<Real>::min(),
		     dot_product(gradient-previous_gradient, gradient-previous_gradient));
	direction *= gamma;

	for (int ii = std::max(iteration_last_restart,n_iterations_-n_states);
	     ii < n_iterations_;
	     ++ii) {
	  Real beta = data.rho(ii) * dot_product(data.gradient_diff(ii), direction);
	  direction += data.x_diff(ii) * (data.alpha(ii)-beta);
	}

	direction = -direction;
      }
      else {
	// We are either at the first iteration or have restarted
	// having changed the bound dimensions: use steepest descent
	direction = -gradient * (step_size / norm2(gradient));
      }

      // Store state and gradient
      previous_x = x;
      previous_gradient = gradient;

      // Perform line search, storing new state vector in x, and
      // returning MINIMIZER_STATUS_NOT_YET_CONVERGED on success
      Real curvature_coeff = lbfgs_curvature_coeff_;
      int n_stored_iterations = n_iterations_ - iteration_last_restart;
      if (n_stored_iterations < n_states) {
	// In the early iterations we require the line search to be
	// more accurate since the L-BFGS update will have fewer
	// states to make a good estimate of the minimum; interpolate
	// between the Conjugate Gradient and L-BFGS curvature
	// coefficients
	curvature_coeff = (cg_curvature_coeff_ * (n_states-n_stored_iterations)
			   + lbfgs_curvature_coeff_ * n_stored_iterations)
	  / n_states;
      }

      // Direction points to the best estimate of the actual location
      // of the minimum, so the step size is the norm of the direction
      // vector
      step_size = norm2(direction);

      // Distance to the nearest bound
      Real dir_scaling = step_size;
      Real bound_step_size = std::numeric_limits<Real>::max();
      int i_nearest_bound = -1;
      int i_bound_type = 0;
      // Work out the maximum step size along "direction" before a
      // bound is met... there must be a faster way to do this
      for (int ix = 0; ix < nx; ++ix) {
	if (direction(ix) > 0.0 && max_x(ix) < std::numeric_limits<Real>::max()) {
	  Real local_bound_step_size = dir_scaling*(max_x(ix)-x(ix))/direction(ix);
	  if (bound_step_size >= local_bound_step_size) {
	    bound_step_size = local_bound_step_size;
	    i_nearest_bound = ix;
	    i_bound_type = 1;
	  }				   
	}
	else if (direction(ix) < 0.0 && min_x(ix) > -std::numeric_limits<Real>::max()) {
	  Real local_bound_step_size = dir_scaling*(min_x(ix)-x(ix))/direction(ix);
	  if (bound_step_size >= local_bound_step_size) {
	    bound_step_size = local_bound_step_size;
	    i_nearest_bound = ix;
	    i_bound_type = -1;
	  }
	}
      }

      MinimizerStatus ls_status; // line-search outcome
      if (i_nearest_bound >= 0) {
	// Perform line search, storing new state vector in x
	ls_status = line_search(optimizable, x, direction,
				test_x, step_size, gradient, state_up_to_date,
				curvature_coeff, bound_step_size);
	if (ls_status == MINIMIZER_STATUS_BOUND_REACHED) {
	  bound_status(i_nearest_bound) = i_bound_type;
	  // Restart the L-BFGS storage
	  iteration_last_restart = n_iterations_+1;
	  ls_status = MINIMIZER_STATUS_SUCCESS;
	}
      }
      else {
	// Perform line search, storing new state vector in x
	ls_status = line_search(optimizable, x, direction,
				test_x, step_size, gradient, state_up_to_date,
				curvature_coeff);
      }

      if (ls_status == MINIMIZER_STATUS_SUCCESS) {
	// Successfully minimized along search direction: continue to
	// next iteration
	status_ = MINIMIZER_STATUS_NOT_YET_CONVERGED;
      }
      else {
	// Unrecoverable failure in line-search: return status to
	// calling function
	status_ = ls_status;
      }

      ++n_iterations_;
      if (status_ == MINIMIZER_STATUS_NOT_YET_CONVERGED
	  && n_iterations_ >= max_iterations_) {
	status_ = MINIMIZER_STATUS_MAX_ITERATIONS_REACHED;
      }

      // End of main loop: if status_ is anything other than
      // MINIMIZER_STATUS_NOT_YET_CONVERGED then no more iterations
      // are performed
    }
     
    if (state_up_to_date < ensure_updated_state_) {
      // The last call to calc_cost_function* was not with the state
      // vector returned to the user, and they want it to be.
      if (ensure_updated_state_ > 0) {
	// User wants at least the first derivative
	cost_function_ = optimizable.calc_cost_function_gradient(x, gradient);
      }
      else {
	// User does not need derivatives to have been computed
	cost_function_ = optimizable.calc_cost_function(x);
      }
    }

    return status_;
  }

};


================================================
FILE: adept/settings.cpp
================================================
/* settings.cpp -- View/change the overall Adept settings

    Copyright (C) 2016 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#include <sstream>
#include <cstring>

#include <adept/base.h>
#include <adept/settings.h>

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#ifdef HAVE_OPENBLAS_CBLAS_HEADER
#include <cblas.h>
#endif

namespace adept {

  // -------------------------------------------------------------------
  // Get compile-time settings
  // -------------------------------------------------------------------

  // Return the version of Adept at compile time
  std::string
  version()
  {
    return ADEPT_VERSION_STR;
  }

  // Return the compiler used to compile the Adept library (e.g. "g++
  // [4.3.2]" or "Microsoft Visual C++ [1800]")
  std::string
  compiler_version()
  {
#ifdef CXX
    std::string cv = CXX; // Defined in config.h
#elif defined(_MSC_VER)
    std::string cv = "Microsoft Visual C++";
#else
    std::string cv = "unknown";
#endif

#ifdef __GNUC__

#define STRINGIFY3(A,B,C) STRINGIFY(A) "." STRINGIFY(B) "." STRINGIFY(C)
#define STRINGIFY(A) #A
    cv += " [" STRINGIFY3(__GNUC__,__GNUC_MINOR__,__GNUC_PATCHLEVEL__) "]";
#undef STRINGIFY
#undef STRINGIFY3

#elif defined(_MSC_VER)

#define STRINGIFY1(A) STRINGIFY(A)
#define STRINGIFY(A) #A
    cv += " [" STRINGIFY1(_MSC_VER) "]";
#undef STRINGIFY
#undef STRINGIFY1

#endif
    return cv;
  }

  // Return the compiler flags used when compiling the Adept library
  // (e.g. "-Wall -g -O3")
  std::string
  compiler_flags()
  {
#ifdef CXXFLAGS
    return CXXFLAGS; // Defined in config.h
#else
    return "unknown";
#endif
  }

  // Return a multi-line string listing numerous aspects of the way
  // Adept has been configured.
  std::string
  configuration()
  {
    std::stringstream s;
    s << "Adept version " << adept::version() << ":\n";
    s << "  Compiled with " << adept::compiler_version() << "\n";
    s << "  Compiler flags \"" << adept::compiler_flags() << "\"\n";
#ifdef BLAS_LIBS
    if (std::strlen(BLAS_LIBS) > 2) {
      const char* blas_libs = &BLAS_LIBS[2];
      s << "  BLAS support from " << blas_libs << " library\n";
    }
    else {
      s << "  BLAS support from built-in library\n";
    }
#endif
#ifdef HAVE_OPENBLAS_CBLAS_HEADER
    s << "  Number of BLAS threads may be specified up to maximum of "
      << max_blas_threads() << "\n";
#endif
    s << "  Jacobians processed in blocks of size " 
      << ADEPT_MULTIPASS_SIZE << "\n";
    return s.str();
  }


  // -------------------------------------------------------------------
  // Get/set number of threads for array operations
  // -------------------------------------------------------------------

  // Get the maximum number of threads available for BLAS operations
  int
  max_blas_threads()
  {
#ifdef HAVE_OPENBLAS_CBLAS_HEADER
    return openblas_get_num_threads();
#else
    return 1;
#endif
  }

  // Set the maximum number of threads available for BLAS operations
  // (zero means use the maximum sensible number on the current
  // system), and return the number actually set. Note that OpenBLAS
  // uses pthreads and the Jacobian calculation uses OpenMP - this can
  // lead to inefficient behaviour so if you are computing Jacobians
  // then you may get better performance by setting the number of
  // array threads to one.
  int
  set_max_blas_threads(int n)
  {
#ifdef HAVE_OPENBLAS_CBLAS_HEADER
    openblas_set_num_threads(n);
    return openblas_get_num_threads();
#else
    return 1;
#endif
  }

  // Was the library compiled with matrix multiplication support (from
  // BLAS)?
  bool
  have_matrix_multiplication() {
#ifdef HAVE_BLAS
    return true;
#else
    return false;
#endif
  }

  // Was the library compiled with linear algebra support (e.g. inv
  // and solve from LAPACK)
  bool
  have_linear_algebra() {
#ifdef HAVE_LAPACK
    return true;
#else
    return false;
#endif
  }

} // End namespace adept


================================================
FILE: adept/solve.cpp
================================================
/* solve.cpp -- Solve systems of linear equations using LAPACK

    Copyright (C) 2015-2016 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.
*/
                             

#include <vector>

#include <adept/solve.h>
#include <adept/Array.h>
#include <adept/SpecialMatrix.h>

// If ADEPT_SOURCE_H is defined then we are in a header file generated
// from all the source files, so cpplapack.h will already have been
// included
#ifndef AdeptSource_H
#include "cpplapack.h"
#endif

#ifdef HAVE_LAPACK

namespace adept {

  using namespace internal;
  
  // -------------------------------------------------------------------
  // Solve Ax = b for general square matrix A
  // -------------------------------------------------------------------
  template <typename T>
  Array<1,T,false> 
  solve(const Array<2,T,false>& A, const Array<1,T,false>& b) {
    Array<2,T,false> A_;
    Array<1,T,false> b_;

    // LAPACKE is more efficient with column-major input
    // if (A.is_row_contiguous()) {
      A_.resize_column_major(A.dimensions());
      A_ = A;
    // }
    // else {
    //   A_.link(A);
    // }

    // if (b_.offset(0) != 0) {
      b_ = b;
    // }
    // else {
    //   b_.link(b);
    // }

    std::vector<lapack_int> ipiv(A_.dimension(0));

    //    lapack_int status = LAPACKE_dgesv(LAPACK_COL_MAJOR, A_.dimension(0), 1,
    //				      A_.data(), A_.offset(1), &ipiv[0],
    //				      b_.data(), b_.dimension(0));
    lapack_int status = cpplapack_gesv(A_.dimension(0), 1,
				       A_.data(), A_.offset(1), &ipiv[0],
				       b_.data(), b_.dimension(0));

    if (status != 0) {
      std::stringstream s;
      s << "Failed to solve general system of equations: LAPACK ?gesv returned code " << status;
      throw(matrix_ill_conditioned(s.str() ADEPT_EXCEPTION_LOCATION));
    }
    return b_;    
  }

  // -------------------------------------------------------------------
  // Solve AX = B for general square matrix A and rectangular matrix B
  // -------------------------------------------------------------------
  template <typename T>
  Array<2,T,false> 
  solve(const Array<2,T,false>& A, const Array<2,T,false>& B) {
    Array<2,T,false> A_;
    Array<2,T,false> B_;
    
    // LAPACKE is more efficient with column-major input
    // if (A.is_row_contiguous()) {
      A_.resize_column_major(A.dimensions());
      A_ = A;
    // }
    // else {
    //   A_.link(A);
    // }

    // if (B.is_row_contiguous()) {
      B_.resize_column_major(B.dimensions());
      B_ = B;
    // }
    // else {
    //   B_.link(B);
    // }

    std::vector<lapack_int> ipiv(A_.dimension(0));

    //    lapack_int status = LAPACKE_dgesv(LAPACK_COL_MAJOR, A_.dimension(0), B.dimension(1),
    //				      A_.data(), A_.offset(1), &ipiv[0],
    //				      B_.data(), B_.offset(1));
    lapack_int status = cpplapack_gesv(A_.dimension(0), B.dimension(1),
				       A_.data(), A_.offset(1), &ipiv[0],
				       B_.data(), B_.offset(1));
    if (status != 0) {
      std::stringstream s;
      s << "Failed to solve general system of equations for matrix RHS: LAPACK ?gesv returned code " << status;
      throw(matrix_ill_conditioned(s.str() ADEPT_EXCEPTION_LOCATION));
    }
    return B_;    
  }


  // -------------------------------------------------------------------
  // Solve Ax = b for symmetric square matrix A
  // -------------------------------------------------------------------
  template <typename T, SymmMatrixOrientation Orient>
  Array<1,T,false>
  solve(const SpecialMatrix<T,SymmEngine<Orient>,false>& A,
	const Array<1,T,false>& b) {
    SpecialMatrix<T,SymmEngine<Orient>,false> A_;
    Array<1,T,false> b_;

    // Not sure why the original code copies A...
    A_.resize(A.dimension());
    A_ = A;
    // A_.link(A);

    // if (b.offset(0) != 1) {
      b_ = b;
    // }
    // else {
    //   b_.link(b);
    // }

    // Treat symmetric matrix as column-major
    char uplo;
    if (Orient == ROW_LOWER_COL_UPPER) {
      uplo = 'U';
    }
    else {
      uplo = 'L';
    }

    std::vector<lapack_int> ipiv(A_.dimension());

    //    lapack_int status = LAPACKE_dsysv(LAPACK_COL_MAJOR, uplo, A_.dimension(0), 1,
    //				      A_.data(), A_.offset(), &ipiv[0],
    //				      b_.data(), b_.dimension(0));
    lapack_int status = cpplapack_sysv(uplo, A_.dimension(0), 1,
				       A_.data(), A_.offset(), &ipiv[0],
				       b_.data(), b_.dimension(0));

    if (status != 0) {
      //      std::stringstream s;
      //      s << "Failed to solve symmetric system of equations: LAPACK ?sysv returned code " << status;
      //      throw(matrix_ill_conditioned(s.str() ADEPT_EXCEPTION_LOCATION));
      std::cerr << "Warning: LAPACK solve symmetric system failed (?sysv): trying general (?gesv)\n";
      return solve(Array<2,T,false>(A_),b_);
    }
    return b_;    
  }


  // -------------------------------------------------------------------
  // Solve AX = B for symmetric square matrix A
  // -------------------------------------------------------------------
  template <typename T, SymmMatrixOrientation Orient>
  Array<2,T,false>
  solve(const SpecialMatrix<T,SymmEngine<Orient>,false>& A,
	const Array<2,T,false>& B) {
    SpecialMatrix<T,SymmEngine<Orient>,false> A_;
    Array<2,T,false> B_;

    A_.resize(A.dimension());
    A_ = A;
    // A_.link(A);

    // if (B.is_row_contiguous()) {
      B_.resize_column_major(B.dimensions());
      B_ = B;
    // }
    // else {
    //   B_.link(B);
    // }

    // Treat symmetric matrix as column-major
    char uplo;
    if (Orient == ROW_LOWER_COL_UPPER) {
      uplo = 'U';
    }
    else {
      uplo = 'L';
    }

    std::vector<lapack_int> ipiv(A_.dimension());

    //    lapack_int status = LAPACKE_dsysv(LAPACK_COL_MAJOR, uplo, A_.dimension(0), B.dimension(1),
    //				      A_.data(), A_.offset(), &ipiv[0],
    //				      B_.data(), B_.offset(1));
    lapack_int status = cpplapack_sysv(uplo, A_.dimension(0), B.dimension(1),
				       A_.data(), A_.offset(), &ipiv[0],
				       B_.data(), B_.offset(1));

    if (status != 0) {
      std::stringstream s;
      s << "Failed to solve symmetric system of equations with matrix RHS: LAPACK ?sysv returned code " << status;
      throw(matrix_ill_conditioned(s.str() ADEPT_EXCEPTION_LOCATION));
    }
    return B_;
  }

}

#else

namespace adept {
  
  using namespace internal;
  
  // -------------------------------------------------------------------
  // Solve Ax = b for general square matrix A
  // -------------------------------------------------------------------
  template <typename T>
  Array<1,T,false> 
  solve(const Array<2,T,false>& A, const Array<1,T,false>& b) {
    throw feature_not_available("Cannot solve linear equations because compiled without LAPACK");
  }

  // -------------------------------------------------------------------
  // Solve AX = B for general square matrix A and rectangular matrix B
  // -------------------------------------------------------------------
  template <typename T>
  Array<2,T,false> 
  solve(const Array<2,T,false>& A, const Array<2,T,false>& B) {
    throw feature_not_available("Cannot solve linear equations because compiled without LAPACK");
  }

  // -------------------------------------------------------------------
  // Solve Ax = b for symmetric square matrix A
  // -------------------------------------------------------------------
  template <typename T, SymmMatrixOrientation Orient>
  Array<1,T,false>
  solve(const SpecialMatrix<T,SymmEngine<Orient>,false>& A,
	const Array<1,T,false>& b) {
    throw feature_not_available("Cannot solve linear equations because compiled without LAPACK");
  }

  // -------------------------------------------------------------------
  // Solve AX = B for symmetric square matrix A
  // -------------------------------------------------------------------
  template <typename T, SymmMatrixOrientation Orient>
  Array<2,T,false>
  solve(const SpecialMatrix<T,SymmEngine<Orient>,false>& A,
	const Array<2,T,false>& B) {
    throw feature_not_available("Cannot solve linear equations because compiled without LAPACK");
  }

}

#endif


namespace adept {

  // -------------------------------------------------------------------
  // Explicit instantiations
  // -------------------------------------------------------------------
#define ADEPT_EXPLICIT_SOLVE(TYPE,RRANK)				\
  template Array<RRANK,TYPE,false>					\
  solve(const Array<2,TYPE,false>& A, const Array<RRANK,TYPE,false>& b); \
  template Array<RRANK,TYPE,false>					\
  solve(const SpecialMatrix<TYPE,SymmEngine<ROW_LOWER_COL_UPPER>,false>& A, \
	const Array<RRANK,TYPE,false>& b);					\
  template Array<RRANK,TYPE,false>					\
  solve(const SpecialMatrix<TYPE,SymmEngine<ROW_UPPER_COL_LOWER>,false>& A, \
	const Array<RRANK,TYPE,false>& b);

  ADEPT_EXPLICIT_SOLVE(float,1)
  ADEPT_EXPLICIT_SOLVE(float,2)
  ADEPT_EXPLICIT_SOLVE(double,1)
  ADEPT_EXPLICIT_SOLVE(double,2)
#undef ADEPT_EXPLICIT_SOLVE

}


================================================
FILE: adept/vector_utilities.cpp
================================================
/* vector_utilities.cpp -- Vector utility functions

    Copyright (C) 2016 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#include <adept/vector_utilities.h>

namespace adept {

  Array<1,Real,false>
  linspace(Real x1, Real x2, Index n) {
    Array<1,Real,false> ans(n);
    if (n > 1) {
      for (Index i = 0; i < n; ++i) {
	ans(i) = x1 + (x2-x1)*i / static_cast<Real>(n-1);
      }
    }
    else if (n == 1 && x1 == x2) {
      ans(0) = x1;
      return ans;
    }
    else if (n == 1) {
      throw(invalid_operation("linspace(x1,x2,n) with n=1 only valid if x1=x2"));
    }
    return ans;
  }

}


================================================
FILE: benchmark/Makefile.am
================================================
check_PROGRAMS = autodiff_benchmark animate matrix_benchmark math_benchmark
autodiff_benchmark_SOURCES = autodiff_benchmark.cpp \
	differentiator.h advection_schemes.h \
	advection_schemes_AD.h advection_schemes_K.h nx.h

autodiff_benchmark_CPPFLAGS = -I@top_srcdir@/include
autodiff_benchmark_LDFLAGS = -static -no-install -L@top_srcdir@/adept/.libs
autodiff_benchmark_LDADD = -ladept

animate_SOURCES = animate.cpp
animate_CPPFLAGS = -I@top_srcdir@/include

matrix_benchmark_SOURCES = matrix_benchmark.cpp
matrix_benchmark_CPPFLAGS = -I@top_srcdir@/include
matrix_benchmark_LDFLAGS = -static -no-install -L@top_srcdir@/adept/.libs
matrix_benchmark_LDADD = -ladept

math_benchmark_SOURCES = math_benchmark.cpp
math_benchmark_CPPFLAGS = -I@top_srcdir@/include
math_benchmark_LDFLAGS = -static -no-install -L@top_srcdir@/adept/.libs
math_benchmark_LDADD = -ladept


================================================
FILE: benchmark/advection_schemes.h
================================================
/* advection_schemes.h - Two test advection algorithms from the Adept paper

  Copyright (C) 2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

// Use templates so that these functions can be easily compiled with
// different automatic differentiation tools in order that the
// performance of these tools can be compared.

#ifndef ADVECTION_SCHEMES_H
#define ADVECTION_SCHEMES_H 1

#include <cmath>

// Use a fixed problem size
#include "nx.h"

// Lax-Wendroff scheme applied to linear advection
template <class aReal, typename Real>
void lax_wendroff(int nt, Real c, const aReal q_init[NX], aReal q[NX]) {
  aReal flux[NX-1];                        // Fluxes between boxes
  for (int i=0; i<NX; i++) q[i] = q_init[i]; // Initialize q 
  for (int j=0; j<nt; j++) {                 // Main loop in time
    for (int i=0; i<NX-1; i++) flux[i] = 0.5*c*(q[i]+q[i+1]+c*(q[i]-q[i+1]));
    for (int i=1; i<NX-1; i++) q[i] += flux[i-1]-flux[i];
    q[0] = q[NX-2]; q[NX-1] = q[1];          // Treat boundary conditions
  }
}

// Toon advection scheme applied to linear advection
template <class aReal, typename Real>
void toon(int nt, Real c, const aReal q_init[NX], aReal q[NX]) {
  aReal flux[NX-1];                        // Fluxes between boxes
  for (int i=0; i<NX; i++) q[i] = q_init[i]; // Initialize q
  for (int j=0; j<nt; j++) {                 // Main loop in time
    for (int i=0; i<NX-1; i++) {
      // Need to check if the difference between adjacent points is
      // not too small or we end up with close to 0/0.  Unfortunately
      // the "fabs" function is not always available in CppAD, hence
      // the following.
      //      aReal bigdiff = (q[i]-q[i+1])*1.0e6;
      //      if (bigdiff > q[i] || bigdiff < -q[i]) {
	flux[i] = (exp(c*log(q[i]/q[i+1]))-1.0)
	  * q[i]*q[i+1] / (q[i]-q[i+1]);
	//      }
	//      else {
	//	flux[i] = c*q[i]; // Upwind scheme
	//      }
    }
    for (int i=1; i<NX-1; i++) q[i] += flux[i-1]-flux[i];
    q[0] = q[NX-2]; q[NX-1] = q[1];          // Treat boundary conditions
  }
}

#include "adept_arrays.h"


template <typename T> struct is_active { static const bool value = false; };
template <> struct is_active<adept::aReal> { static const bool value = true; };

// Lax-Wendroff scheme applied to linear advection
template <typename aReal, typename Real>
void lax_wendroff_vector(int nt, Real c, const aReal q_init[NX], 
			 aReal q[NX]) {
  using namespace adept;
  typedef adept::Array<1,Real,::is_active<aReal>::value> my_vector;
  //  typedef adept::Array<1,Real,true> my_vector;
  my_vector Q(NX);
  my_vector F(NX-1);
  my_vector Qleft = Q(range(0,end-1));
  my_vector Qright = Q(range(1,end));
  my_vector Qcentre = Q(range(1,end-1));
  my_vector Fleft = F(range(0,end-1));
  my_vector Fright = F(range(1,end));
  for (int i=0; i<NX; i++) Q(i) = q_init[i]; // Initialize q 
  for (int j=0; j<nt; j++) {                 // Main loop in time
    F = 0.5*c*(Qleft+Qright+c*(Qleft-Qright));
    Qcentre += Fleft-Fright;
    Q(0) = Q(NX-2);
    Q(NX-1) = Q(1);
  }
  for (int i=0; i<NX; i++) q[i] = Q(i);
}

template <class aReal, typename Real>
void toon_vector(int nt, Real c, const aReal q_init[NX], aReal q[NX]) {
  using namespace adept;
  typedef adept::Array<1,Real,::is_active<aReal>::value> my_vector;
  my_vector Q(NX);
  my_vector F(NX-1);
  my_vector Qleft = Q(range(0,end-1));
  my_vector Qright = Q(range(1,end));
  my_vector Qcentre = Q(range(1,end-1));
  my_vector Fleft = F(range(0,end-1));
  my_vector Fright = F(range(1,end));
  for (int i=0; i<NX; i++) Q(i) = q_init[i]; // Initialize q
  for (int j=0; j<nt; j++) {                 // Main loop in time
    F = (exp(c*log(Qleft/Qright))-1.0)
      * Qleft*Qright / (Qleft-Qright);
    Qcentre += Fleft-Fright;
    Q(0) = Q(NX-2);
    Q(NX-1) = Q(1);
  }
  for (int i=0; i<NX; i++) q[i] = Q(i);
}
#endif


================================================
FILE: benchmark/advection_schemes_AD.h
================================================
/* advection_schemes_AD.h - Header for the hand-coded adjoints

  Copyright (C) 2014 The University of Reading
  Copyright (C) 2018 European Centre for Medium-Range Weather Forecasts

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

#ifndef ADVECTION_SCHEMES_AD_H
#define ADVECTION_SCHEMES_AD_H

#include "nx.h"

// Hand-coded adjoint of Lax-Wendroff advection scheme
template <typename real>
void lax_wendroff_AD(int nt, real c, const real q_init[NX], real q[NX],
		     const real q_AD_const[NX], real q_init_AD[NX]) {
  // Forward pass
  real flux[NX-1];

  for (int i = 0; i < NX; i++)   q[i] = q_init[i];

  // Forward pass
  for (int j = 0; j < nt; j++) {
    for (int i = 0; i < NX-1; i++)  flux[i] = 0.5*c*(q[i]+q[i+1]+c*(q[i]-q[i+1]));
    for (int i = 1; i < NX-1; i++)  q[i] += flux[i-1]-flux[i];
    q[0] = q[NX-2]; q[NX-1] = q[1];  // Treat boundary conditions
  }

  real q_AD[NX];
  real flux_AD[NX-1];
  for (int i = 0; i < NX; i++) q_AD[i] = q_AD_const[i];
  for (int i = 0; i < NX-1; i++) flux_AD[i] = 0.0;
  
  // Reverse pass
  for (int j = nt-1; j >= 0; j--) {
    q_AD[NX-2] += q_AD[0];
    q_AD[0] = 0.0;
    q_AD[1] += q_AD[NX-1];
    q_AD[NX-1] = 0.0;

    for(int i = 1; i < NX-1; i++) {
      flux_AD[i-1] += q_AD[i];
      flux_AD[i] -= q_AD[i];
      //      q_AD[i] = 0.0;
    }
    real factor1 = 0.5*c*(1.0+c);
    real factor2 = 0.5*c*(1.0-c);
    for (int i = 0; i < NX-1; i++) {
      q_AD[i] += factor1*flux_AD[i];
      q_AD[i+1] += factor2*flux_AD[i];
      flux_AD[i] = 0.0;
    }
  }
  for (int i = 0; i < NX; i++) {
    q_init_AD[i] = q_AD[i];
    q_AD[i] = 0.0;
  }
}

// Hand-coded adjoint of Toon advection scheme
template <typename real>
void toon_AD(int nt, real c, const real q_init[NX], real q_out[NX],
	     const real q_AD_const[NX], real q_init_AD[NX]) {
  // Forward pass
  real flux[NX-1];

  real* q_save = new real[NX*(nt+1)];
  //  real q_save[NX*(nt+1)];
  real* q = &(q_save[0]);

  for (int i = 0; i < NX; i++)   q[i] = q_init[i];

  // Forward pass
  for (int j = 0; j < nt; j++) {
    for (int i=0; i<NX-1; i++) flux[i] = (exp(c*log(q[i]/q[i+1]))-1.0) 
                                         * q[i]*q[i+1] / (q[i]-q[i+1]);
    q += NX;
    for (int i = 1; i < NX-1; i++)  q[i] = q[i-NX]+flux[i-1]-flux[i];
    q[0] = q[NX-2]; q[NX-1] = q[1];  // Treat boundary conditions
  }

  for (int i = 0; i < NX; i++) q_out[i] = q[i];

  real q_AD[NX];
  real flux_AD[NX-1];
  for (int i = 0; i < NX; i++) q_AD[i] = q_AD_const[i];
  for (int i = 0; i < NX-1; i++) flux_AD[i] = 0.0;
  
  // Reverse pass
  for (int j = nt-1; j >= 0; j--) {
    q_AD[NX-2] += q_AD[0];
    q_AD[0] = 0.0;
    q_AD[1] += q_AD[NX-1];
    q_AD[NX-1] = 0.0;

    for(int i = 1; i < NX-1; i++) {
      flux_AD[i-1] += q_AD[i];
      flux_AD[i] -= q_AD[i];
      //      q_AD[i] = 0.0;
    }
    q -= NX;
    for (int i = 0; i < NX-1; i++) {
      real factor = exp(c*log(q[i]/q[i+1]));
      real one_over_q_i = 1.0/q[i];
      real one_over_q_i_plus_one = 1.0/q[i+1];

      // Up to and including Adept 2.0.5 this was the incorrect line:
      //      real one_over_denominator = 1.0/(one_over_q_i+one_over_q_i_plus_one);
      // This is the corrected line:
      real one_over_denominator = 1.0/(one_over_q_i_plus_one-one_over_q_i);

      q_AD[i] += one_over_denominator*one_over_q_i
	* (c*factor - (factor-1.0)*one_over_denominator*one_over_q_i)
	* flux_AD[i];
      q_AD[i+1] += one_over_denominator*one_over_q_i_plus_one
	* (- c*factor + (factor-1.0)*one_over_denominator*one_over_q_i_plus_one)
	* flux_AD[i];
      flux_AD[i] = 0.0;
    }
  }
  for (int i = 0; i < NX; i++) {
    q_init_AD[i] = q_AD[i];
    q_AD[i] = 0.0;
  }

  delete[] q_save;
}

#endif


================================================
FILE: benchmark/advection_schemes_K.h
================================================
/* advection_schemes_K.h - Header for hand-coded Jacobians

  Copyright (C) 2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

#ifndef ADVECTION_SCHEMES_K_H
#define ADVECTION_SCHEMES_K_H

#include <cmath>
#include <iostream>

#include "nx.h"

// Lax-Wendroff scheme applied to linear advection
template <typename real>
void lax_wendroff_K(int nt, real c, const real q_init[NX],
		    real q[NX], real jacobian[NX*NX]) {
  real flux[NX-1];                         // Fluxes between boxes
  real flux_K[NX-1][NX];                   // Flux Jacobian (dflux/dq_init)
  //  real (&q_K)[NX][NX] = *reinterpret_cast<real(*)[NX][NX]>(jacobian);
  real q_K[NX][NX];
  real coeff1 = 0.5*c*(1.0+c);
  real coeff2 = 0.5*c*(1.0-c);

  for (int i=0; i<NX; i++) {
    q[i] = q_init[i];                        // Initialize q 
    for (int k=0; k<NX; k++) {
      q_K[i][k] = 0.0;                       // Initialize Jacobian
    }
    q_K[i][i] = 1.0;
  }
  for (int j=0; j<nt; j++) {                 // Main loop in time
    for (int i=0; i<NX-1; i++) {
      flux[i] = 0.5*c*(q[i]+q[i+1]+c*(q[i]-q[i+1]));
      for (int k=0; k<NX; k++) {
	flux_K[i][k] = coeff1*q_K[i][k] + coeff2*q_K[i+1][k];
      }
    }
    for (int i=1; i<NX-1; i++) {
      q[i] += flux[i-1]-flux[i];
      for (int k=0; k<NX; k++) {
	q_K[i][k] += flux_K[i-1][k]-flux_K[i][k];
      }
    }
    q[0] = q[NX-2]; q[NX-1] = q[1];          // Treat boundary conditions
    for (int k=0; k<NX; k++) {
      q_K[0][k] = q_K[NX-2][k];
      q_K[NX-1][k] = q_K[1][k];
    }
  }

  // Transpose the result
  for (int i = 0, index = 0; i < NX; i++) {
    for (int j = 0; j < NX; j++, index++) {
      jacobian[index] = q_K[j][i];
    }
  }

}


// Toon advection scheme applied to linear advection
template <typename real>
void toon_K(int nt, real c, const real q_init[NX], real q[NX],
	    real jacobian[NX*NX]) {
  real flux[NX-1];                        // Fluxes between boxes
  real flux_K[NX-1][NX];
  real q_K[NX][NX];

  for (int i=0; i<NX; i++) {
    q[i] = q_init[i]; // Initialize q
    for (int k=0; k<NX; k++) {
      q_K[i][k] = 0.0;                       // Initialize Jacobian
    }
    q_K[i][i] = 1.0;
  }
  for (int j=0; j<nt; j++) {                 // Main loop in time
    for (int i=0; i<NX-1; i++) {
      real coeff1, coeff2;
      // Ought to check if the difference between adjacent points is
      // not too small or we end up with close to 0/0, but this leads
      // to different results from the automatic differentiation
      //      if (fabs(q[i]-q[i+1]) > q[i]*1.0e-6) {
	real factor = exp(c*log(q[i]/q[i+1]));
	real one_over_denominator = 1.0/(q[i]-q[i+1]);
	coeff1 = one_over_denominator*q[i+1]
	  * (c*factor + (factor-1.0)*(1.0-q[i]*one_over_denominator));
	coeff2 = one_over_denominator*q[i]
	  * (- c*factor + (factor-1.0)*(1.0+q[i+1]*one_over_denominator));
	flux[i] = (factor-1.0) * q[i]*q[i+1]*one_over_denominator;
	/*
      }
      else {
	flux[i] = c*q[i]; // Upwind scheme
	coeff1 = c;
	coeff2 = 0.0;
      }
	*/
      for (int k=0; k<NX; k++) {
	flux_K[i][k] = coeff1*q_K[i][k] + coeff2*q_K[i+1][k];
      }
    }

    for (int i=1; i<NX-1; i++) {
      q[i] += flux[i-1]-flux[i];
      for (int k=0; k<NX; k++) {
	q_K[i][k] += flux_K[i-1][k]-flux_K[i][k];
      }
    }
    q[0] = q[NX-2]; q[NX-1] = q[1];          // Treat boundary conditions
    for (int k=0; k<NX; k++) {
      q_K[0][k] = q_K[NX-2][k];
      q_K[NX-1][k] = q_K[1][k];
    }
  }

  // Transpose the result
  for (int i = 0, index = 0; i < NX; i++) {
    for (int j = 0; j < NX; j++, index++) {
      jacobian[index] = q_K[j][i];
    }
  }
}

#endif


================================================
FILE: benchmark/animate.cpp
================================================
/* animate.cpp - Visualize the advection

  Copyright (C) 2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

#include <string>
#include <iostream>
#include <time.h>

#include "advection_schemes.h"

int
main(int argc, char** argv)
{
  double q1_save[NX];
  double q2_save[NX];
  double* q1 = q1_save;
  double* q2 = q2_save;
  double pi = 4.0*atan(1.0);

  double min_q = -0.2;
  double max_q = 1.2;
  double dq = 0.05;

  double dt = 0.125;
  int nt = 8;
  int cycles = 5;

  int j_min = min_q/dq;
  int j_max = max_q/dq;

  std::string line;
  line.resize(NX);

  timespec t;
  t.tv_sec = 0;
  t.tv_nsec = 20000000;

  for (int i = 0; i < NX; i++) q1[i] = (0.5+0.5*sin((i*2.0*pi)/(NX-1.5)))+0.0001;
  for (int k = 0; k < cycles*NX/(nt*dt); k++) {
    std::cout << "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n";

    for (int j = j_max; j > 0; j--) {
      double q_thresh = j*dq;
      for (int i = 0; i < NX; i++) {
	if (q1[i] > q_thresh) {
	  line[i] = '#';
	}
	else {
	  line[i] = ' ';
	}
      }
      std::cout << line << "\n";
    }
    for (int i = 0; i < NX; i++) {
      line[i] = '-';
    }
    std::cout << line << "\n";
    for (int j = -1; j > j_min; j--) {
      double q_thresh = j*dq;
      for (int i = 0; i < NX; i++) {
	if (q1[i] <= q_thresh) {
	  line[i] = '$';
	}
	else {
	  line[i] = ' ';
	}
      }
      std::cout << line << "\n";
      std::cout.flush();
    }
    nanosleep(&t, 0);
    //toon(nt, dt, q1, q2);
    lax_wendroff(nt, dt, q1, q2);
    double* tmp = q1;
    q2 = q1;
    q1 = tmp;  
  }
  return 0;
}


================================================
FILE: benchmark/autodiff_benchmark.cpp
================================================
/* autodiff_benchmark.cpp - Program to benchmark different automatic differentiation tools

  Copyright (C) 2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

#include <sstream>
#include <iostream>
#include <vector>
#include <cmath>
#include <valarray>

#include "differentiator.h"

#include <adept.h>
using adept::Real;

static
Real
rms(const std::vector<Real>& a, const std::vector<Real>&b)
{
  if (a.size() != b.size()) {
    throw differentiator_exception("Attempt to compute RMS difference between vectors of different size");
  }
  Real sum = 0.0;
  for (size_t i = 0; i < a.size(); i++) {
    sum += (a[i]-b[i])*(a[i]-b[i]);
  }
  return sqrt(sum/a.size());
}

static
void
usage(const char* argv0)
{
  std::cout << "Usage: " << argv0 << " [OPTIONS] where OPTIONS can be\n";
  std::cout << "  -h|--help          Print this message\n";
  std::cout << "  -a|--algorithm  s  Use test algorithms specified by string s which may be\n";
  std::cout << "                     \"all\" or a comma separated list with possible entries\n";
  std::cout << "                     " << test_algorithms() << "\n";
  std::cout << "  -t|--tool       s  Use automatic differentiation tools specified by string\n";
  std::cout << "                     s which may be \"all\" or a comma separated list with\n";
  std::cout << "                     possible entries " << autodiff_tools() << "\n";    
  std::cout << "  -r|--repeat     n  Benchmark repeats the simulation n times\n";
  std::cout << "  -j|--jrepeat    n  Repeat the Jacobian simulation n times\n";
  std::cout << "  -n|--timesteps  n  Simulation uses n timesteps\n";
  std::cout << "  --print-result     Print the final output from the simulation(s)\n";
  std::cout << "  --print-adjoint    Print the hand-coded adjoint\n";
  std::cout << "  --print-jacobian   Print the hand-coded Jacobian matrix\n";
  std::cout << "  --no-openmp        Don't use OpenMP to speed up Adept\n";
  std::cout << "  --jacobian-forward Force use of forward-mode Jacobian\n";
  std::cout << "  --jacobian-reverse Force use of reverse-mode Jacobian\n";
  std::cout << "  --tolerance     x  Agreement with hand-coded requires RMS difference < x\n";
  std::cout << "  --verify-only      No benchmark: only verify correctness of results\n";
  std::cout << "Return code: 0 if all automatic differentiation tools produce adjoints and\n"
    "  Jacobians whose RMS difference with the values from hand-coded\n"
    "  differentiation is less than the required tolerance; 1 otherwise.\n";
}

int
main(int argc, char** argv)
{
  int nt = 2000;
  int nr = 100;
  int nr_jacobian = nr/10;
  Real dt = 0.125;
  Real tolerance = 1.0e-5;
  int force_jacobian = 0;

  bool verbose = false;
  bool print_result = false;
  bool print_adjoint = false;
  bool print_jacobian = false;
  bool no_openmp = false;
  bool verify_only = false;

  std::valarray<bool> use_tool(N_AUTODIFF_TOOLS);
  std::valarray<bool> use_algorithm(N_TEST_ALGORITHMS);

  use_tool = true;
  use_algorithm = true;

  int iarg = 1;

  while (iarg < argc) {
    if (std::string("-h") == argv[iarg]
	|| std::string("--help") == argv[iarg]) {
      usage(argv[0]);
      return 0;
    }
    if (std::string("-v") == argv[iarg]
	|| std::string("--verbose") == argv[iarg]) {
      verbose = true;
    }
    else if (std::string("--print-result") == argv[iarg]) {
      print_result = true;
    }
    else if (std::string("--print-adjoint") == argv[iarg]) {
      print_adjoint = true;
    }
    else if (std::string("--print-jacobian") == argv[iarg]) {
      print_jacobian = true;
    }
    else if (std::string("--jacobian-forward") == argv[iarg]) {
      force_jacobian = +1;
    }
    else if (std::string("--jacobian-reverse") == argv[iarg]) {
      force_jacobian = -1;
    }
    else if (std::string("--no-openmp") == argv[iarg]) {
      no_openmp = true;
    }
    else if (std::string("--verify-only") == argv[iarg]) {
      verify_only = true;
    }
    else if (std::string("-a") == argv[iarg]
	     || std::string("--algorithm") == argv[iarg]) {
      if (++iarg < argc) {
	if (std::string(argv[iarg]) != "all") {
	  use_algorithm = false;
	  std::istringstream ss(argv[iarg]);
	  std::string alg;
	  while (std::getline(ss, alg, ',')) {
	    bool found = false;
	    for (int i = 0; i < N_TEST_ALGORITHMS; i++) {
	      if (alg == test_algorithm_string[i]) {
		use_algorithm[i] = true;
		found = true;
		break;
	      }
	    }
	    if (!found) {
	      std::cout << "Test algorithm \""
			<< alg << "\" not available; available algorithms are "
			<< test_algorithms() << "\n";
	    }
	  }
	}
      }
      else {
	std::cout << "Arguments \"-a\" or \"--algorithm\" need to be followed by a string containing a comma-separated list of algorithms\n";
	return 1;
      }
    }
    else if (std::string("-t") == argv[iarg]
	     || std::string("--tool") == argv[iarg]) {
      if (++iarg < argc) {
	if (std::string(argv[iarg]) != "all") {
	  use_tool = false;
	  std::istringstream ss(argv[iarg]);
	  std::string tool;
	  while (std::getline(ss, tool, ',')) {
	    bool found = false;
	    for (int i = 0; i < N_AUTODIFF_TOOLS; i++) {
	      if (tool == autodiff_tool_string[i]) {
		use_tool[i] = true;
		found = true;
		break;
	      }
	    }
	    if (!found) {
	      std::cout << "Automatic differentiation tool \""
			<< tool << "\" not available; available tools are "
			<< autodiff_tools() << "\n";
	    }
	  }
	}
      }
      else {
	std::cout << "Arguments \"-a\" or \"--algorithm\" need to be followed by a string containing a comma-separated list of algorithms\n";
	return 1;
      }
    }
    else if (std::string("-r") == argv[iarg]
	     || std::string("--repeat") == argv[iarg]) {
      if (++iarg < argc) {
	std::stringstream ss(argv[iarg]);
	if (ss >> nr) {
	  if (nr <= 0) { 
	    std::cout << "Number of repeats must be greater than zero\n";
	    return 1;
	  }
	}
	else {
	  std::cout << "Failed to read \""
		    << argv[iarg]
		    << "\"as an integer\n";
	  return 1;
	}
      }
      else {
	throw differentiator_exception("Arguments \"-r\" or \"--repeat\" need to be followed by a number");
      }
    }
    else if (std::string("-j") == argv[iarg]
	     || std::string("--jrepeat") == argv[iarg]) {
      if (++iarg < argc) {
	std::stringstream ss(argv[iarg]);
	if (ss >> nr_jacobian) {
	  if (nr <= 0) { 
	    throw differentiator_exception("Number of repeats must be greater than zero");
	  }
	}
	else {
	  std::string msg = "Failed to read \"";
	  msg += argv[iarg];
	  msg += "\"as an integer";
	  throw differentiator_exception(msg.c_str());
	}
      }
      else {
	throw differentiator_exception("Arguments \"-j\" or \"--jrepeat\" need to be followed by a number");
      }
    }
    else if (std::string("-n") == argv[iarg]
	     || std::string("--timesteps") == argv[iarg]) {
      if (++iarg < argc) {
	std::stringstream ss(argv[iarg]);
	if (ss >> nt) {
	  if (nt < 0) { 
	    throw differentiator_exception("Number of timesteps must be greater than or equal to zero");
	  }
	}
	else {
	  std::string msg = "Failed to read \"";
	  msg += argv[iarg];
	  msg += "\"as an integer";
	  throw differentiator_exception(msg.c_str());
	}
      }
      else {
	throw differentiator_exception("Arguments \"-n\" or \"--timesteps\" need to be followed by a number");
      }
    }
    else if (std::string("--tolerance") == argv[iarg]) {
      if (++iarg < argc) {
	std::stringstream ss(argv[iarg]);
	if (ss >> tolerance) {
	  if (tolerance < 0) { 
	    throw differentiator_exception("Tolerance must be greater than or equal to zero");
	  }
	}
	else {
	  std::string msg = "Failed to read \"";
	  msg += argv[iarg];
	  msg += "\"as a Real";
	  throw differentiator_exception(msg.c_str());
	}
      }
      else {
	throw differentiator_exception("Arguments \"-j\" or \"--jrepeat\" need to be followed by a number");
      }
    }
    else {
      std::string msg = "Argument \"";
      msg += argv[iarg];
      msg += "\" not understood\n";
      std::cout << msg;
      usage(argv[0]);
      return 1;
    }
    iarg++;
  }

  Real pi = 4.0*atan(1.0);
  std::vector<Real> q_init(NX);
  std::vector<Real> q(NX);
  std::vector<Real> q_AD(NX);
  std::vector<Real> q_init_AD(NX);
  std::vector<Real> q_init_AD_reference(NX);
  std::vector<Real> jac(NX*NX);
  std::vector<Real> jac_reference(NX*NX);

  int nr_warm_up = nr/10;
  int nr_jacobian_warm_up = nr_jacobian/10;
  if (nr_warm_up < 1) {
    nr_warm_up = 1;
  }
  if (nr_jacobian_warm_up < 1) {
    nr_jacobian_warm_up = 1;
  }

  if (verify_only) {
    nr = 0;
    nr_jacobian = 0;
    nr_warm_up = 1;
    nr_jacobian_warm_up = 1;
  }

  for (int i = 0; i < NX; i++) q_init[i] = (0.5+0.5*sin((i*2.0*pi)/(NX-1.5)))+1;
  for (int i = 0; i < NX; i++) q_AD[i] = 0.1;

  bool verify_error = false;

  Timer timer;

  std::cout << "Automatic differentiation benchmark and verification\n";
  std::cout << "   Automatic differentiation tools = ";
  bool is_first = true;
  for (int i = 0; i < N_AUTODIFF_TOOLS; i++) {
    if (use_tool[i]) {
      if (!is_first) {
	std::cout << ", ";
      }
      else {
	is_first = false;
      }
      std::cout << autodiff_tool_long_string[i];
    }
  }
  std::cout << "\n";

  std::cout << "   Test algorithms = ";
  is_first = true;
  for (int i = 0; i < N_TEST_ALGORITHMS; i++) {
    if (use_algorithm[i]) {
      if (!is_first) {
	std::cout << ", ";
      }
      else {
	is_first = false;
      }
      std::cout << test_algorithm_long_string[i];
    }
  }
  std::cout << "\n";

  std::cout << "   Number of x points = " << NX << "\n";
  std::cout << "   Number of timesteps = " << nt << ", Courant number = " << dt << "\n";
  if (!verify_only) {
    std::cout << "   Algorithm repeats = " << nr << ", warm-up repeats = " << nr_warm_up << "\n";
    std::cout << "   Jacobian repeats = " << nr_jacobian << ", warm-up repeats = " << nr_jacobian_warm_up << "\n";
  }
  else {
    std::cout << "   Verifying results only: no repeats\n";
  }

  std::cout << adept::configuration();

  // Loop through test algorithms
  for (int ialg = 0; ialg < N_TEST_ALGORITHMS; ialg++) {
    if (use_algorithm[ialg]) {

      std::string algorithm_string = test_algorithm_long_string[ialg];
      std::cout << "\nRunning test algorithm \"" << algorithm_string << "\":\n";
      
      TestAlgorithm ta = static_cast<TestAlgorithm>(ialg);
      
      std::cout << "   Hand coded (forward-mode Jacobian only)\n";
      
      HandCodedDifferentiator hand_coded_differentiator(timer, algorithm_string);
      hand_coded_differentiator.initialize(nt, dt);
      for (int i = 0; i < nr_warm_up; i++) {
	hand_coded_differentiator.func(ta, q_init, q);
	hand_coded_differentiator.adjoint(ta, q_init, q, q_AD, q_init_AD_reference);
	hand_coded_differentiator.jacobian(ta, q_init, q, jac_reference);
      }
      hand_coded_differentiator.reset_timings();
      for (int i = 0; i < nr; i++) {
	hand_coded_differentiator.func(ta, q_init, q);
	hand_coded_differentiator.adjoint(ta, q_init, q, q_AD, q_init_AD_reference);
	hand_coded_differentiator.jacobian(ta, q_init, q, jac_reference);
      }
      
      if (print_result) {
	std::cout << "      result = [" << q[0];
	for (int i = 1; i < NX; i++) {
	  std::cout << ", " << q[i];
	}
	std::cout << "]\n";
      }
      
      if (print_adjoint) {
	std::cout << "adjoint = [" << q_init_AD_reference[0];
	for (int i = 1; i < NX; i++) {
	  std::cout << ", " << q_init_AD_reference[i];
	}
	std::cout << "]\n";
      }
      if (print_jacobian) {
	Real (&q_K)[NX][NX]
	  = *reinterpret_cast<Real(*)[NX][NX]>(&jac_reference[0]);
	std::cout << "jacobian = [\n";
	for (int i = 0; i < NX; i++) {
	  std::cout << q_K[i][0];
	  for (int j = 1; j < NX; j++) {
	    std::cout << ", " << q_K[i][j];
	}
	  std::cout << "\n";
	}
	std::cout << "]\n";
      }
      
      Real base_time = timer.timing(hand_coded_differentiator.base_timer_id());
      
      if (!verify_only) {
	std::cout << "      Time of original algorithm: " << base_time << " seconds\n";
	std::cout << "      Absolute time of adjoint: " 
		  << timer.timing(hand_coded_differentiator.adjoint_compute_timer_id())
		  << " s\n";
	std::cout << "      Relative time of adjoint: " 
		  << timer.timing(hand_coded_differentiator.adjoint_compute_timer_id())
	  / base_time << "\n";
	std::cout << "      Absolute time of Jacobian: " 
		  << timer.timing(hand_coded_differentiator.jacobian_timer_id())
		  << " s\n";
	std::cout << "      Relative time of Jacobian: " 
		  << timer.timing(hand_coded_differentiator.jacobian_timer_id())
	  / base_time << "\n";
      }
      
      for (int itool = 0; itool < N_AUTODIFF_TOOLS; itool++) {
	if (use_tool[itool]) {
	  Differentiator* differentiator
	    = new_differentiator(static_cast<AutoDiffTool>(itool),
				 timer, algorithm_string);
	  if (!differentiator) {
	    if (verbose) std::cout << "Automatic differentiation tool with code " << itool << " not available\n";
	    continue;
	  }
	  
	  differentiator->initialize(nt, dt);
	  if (no_openmp) {
	    differentiator->no_openmp();
	  }
	  
	  std::cout << "   " << differentiator->name() << "\n";
	  
	  if (test_algorithm_is_vector[ialg] && !differentiator->supports_vector_calls()) {
	    std::cout << "     ...vector calls not supported\n";
	    delete differentiator;
	    continue;
	  }

	  for (int i = 0; i < nr_warm_up; i++) {
	    differentiator->adjoint(ta, q_init, q, q_AD, q_init_AD);
	  }
	  Real rms_verify = rms(q_init_AD, q_init_AD_reference);
	  if (rms_verify > tolerance) {
	    std::cout << "      *** Adjoint RMS difference with hand-coded of " << rms_verify << " is greater than tolerance of " << tolerance << " ***\n";
	    if (print_adjoint) {
	      std::cout << "adjoint_auto = [" << q_init_AD[0];
	      for (int i = 1; i < NX; i++) {
		std::cout << ", " << q_init_AD[i];
	      }
	      std::cout << "]\n";
	    }

	    verify_error = true;
	  }
	  else {
	    std::cout << "      Adjoint RMS difference with hand-coded of " << rms_verify << " is within tolerance of " << tolerance << "\n";
	  }

	  for (int i = 0; i < nr_jacobian_warm_up; i++) {
	    differentiator->jacobian(ta, q_init, q, jac, force_jacobian);
	  }
	  rms_verify = rms(jac, jac_reference);
	  if (rms_verify > tolerance) {
	    std::cout << "      *** Jacobian RMS difference with hand-coded of " << rms_verify << " is greater than tolerance of " << tolerance << " ***\n";
	    verify_error = true;
	  }
	  else {
	    std::cout << "      Jacobian RMS difference with hand-coded of " << rms_verify << " is within tolerance of " << tolerance << "\n";
	  }
	  
	  
	  if (!verify_only) {
	    differentiator->reset_timings();
	    for (int i = 0; i < nr; i++) {
	      differentiator->adjoint(ta, q_init, q, q_AD, q_init_AD);
	    }

	    Real relative_record_time = timer.timing(differentiator->base_timer_id())
	      / base_time;
	    Real relative_adjoint_time
	      = timer.timing(differentiator->adjoint_compute_timer_id())
	      / base_time;
	    Real relative_adjoint_prep_time
	      = timer.timing(differentiator->adjoint_prep_timer_id())
	      / base_time;

	    std::cout << "      Absolute time of adjoint: "
		      << timer.timing(differentiator->base_timer_id())
	      + timer.timing(differentiator->adjoint_compute_timer_id())
	      + timer.timing(differentiator->adjoint_prep_timer_id())
		      << " s (" 
		      << timer.timing(differentiator->base_timer_id())
		      << " s + ";
	    if (relative_adjoint_prep_time > 0.0) {
	      std::cout << timer.timing(differentiator->adjoint_prep_timer_id()) 
			<< " s + ";
	    }
	    std::cout <<  timer.timing(differentiator->adjoint_compute_timer_id())
		      << " s)\n";
	    std::cout << "      Relative time of adjoint: "
		      << relative_record_time + relative_adjoint_prep_time
	      + relative_adjoint_time
		      << " (" << relative_record_time << " + ";
	    if (relative_adjoint_prep_time > 0.0) {
	      std::cout << relative_adjoint_prep_time << " + ";
	    }
	    std::cout << relative_adjoint_time << ")\n";
	    differentiator->reset_timings();
	  }
	  
	  for (int i = 0; i < nr_jacobian; i++) {
	    differentiator->jacobian(ta, q_init, q, jac, force_jacobian);
	  }
	  
	  if (print_jacobian) {
	    Real (&q_K)[NX][NX]
	      = *reinterpret_cast<Real(*)[NX][NX]>(&jac[0]);
	    std::cout << "jacobian_auto = [\n";
	    for (int i = 0; i < NX; i++) {
	      std::cout << q_K[i][0];
	      for (int j = 1; j < NX; j++) {
		std::cout << ", " << q_K[i][j];
	      }
	      std::cout << "\n";
	    }
	    std::cout << "]\n";
	  }
	  
	  if (!verify_only) {
	    Real relative_record_time = (nr*timer.timing(differentiator->base_timer_id()))
	      /(nr_jacobian*base_time);
	    Real relative_jacobian_time = (nr*timer.timing(differentiator->jacobian_timer_id()))
	      /(nr_jacobian*base_time);
	    Real relative_adjoint_prep_time = (nr*timer.timing(differentiator->adjoint_prep_timer_id()))
	      /(nr_jacobian*base_time);
	    std::cout << "      Absolute time of Jacobian: "
		      << timer.timing(differentiator->base_timer_id())
	      + timer.timing(differentiator->adjoint_prep_timer_id())
	      + timer.timing(differentiator->jacobian_timer_id())
		      << " s ("
		      << timer.timing(differentiator->base_timer_id()) 
		      << " s + ";
	    if (relative_adjoint_prep_time > 0.0) {
	      std::cout << timer.timing(differentiator->adjoint_prep_timer_id())
			<< " s + ";
	    }
	    std::cout << timer.timing(differentiator->jacobian_timer_id())
		      << " s)\n";
	    std::cout << "      Relative time of Jacobian: "
		      << relative_record_time + relative_adjoint_prep_time + relative_jacobian_time
		      << " (" << relative_record_time << " + ";
	    if (relative_adjoint_prep_time > 0.0) {
	      std::cout << relative_adjoint_prep_time << " + ";
	    }
	    std::cout << relative_jacobian_time << ")\n";
	  }
	  differentiator->print();
	  delete differentiator;
	}
      }
    }
  }
  if (verify_error) {
    std::cout << "\nEXITING WITH ERROR CODE 1: ONE OR MORE OF THE AUTOMATIC DIFFERENTIATION\n"
	      << "TOOLS DID NOT REPRODUCE THE HAND-CODING RESULT\n";
    return 1;
  }
  else {
    std::cout << "\nAll tests were passed within tolerance\n";
    return 1;
  }
}


================================================
FILE: benchmark/differentiator.h
================================================
/* differentiator.h

  Copyright (C) 2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <iostream>
#include <vector>
#include <exception>
#include <cmath>
#include <string>

#include "Timer.h"

#include "adept.h"

using adept::Real;

#ifdef HAVE_ADOLC
// Note that ADOL-C places the "adouble" type in the global namespace
#include "adolc/adolc.h"
#endif

#ifdef HAVE_CPPAD
#include "cppad/cppad.hpp"
#endif

#ifdef HAVE_SACADO
#include "Sacado.hpp"
#endif

#include "advection_schemes.h"
#include "advection_schemes_AD.h"
#include "advection_schemes_K.h"


enum TestAlgorithm {
  TEST_ALGORITHM_LAX_WENDROFF = 0,
  TEST_ALGORITHM_TOON = 1,
  TEST_ALGORITHM_LAX_WENDROFF_VECTOR = 2,
  TEST_ALGORITHM_TOON_VECTOR = 3,
  N_TEST_ALGORITHMS
};

const char* test_algorithm_long_string[] = {"Lax-Wendroff", "Toon et al.",
					    "Lax-Wendroff vector", "Toon et al. vector"};
const char* test_algorithm_string[] = {"lw","toon","lw_vector", "toon_vector"};

const bool test_algorithm_is_vector[] = {false, false, true, true};

inline
std::string
test_algorithms()
{
  std::string algs = test_algorithm_string[0];
  for (int i = 1; i < N_TEST_ALGORITHMS; i++) {
    algs += ",";
    algs += test_algorithm_string[i];
  }
  return algs;
}


class differentiator_exception : public std::exception {
public:
  differentiator_exception(const char* message = "An error occurred in differentiator.h")
  { message_ = message; }
  virtual const char* what() const throw()
  { return message_; }
protected:
  const char* message_;
};

// Base class from which specialist differentiators (hand-coded,
// Adept, ADOL-C etc) inherit
class Differentiator {
public:
  Differentiator(Timer& timer) 
    : timer_(timer) {
    initialize(2000, 0.125); 
  }

  virtual ~Differentiator() { }

  virtual void print() { }

  void initialize(int nt, Real c) {
    nt_ = nt;
    c_ = c;
  }

  virtual bool supports_vector_calls() { return false; }
  
  // Call the function to be differentiated, with the active type
  // provided as a template argument
  template <class ActiveRealType>
  void func(TestAlgorithm test_algorithm,
	    const std::vector<ActiveRealType>& x,
	    std::vector<ActiveRealType>& y) {
    timer_.start(base_timer_id_);
    if (test_algorithm == TEST_ALGORITHM_LAX_WENDROFF) {
      lax_wendroff(nt_, c_, &x[0], &y[0]);
    }
    else if (test_algorithm == TEST_ALGORITHM_TOON) {
      toon(nt_, c_, &x[0], &y[0]);
    }
    timer_.stop();
  }

  virtual bool adjoint(TestAlgorithm test_algorithm,
		       const std::vector<Real>& x,
		       std::vector<Real>& y,
		       const std::vector<Real>& y_AD,
		       std::vector<Real>& x_AD) {
    return false;
  }

  virtual bool jacobian(TestAlgorithm test_algorithm,
			const std::vector<Real>& x,
			std::vector<Real>& y,
			std::vector<Real>& jac,
			int force_jacobian = 0) {
    return false;
  }

  void reset_timings() {
    timer_.reset(base_timer_id_);
    timer_.reset(adjoint_prep_timer_id_);
    timer_.reset(adjoint_compute_timer_id_);
    timer_.reset(jacobian_timer_id_);
  }

  virtual std::string name() const = 0; //{ return "GENERIC"; }

  virtual void no_openmp() { }

  int base_timer_id() const { return base_timer_id_; }
  int adjoint_prep_timer_id() const { return adjoint_prep_timer_id_; }
  int adjoint_compute_timer_id() const { return adjoint_compute_timer_id_; }
  int jacobian_timer_id() const { return jacobian_timer_id_; }

protected:
  void init_timer(const std::string name_) {
    base_timer_id_ = timer_.new_activity(name() + " | " + name_ + " | record");
    adjoint_prep_timer_id_ = timer_.new_activity(name() + " | " + name_ + " | adjoint prep");
    adjoint_compute_timer_id_ = timer_.new_activity(name() + " | " + name_ + " | adjoint compute");
    jacobian_timer_id_ = timer_.new_activity(name() + " | " + name_ + " | Jacobian");
  }

protected:
  Timer& timer_;
  int nt_; // Number of timesteps to run
  Real c_;  // Courant number
  int base_timer_id_;
  int adjoint_prep_timer_id_;
  int adjoint_compute_timer_id_;
  int jacobian_timer_id_;
};

// ================= HAND CODED ===========================
#include "advection_schemes_AD.h"

class HandCodedDifferentiator
  : public Differentiator {
public:
  HandCodedDifferentiator(Timer& timer, const std::string& name_)
    : Differentiator(timer) {
    init_timer(name_);
  }

  virtual bool supports_vector_calls() { return true; }
  
  virtual bool adjoint(TestAlgorithm test_algorithm,
		       const std::vector<Real>& x,
		       std::vector<Real>& y,
		       const std::vector<Real>& y_AD,
		       std::vector<Real>& x_AD) {
    if (test_algorithm == TEST_ALGORITHM_LAX_WENDROFF) {
      timer_.start(adjoint_compute_timer_id_);
      lax_wendroff_AD(nt_, c_, &x[0], &y[0], &y_AD[0], &x_AD[0]);
      timer_.stop();
    }
    else if (test_algorithm == TEST_ALGORITHM_TOON) {
      timer_.start(adjoint_compute_timer_id_);
      toon_AD(nt_, c_, &x[0], &y[0], &y_AD[0], &x_AD[0]);
      timer_.stop();
    }
    else if (test_algorithm == TEST_ALGORITHM_LAX_WENDROFF_VECTOR) {
      timer_.start(adjoint_compute_timer_id_);
      lax_wendroff_AD(nt_, c_, &x[0], &y[0], &y_AD[0], &x_AD[0]);
      timer_.stop();
    }
    else if (test_algorithm == TEST_ALGORITHM_TOON_VECTOR) {
      timer_.start(adjoint_compute_timer_id_);
      toon_AD(nt_, c_, &x[0], &y[0], &y_AD[0], &x_AD[0]);
      timer_.stop();
    }
    else {
      std::cerr << "Algorithm not found: " << test_algorithm << "\n";
      return false;
    }
    return true;
  }

  virtual bool jacobian(TestAlgorithm test_algorithm,
			const std::vector<Real>& x,
			std::vector<Real>& y,
			std::vector<Real>& jac,
			int force_jacobian = 0) {
    jac.resize(NX*NX);
    if (test_algorithm == TEST_ALGORITHM_LAX_WENDROFF) {
      timer_.start(jacobian_timer_id_);
      lax_wendroff_K(nt_, c_, &x[0], &y[0], &jac[0]);
      timer_.stop();
    }
    else if (test_algorithm == TEST_ALGORITHM_TOON) {
      timer_.start(jacobian_timer_id_);
      toon_K(nt_, c_, &x[0], &y[0], &jac[0]);
      timer_.stop();
    }
    else if (test_algorithm == TEST_ALGORITHM_LAX_WENDROFF_VECTOR) {
      timer_.start(jacobian_timer_id_);
      lax_wendroff_K(nt_, c_, &x[0], &y[0], &jac[0]);
      timer_.stop();
    }
    else if (test_algorithm == TEST_ALGORITHM_TOON_VECTOR) {
      timer_.start(jacobian_timer_id_);
      toon_K(nt_, c_, &x[0], &y[0], &jac[0]);
      timer_.stop();
    }
    else {
      std::cerr << "Algorithm not found: " << test_algorithm << "\n";
      return false;
    }
    return true;
  }

  virtual std::string name() const { return "Hand coded"; }
};


// ================= ADEPT ================================ 

class AdeptDifferentiator
  : public Differentiator {
public:
  AdeptDifferentiator(Timer& timer, const std::string& name_)
    : Differentiator(timer) { init_timer(name_); }

  virtual ~AdeptDifferentiator() { }

  virtual bool supports_vector_calls() { return true; }
  
  // Need to overload the function in the base class, because only
  // Adept supports the _VECTOR versions of the algorithms
  template <class ActiveRealType>
  void func(TestAlgorithm test_algorithm,
	    const std::vector<ActiveRealType>& x,
	    std::vector<ActiveRealType>& y) {
    timer_.start(base_timer_id_);
    if (test_algorithm == TEST_ALGORITHM_LAX_WENDROFF) {
      lax_wendroff(nt_, c_, &x[0], &y[0]);
    }
    else if (test_algorithm == TEST_ALGORITHM_TOON) {
      toon(nt_, c_, &x[0], &y[0]);
    }
    else if (test_algorithm == TEST_ALGORITHM_LAX_WENDROFF_VECTOR) {
      lax_wendroff_vector(nt_, c_, &x[0], &y[0]);
    }
    else if (test_algorithm == TEST_ALGORITHM_TOON_VECTOR) {
      toon_vector(nt_, c_, &x[0], &y[0]);
    }
    timer_.stop();
  }

  virtual bool adjoint(TestAlgorithm test_algorithm,
		       const std::vector<Real>& x,
		       std::vector<Real>& y,
		       const std::vector<Real>& y_AD,
		       std::vector<Real>& x_AD) {
    if (x.size() != NX || y_AD.size() != NX) {
      throw differentiator_exception("One of input vectors not of size NX in call to AdeptDifferentiator::adjoint");
    }
    y.resize(NX);
    x_AD.resize(NX);

    std::vector<adept::aReal> q_init(NX);
    std::vector<adept::aReal> q(NX);

    adept::set_values(&q_init[0], NX, &x[0]);

    stack_.new_recording();
    func(test_algorithm, q_init, q);

    timer_.start(adjoint_compute_timer_id_);

    adept::set_gradients(&q[0], NX, &y_AD[0]);
    stack_.compute_adjoint();
    adept::get_gradients(&q_init[0], NX, &x_AD[0]);

    timer_.stop();

    return true;
  }


  virtual bool jacobian(TestAlgorithm test_algorithm,
			const std::vector<Real>& x,
			std::vector<Real>& y,
			std::vector<Real>& jac,
			int force_jacobian = 0) {
    if (x.size() != NX) {
      throw differentiator_exception("Input vector x not of size NX in call to AdeptDifferentiator::jacobian");
    }
    y.resize(NX);
    jac.resize(NX*NX);

    std::vector<adept::aReal> q_init(NX);
    std::vector<adept::aReal> q(NX);

    adept::set_values(&q_init[0], NX, &x[0]);

    stack_.new_recording();
    func(test_algorithm, q_init, q);

    stack_.independent(&q_init[0], NX);
    stack_.dependent(&q[0], NX);

    timer_.start(jacobian_timer_id_);
    if (force_jacobian > 0) {
      stack_.jacobian_forward(&jac[0]);
    }
    else if (force_jacobian < 0) {
      stack_.jacobian_reverse(&jac[0]);
    }
    else {
      stack_.jacobian(&jac[0]);
    }
    timer_.stop();
    return true;
  }

  virtual std::string name() const {
    std::stringstream name_;
    name_ << "Adept";
    int nthread = stack_.max_jacobian_threads();
    if (nthread > 1) {
      name_ << " (Jacobian using up to " << nthread << " OpenMP threads)";
    }
    else {
      name_ << " (single threaded)";
    }
    return name_.str(); 
  }

  virtual void no_openmp() { 
    stack_.set_max_jacobian_threads(1);
  }

  virtual void print() {
    std::cout << "========== ADEPT STACK BEGIN ==========\n";
    std::cout << stack_;
    std::cout << "========== ADEPT STACK END ============\n";
  }

private:
  adept::Stack stack_;
};

 
#ifdef HAVE_ADOLC

// ================= ADOLC ================================ 

class AdolcDifferentiator
  : public Differentiator {
public:
  AdolcDifferentiator(Timer& timer, const std::string& name_)
    : Differentiator(timer), jac(0), I(0), result(0) { init_timer(name_); }

  // Note that ADOL-C places the "adouble" type in the global namespace
  typedef adouble aReal;
  
  virtual ~AdolcDifferentiator() {
    if (I) {
      myfreeI2(NX, I);
    }
    if (jac) {
      myfree2(jac);
    }
    if (result) {
      myfree1(result);
    }
  }

  virtual bool adjoint(TestAlgorithm test_algorithm,
		       const std::vector<Real>& x,
		       std::vector<Real>& y,
		       const std::vector<Real>& y_AD,
		       std::vector<Real>& x_AD) {
    if (x.size() != NX || y_AD.size() != NX) {
      throw differentiator_exception("One of input vectors not of size NX in call to AdolcDifferentiator::adjoint");
    }
    y.resize(NX);
    x_AD.resize(NX);

    std::vector<aReal> q_init(NX);
    std::vector<aReal> q(NX);

    trace_on(1,1);

    for (int i = 0; i < NX; i++) {
      q_init[i] <<= x[i];
    }

    func(test_algorithm, q_init, q);

    for (int i = 0; i < NX; i++) {
      q[i] >>= y[i];
    }

    trace_off();

    timer_.start(adjoint_compute_timer_id_);

    reverse(1, NX, NX, 0, const_cast<Real*>(&y_AD[0]), &x_AD[0]);                                                

    timer_.stop();
    return true;
  }


  virtual bool jacobian(TestAlgorithm test_algorithm,
			const std::vector<Real>& x,
			std::vector<Real>& y,
			std::vector<Real>& jac_,
			int force_jacobian = 0) {
    if (x.size() != NX) {
      throw differentiator_exception("Input vector x not of size NX in call to AdolcDifferentiator::jacobian");
    }
    y.resize(NX);
    jac_.resize(NX*NX);

    std::vector<aReal> q_init(NX);
    std::vector<aReal> q(NX);

    trace_on(1,1);

    for (int i = 0; i < NX; i++) {
      q_init[i] <<= x[i];
    }

    func(test_algorithm, q_init, q);

    for (int i = 0; i < NX; i++) {
      q[i] >>= y[i];
    }

    trace_off();

    if (!jac) {
      jac = myalloc2(NX,NX);
      I = myallocI2(NX);
      result = myalloc1(NX);
    }

    timer_.start(jacobian_timer_id_);

    if (force_jacobian < 0) {
      int rc = zos_forward(1, NX, NX, 1, &x[0], result);
      if (rc < 0) {
	throw differentiator_exception("Error occurred ADOL-C's zos_forward()");
      }
      MINDEC(rc,fov_reverse(1, NX, NX, NX, I, jac));
    }
    else if (force_jacobian > 0) {
      int rc = fov_forward(1, NX, NX, NX, &x[0], I, result, jac);
      if (rc < 0) {
	throw differentiator_exception("Error occurred ADOL-C's fov_forward()");
      }
    }
    else {
      ::jacobian(1, NX, NX, &x[0], jac);
    }

    timer_.stop();

    for (int j=0, index=0; j < NX; j++) {
      for (int i=0; i < NX; i++, index++) {
	jac_[index] = jac[i][j];
      }
    }
    return true;
  }

  virtual std::string name() const { return "ADOL-C"; }

private:
  Real** jac;
  Real** I;
  Real* result;
};

#endif // HAVE_ADOLC


#ifdef HAVE_CPPAD

// ================= CPPAD ================================ 

class CppadDifferentiator
  : public Differentiator {
public:
  typedef CppAD::AD<Real> aReal;

  CppadDifferentiator(Timer& timer, const std::string& name_)
    : Differentiator(timer) {
    init_timer(name_); 
    CppAD::thread_alloc::hold_memory(true);
  }
    
  virtual ~CppadDifferentiator() { }
  
  virtual bool adjoint(TestAlgorithm test_algorithm,
		       const std::vector<Real>& x,
		       std::vector<Real>& y,
		       const std::vector<Real>& y_AD,
		       std::vector<Real>& x_AD) {
    if (x.size() != NX || y_AD.size() != NX) {
      throw differentiator_exception("One of input vectors not of size NX in call to CppadDifferentiator::adjoint");
    }
    y.resize(NX);
    x_AD.resize(NX);

    std::vector<aReal> q_init(NX);
    std::vector<aReal> q(NX);

    for (int i = 0; i < NX; i++) {
      q_init[i] = x[i];
    }

    CppAD::Independent(q_init);

    func(test_algorithm, q_init, q);

    for (int i = 0; i < NX; i++) {
      y[i] = CppAD::Value(q[i]);
    }

    timer_.start(adjoint_prep_timer_id_);
    CppAD::ADFun<Real> f(q_init, q);

    timer_.start(adjoint_compute_timer_id_);
    x_AD = f.Reverse(1, y_AD);
    timer_.stop();

    return true;
  }

  virtual bool jacobian(TestAlgorithm test_algorithm,
			const std::vector<Real>& x,
			std::vector<Real>& y,
			std::vector<Real>& jac,
			int force_jacobian = 0) {
    if (x.size() != NX) {
      throw differentiator_exception("Input vector x not of size NX in call to CppadDifferentiator::jacobian");
    }
    y.resize(NX);
    jac.resize(NX*NX);
    jac_transpose_.resize(NX*NX);

    std::vector<aReal> q_init(NX);
    std::vector<aReal> q(NX);

    for (int i = 0; i < NX; i++) {
      q_init[i] = x[i];
    }

    CppAD::Independent(q_init);

    func(test_algorithm, q_init, q);

    for (int i = 0; i < NX; i++) {
      y[i] = CppAD::Value(q[i]);
    }

    timer_.start(adjoint_prep_timer_id_);
    CppAD::ADFun<Real> f(q_init, q);

    timer_.start(jacobian_timer_id_);

    if (force_jacobian < 0) {
      CppAD::JacobianRev(f, x, jac_transpose_);
    }
    else if (force_jacobian > 0) {
      CppAD::JacobianFor(f, x, jac_transpose_);
    } 
    else {
      jac_transpose_ = f.Jacobian(x);
    }

    // Transpose Jacobian because CppAD uses the opposite convention to the other tools
    Real (&jac_transpose2)[NX][NX]
      = *reinterpret_cast<Real(*)[NX][NX]>(&jac_transpose_[0]);
    for (int i = 0, index = 0; i < NX; i++) {
      for (int j = 0; j < NX; j++, index++) {
	jac[index] = jac_transpose2[j][i];
      }
    }

    return true;
  }

  virtual std::string name() const { return "CppAD"; }

private:
  std::vector<Real> jac_transpose_;
};

#endif // HAVE_CPPAD


#ifdef HAVE_SACADO

// ================= SACADO ================================ 

template<> int Sacado::Rad::ADmemblock<Real>::n_blocks = 0;

class SacadoDifferentiator
  : public Differentiator {
public:
  typedef Sacado::Rad::ADvar<Real> aReal;
  typedef Sacado::ELRFad::DFad<Real> aReal_fad;

  SacadoDifferentiator(Timer& timer, const std::string& name_)
    : Differentiator(timer) { init_timer(name_); }
    
  virtual ~SacadoDifferentiator() { }
  
  virtual bool adjoint(TestAlgorithm test_algorithm,
		       const std::vector<Real>& x,
		       std::vector<Real>& y,
		       const std::vector<Real>& y_AD,
		       std::vector<Real>& x_AD) {
    if (x.size() != NX || y_AD.size() != NX) {
      throw differentiator_exception("One of input vectors not of size NX in call to SacadoDifferentiator::adjoint");
    }
    y.resize(NX);
    x_AD.resize(NX);

    std::vector<aReal> q_init(NX);
    std::vector<aReal> q(NX);

    for (int i = 0; i < NX; i++) {
      q_init[i] = x[i];
    }

    func(test_algorithm, q_init, q);

    for (int i = 0; i < NX; i++) {
      y[i] = q[i].val();
    }

    timer_.start(base_timer_id_);
    aReal objective_func = 0.0;
    for (int i = 0; i < NX; i++) {
      objective_func += q[i] * y_AD[i];
    }

    timer_.start(adjoint_compute_timer_id_);
    Sacado::Rad::ADvar<Real>::Gradcomp();
    for (int i = 0; i < NX; i++) { 
      x_AD[i] = q_init[i].adj();
    }
    timer_.stop();

    return true;
  }  


  virtual bool jacobian(TestAlgorithm test_algorithm,
			const std::vector<Real>& x,
			std::vector<Real>& y,
			std::vector<Real>& jac,
			int force_jacobian = 0) {
    if (x.size() != NX) {
      throw differentiator_exception("Input vector x not of size NX in call to SacadoDifferentiator::jacobian");
    }
    y.resize(NX);
    jac.resize(NX*NX);

    std::vector<aReal_fad> q_init(NX);
    std::vector<aReal_fad> q(NX);

    for (int i = 0; i < NX; i++) {
      q_init[i] = x[i];
      q_init[i].resize(NX);
      q[i].resize(NX);
      q_init[i].fastAccessDx(i) = 1.0;
    }

    func(test_algorithm, q_init, q);

    for (int i = 0; i < NX; i++) {
      y[i] = q[i].val();
    }
            
    int index = 0;
    for (int i = 0; i < NX; i++) { 
      for (int k = 0; k < NX; k++, index++) {
	jac[index] = q[k].dx(i);
      }
    }
    return true;
  }

  virtual std::string name() const { return "Sacado (::Rad for adjoint, forward-mode only ::ELRFad for Jacobian)"; }
};

#endif // HAVE_SACADO


// The following enum is designed to be used in a "for" loop to loop
// through the available automatic differentiaion tools
enum AutoDiffTool {
  AUTODIFF_TOOL_ADEPT = 0
#ifdef HAVE_ADOLC
  , AUTODIFF_TOOL_ADOLC
#endif
#ifdef HAVE_CPPAD
  , AUTODIFF_TOOL_CPPAD
#endif
#ifdef HAVE_SACADO
  , AUTODIFF_TOOL_SACADO
#endif
  , N_AUTODIFF_TOOLS
};

const char* autodiff_tool_string[] = {
  "adept"
#ifdef HAVE_ADOLC
  , "adolc"
#endif
#ifdef HAVE_CPPAD
  , "cppad"
#endif
#ifdef HAVE_SACADO
  , "sacado"
#endif
};

const char* autodiff_tool_long_string[] = {
  "Adept"
#ifdef HAVE_ADOLC
  , "ADOL-C"
#endif
#ifdef HAVE_CPPAD
  , "CppAD"
#endif
#ifdef HAVE_SACADO
  , "Sacado"
#endif
};

inline
std::string
autodiff_tools()
{
  std::string tools = autodiff_tool_string[0];
  for (int i = 1; i < N_AUTODIFF_TOOLS; i++) {
    tools += ",";
    tools += autodiff_tool_string[i];
  }
  return tools;
}


// Return pointer to a virtual base object Differentiator
inline
Differentiator* 
new_differentiator(AutoDiffTool auto_diff_tool, Timer& timer, const std::string& name_)
{
  if (auto_diff_tool == AUTODIFF_TOOL_ADEPT) {
    return new AdeptDifferentiator(timer, name_);
  }
#ifdef HAVE_ADOLC
  else if (auto_diff_tool == AUTODIFF_TOOL_ADOLC) {
    return new AdolcDifferentiator(timer, name_);
  }
#endif
#ifdef HAVE_CPPAD
  else if (auto_diff_tool == AUTODIFF_TOOL_CPPAD) {
    return new CppadDifferentiator(timer, name_);
  }
#endif
#ifdef HAVE_SACADO
  else if (auto_diff_tool == AUTODIFF_TOOL_SACADO) {
    return new SacadoDifferentiator(timer, name_);
  }
#endif
  else {
    return 0;
  }
}


================================================
FILE: benchmark/math_benchmark.cpp
================================================
/* math_benchmark.cpp - Benchmark mathematical functions

  Copyright (C) 2023 ECMWF

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.

*/

#include <iostream>
#include <adept_arrays.h>
#include "Timer.h"

int main(int argc, const char** argv)
{
  using namespace adept;
  static const int N = 1024;
  int nrepeat = 1024*16;
  Vector x(N), y(N);

  Timer timer;
  timer.print_on_exit(true);
  int add_id = timer.new_activity("addition");
  int sub_id = timer.new_activity("subtraction");
  int mul_id = timer.new_activity("multiplication");
  int div_id = timer.new_activity("division");
  int exp_id = timer.new_activity("exp");
  int fastexp_id = timer.new_activity("fastexp");
  int log_id = timer.new_activity("log");
  int sin_id = timer.new_activity("sin");

  x = 1.001;
  y = x*x;
  y = 0.0;
  
  timer.start(add_id);
  for (int irepeat = 0; irepeat < nrepeat; ++irepeat) {
    y += x;
  }
  timer.stop();

  y = 0.0;
  
  timer.start(sub_id);
  for (int irepeat = 0; irepeat < nrepeat; ++irepeat) {
    y -= x;
  }
  timer.stop();

  y = 1.0;
  
  timer.start(mul_id);
  for (int irepeat = 0; irepeat < nrepeat; ++irepeat) {
    y *= x;
  }
  timer.stop();

  std::cout << "y=" << y(0) << "\n";
  
  timer.start(div_id);
  for (int irepeat = 0; irepeat < nrepeat; ++irepeat) {
    y /= x;
  }
  timer.stop();

  x = 0.001;
  
  timer.start(exp_id);
  for (int irepeat = 0; irepeat < nrepeat; ++irepeat) {
    y = exp(x);
    x = y-1.001;
  }
  timer.stop();

  std::cout << "y=" << y(0) << "\n";
  
  x = 0.001;

 
  timer.start(fastexp_id);
  for (int irepeat = 0; irepeat < nrepeat; ++irepeat) {
    y = fastexp(x);
    x = y-1.001;
  }
  timer.stop();

  std::cout << "y=" << y(0) << "\n";
  
  x = 1.001;
  
  timer.start(log_id);
  for (int irepeat = 0; irepeat < nrepeat; ++irepeat) {
    y = log(x);
    x = y+1.0;
  }
  timer.stop();

  std::cout << "y=" << y(0) << "\n";
  
  x = 1.001;
  
  timer.start(sin_id);
  for (int irepeat = 0; irepeat < nrepeat; ++irepeat) {
    y = sin(x);
    y = x;
  }
  timer.stop();

  std::cout << "y=" << y(0) << "\n";

  std::cout << "RELATIVE COSTS\n";
  std::cout << "div/mul = " << timer.timing(div_id)/timer.timing(mul_id) << "\n";
  std::cout << "exp/mul = " << timer.timing(exp_id)/timer.timing(mul_id) << "\n";
  std::cout << "fastexp/mul = " << timer.timing(fastexp_id)/timer.timing(mul_id) << "\n";
  std::cout << "log/mul = " << timer.timing(log_id)/timer.timing(mul_id) << "\n";
  std::cout << "sin/mul = " << timer.timing(sin_id)/timer.timing(mul_id) << "\n";
  
}


================================================
FILE: benchmark/matrix_benchmark.cpp
================================================
#include <iostream>

#include <adept_arrays.h>

#include "Timer.h"

template<bool IsActive>
double
time_matmul(int n, int nrepeat, bool is_col_major)
{
  adept::Array<2,double,IsActive> A, B, C;
  Timer timer;
  int matmul_timer_id = timer.new_activity("matmul");
  if (is_col_major) {
    A.resize_column_major(adept::expression_size(n,n));
    B.resize_column_major(adept::expression_size(n,n));
    C.resize_column_major(adept::expression_size(n,n));
  }
  else {
    A.resize(n,n);
    B.resize(n,n);
    C.resize(n,n);
  }
  for (int irepeat = -nrepeat/10; irepeat < nrepeat; ++irepeat) {
    A = 1.1;
    B = 2.2;
    A.diag_vector() = 3.3;
    B.diag_vector() = 5.5;
    if (IsActive) {
      adept::active_stack()->new_recording();
    }
    if (irepeat >= 0) {
      timer.start(matmul_timer_id);
    }
    C = A ** B;
    if (irepeat >= 0) {
      timer.stop();
    }
  }
  /*
  if (IsActive && n < 8) {
    std::cout << "C=" << C;
    std::cout << *adept::active_stack();
    adept::active_stack()->print_statements();
  }
  */
  return timer.timing(matmul_timer_id) / nrepeat;
}

double
time_solve(int n, int nrepeat, bool is_col_major)
{
  adept::Matrix A, B, C;
  Timer timer;
  int solve_timer_id = timer.new_activity("solve");
  if (is_col_major) {
    A.resize_column_major(adept::expression_size(n,n));
    B.resize_column_major(adept::expression_size(n,n));
    C.resize_column_major(adept::expression_size(n,n));
  }
  else {
    A.resize(n,n);
    B.resize(n,n);
    C.resize(n,n);
  }
  for (int irepeat = -nrepeat/10; irepeat < nrepeat; ++irepeat) {
    A = 1.1;
    B = 2.2;
    A.diag_vector() = 3.3;
    B.diag_vector() = 5.5;
    if (irepeat >= 0) {
      timer.start(solve_timer_id);
    }
    C = adept::solve(A, B);
    if (irepeat >= 0) {
      timer.stop();
    }
  }

  return timer.timing(solve_timer_id) / nrepeat;
}


int
main(int argc, char* argv[])
{
  int ibegin = 1;
  int iend = 8;
  int nrepeat = 20;
  bool is_col_major = false;

  adept::Stack stack;
  int n = 2;
  std::cout << "Average cost per operation (" << nrepeat << " repeats)\n";
  std::cout << "Dense N-by-N matrix-matrix multiplication\n";
  //std::cout << " N        inactive time (us)   inactive flops    active time (us)    active flops\n";
  std::cout << "N \tinactive time (us) \tactive time (us)\n";
  for (int i = ibegin; i <= iend; ++i) {
    std::cout << n << " \t";

    double t = time_matmul<false>(n, nrepeat, is_col_major);
    //    std::cout << t*1.0e6 << "  " << (n*n*n) / t << "  ";
    std::cout << t*1.0e6/nrepeat << " \t\t\t";

    t = time_matmul<true>(n, nrepeat, is_col_major);
    //    std::cout << t*1.0e6 << "  " << (n*n*n) / t;
    std::cout << t*1.0e6/nrepeat;

    std::cout << "\n";

    n *= 2;
  }
  
  n = 2;
  std::cout << "Dense N-by-N matrix-matrix solve\n";
  std::cout << "N \tinactive time (us)\n";
  for (int i = ibegin; i <= iend; ++i) {
    std::cout << n << " \t";

    double t = time_solve(n, nrepeat, is_col_major);
    //    std::cout << t*1.0e6 << "  " << (n*n*n) / t << "  ";
    std::cout << t*1.0e6/nrepeat << "\n";

    n *= 2;
  }
  return 0;
}


================================================
FILE: benchmark/nx.h
================================================
#ifndef NX
#define NX 100
#endif


================================================
FILE: config_platform_independent.h.in
================================================
/* config_platform_independent.h.in. */

/* Name of package */
#undef PACKAGE

/* Define to the address where bug reports for this package should be sent. */
#undef PACKAGE_BUGREPORT

/* Define to the full name of this package. */
#undef PACKAGE_NAME

/* Define to the full name and version of this package. */
#undef PACKAGE_STRING

/* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME

/* Define to the home page for this package. */
#undef PACKAGE_URL

/* Define to the version of this package. */
#undef PACKAGE_VERSION

/* Version number of package */
#undef VERSION


================================================
FILE: configure.ac
================================================
# Configure autoconf for the Adept library

### GENERAL CONFIGURATION ###

AC_PREREQ([2.61])
AC_INIT([adept], [2.1.3], [r.j.hogan@ecmwf.int], [adept], [http://www.met.reading.ac.uk/clouds/adept/])
AC_LANG([C++])
AC_CONFIG_SRCDIR([adept/Stack.cpp])
AC_CONFIG_HEADERS([config.h config_platform_independent.h])
AM_INIT_AUTOMAKE([foreign -Wall -Werror])
AC_CONFIG_MACRO_DIR([m4])

# Checks for programs
AC_PROG_CXX
AC_PROG_F77
AC_PROG_MAKE_SET
m4_ifdef([AM_PROG_AR],[AM_PROG_AR])
AC_PROG_LIBTOOL

# Check for system features
AC_CHECK_HEADERS([sys/time.h])
AC_CHECK_FUNCS([gettimeofday pow sqrt])

# Check for OpenMP
AC_OPENMP
AC_SUBST(AM_CXXFLAGS,"$OPENMP_CXXFLAGS")

#### LIBRARIES NEEDED BY ADEPT ###

if test "x$F77" = x
then
	AC_MSG_NOTICE([Not checking for BLAS and LAPACK because no Fortran compiler found])
else
	# Check for BLAS and LAPACK
	# First we need this since the libraries are Fortran called from C++
	AC_F77_LIBRARY_LDFLAGS
	# The following tests for both BLAS and LAPACK
	AX_LAPACK
fi

# Dependencies dictate the following order of libraries
LIBS="$LAPACK_LIBS $BLAS_LIBS $LIBS"
# FLIBS should be included in LDADD or LIBADD in the relevant
# Makefile.am

# If the BLAS library is OpenBLAS then we need to give the user the
# option to change the number of threads, since OpenBLAS's pthreads
# can clash with Adept's use of OpenMP, leading to suboptimal
# performance.
ac_have_openblas_cblas_header=no

if test "$ax_blas_ok" = yes
then
	if test "x$BLAS_LIBS" = "x-lopenblas"
	then	
		AC_MSG_CHECKING([whether cblas.h is from OpenBLAS])
		AC_TRY_LINK([#include <cblas.h>],
		[openblas_set_num_threads(1)],
		[ac_have_openblas_cblas_header=yes
		AC_MSG_RESULT(yes)
		AC_DEFINE([HAVE_OPENBLAS_CBLAS_HEADER],1,[Is the clbas.h header file from OpenBLAS?])],
		AC_MSG_RESULT(no))
	fi
fi

### LIBRARIES THAT MAY BE USED BY TEST PROGRAMS ###

# Checks for GNU Scientific Library
AC_CHECK_LIB([gsl],[gsl_multimin_fdfminimizer_alloc],[AC_MSG_NOTICE([Note that GSL is not used by Adept, just by one of the test programs])])
AC_SUBST(USE_GSL, ["$ac_cv_lib_gsl_gsl_multimin_fdfminimizer_alloc"])

# Check for ADOL-C automatic differentiation library
AC_CHECK_HEADERS([adolc/adolc.h])
AC_CHECK_LIB([adolc],[tapestats])

# Check for SACADO automatic differentiation library
ac_have_sacado=no
save_LIBS=$LIBS
LIBS="$LIBS -lsacado -lteuchos"
AC_MSG_CHECKING([whether Sacado is installed])
AC_TRY_LINK([#include <Sacado.hpp>],
[Sacado::ELRFad::DFad<double> v = 1.0],
[ac_have_sacado=yes
AC_MSG_RESULT(yes)
AC_DEFINE([HAVE_SACADO],1,[Is the Sacado library working?])],
[LIBS=$save_LIBS
AC_MSG_RESULT(no)])

# Check for CppAD automatic differentiation library
AC_CHECK_HEADERS([cppad/cppad.hpp])
if test "$ac_cv_header_cppad_cppad_hpp" = yes
then
   AC_DEFINE([NDEBUG],1,[If CppAD is being used by the benchmarking program then it is much faster with debugging disabled])
fi


### CREATE MAKEFILES AND CONFIG HEADER ###

AC_CONFIG_FILES([Makefile makefile_include adept/Makefile include/Makefile benchmark/Makefile])

AC_DEFINE_UNQUOTED([CXX],["$CXX"],[C++ compiler])
AC_DEFINE_UNQUOTED([CXXFLAGS],["$CXXFLAGS"],[Flags passed to C++ compiler])
AC_DEFINE_UNQUOTED([BLAS_LIBS],["$BLAS_LIBS"],[BLAS library option])

AH_BOTTOM([/* Use ADOLC only if both the library and the header files are available */
#if defined( HAVE_LIBADOLC ) && defined( HAVE_ADOLC_ADOLC_H )
#define HAVE_ADOLC 1
#endif])
AH_BOTTOM([/* Use CPPAD if the header files are available */
#if defined( HAVE_CPPAD_CPPAD_HPP )
#define HAVE_CPPAD 1
#endif])

AC_OUTPUT


### REPORT CONFIGURATION TO THE USER ###

AC_MSG_NOTICE([********************* Summary **************************************])
AC_MSG_NOTICE([  CXX = $CXX ])
AC_MSG_NOTICE([  CPPFLAGS = $CPPFLAGS])
AC_MSG_NOTICE([  CXXFLAGS = $CXXFLAGS $OPENMP_CXXFLAGS])
AC_MSG_NOTICE([  LDFLAGS =  $LDFLAGS])
AC_MSG_NOTICE([  LIBS = $LIBS])
AC_MSG_NOTICE([Typing "make; make install" will install Adept header files in $includedir])
AC_MSG_NOTICE([and the static and shared libraries as $libdir/libadept.*, where])
AC_MSG_NOTICE([prefix=$prefix])
AC_MSG_NOTICE([********************* Libraries used by Adept **********************])
ac_warn_given=no
if test "$ax_blas_ok" = yes
then
	AC_MSG_NOTICE([BLAS (Basic Linear Algebra Subprograms) will be used: BLAS_LIBS = $BLAS_LIBS])
	if test "$ac_have_openblas_cblas_header" = yes
	then
	   AC_MSG_NOTICE([  Number of BLAS threads may be controlled at run time])
	fi
else
	AC_MSG_NOTICE([BLAS (Basic Linear Algebra Subprograms) will not be used: MATRIX MULTIPLICATION IS UNAVAILABLE])
	ac_warn_given=yes
fi
if test "$ax_lapack_ok" = yes
then
	AC_MSG_NOTICE([LAPACK (Linear Algebra Package) will be used: LAPACK_LIBS = $LAPACK_LIBS])
else
	AC_MSG_NOTICE([LAPACK (Linear Algebra Package) will not be used: LINEAR ALGEBRA ROUTINES ARE UNAVAILABLE])
	ac_warn_given=yes
fi

AC_MSG_NOTICE([********************* Libraries used by test programs **************])

if test "$ac_cv_lib_gsl_gsl_multimin_fdfminimizer_alloc" = no
then
	AC_MSG_NOTICE([GNU Scientific Library (GSL) not found; Adept will compile all the])
	AC_MSG_NOTICE([example programs except test/test_gsl_interface.])
	ac_warn_given=yes
else
	AC_MSG_NOTICE([GNU Scientific Library (GSL) found; Adept will compile all the])
	AC_MSG_NOTICE([example programs.])
fi

AC_MSG_NOTICE([********************* Benchmark program ****************************])
AC_MSG_NOTICE([The benchmarking program, "benchmark/advection_benchmark", will be])
AC_MSG_NOTICE([compiled with support for these automatic differentiation libraries:])
AC_MSG_NOTICE([   Adept: yes])

if test "$ac_cv_lib_adolc_tapestats" = yes -a "$ac_cv_header_adolc_adolc_h" = yes
then
   	AC_MSG_NOTICE([   ADOLC: yes])
else
	AC_MSG_NOTICE([   ADOLC: no])
	ac_warn_given=yes
fi

if test "$ac_cv_header_cppad_cppad_hpp" = yes
then
   	AC_MSG_NOTICE([   CppAD: yes])
else
	AC_MSG_NOTICE([   CppAD: no])
	ac_warn_given=yes
fi

if test "$ac_have_sacado" = no
then
	AC_MSG_NOTICE([   Sacado: no])
	ac_warn_given=yes
else
	AC_MSG_NOTICE([   Sacado: yes])
fi

AC_MSG_NOTICE([********************* Top tips *************************************])
AC_MSG_NOTICE([To use a higher than default optimization level, call this configure])
AC_MSG_NOTICE([script with something like: ./configure "CXXFLAGS=-g -O3"])
AC_MSG_NOTICE([If you have libraries in non-standard locations, specify their location])
AC_MSG_NOTICE([by calling this script with something like:])
AC_MSG_NOTICE([  ./configure CPPFLAGS=-I/local/include LDFLAGS="-L/local/lib -Wl,-rpath,/local/lib"])
AC_MSG_NOTICE([The rpath argument is especially useful for locating the BLAS and LAPACK])
AC_MSG_NOTICE([libraries if they are in non-standard locations, so that executables])
AC_MSG_NOTICE([built with Adept do not need to use the LD_LIBRARY_PATH environment])
AC_MSG_NOTICE([variable to specify their locations at run-time.])
AC_MSG_NOTICE([********************************************************************])


================================================
FILE: doc/COPYING
================================================

                GNU Free Documentation License
                 Version 1.3, 3 November 2008


 Copyright (C) 2000, 2001, 2002, 2007, 2008 Free Software Foundation, Inc.
     <http://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

0. PREAMBLE

The purpose of this License is to make a manual, textbook, or other
functional and useful document "free" in the sense of freedom: to
assure everyone the effective freedom to copy and redistribute it,
with or without modifying it, either commercially or noncommercially.
Secondarily, this License preserves for the author and publisher a way
to get credit for their work, while not being considered responsible
for modifications made by others.

This License is a kind of "copyleft", which means that derivative
works of the document must themselves be free in the same sense.  It
complements the GNU General Public License, which is a copyleft
license designed for free software.

We have designed this License in order to use it for manuals for free
software, because free software needs free documentation: a free
program should come with manuals providing the same freedoms that the
software does.  But this License is not limited to software manuals;
it can be used for any textual work, regardless of subject matter or
whether it is published as a printed book.  We recommend this License
principally for works whose purpose is instruction or reference.


1. APPLICABILITY AND DEFINITIONS

This License applies to any manual or other work, in any medium, that
contains a notice placed by the copyright holder saying it can be
distributed under the terms of this License.  Such a notice grants a
world-wide, royalty-free license, unlimited in duration, to use that
work under the conditions stated herein.  The "Document", below,
refers to any such manual or work.  Any member of the public is a
licensee, and is addressed as "you".  You accept the license if you
copy, modify or distribute the work in a way requiring permission
under copyright law.

A "Modified Version" of the Document means any work containing the
Document or a portion of it, either copied verbatim, or with
modifications and/or translated into another language.

A "Secondary Section" is a named appendix or a front-matter section of
the Document that deals exclusively with the relationship of the
publishers or authors of the Document to the Document's overall
subject (or to related matters) and contains nothing that could fall
directly within that overall subject.  (Thus, if the Document is in
part a textbook of mathematics, a Secondary Section may not explain
any mathematics.)  The relationship could be a matter of historical
connection with the subject or with related matters, or of legal,
commercial, philosophical, ethical or political position regarding
them.

The "Invariant Sections" are certain Secondary Sections whose titles
are designated, as being those of Invariant Sections, in the notice
that says that the Document is released under this License.  If a
section does not fit the above definition of Secondary then it is not
allowed to be designated as Invariant.  The Document may contain zero
Invariant Sections.  If the Document does not identify any Invariant
Sections then there are none.

The "Cover Texts" are certain short passages of text that are listed,
as Front-Cover Texts or Back-Cover Texts, in the notice that says that
the Document is released under this License.  A Front-Cover Text may
be at most 5 words, and a Back-Cover Text may be at most 25 words.

A "Transparent" copy of the Document means a machine-readable copy,
represented in a format whose specification is available to the
general public, that is suitable for revising the document
straightforwardly with generic text editors or (for images composed of
pixels) generic paint programs or (for drawings) some widely available
drawing editor, and that is suitable for input to text formatters or
for automatic translation to a variety of formats suitable for input
to text formatters.  A copy made in an otherwise Transparent file
format whose markup, or absence of markup, has been arranged to thwart
or discourage subsequent modification by readers is not Transparent.
An image format is not Transparent if used for any substantial amount
of text.  A copy that is not "Transparent" is called "Opaque".

Examples of suitable formats for Transparent copies include plain
ASCII without markup, Texinfo input format, LaTeX input format, SGML
or XML using a publicly available DTD, and standard-conforming simple
HTML, PostScript or PDF designed for human modification.  Examples of
transparent image formats include PNG, XCF and JPG.  Opaque formats
include proprietary formats that can be read and edited only by
proprietary word processors, SGML or XML for which the DTD and/or
processing tools are not generally available, and the
machine-generated HTML, PostScript or PDF produced by some word
processors for output purposes only.

The "Title Page" means, for a printed book, the title page itself,
plus such following pages as are needed to hold, legibly, the material
this License requires to appear in the title page.  For works in
formats which do not have any title page as such, "Title Page" means
the text near the most prominent appearance of the work's title,
preceding the beginning of the body of the text.

The "publisher" means any person or entity that distributes copies of
the Document to the public.

A section "Entitled XYZ" means a named subunit of the Document whose
title either is precisely XYZ or contains XYZ in parentheses following
text that translates XYZ in another language.  (Here XYZ stands for a
specific section name mentioned below, such as "Acknowledgements",
"Dedications", "Endorsements", or "History".)  To "Preserve the Title"
of such a section when you modify the Document means that it remains a
section "Entitled XYZ" according to this definition.

The Document may include Warranty Disclaimers next to the notice which
states that this License applies to the Document.  These Warranty
Disclaimers are considered to be included by reference in this
License, but only as regards disclaiming warranties: any other
implication that these Warranty Disclaimers may have is void and has
no effect on the meaning of this License.

2. VERBATIM COPYING

You may copy and distribute the Document in any medium, either
commercially or noncommercially, provided that this License, the
copyright notices, and the license notice saying this License applies
to the Document are reproduced in all copies, and that you add no
other conditions whatsoever to those of this License.  You may not use
technical measures to obstruct or control the reading or further
copying of the copies you make or distribute.  However, you may accept
compensation in exchange for copies.  If you distribute a large enough
number of copies you must also follow the conditions in section 3.

You may also lend copies, under the same conditions stated above, and
you may publicly display copies.


3. COPYING IN QUANTITY

If you publish printed copies (or copies in media that commonly have
printed covers) of the Document, numbering more than 100, and the
Document's license notice requires Cover Texts, you must enclose the
copies in covers that carry, clearly and legibly, all these Cover
Texts: Front-Cover Texts on the front cover, and Back-Cover Texts on
the back cover.  Both covers must also clearly and legibly identify
you as the publisher of these copies.  The front cover must present
the full title with all words of the title equally prominent and
visible.  You may add other material on the covers in addition.
Copying with changes limited to the covers, as long as they preserve
the title of the Document and satisfy these conditions, can be treated
as verbatim copying in other respects.

If the required texts for either cover are too voluminous to fit
legibly, you should put the first ones listed (as many as fit
reasonably) on the actual cover, and continue the rest onto adjacent
pages.

If you publish or distribute Opaque copies of the Document numbering
more than 100, you must either include a machine-readable Transparent
copy along with each Opaque copy, or state in or with each Opaque copy
a computer-network location from which the general network-using
public has access to download using public-standard network protocols
a complete Transparent copy of the Document, free of added material.
If you use the latter option, you must take reasonably prudent steps,
when you begin distribution of Opaque copies in quantity, to ensure
that this Transparent copy will remain thus accessible at the stated
location until at least one year after the last time you distribute an
Opaque copy (directly or through your agents or retailers) of that
edition to the public.

It is requested, but not required, that you contact the authors of the
Document well before redistributing any large number of copies, to
give them a chance to provide you with an updated version of the
Document.


4. MODIFICATIONS

You may copy and distribute a Modified Version of the Document under
the conditions of sections 2 and 3 above, provided that you release
the Modified Version under precisely this License, with the Modified
Version filling the role of the Document, thus licensing distribution
and modification of the Modified Version to whoever possesses a copy
of it.  In addition, you must do these things in the Modified Version:

A. Use in the Title Page (and on the covers, if any) a title distinct
   from that of the Document, and from those of previous versions
   (which should, if there were any, be listed in the History section
   of the Document).  You may use the same title as a previous version
   if the original publisher of that version gives permission.
B. List on the Title Page, as authors, one or more persons or entities
   responsible for authorship of the modifications in the Modified
   Version, together with at least five of the principal authors of the
   Document (all of its principal authors, if it has fewer than five),
   unless they release you from this requirement.
C. State on the Title page the name of the publisher of the
   Modified Version, as the publisher.
D. Preserve all the copyright notices of the Document.
E. Add an appropriate copyright notice for your modifications
   adjacent to the other copyright notices.
F. Include, immediately after the copyright notices, a license notice
   giving the public permission to use the Modified Version under the
   terms of this License, in the form shown in the Addendum below.
G. Preserve in that license notice the full lists of Invariant Sections
   and required Cover Texts given in the Document's license notice.
H. Include an unaltered copy of this License.
I. Preserve the section Entitled "History", Preserve its Title, and add
   to it an item stating at least the title, year, new authors, and
   publisher of the Modified Version as given on the Title Page.  If
   there is no section Entitled "History" in the Document, create one
   stating the title, year, authors, and publisher of the Document as
   given on its Title Page, then add an item describing the Modified
   Version as stated in the previous sentence.
J. Preserve the network location, if any, given in the Document for
   public access to a Transparent copy of the Document, and likewise
   the network locations given in the Document for previous versions
   it was based on.  These may be placed in the "History" section.
   You may omit a network location for a work that was published at
   least four years before the Document itself, or if the original
   publisher of the version it refers to gives permission.
K. For any section Entitled "Acknowledgements" or "Dedications",
   Preserve the Title of the section, and preserve in the section all
   the substance and tone of each of the contributor acknowledgements
   and/or dedications given therein.
L. Preserve all the Invariant Sections of the Document,
   unaltered in their text and in their titles.  Section numbers
   or the equivalent are not considered part of the section titles.
M. Delete any section Entitled "Endorsements".  Such a section
   may not be included in the Modified Version.
N. Do not retitle any existing section to be Entitled "Endorsements"
   or to conflict in title with any Invariant Section.
O. Preserve any Warranty Disclaimers.

If the Modified Version includes new front-matter sections or
appendices that qualify as Secondary Sections and contain no material
copied from the Document, you may at your option designate some or all
of these sections as invariant.  To do this, add their titles to the
list of Invariant Sections in the Modified Version's license notice.
These titles must be distinct from any other section titles.

You may add a section Entitled "Endorsements", provided it contains
nothing but endorsements of your Modified Version by various
parties--for example, statements of peer review or that the text has
been approved by an organization as the authoritative definition of a
standard.

You may add a passage of up to five words as a Front-Cover Text, and a
passage of up to 25 words as a Back-Cover Text, to the end of the list
of Cover Texts in the Modified Version.  Only one passage of
Front-Cover Text and one of Back-Cover Text may be added by (or
through arrangements made by) any one entity.  If the Document already
includes a cover text for the same cover, previously added by you or
by arrangement made by the same entity you are acting on behalf of,
you may not add another; but you may replace the old one, on explicit
permission from the previous publisher that added the old one.

The author(s) and publisher(s) of the Document do not by this License
give permission to use their names for publicity for or to assert or
imply endorsement of any Modified Version.


5. COMBINING DOCUMENTS

You may combine the Document with other documents released under this
License, under the terms defined in section 4 above for modified
versions, provided that you include in the combination all of the
Invariant Sections of all of the original documents, unmodified, and
list them all as Invariant Sections of your combined work in its
license notice, and that you preserve all their Warranty Disclaimers.

The combined work need only contain one copy of this License, and
multiple identical Invariant Sections may be replaced with a single
copy.  If there are multiple Invariant Sections with the same name but
different contents, make the title of each such section unique by
adding at the end of it, in parentheses, the name of the original
author or publisher of that section if known, or else a unique number.
Make the same adjustment to the section titles in the list of
Invariant Sections in the license notice of the combined work.

In the combination, you must combine any sections Entitled "History"
in the various original documents, forming one section Entitled
"History"; likewise combine any sections Entitled "Acknowledgements",
and any sections Entitled "Dedications".  You must delete all sections
Entitled "Endorsements".


6. COLLECTIONS OF DOCUMENTS

You may make a collection consisting of the Document and other
documents released under this License, and replace the individual
copies of this License in the various documents with a single copy
that is included in the collection, provided that you follow the rules
of this License for verbatim copying of each of the documents in all
other respects.

You may extract a single document from such a collection, and
distribute it individually under this License, provided you insert a
copy of this License into the extracted document, and follow this
License in all other respects regarding verbatim copying of that
document.


7. AGGREGATION WITH INDEPENDENT WORKS

A compilation of the Document or its derivatives with other separate
and independent documents or works, in or on a volume of a storage or
distribution medium, is called an "aggregate" if the copyright
resulting from the compilation is not used to limit the legal rights
of the compilation's users beyond what the individual works permit.
When the Document is included in an aggregate, this License does not
apply to the other works in the aggregate which are not themselves
derivative works of the Document.

If the Cover Text requirement of section 3 is applicable to these
copies of the Document, then if the Document is less than one half of
the entire aggregate, the Document's Cover Texts may be placed on
covers that bracket the Document within the aggregate, or the
electronic equivalent of covers if the Document is in electronic form.
Otherwise they must appear on printed covers that bracket the whole
aggregate.


8. TRANSLATION

Translation is considered a kind of modification, so you may
distribute translations of the Document under the terms of section 4.
Replacing Invariant Sections with translations requires special
permission from their copyright holders, but you may include
translations of some or all Invariant Sections in addition to the
original versions of these Invariant Sections.  You may include a
translation of this License, and all the license notices in the
Document, and any Warranty Disclaimers, provided that you also include
the original English version of this License and the original versions
of those notices and disclaimers.  In case of a disagreement between
the translation and the original version of this License or a notice
or disclaimer, the original version will prevail.

If a section in the Document is Entitled "Acknowledgements",
"Dedications", or "History", the requirement (section 4) to Preserve
its Title (section 1) will typically require changing the actual
title.


9. TERMINATION

You may not copy, modify, sublicense, or distribute the Document
except as expressly provided under this License.  Any attempt
otherwise to copy, modify, sublicense, or distribute it is void, and
will automatically terminate your rights under this License.

However, if you cease all violation of this License, then your license
from a particular copyright holder is reinstated (a) provisionally,
unless and until the copyright holder explicitly and finally
terminates your license, and (b) permanently, if the copyright holder
fails to notify you of the violation by some reasonable means prior to
60 days after the cessation.

Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.

Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License.  If your rights have been terminated and not permanently
reinstated, receipt of a copy of some or all of the same material does
not give you any rights to use it.


10. FUTURE REVISIONS OF THIS LICENSE

The Free Software Foundation may publish new, revised versions of the
GNU Free Documentation License from time to time.  Such new versions
will be similar in spirit to the present version, but may differ in
detail to address new problems or concerns.  See
http://www.gnu.org/copyleft/.

Each version of the License is given a distinguishing version number.
If the Document specifies that a particular numbered version of this
License "or any later version" applies to it, you have the option of
following the terms and conditions either of that specified version or
of any later version that has been published (not as a draft) by the
Free Software Foundation.  If the Document does not specify a version
number of this License, you may choose any version ever published (not
as a draft) by the Free Software Foundation.  If the Document
specifies that a proxy can decide which future versions of this
License can be used, that proxy's public statement of acceptance of a
version permanently authorizes you to choose that version for the
Document.

11. RELICENSING

"Massive Multiauthor Collaboration Site" (or "MMC Site") means any
World Wide Web server that publishes copyrightable works and also
provides prominent facilities for anybody to edit those works.  A
public wiki that anybody can edit is an example of such a server.  A
"Massive Multiauthor Collaboration" (or "MMC") contained in the site
means any set of copyrightable works thus published on the MMC site.

"CC-BY-SA" means the Creative Commons Attribution-Share Alike 3.0 
license published by Creative Commons Corporation, a not-for-profit 
corporation with a principal place of business in San Francisco, 
California, as well as future copyleft versions of that license 
published by that same organization.

"Incorporate" means to publish or republish a Document, in whole or in 
part, as part of another Document.

An MMC is "eligible for relicensing" if it is licensed under this 
License, and if all works that were first published under this License 
somewhere other than this MMC, and subsequently incorporated in whole or 
in part into the MMC, (1) had no cover texts or invariant sections, and 
(2) were thus incorporated prior to November 1, 2008.

The operator of an MMC Site may republish an MMC contained in the site
under CC-BY-SA on the same site at any time before August 1, 2009,
provided the MMC is eligible for relicensing.


ADDENDUM: How to use this License for your documents

To use this License in a document you have written, include a copy of
the License in the document and put the following copyright and
license notices just after the title page:

    Copyright (c)  YEAR  YOUR NAME.
    Permission is granted to copy, distribute and/or modify this document
    under the terms of the GNU Free Documentation License, Version 1.3
    or any later version published by the Free Software Foundation;
    with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
    A copy of the license is included in the section entitled "GNU
    Free Documentation License".

If you have Invariant Sections, Front-Cover Texts and Back-Cover Texts,
replace the "with...Texts." line with this:

    with the Invariant Sections being LIST THEIR TITLES, with the
    Front-Cover Texts being LIST, and with the Back-Cover Texts being LIST.

If you have Invariant Sections without Cover Texts, or some other
combination of the three, merge those two alternatives to suit the
situation.

If your document contains nontrivial examples of program code, we
recommend releasing these examples in parallel under your choice of
free software license, such as the GNU General Public License,
to permit their use in free software.


================================================
FILE: doc/Makefile
================================================
# If you have pdflatex installed, type "make" to create the
# documentation, "make clean" to delete it

documentation: adept_documentation.pdf adept_reference.pdf

adept_documentation.pdf: adept_documentation.tex
	pdflatex adept_documentation.tex
	pdflatex adept_documentation.tex
	pdflatex adept_documentation.tex

adept_reference.pdf: adept_reference.tex
	pdflatex adept_reference.tex

clean:
	rm -f adept_documentation.pdf adept_reference.pdf

.PHONY: documentation clean


================================================
FILE: doc/README
================================================
This directory contains the LaTeX source files for the Adept User
Guide and Adept Reference Sheet

Type "make" to create the corresponding PDF files (using pdflatex),
and "make clean" to delete them

Permission is granted to copy, distribute and/or modify the Adept User
Guide and Adept Reference Sheet under the terms of the GNU Free
Documentation License, Version 1.3 or any later version published by
the Free Software Foundation. This license may be found at
http://www.gnu.org/copyleft/fdl.html, and in this directory in the
"COPYING" file. As an exception, no copyright is asserted for the code
fragments in the document (indicated in the text with a light-grey
background); these code fragments are in the Public Domain and may be
copied, modified and distributed without restriction.


================================================
FILE: doc/adept_documentation.tex
================================================
% 
% Adept automatic differentiation library for C++: User guide
%
% Type "pdflatex adept_documentation.tex" twice to recreate the PDF
% file (or type "make pdf" in this directory after running the
% configure script one directory above).
%
% Permission is granted to copy, distribute and/or modify this
% document under the terms of the GNU Free Documentation License,
% Version 1.3 or any later version published by the Free Software
% Foundation. This license may be found at
% http://www.gnu.org/copyleft/fdl.html, and in this directory in the
% "COPYING" file. As an exception, no copyright is asserted for the
% code fragments in this document (indicated in the text with a
% light-grey background); these code fragments are in the Public
% Domain and may be copied, modified and distributed without
% restriction.

\documentclass[a4,oneside]{book}
\usepackage[colorlinks=true,linkcolor=blue,citecolor=blue]{hyperref}
\usepackage{natbib}
\usepackage{times}
\usepackage{listings}
\usepackage{xcolor}
\usepackage{color}
\usepackage{marginnote}
\usepackage{rotating}

\usepackage{mdframed,lipsum}
\newmdenv[
  leftmargin = 0pt,
  innerleftmargin = 1em,
  innertopmargin = 0pt,
  innerbottommargin = 0pt,
  innerrightmargin = 0pt,
  rightmargin = 0pt,
  linewidth = 1pt,
  topline = false,
  rightline = false,
  bottomline = false
  ]{leftbar}

% Set math in Times Roman
\DeclareSymbolFont{letters}{OML}{ptmcm}{m}{it}
\DeclareSymbolFont{operators}{OT1}{ptmcm}{m}{n}

% Page set up
\setlength{\oddsidemargin}{0cm} %{0.5cm}
\setlength{\evensidemargin}{0cm} %{0.5cm}
\setlength{\topmargin}{-2cm}
\setlength{\textheight}{24cm}
\setlength{\textwidth}{16cm}
\setlength{\marginparsep}{0.5cm}
\setlength{\marginparwidth}{0cm}
\setlength{\parindent}{1em}
\setlength{\parskip}{0cm}
\renewcommand{\baselinestretch}{1.1}
\sloppy

% Configure appearance of code listings
\definecolor{light-gray}{gray}{0.92}
\def\codesize{\small}
\lstset{language=C++,
  backgroundcolor=\color{light-gray},
  numbersep=5pt,
  xleftmargin=0cm,
  xrightmargin=0cm,
  basicstyle=\footnotesize\ttfamily,
  emph={adouble,xdouble,Stack,adept,Array,FixedArray,Vector,aVector,aReal,Optimizable,Real,Minimizer,MinimizerStatus,Matrix,aMatrix,Array3D,aArray3D,intVector,boolVector,floatVector,floatMatrix,intMatrix,FortranArray,SpecialMatrix,SquareMatrix,aSquareMatrix,SymmMatrix,aSymmMatrix,UpperMatrix,LowerMatrix,IndexVector,adept_arrays,adept_optimize,adept_fortran},
  emphstyle=\bfseries\color{red}}
\lstset{showstringspaces=false}

% Table-of-contents configuration
\usepackage{tocloft}
\setlength\cftparskip{-2pt}
\setlength\cftbeforesecskip{1pt}
\setlength\cftaftertoctitleskip{2pt}
\renewcommand\cftsecfont{\normalfont}
\renewcommand\cftsecpagefont{\normalfont}
\renewcommand{\cftsecleader}{\cftdotfill{\cftsecdotsep}}
\renewcommand\cftsecdotsep{\cftdot}
\renewcommand\cftsubsecdotsep{\cftdot}

% Page headers
\usepackage{fancyhdr}
\pagestyle{fancy}
\renewcommand{\headrulewidth}{0.5pt}
\renewcommand{\sectionmark}[1]{\markright{\thesection.\ #1}}
\renewcommand{\subsectionmark}[1]{}
\fancyhead[RO,RE]{\thepage}
\fancyfoot[C]{}

% Symbols and macros
\def\x{\ensuremath{{\bf x}}}
\def\y{\ensuremath{{\bf y}}}
\def\H{\ensuremath{{\bf H}}}
\def\T{\ensuremath{^\mathrm{T}}}
\def\Adept{\emph{Adept}}
\def\code#1{{\codesize\texttt{#1}}}
\def\codebf#1{{\codesize\texttt{\textbf{#1}}}}
\def\citem#1{\item[{\codesize\texttt{#1}}]}
\def\codestyle#1{\texttt{#1}}
\def\Offset{size\_t}
\renewcommand\thefootnote{\relax}
\def\cxx11{\marginpar{\rotatebox[origin=rb]{90}{\textbf{C++11 only~~~}}}}
\reversemarginpar

% Title material
\title{\Adept\ C++ Software Library: User Guide}

\author{Robin J. Hogan\\ \emph{European Centre for Medium Range
    Weather Forecasts, Reading, UK}\\ \emph{and School of
    Mathematical, Physical and Computational Sciences, University of
    Reading, UK,}}

\date{Document version 2.1.3 (February 2024) applicable to \Adept\ version
  2.1.3 \thanks{This document is copyright \copyright\ Robin J. Hogan
    2013--2024.  Permission is granted to copy, distribute and/or
    modify this document under the terms of the GNU Free Documentation
    License, Version 1.3 or any later version published by the Free
    Software Foundation. This license may be found at
    \url{http://www.gnu.org/copyleft/fdl.html}.  As an exception, no
    copyright is asserted for the code fragments in this document
    (indicated in the text with a light-grey background); these code
    fragments are hereby placed in the Public Domain, and accordingly
    may be copied, modified and distributed without restriction.}
  \thanks{If you have any queries about \Adept\ that are not answered
    by this document or by the information on the \Adept\ web site
    (\url{http://www.met.reading.ac.uk/clouds/adept/}) then please
    email me at
    \href{mailto:r.j.hogan@ecmwf.int}{\texttt{r.j.hogan@ecmwf.int}}.}}
\begin{document}
\maketitle

\tableofcontents
\def\thefootnote{\fnsymbol{footnote}}
\chapter{Introduction}
\section{What is Adept?}
\Adept\ (Automatic Differentiation using Expression Templates) is a
C++ software library that enables algorithms to be automatically
differentiated. Since version 2.0\footnote{Note that the version 1.9.x
  series served as beta releases for version 2.0 of \Adept.} it also
provides array classes that can be used in array expressions.  These
two capabilities are fully integrated such that array expressions can
be differentiated efficiently, but the array capability may also be
used on its own.

The automatic-differentiation capability uses an operator overloading
approach, so very little code modification is
required. Differentiation can be performed in forward mode (the
``tangent-linear'' computation), reverse mode (the ``adjoint''
computation), or the full Jacobian matrix can be computed. This
behaviour is common to several other libraries, namely ADOL-C
\citep{Griewank+1996}, CppAD \citep{Bell2007} and Sacado
\citep{Gay2005}, but the use of expression templates, an efficient way
to store the differential information and several other optimizations
mean that reverse-mode differentiation tends to be significantly
faster and use less memory. In fact, \Adept\ is also usually only a
little slower than an adjoint code you might write by hand, but
immeasurably faster in terms of user time; adjoint coding is very time
consuming and error-prone. For technical details of how it works,
benchmark results and further discussion of the factors affecting its
speed when applied to a particular code, see \cite{Hogan2014}.

Expression templates also underpin a number of libraries that provide
the capability to perform mathematical operations on entire arrays
\citep{Veldhuizen1995}. Unfortunately, if \Adept\ version 1.x and such
an array library are used together, then the speed advantages of
expression templates are lost, if indeed the libraries work together
at all. Since version 2.0, \Adept\ provides array classes that
overcome this problem: its automatic differentiation and array
capabilities are underpinned by a single unified expression template
framework so that array expressions may be differentiated very
efficiently.  However, it should be stressed that \Adept\ is useful as
a fully functional array library even if you don't wish to use its
automatic differentiation capability. \Adept\ uses BLAS and LAPACK for
matrix operations.

This user guide describes how to apply the \Adept\ software library to
your code, and many of the examples map on to those in the \code{test}
directory of the \Adept\ software package.  Section
\ref{sec:installing} outlines how to install \Adept\ on your system
and how to compile your own code to use it. Chapter \ref{chap:ad}
describes how to use the automatic differentiation capability of the
library, chapter \ref{chap:arrays} its array capability and chapter
\ref{chap:optimize} its optimization capability. Chapter
\ref{chap:gen} then describes general aspects such as exception
handling, configuration options and license terms.

\section{Installing \Adept\ and compiling your code to use it}
\label{sec:installing}
\Adept\ should work with any C++98 compliant compiler, but uses some
C++11 features if compiled with support for this later standard. Most
of the testing has been on Linux with the GNU C++ compiler, but it
also compiles on Linux with the Clang and Intel compilers and on
Windows with the Microsoft compiler. The code is built with the help
of a \code{configure} shell script generated by GNU autotools.  If you
are on a non-Unix system (e.g.\ Windows) and cannot use shell scripts,
see section \ref{sec:non-unix}.
\subsection{Unix-like platforms}
\label{sec:unix}
On a Unix-like system, do the following:
\begin{enumerate}
\item Install the BLAS library to enable matrix multiplication.  For
  the best performance in matrix operations it is recommended that you
  install an optimized package such as OpenBLAS\footnote{OpenBLAS is
    available from \url{http://www.openblas.net/}.} or
  ATLAS\footnote{ATLAS is available from
    \url{http://math-atlas.sourceforge.net/}.}.  If you have multiple
  BLAS libraries available on your system you can specify the one you
  want by calling the \code{configure} script below with
  \code{--with-blas=openblas} or similar.  If \Adept\ is compiled
  without BLAS support then matrix multiplication will fail at run
  time.
\item Optionally install the LAPACK library, necessary for matrix
  inversion and solving linear systems of equations. If you do not
  install this then \Adept\ will still compile but the functions
  \code{inv} and \code{solve} will fail at run time. Note that LAPACK
  relies on the underlying BLAS library for its speed.
\item The test and benchmarking programs can make use of additional
  libraries if available. If you also install any of the automatic
  differentiation tools ADOL-C, CppAD and/or Sacado then the
  benchmarking test program can compare them to \Adept. One of the
  test programs uses the minimization algorithm from the GNU
  Scientific Library, if available, so you may wish to install that
  too.
\item Unpack the package (\code{tar xvfz adept-2.x.tar.gz} on Linux)
  and \code{cd} to the directory \code{adept-2.x}.
\item Configure the build using the \code{configure} script. The most
  basic method is to just run
\begin{lstlisting}
 ./configure
\end{lstlisting}
More likely you will wish to compile with a higher level of
optimization than the default (which is \code{-O2}), achieved by
setting the environment variable \code{CXXFLAGS}. You may also wish to
specify the root directory of the installation, say to
\code{/foo}. These may be done by running instead
\begin{lstlisting}
 ./configure CXXFLAGS="-g -O3" --prefix=/foo
\end{lstlisting}
The \code{-g} option to \code{CXXFLAGS} ensures debugging information
is stored. If you use the GNU compiler then consider the \code{-g1}
option instead to reduce the amount of debugging information
stored. The GNU \code{-march=native} option will also enable the
fastest instruction set for the machine on which the code is being
compiled.  \Adept\ can vectorize certain floating-point array
expressions making use of the SSE2, AVX and AVX512 instruction sets on
Intel hardware and the NEON instruction set on 64-bit ARM. If a library you
wish to use is installed in a non-system directory, say under
\code{/foo}, then specify the locations as follows:
\begin{lstlisting}
 ./configure CPPFLAGS="-I/foo/include" LDFLAGS="-L/foo/lib -Wl,-rpath,/foo/lib"
\end{lstlisting}
where the \code{-rpath} business is needed in order that the
\Adept\ shared library knows where to look for the libraries it is
dependent on.  If you have them then for the benchmarking program you
can also add the non-system location of ADOL-C, CppAD and Sacado
libraries with additional \code{-I} and \code{-L} arguments, but note
that the \code{-rpath} argument is not needed in that case.  You can
see the more general options available by running \code{./configure
  --help}; for example, you can turn-off OpenMP parallelization in the
computation of Jacobian matrices using \code{--disable-openmp}.  See
also section \ref{sec:configuring} for ways to make more fundamental
changes to the configuration of \Adept.  The output from the
\code{configure} script provides information on aspects of how
\Adept\ and the test programs will be built.
\item Build \Adept\ by running
\begin{lstlisting}
 make
\end{lstlisting}
This will create the static and shared libraries in \code{adept/.libs}.
\item Install the header files and the static and shared libraries by
  running
\begin{lstlisting}
 make install
\end{lstlisting}
If this is to be installed to a system directory, you will need to log
in as the super-user first, or run \code{sudo make install} on
depending on your system.
\item Build and run the test programs by running
\begin{lstlisting}
 make check
\end{lstlisting}
Note that this may be done without first installing the
\Adept\ library to a system directory.  This compiles a number of test
programs in the \code{test} directory and runs them one by one; if any
fail due to an incorrect result then \code{make check} will fail.
%
The \code{make check} operation also compiles
\code{autodiff\_benchmark} in the \code{benchmark} directory for
comparing the speed of the differentiation of two advection algorithms
using \Adept, ADOL-C, CppAD and Sacado (or whichever subset of these
tools you have on your system).  It also compiles \code{animate} for
visualizing at a terminal what the algorithms are doing.  Further
information on running these programs can be found in the
\code{README} files in the relevant directories.
\end{enumerate}
%
The test programs in in the \code{test} directory are as follows:
%
\begin{enumerate}
\item\code{test\_adept}: compares the results of numerical and
  automatic differentiation.
\item\code{test\_with\_without\_ad}: does the same but compiling the
  same source code both with and without automatic differentiation
  (see \code{test/Makefile} for how this is done),
\item\code{test\_radiances}: demonstrates the interfacing of
  \Adept\ with code that provides its own Jacobian.
\item\code{test\_gsl\_interface}: implementation of a simple minimization
  problem using the L-BFGS minimizer in the GSL library.
\item\code{test\_misc}: the trivial example from \cite{Hogan2014}.
\item\code{test\_checkpoint}: demonstration of checkpointing, a useful
  technique for large codes.
\item\code{test\_thread\_safe}: demonstration of the use of multiple
  OpenMP threads, each with its own instance of an \Adept\ stack.
\item\code{test\_no\_lib}: demonstrates the use of the
  \code{adept\_source.h} header file that means there is no need to
  link to the \Adept\ library in order to create an executable.
\item\code{test\_arrays}, \code{test\_arrays\_active},
  \code{test\_arrays\_active\_pausable}, \code{test\_complex\_arrays}:
  test many of the array capabilities described in chapter
  \ref{chap:arrays}. Each of these four executables is compiled from
  the same source file but with different compiler options in order to
  test the same array operations but with (a) passive arrays, (b)
  active arrays, (c) active arrays but with stack recording "paused"
  (see setion \ref{sec:pausable}), and (d) complex arrays.
\item\code{test\_array\_speed}: compares the speed of array operations
  versus the equivalent C-style \code{for} loop.
\item\code{test\_radiances\_array}: as \code{test\_radiances} but
  demonstrates the use of \code{add\_derivative\_dependence} with
  array arguments.
\item\code{test\_fixed\_arrays}, \code{test\_fixed\_arrays\_active}:
  tests the functionality of arrays with fixed dimensions, i.e.\ those
  known at compile time. The two executables are compiled from the
  same source file, testing (a) passive arrays and (b) active arrays.
\item\code{test\_constructors}: test the different ways of
  constructing, assigning and linking arrays, and passing them to and
  from functions.
\item\code{test\_derivatives}: tests that all mathematical functions
  supported by \Adept\ differentiate correctly.
\item\code{test\_array\_derivatives}: tests that selected array
  operations differentiate correctly.
\item\code{test\_thread\_safe\_arrays}: tests two ways to ensure
  arrays may be accessed and subsetted safely in a multi-threaded
  environment.
\item\code{test\_packet\_operations}: tests that Adept's use of Intel
  or ARM intrinsics to accelerate vector operations leads to identical
  output to the equivalent scalar code.
\item\code{test\_fastexp}: tests the correctness of Adept's fast
  exponential function.
\item\code{test\_reduce\_active}: tests the correctness of the
  differentiation of reduction operations (\code{sum}, \code{product},
  \code{maxval} etc).
\item\code{test\_minimizer}: tests Adept's minimization capabilities
  on the N-dimensional Rosenbrock banana function. Different
  dimensionality and minimization algorithms can be used, but by
  default the Levenberg-Marquardt minimizer is used with the
  2-dimensional Rosenbrock function.
\end{enumerate}

To compile source files that use the \Adept\ library, you need to make
sure that \code{adept.h} and \code{adept\_arrays.h} are in your
include path. If they are located in a directory that is not in the
default include path, add something like \code{-I/home/fred/include}
to the compiler command line. At the linking stage, add \code{-ladept}
to the command line to tell the linker to look for the
\code{libadept.a} static library, or equivalent shared library. If
this file is in a non-standard location, also add something like
\code{-L/home/fred/lib -Wl,-rpath,/home/fred/lib} before the
\code{-ladept} argument to specify its location. Section
\ref{sec:multipleobjects} provdes an example Makefile for compiling
code that uses the \Adept\ library. Read on to see how you can compile
an \Adept\ application \emph{without} needing to link to a library.

\subsection{Non-Unix platforms, and compiling \Adept\ applications
  without linking to an external library}
\label{sec:non-unix}

Most of the difficulty in maintaining software that can compile on
multiple platforms arises from the different ways of compiling
software libraries, and the need to test on compilers that may be
proprietary.  Unfortunately I don't have the time to maintain versions
of \Adept\ that build specifically on Microsoft Windows or other
non-Unix platforms.  However, \Adept\ is not a large library, so I
have provided a very simple way to build an \Adept\ application
\emph{without} the need to link to a pre-compiled \Adept\ library. In
one of your source files and one only, add this near the top:
\begin{lstlisting}
 #include <adept_source.h>
\end{lstlisting}
Typically you would include this in the source file containing the
\code{main} function.  This header file is simply a concatenation of
the \Adept\ library source files, so when you compile a file that
includes it, you compile in all the functionally of the
\Adept\ library. All other source files in your application should
include only the \code{adept.h} or \code{adept\_arrays.h} header file
as normal.  When you link all your object files together to make an
executable, the \Adept\ functionality will be built in, even though
you did not link to an external \Adept\ library.

By default, \code{adept\_arrays.h} does not enable BLAS (needed for
matrix multiplication) or LAPACK (needed for matrix inversion and
solving linear systems of equations); to enable either BLAS alone, or
both BLAS and LAPACK, uncomment the lines near the top of
\code{adept\_source.h} defining \code{HAVE\_BLAS} and
\code{HAVE\_LAPACK}, and link against functioning BLAS and LAPACK
library. A demonstration of the use of \code{adept\_source.h} is in
the \code{test/test\_no\_lib.cpp} source file, which needs to be
compiled together with \code{test/algorithm.cpp} to make an
executable.
%
It is hoped that this feature will make it easy to use \Adept\ on
non-Unix platforms, although of course this feature works just as well
on Unix-like platforms as well.
%  If you want to use OpenBLAS on such
%platforms then you will still need to install that library in the
%normal way.%

A further point to note is that, under the terms of the license, it is
permitted to copy all the \Adept\ include files, including
\code{adept\_source.h}, into an include directory in your software
package and use it from there in both binary and source-code releases
of your software. This means that users do not need to install
\Adept\ separately before they use your software.  However, if you do
this then remember that your use of these files must comply with the
terms of the Apache License, Version 2.0; see section
\ref{sec:license} for details.
%
\chapter{Using \Adept\ for automatic differentiation}
\label{chap:ad}
%
\section{Introduction}
\label{sec:ad_functionality}
This chapter describes how to use \Adept\ to differentiate your code.
For simplicity, none of the examples use array functionality described
in the next chapter. \Adept\ provides the following
automatic-differentiation functionality:
%
\begin{description}
\item[Full Jacobian matrix] Given the non-linear function $\y=f(\x)$
  relating vector $\y$ to vector $\x$ coded in C or C++, after a
  little code modification \Adept\ can compute the Jacobian matrix
  $\H=\partial\y/\partial\x$, where the element at row $i$ and column $j$ of
  $\H$ is $H_{i,j}=\partial y_i/\partial x_j$. This matrix will be
  computed much more rapidly and accurately than if you simply
  recompute the function multiple times, each time perturbing a
  different element of $\x$ by a small amount. The Jacobian matrix is
  used in the Gauss-Newton and Levenberg-Marquardt minimization
  algorithms.
\item[Reverse-mode differentiation] This is a key component in
  optimization problems where a non-linear function needs to be
  minimized but the state vector $\x$ is too large for it to make
  sense to compute the full Jacobian matrix. Atmospheric data
  assimilation is the canonical example in the field of
  meteorology. Given a non-linear function $J(\x)$ relating the
  scalar to be minimized $J$ to vector $\x$, \Adept\ will compute the
  vector of adjoints $\partial J/\partial\x$. Moreover, for a
  component of the code that may be expressed as a multi-dimensional
  non-linear function $\y=f(\x)$, \Adept\ can compute $\partial
  J/\partial\x$ if it is provided with the vector of input adjoints
  $\partial J/\partial\y$.  In this case, $\partial J/\partial\x$ is
  equal to the matrix-vector product $\H\T\partial J/\partial\y$, but
  it is computed here without computing the full Jacobian matrix
  $\H$. The vector $\partial J/\partial\x$ may then be used in a
  quasi-Newton minimization scheme \cite[e.g.,][]{Liu+1989}.
\item[Forward-mode differentiation] Given the non-linear function
  $\y=f(\x)$ and a vector of perturbations $\delta\x$, \Adept\ will
  compute the corresponding vector $\delta\y$ arising from a
  linearization of the function $f$. Formally, $\delta\y$ is equal
  to the matrix-vector product $\H\delta\x$, but it is computed here
  without computing the full Jacobian matrix $\H$. Note that
  \Adept\ is designed for the reverse case, so might not be as fast
  or economical in memory in the forward mode as libraries written
  especially for that purpose (although Hogan, 2014, showed that it
  was competitive).
\end{description}%
%
\Adept\ can automatically differentiate the following
operators and functions:
\begin{itemize}
\item The standard binary mathematical operators \code{+}, \code{-},
  \code{*} and \code{/}.
\item The assignment versions of these operators:
  \code{+=}, \code{-=}, \code{*=} and \code{/=}.
\item The unary mathematical functions \code{sqrt}, \code{exp},
  \code{log}, \code{log10}, \code{sin}, \code{cos}, \code{tan},
  \code{asin}, \code{acos}, \code{atan}, \code{sinh}, \code{cosh},
  \code{tanh}, \code{abs}, \code{asinh}, \code{acosh}, \code{atanh},
  \code{expm1}, \code{log1p}, \code{cbrt}, \code{erf}, \code{erfc},
  \code{exp2}, \code{log2}, \code{round}, \code{trunc}, \code{rint}
  and \code{nearbyint},
\item The binary functions \code{pow}, \code{atan2}, \code{min},
  \code{max}, \code{fmin} and \code{fmax}.
\end{itemize}
Variables to take part in expressions to be differentiated have a
special ``active'' type; such variables can take part in comparison
operations \code{==}, \code{!=}, \code{>}, \code{<}, \code{>=} and
\code{<=}, as well as the diagnostic functions \code{isfinite},
\code{isinf} and \code{isnan}.

Note that at present \Adept\ is missing some functionality that you may
require:

\begin{itemize}
\item Differentiation is first-order only: it cannot directly compute
  higher-order derivatives such as the Hessian matrix, although
  section \ref{sec:optimize} describes how \Adept\ can help compute
  the approximate Hessian if the cost function (also known as the
  penalty function or objective function) is in a particular commonly
  used form.
\item It has limited support for complex numbers; no support for
  mathematical functions of complex numbers, and expressions involving
  operations (addition, subtraction, multiplication and division) on
  complex numbers are not optimized.
\item It can be applied to C and C++ only; \Adept\ could not be
  written in Fortran since the language provides no template
  capability.
\end{itemize}%
%
It is hoped that future versions will remedy these limitations (and
maybe even a future version of Fortran will support templates).

Section \ref{sec:preparation} describes how to prepare your code for
automatic differentiation, and section \ref{sec:adjoint} describes how
to perform forward- and reverse-mode automatic differentiation on this
code. Section \ref{sec:jacobian} describes how to compute Jacobian
matrices. Section \ref{sec:realworld} provides a detailed description
of how to interface an algorithm implemented using \Adept\ with a
third-party minimization library.  Section \ref{sec:withwithout}
describes how to call a function both with and without automatic
differentiation from within the same program. Section
\ref{sec:interfacehandcoded} describes how to interface to software
modules that compute their own Jacobians.  Section \ref{sec:stack}
describes the user-oriented member functions of the \code{Stack} class
that contains the differential information and section
\ref{sec:adouble} describes the member functions of the ``active''
double-precision type \code{adouble}.


\section{Code preparation}
\label{sec:preparation}
If you have used ADOL-C, CppAD or Sacado then you will already be
familiar with what is involved in applying an operator-overloading
automatic differentiation package to your code. The user interface to
\Adept\ differs from these only in the detail. It is assumed that you
have an algorithm written in C or C++ that you wish to
differentiate. This section deals with the modifications needed to
your code, while section \ref{sec:adjoint} describes the small
additional amount of code you need to write to differentiate it.

In all source files containing code to be differentiated, you need to
include the \code{adept.h} header file and import the \code{adouble}
type from the \code{adept} namespace. Assuming your code uses double
precision, you then search and replace \code{double} with the
``active'' equivalent \code{adouble}, but doing this only for those
variables whose values depend on the independent input variables.
Under the hood this type is an alias for \code{Active<double>}.  The
single-precision equivalent is \code{afloat}, an alias for
\code{Active<float>}.  Active and passive variables of single and
double precision may be used together in the same expressions, but
note that by default all differentiation is done in double precision.

If you wish to enable your code to be easily recompiled to use
different precisions, then you may alternatively use the generic
\code{Real} type from the \code{adept} namespace with its active
equivalent \code{aReal} (an alias for \code{Active<Real>}). Section
\ref{sec:configuring} describes how to redefine \code{Real} to
represent single, double or quadruple precision.  Automatic
differentiation will be performed using the same precision as
\code{Real}, but but be aware that if this is defined to be the same
as a single-precision \code{float}, accumulation of round-off error
can make the accuracy of derivatives insufficient for minimization
algorithms. The examples in the remainder of this chapter use only
double precision.

Consider the following contrived algorithm from \cite{Hogan2014} that
takes two inputs and returns one output:

\begin{lstlisting}
 double algorithm(const double x[2]) {
   double y = 4.0;
   double s = 2.0*x[0] + 3.0*x[1]*x[1];
   y *= sin(s);
   return y;
 }
\end{lstlisting}

\noindent The modified code would look like this:

\begin{lstlisting}
 #include <adept.h>
 using adept::adouble;

 adouble algorithm(const adouble x[2]) {
   adouble y = 4.0;
   adouble s = 2.0*x[0] + 3.0*x[1]*x[1];
   y *= sin(s);
   return y;
 }
\end{lstlisting}

\noindent Changes like this need to be done in all source files that
form part of an algorithm to be differentiated. 

If you need to access the real number underlying an \code{adouble}
variable \code{a}, for example in order to use it as an argument to
the \code{fprintf} function, then use \code{a.value()} or
\code{adept::value(a)}. Any mathematical operations performed on
this real number will not be differentiated.

You may use \code{adouble} as the template argument of a Standard
Template Library (STL) vector type (i.e.  \code{std::vector\textless
  adouble\textgreater}), or indeed any container where you access
individual elements one by one. For types allowing mathematical
operations on the whole object, such as the STL \code{complex} and
\code{valarray} types, you will find that although you can multiply
one \code{std::complex\textless adouble\textgreater} or
\code{std::valarray\textless adouble\textgreater} object by another,
mathematical functions (\code{exp}, \code{sin} etc.) will not work
when applied to whole objects, and neither will some simple operations
such as multiplying these types by an ordinary (non-active)
\code{double} variable.  Moreover, the performance is not great
because expressions cannot be fully optimized when in these
containers.  Therefore If you need array functionality then you should
use the features described in chapter \ref{chap:arrays}.  It is hoped
that a future version of \Adept\ will include its own complex type.

\section{Applying reverse-mode differentiation}
\label{sec:adjoint}

Suppose you wanted to create a version of \code{algorithm} that
returned not only the result but also the gradient of the result with
respect to its inputs, you would do this:

\begin{lstlisting}
 #include <adept.h>
 double algorithm_and_gradient(
                     const double x_val[2], // Input values
                     double dy_dx[2]) {     // Output gradients
   adept::Stack stack;                      // Where the derivative information is stored
   using adept::adouble;                    // Import adouble from adept
   adouble x[2] = {x_val[0], x_val[1]};     // Initialize active input variables
   stack.new_recording();                   // Start recording
   adouble y = algorithm(x);                // Call version overloaded for adouble args
   y.set_gradient(1.0);                     // Defines y as the cost function 
   stack.compute_adjoint();                 // Run the adjoint algorithm
   dy_dx[0] = x[0].get_gradient();          // Store the first gradient
   dy_dx[1] = x[1].get_gradient();          // Store the second gradient
   return y.value();                        // Return the result of the simple computation
 }
\end{lstlisting}
%
The component parts of this function are in a specific order, and if
this order is violated then the code will not run correctly. The steps
are now described.
%
\subsection{Set-up stack to record derivative information}
\label{sec:stack_setup}
\begin{lstlisting}
 adept::Stack stack;
\end{lstlisting}
The \code{Stack} object is where the differential version of the
algorithm will be stored. When initialized, it makes itself accessible
to subsequent statements via a global variable, but using thread-local
storage to ensure thread safety. \emph{It must be initialized before
  the first \code{adouble} object is instantiated and it must not go
  out of scope until the last \code{adouble} object is destructed.}
This is because \code{adouble} objects register themselves with the
currently active stack, and deregister themselves when they are
destroyed; if the same stack is not active throughout the lifetime of
such \code{adouble} objects then the code will crash with a
segmentation fault.

In the example here, the \code{Stack} object is local to the scope of
the function. If another \code{Stack} object had been initialized by
the calling function and so was active at the point of entry to the
function, then the local \code{Stack} object would throw an
\code{adept::stack\_already\_active} exception. See Test 3 described
at \code{test/README} in the \Adept\ package if you want to use
multiple \code{Stack} objects in the same program: the relevant source
code is in \code{test/simulate\_radiances.cpp}, which temporarily
deactivates the existing \code{Stack} objects in order that the local
one can run.  A disadvantage of local \code{Stack} objects is that the
memory it uses must be reallocated each time the function is called.
This can be overcome in several ways:
\begin{itemize}
\item Declare the \code{Stack} object to be \code{static}, which means
  that it will persist between function calls. This has the
  disadvantage that you won't be able to use other \code{Stack}
  objects in the program without deactivating this one first (see \code{test\_radiances} in the \Adept\ package, referred to above, for how to do this).
\item Initialize \code{Stack} at a higher level in the program. If you
  need access to the stack, you may either pass a reference to it to
  functions such as \code{algorithm\_and\_gradient}, or alternatively
  you can use the \code{adept::active\_stack()} function to return a
  pointer to the currently active stack object.
\item Put it in a class so that it is accessible to member functions;
  this approach is demonstrated in section \ref{sec:realworld}.
\end{itemize}
%
\subsection{Initialize independent variables and start recording}
\begin{lstlisting}
 adouble x[2] = {x_val[0], x_val[1]};
 stack.new_recording();
\end{lstlisting}
The first line here simply copies the input values to the algorithm
into \code{adouble} variables. These are the \emph{independent
  variables}, but note that there is no obligation for these to be
stored as one array (as in CppAD), and for forward- and reverse-mode
automatic differentiation you do not need to tell \Adept\ explicitly
via a function call which variables are the independent ones. The next
line clears all differential statements from the stack so that it is
ready for a new recording of differential information.
%
Note that the first line here actually stores two differential
statements, $\delta$\code{x[0]=0} and $\delta$\code{x[1]=0}, which are
immediately cleared by the \code{new\_recording} function call.  To
avoid the small overhead of storing redundant information on the
stack, we could replace the first line with 
\begin{lstlisting}
 x[0].set_value(x_val[0]);
 x[1].set_value(x_val[1]);
\end{lstlisting}
or
\begin{lstlisting}
 adept::set_values(x, 2, x_val);
\end{lstlisting}
which have the effect of setting the values of \code{x} without storing
the equivalent differential statements.

Previous users of \Adept\ version 0.9 should note that since version
1.0, the \code{new\_recording} function replaces the \code{start}
function call, which had to be put \emph{before} the independent
variables were initialized.  The problem with this was that the
independent variables had to be initialized with the \code{set\_value}
or \code{set\_values} functions, otherwise the gradients coming out of
the automatic differentiation would all be zero.  Since it was easy to
forget this, \code{new\_recording} was introduced to allow the
independent variables to be assigned in the normal way using the
assignment operator (\code{=}).  But don't just replace \code{start}
in your version-0.9-compatible code with \code{new\_recording}; the
latter must appear \emph{after} the independent variables have been
initialized.

\subsection{Perform calculations to be differentiated}
\begin{lstlisting}
 adouble y = algorithm(x);
\end{lstlisting}
The algorithm is called, and behind the scenes the equivalent
differential statement for every mathematical statement is stored in the
stack. The result of the forward calculation is stored in \code{y},
known as a dependent variable. This example has one dependent
variable, but any number is allowed, and they could be returned in
another way, e.g. by passing a non-constant array to algorithm that is
filled with the final values when the function returns.
%
\subsection{Perform reverse-mode differentiation}

\begin{lstlisting}
 y.set_gradient(1.0);
 stack.compute_adjoint();
\end{lstlisting}
The first line sets the initial gradient (or adjoint) of \code{y}. In
this example, we want the output gradients to be the derivatives of
\code{y} with respect to each of the independent variables; to achieve
this, the initial gradient of \code{y} must be unity.

More generally, if \code{y} was only an intermediate value in the
computation of cost function $J$, then for the outputs of the
function to be the derivatives of $J$ with respect to each of the
independent variables, we would need to set the gradient of
\code{y} to $\partial J/\partial$\code{y}. In the case of multiple
intermediate values, a separate call to \code{set\_gradient} is needed
for each intermediate value.  If \code{y} was an array of length
\code{n} then the gradient of each element could be set to the values in a \code{double} array \code{y\_ad} using
\begin{lstlisting}
 adept::set_gradients(y, n, y_ad);
\end{lstlisting}

The \code{compute\_adjoint()} member function of stack performs the
adjoint calculation, sweeping in reverse through the differential
statements stored on the stack. Note that this must be preceded by at
least one \code{set\_gradient} or \code{set\_gradients} call, since
the first such call initializes the list of gradients for
\code{compute\_adjoint()} to act on. Otherwise,
\code{compute\_adjoint()} will throw a
\code{gradients\_not\_initialized} exception. 

\subsection{Extract the final gradients}

\begin{lstlisting}
 dy_dx[0] = x[0].get_gradient();
 dy_dx[1] = x[1].get_gradient();
\end{lstlisting}
These lines simply extract the gradients of the cost function
with respect to the two independent variables. Alternatively we could
have extracted them simultaneously using
\begin{lstlisting}
 adept::get_gradients(x, 2, dy_dx);
\end{lstlisting}

To do forward-mode differentiation in this example would involve
setting the initial gradients of \code{x} instead of \code{y}, calling
the member function \code{compute\_tangent\_linear()} instead of
\code{compute\_adjoint()}, and extracting the final gradients from
\code{y} instead of \code{x}.

\section{Computing Jacobian matrices}
\label{sec:jacobian}
Until now we have considered a function with two inputs and one
output.  Consider the following more general function whose declaration
is
\begin{lstlisting}
 void algorithm2(int n, const adouble* x, int m, adouble* y);
\end{lstlisting}
where \code{x} points to the \code{n} independent (input) variables
and \code{y} points to the \code{m} dependent (output) variables. The
following function would return the full Jacobian matrix:
%
\begin{lstlisting}
 #include <vector>
 #include <adept.h>
 void algorithm2_jacobian(
                     int n,                 // Number of input values
                     const double* x_val,   // Input values
                     int m,                 // Number of output values
                     double* y_val,         // Output values
                     double* jac) {         // Output Jacobian matrix
   using adept::adouble;                    // Import Stack and adouble from adept
   adept::Stack stack;                      // Where the derivative information is stored
   std::vector<adouble> x(n);               // Vector of active input variables
   adept::set_values(&x[0], n, x_val);      // Initialize adouble inputs
   stack.new_recording();                   // Start recording
   std::vector<adouble> y(m);               // Create vector of active output variables
   algorithm2(n, &x[0], m, &y[0]);          // Run algorithm
   stack.independent(&x[0], n);             // Identify independent variables
   stack.dependent(&y[0], m);               // Identify dependent variables
   stack.jacobian(jac);                     // Compute & store Jacobian in jac
   for (int iy = 0; iy < m; ++iy) 
     y_val[iy] = y[iy].value();             // Extract value from active object 
 }
\end{lstlisting}
%
Note that:
\begin{itemize}
\item The \code{independent} member function of stack is used to
  identify the independent variables, i.e.\ the variables that the
  derivatives in the Jacobian matrix will be with respect to. In this
  example there are \code{n} independent variables located together in
  memory and so can be identified all at once. Multiple calls are
  possible to identify further independent variables.  To identify a
  single independent variable, call \code{independent} with just one
  argument, the independent variable (not as a pointer). 
\item The \code{dependent} member function of stack identifies the
  dependent variables, and its usage is identical to
  \code{independent}.
\item The memory provided to store the Jacobian matrix (pointed to by
  \code{jac}) must be a one-dimensional array of size
  \code{m}$\times$\code{n}, where \code{m} is the number of dependent
  variables and \code{n} is the number of independent variables.
\item The resulting matrix is stored in the sense of the index
  representing the dependent variables varying fastest (column-major
  order).
% To get row-major order, call the \code{jacobian} function
%  with a second argument of \code{true} (see section \ref{sec:stack}).
\item Internally, the Jacobian calculation is performed by multiple
  forward or reverse passes, whichever would be faster (dependent on
  the numbers of independent and dependent variables).
\item The use of \code{std::vector<adouble>} rather than \code{new
  adouble[n]} ensures no memory leaks in the case of an exception being
  thrown, since the memory associated with \code{x} and \code{y} will
  be automatically deallocated when they go out of scope.
\end{itemize}%

As described in chapter \ref{chap:arrays}, \Adept\ version 2.0
introduced built-in multi-dimensional arrays of both active
(e.g.\ \code{aVector} and passive (e.g.\ \code{Vector}) variables. It
therefore seems more natural to express the algorithm above in terms
of these objects, which could be done as follows:

\begin{lstlisting}
 #include <adept_arrays.h>

 // Adept vectors know their own length, so lengths do not need to be
 // passed in as well
 adept::aVector algorithm2(const adept::aVector& x);

 void algorithm2_jacobian(
                const adept::Vector& x_val, // Input values
                adept::Vector& y_val,       // Output values (correctly sized or empty)
                adept::Matrix& jac) {       // Output Jacobian matrix (correctly sized)
   adept::Stack stack;                      // Where the derivative information is stored
   adept::aVector x = x_val;                // Active vector of inputs
   stack.new_recording();                   // Start recording
   adept::aVector y = algorithm2(x);        // Run algorithm and store outputs
   stack.independent(x);                    // Identify independent variables
   stack.dependent(y);                      // Identify dependent variables
   stack.jacobian(jac);                     // Compute & store Jacobian (since Adept 2.0.8)
   // If jac is empty we can automatically resize it using this instead (since Adept 2.0.8):
   //jac = stack.jacobian();
   y_val = value(y);                        // Extract the values from the active array
 }
\end{lstlisting}

\section{Real-world usage: interfacing \Adept\ to a third-party minimization library}
\label{sec:realworld}
Suppose we want to find the vector $\x$ that minimizes an cost
function $J(\x)$ that consists of a large algorithm coded using the
\Adept\ library and encapsulated within a C++ class.  In this section
we illustrate how it may be interfaced to a third-party minimization
algorithm with a C-style interface, specifically the free one in the
GNU Scientific Library.  Note that since version 2.0.8,
\Adept\ provides its own minimization functionality, as described in
chapter \ref{chap:optimize}.

The full working version of this example, using the N-dimensional
Rosenbrock banana function as the function to be minimized, is in
\code{test/test\_gsl\_interface.cpp} of the \Adept\ software package
(see the description of Test 4 in \code{test/README}). The interface
to the algorithm is as follows:
%
\begin{lstlisting}
 #include <vector>
 #include <adept.h>
 using adept::adouble;
 class State {
  public:
    // Construct a state with n state variables
    State(int n) { active_x_.resize(n); x_.resize(n); }
    // Minimize the function, returning true if minimization successful, false otherwise
    bool minimize();
    // Get copy of state variables after minimization
    void x(std::vector<double>& x_out) const;
    // For input state variables x, compute the function J(x) and return it
    double calc_function_value(const double* x);
    // For input state variables x, compute function and put its gradient in dJ_dx
    double calc_function_value_and_gradient(const double* x, double* dJ_dx);
    // Return the size of the state vector
    unsigned int nx() const { return active_x_.size(); }
  private:
    // Active version: the algorithm is contained in the definition of this function
    adouble calc_function_value(const adouble* x);
    // DATA
    adept::Stack stack_;             // Adept stack object (must be before active state
                                     // variables, e.g. adouble, in class definition)
    std::vector<adouble> active_x_;  // Active state variables (must be after Stack)
 };
\end{lstlisting}
%
The algorithm itself is contained in the definition of
\code{calc\_function\_value(const adouble*)}, which is implemented using
\code{adouble} variables (following the rules in section
\ref{sec:preparation}). However, the public interface to the class
uses only standard \code{double} types, so the use of \Adept\ is
hidden to users of the class.  Of course, a complicated algorithm may
be implemented in terms of multiple classes that do exchange data via
\code{adouble} objects. We will be using a quasi-Newton minimization
algorithm that calls the algorithm many times with trial vectors $\x$,
and for each call may request not only the value of the function, but
also its gradient with respect to $\x$. Thus the public interface
provides \code{calc\_function\_value(const double*)} and
\code{calc\_function\_value\_and\_gradient}, which could be implemented as
follows:
%
\begin{lstlisting}
 double State::calc_function_value(const double* x) {
   for (unsigned int i = 0; i < nx(); ++i) active_x_[i] = x[i];
   stack_.new_recording();
   return value(calc_function_value(&active_x_[0]));
 }

 double State::calc_function_value_and_gradient(const double* x, double* dJ_dx) {
   for (unsigned int i = 0; i < nx(); ++i) active_x_[i] = x[i];
   stack_.new_recording();
   adouble J = calc_function_value(&active_x_[0]);
   J.set_gradient(1.0);
   stack_.compute_adjoint();
   adept::get_gradients(&active_x_[0], nx(), dJ_dx);
   return value(J);
 }
\end{lstlisting}
%
The first function simply copies the \code{double} inputs into an
\code{adouble} vector and runs the version of
\code{calc\_function\_value} for \code{adouble} arguments. Obviously
there is an inefficiency here in that gradients are recorded that are
then not used, and this function would be typically 2.5--3 times
slower than an implementation of the algorithm that did not store
gradients.  Section \ref{sec:withwithout} describes three ways to
overcome this problem.  The second function above implements
reverse-mode automatic differentiation as described in section
\ref{sec:adjoint}.

The \code{minimize} member function could be implemented using GSL as
follows:
%
\begin{lstlisting}
 #include <iostream>
 #include <gsl/gsl_multimin.h>

 bool State::minimize() {
   // Minimizer settings
   const double initial_step_size = 0.01;
   const double line_search_tolerance = 1.0e-4;
   const double converged_gradient_norm = 1.0e-3;
   // Use the "limited-memory BFGS" quasi-Newton minimizer
   const gsl_multimin_fdfminimizer_type* minimizer_type
     = gsl_multimin_fdfminimizer_vector_bfgs2;

   // Declare and populate structure containing function pointers
   gsl_multimin_function_fdf my_function;
   my_function.n = nx();
   my_function.f = my_function_value;
   my_function.df = my_function_gradient;
   my_function.fdf = my_function_value_and_gradient;
   my_function.params = reinterpret_cast<void*>(this);
   
   // Set initial state variables using GSL's vector type
   gsl_vector *x;
   x = gsl_vector_alloc(nx());
   for (unsigned int i = 0; i < nx(); ++i) gsl_vector_set(x, i, 1.0);

   // Configure the minimizer
   gsl_multimin_fdfminimizer* minimizer
     = gsl_multimin_fdfminimizer_alloc(minimizer_type, nx());
   gsl_multimin_fdfminimizer_set(minimizer, &my_function, x,
                                 initial_step_size, line_search_tolerance);
   // Begin loop
   size_t iter = 0;
   int status;
   do {
     ++iter;
     // Perform one iteration
     status = gsl_multimin_fdfminimizer_iterate(minimizer);

     // Quit loop if iteration failed
     if (status != GSL_SUCCESS) break;
    
     // Test for convergence
     status = gsl_multimin_test_gradient(minimizer->gradient, converged_gradient_norm);
   }
   while (status == GSL_CONTINUE && iter < 100);

   // Free memory
   gsl_multimin_fdfminimizer_free(minimizer);
   gsl_vector_free(x);

   // Return true if successfully minimized function, false otherwise
   if (status == GSL_SUCCESS) {
     std::cout << "Minimum found after " << iter << " iterations\n";
     return true;
   }
   else {
     std::cout << "Minimizer failed after " << iter << " iterations: "
               << gsl_strerror(status) << "\n";
     return false;
   }
 }
\end{lstlisting}
%
The GSL interface requires three functions to be defined, each of
which takes a vector of state variables $\x$ as input:
\code{my\_function\_value}, which returns the value of the function;
\code{my\_function\_gradient}, which returns the gradient of the
function with respect to $\x$; and
\code{my\_function\_value\_and\_gradient}, which returns the value and
the gradient of the function. These functions are provided to GSL as
function pointers (see above), but since GSL is a C library, we need
to use the `\code{extern "C"}' specifier in their definition. Thus the
function definitions would be:
%
\begin{lstlisting}
 extern "C" 
 double my_function_value(const gsl_vector* x, void* params) {
   State* state = reinterpret_cast<State*>(params);
   return state->calc_function_value(x->data);
 }

 extern "C"
 void my_function_gradient(const gsl_vector* x, void* params, gsl_vector* gradJ) { 
   State* state = reinterpret_cast<State*>(params);
   state->calc_function_value_and_gradient(x->data, gradJ->data);
 }

 extern "C"
 void my_function_value_and_gradient(const gsl_vector* x, void* params,
                                     double* J, gsl_vector* gradJ) { 
   State* state = reinterpret_cast<State*>(params);
   *J = state->calc_function_value_and_gradient(x->data, gradJ->data);
 }
\end{lstlisting}
%
When the \code{gsl\_multimin\_fdfminimizer\_iterate} function is
called, it chooses a search direction and performs several calls of
these functions to approximately minimize the function along this
search direction. The \code{this} pointer (i.e.\ the pointer to the
\code{State} object), which was provided to the \code{my\_function}
structure in the definition of the \code{minimize} function above, is
provided as the second argument to each of the three functions
above. Unlike in C, in C++ this pointer needs to be cast back to a
pointer to a \code{State} type, hence the use of
\code{reinterpret\_cast}.

That's it! A call to \code{minimize} should successfully minimize well
behaved differentiable multi-dimensional functions.  It should be
straightforward to adapt the above to work with other minimization
libraries.

\section{Calling an algorithm with and without automatic differentiation from the same program}
\label{sec:withwithout}
The \code{calc\_function\_value(const double*)} member function
defined in section \ref{sec:realworld} is sub-optimal in that it
simply calls the \code{calc\_function\_value(const adouble*)} member
function, which not only computes the value of the function, it also
records the derivative information of all the operations involved.
This information is then ignored. This overhead makes the function
typically 2.5--3 times slower than it needs to be, although sometimes
(specifically for loops containing no trancendental functions) the
difference between an algorithm coded in terms of \code{double}s and
the same algorithm coded in terms of \code{adouble}s can exceed a
factor of 10 \citep{Hogan2014}.  The impact on the computational speed
of the entire minimization process depends on how many requests are
made for the function value only as opposed to the gradient of the
function, and can be significant.  We require a way to avoid the
overhead of \Adept\ computing the derivative information for calls to
\code{calc\_function\_value(const double*)}, without having to
maintain two versions of the algorithm, one coded in terms of
\code{double}s and the other in terms of \code{adouble}s. The three
ways to achieve this are now described.
%
\subsection{Function templates}
\label{sec:func_templates}
The simplest approach is to use a function template for those
functions that take active arguments, as demonstrated in the following
example:
%
\begin{lstlisting}
 #include <adept.h>
 class State {
  public:
    ...
    template <typename xdouble>
    xdouble calc_function_value(const xdouble* x);
    ...
 };

 // Example function definition that must be in a header file included
 // by any source file that calls calc_function_value
 template <typename xdouble>
 inline
 xdouble State::calc_function_value(const xdouble* x) {
   xdouble y = 4.0;
   xdouble s = 2.0*x[0] + 3.0*x[1]*x[1];
   y *= sin(s);
   return y;
 }
\end{lstlisting}
%
This takes the example from section \ref{sec:preparation} and replaces
\code{adouble} by the template type \code{xdouble}. Thus,
\code{calc\_function\_value} can be called with either \code{double}
or \code{adouble} arguments, and the compiler will compile inline the
inactive or active version accordingly.  Note that the function
template need not be a member function of a class.  

This technique is good if only a small amount of code needs to be
differentiated, but for large models the use of inlining is likely to
lead to duplication of compiled code leading to large executables and
long compile times.  The following two approaches do not have this
drawback and are suitable for large codes.

\subsection{Pausable recording}
\label{sec:pausable}
The second method involves compiling the entire code with the
\code{ADEPT\_RECORDING\_PAUSABLE} preprocessor variable defined, which
can be done by adding an argument \code{-DADEPT\_RECORDING\_PAUSABLE}
to the compler command line. This modifies the behaviour of
mathematical operations performed on \code{adouble} variables: instead
of performing the operation and then storing the derivative
information, it performs the operation and then only stores the
derivative information if the \Adept\ stack is not in the ``paused''
state. We then use the following member function definition instead of
the one in section \ref{sec:realworld}:
%
\begin{lstlisting}
 double State::calc_function_value(const double* x) {
   stack_.pause_recording();
   for (unsigned int i = 0; i < nx(); ++i) active_x_[i] = x[i];
   double J = value(calc_function_value(&active_x_[0]));
   stack_.continue_recording();
   return J;
 }
\end{lstlisting}
%
By pausing the recording for all operations on \code{adouble} objects,
most of the overhead of storing derivative information is removed. The
extra run-time check to see whether the stack is in the paused state,
which is carried out by mathematical operations involving
\code{adouble} objects, generally adds a small overhead.  However, in
algorithms where most of the number crunching occurs in loops
containing no trancendental functions, even if the stack is in the
paused state, the presence of the check can prevent the compiler from
agressively optimizing the loop.  In that instance the third method
may be preferable.
%
\subsection{Multiple object files per source file}
\label{sec:multipleobjects}
The third method involves compiling each source file containing
functions with \code{adouble} arguments twice.  The first time, the
code is compiled normally to produce an object file containing
compiled functions including automatic differentiation. The second
time, the code is compiled with the
\code{-DADEPT\_NO\_AUTOMATIC\_DIFFERENTIATION} flag on the compiler
command line. This instructs the \code{adept.h} header file to turn
off automatic differentiation by defining the \code{adouble} type to
be an alias of the \code{double} type. This way, a second set of
object files are created containing overloaded versions of the same
functions as the first set but this time without automatic
differentiation. These object files can be compiled together to form
one executable.  In the example presented in section
\ref{sec:realworld}, the \code{calc\_function\_value} function would
be one that would be compiled twice in this way, once to provide the
\code{calc\_function\_value(const adouble*)} version and the other to
provide the \code{calc\_function\_value(const double*)} version. Note
that any functions that do not include \code{adouble} arguments must
be compiled only once, because otherwise the linker will complain
about multiple versions of the same function.

The following shows a Makefile from a hypothetical project that
compiles two source files (\code{algorithm1.cpp} and
\code{algorithm2.cpp}) twice and a third (\code{main.cpp}) once:
%
\begin{lstlisting}[language=make]
 # Specify compiler and flags
 CXX = g++
 CXXFLAGS = -Wall -O3 -g
 # Normal object files to be created
 OBJECTS = algorithm1.o algorithm2.o main.o
 # Object files created with no automatic differentiation
 NO_AD_OBJECTS = algorithm1_noad.o algorithm2_noad.o
 # Program name
 PROGRAM = my_program
 # Include-file location
 INCLUDES = -I/usr/local/include
 # Library location and name, plus the math library
 LIBS = -L/usr/local/lib -lm -ladept

 # Rule to build the program (typing "make" will use this rule)
 $(PROGRAM): $(OBJECTS) $(NO_AD_OBJECTS)
         $(CXX) $(CXXFLAGS) $(OBJECTS) $(NO_AD_OBJECTS) $(LIBS) -o $(PROGRAM)
 # Rule to build a normal object file (used to compile all objects in OBJECTS)
 %.o: %.cpp
         $(CXX) $(CXXFLAGS) $(INCLUDES) -c $<
 # Rule to build a no-automatic-differentiation object (used to compile ones in NO_AD_OBJECTS)
 %_noad.o: %.cpp
         $(CXX) $(CXXFLAGS) $(INCLUDES) -DADEPT_NO_AUTOMATIC_DIFFERENTIATION -c $< -o $@
\end{lstlisting}
%

There is a further modification required with this approach, which
arises because if a header file declares both the \code{double} and
\code{adouble} versions of a function, then when compiled with
\code{-DADEPT\_NO\_AUTOMATIC\_DIFFERENTIATION} it appears to the
compiler that the same function is declared twice, leading to a
compile-time error.  This can be overcome by using the preprocessor to
hide the \code{adouble} version if the code is compiled with this
flag, as follows (using the example from section \ref{sec:realworld}):
%
\begin{lstlisting}
 #include <adept.h>
 class State {
  public:
    ...
    double calc_function_value(const double* x);
  private:
 #ifndef ADEPT_NO_AUTOMATIC_DIFFERENTIATION
    adouble calc_function_value(const adouble* x);
 #endif
    ...
 };
\end{lstlisting}

A final nuance is that if the code contains an \code{adouble} object
\code{x}, then \code{x.value()} will work fine in the compilation when
\code{x} is indeed of type \code{adouble}, but in the compilation when
it is set to a simple \code{double} variable, the \code{value()}
member function will not be found.  Hence it is better to use
\code{adept::value(x)}, which returns a \code{double} regardless of
the type of \code{x}, and works regardless of whether the code was
compiled with or without the
\code{-DADEPT\_NO\_AUTOMATIC\_DIFFERENTIATION} flag.

\section{Interfacing with software containing hand-coded Jacobians}
\label{sec:interfacehandcoded}
Often a complicated algorithm will include multiple components.
Components of the code written in C or C++ for which the source is
available are straightforward to convert to using \Adept, following
the rules in section \ref{sec:preparation}.  For components written in
Fortran, this is not possible, but if such components have their own
hand-coded Jacobian then it is possible to interface \Adept\ to them.
More generally, in certain situations automatic differentiation is
much slower than hand-coding \cite[see the Lax-Wendroff example
  in][]{Hogan2014} and we may wish to hand-code certain critical
parts.  In general the Jacobian matrix is quite expensive to compute,
so this interfacing strategy makes most sense if the component of the
algorithm has a small number of inputs or a small number of outputs. A
full working version of the following example is given as ``Test 3''
in the \code{test} directory of the \Adept\ package (see specifically
\code{test/README} and \code{test/test\_radiances.cpp}).

Consider the example of a radiative transfer model for simulating
satellite microwave radiances at two wavelengths, $I$ and $J$, which
takes as input the surface temperature $T_s$ and the vertical profile
of atmospheric temperature $T$ from a numerical weather forecast
model. Such a model would be used in a data assimilation system to
assimilate the temperature information from the satellite observations
into the weather forecast model. In addition to returning the
radiances, the model returns the gradient $\partial I/\partial T_s$
and the gradients $\partial I/\partial T_i$ for all height layers $i$
between 1 and $n$, and likewise for radiance $J$. The interface to the
radiative transfer model is the following:
%
\begin{lstlisting}
 void simulate_radiances(int n, // Size of temperature array
                         // Input variables:
                         double surface_temperature, 
                         const double* temperature,
                         // Output variables:
                         double radiance[2],
                         // Output Jacobians:
                         double dradiance_dsurface_temperature[2],
                         double* dradiance_dtemperature);
\end{lstlisting}
%
The calling function needs to allocate \code{2*n} elements for the
temperature Jacobian \code{dradiance\_dtemperature} to be stored, and
the stored Jacobian will be oriented such that the radiance index
varies fastest.

\Adept\ needs to be told how to relate the radiance perturbations
$\delta I$ and $\delta J$, to perturbations in the input
variables, $\delta T_s$ and $\delta T_i$ (for all layers
$i$). Mathematically, we wish the following relationship to be stored
within the \Adept\ stack:
%
\begin{equation}
\delta I = \frac{\partial I}{\partial T_s}\delta
T_s+\sum_{i=1}^n\frac{\partial I}{\partial T_i}\delta T_i.\nonumber
\end{equation}
%
This is achieved with the following wrapper function, which has
\code{adouble} inputs and outputs and therefore can be called from
within other parts of the algorithm that are coded in terms of
\code{adouble} objects:
%
\begin{lstlisting}
 void simulate_radiances_wrapper(int n,
                                 const adouble& surface_temperature,
                                 const adouble* temperature,
                                 adouble radiance[2]) {
   // Create inactive (double) versions of the active (adouble) inputs
   double st = value(surface_temperature);
   std::vector<double> t(n);
   for (int i = 0; i < n; ++i) t[i] = value(temperature[i]);

   // Declare variables to hold the inactive outputs and their Jacobians
   double r[2];
   double dr_dst[2];
   std::vector<double> dr_dt(2*n);

   // Call the non-Adept function
   simulate_radiances(n, st, &t[0], &r[0], dr_dst, &dr_dt[0]);

   // Copy the results into the active variables, but use set_value in order
   // not to write any equivalent differential statement to the Adept stack
   radiance[0].set_value(r[0]);
   radiance[1].set_value(r[1]);

   // Loop over the two radiances and add the differential statements to the Adept stack
   for (int i = 0; i < 2; ++i) {
     // Add the first term on the right-hand-side of Equation 1 in the text
     radiance[i].add_derivative_dependence(surface_temperature, dr_dst[i]);
     // Now append the second term on the right-hand-side of Equation 1. The third argument
     // "n" of the following function says that there are n terms to be summed, and the fourth 
     // argument "2" says to take only every second element of the Jacobian dr_dt, since the 
     // derivatives with respect to the two radiances have been interlaced.  If the fourth 
     // argument is omitted then relevant Jacobian elements will be assumed to be contiguous
     // in memory.
     radiance[i].append_derivative_dependence(temperature, &dr_dt[i], n, 2);
   }
 }
\end{lstlisting}
%
In this example, the form of \code{add\_derivative\_dependence} for
one variable on the right-hand-side of the derivative expression has
been used, and the form of \code{append\_derivative\_dependence} for
an array of variables on the right-hand-side has been used. As
described in section \ref{sec:adouble}, both functions have forms that
take single variables and arrays as arguments. Note also that the use
of \code{std::vector<double>} rather than \code{new double[n]} ensures
that if \code{simulate\_radiances} throws an exception, the memory
allocated to hold \code{dr\_dt} will be freed correctly.

\section{Member functions of the \codestyle{Stack} class}
\label{sec:stack}
This section describes the user-oriented member functions of the
\code{Stack} class. Some functions have arguments with default values;
if these arguments are omitted then the default values will be used.
Some of these functions throw \Adept\ exceptions, defined in section
\ref{sec:exceptions}.

\begin{description}
\citem{Stack(bool activate\_immediately = true)} The constructor for the
\codebf{Stack} class.  Normally \codebf{Stack} objects are constructed
with no arguments, which means that the object will attempt to make
itself the currently active stack by placing a pointer to itself into
a global variable.  If another \codebf{Stack} object is currently
active, then the present one will be fully constructed, left in the
unactivated state, and an \code{stack\_already\_active} exception
will be thrown.  If a \codebf{Stack} object is constructed with an
argument ``\codebf{false}'', it will be started in an unactivated
state, and a subsequent call to its member function \codebf{activate}
will be needed to use it.
%
\citem{void new\_recording()} Clears all the information on the stack
in order that a new recording can be started. Specifically this
function clears all the differential statements, the list of
independent and dependent variables (used in computing Jacobian
matrices) and the list of gradients used by the
\codebf{compute\_tangent\_linear} and \codebf{compute\_adjoint} functions.
Note that this function leaves the memory allocated to reduce the
overhead of reallocation in the new recordings.
%
\citem{bool pause\_recording()} Stops recording differential
  information every time an \code{adouble} statement is
  executed. This is useful if within a single program an algorithm
  needs to be run both with and without automatic
  differentiation. This option is only effective within compilation
  units compiled with \code{ADEPT\_RECORDING\_PAUSABLE} defined; if it is,
  the function returns \code{true}, otherwise it returns
  \code{false}. Further information on using this and the following
  function are provided in section \ref{sec:pausable}.
%
\citem{bool continue\_recording()} Instruct a stack that may have
previously been put in a paused state to now continue recording
differential information as normal.  This option is only effective within
compilation units compiled with \code{ADEPT\_RECORDING\_PAUSABLE}
defined; if it is, the function returns \code{true}, otherwise it
returns \code{false}.
%
\citem{bool is\_recording()} Returns \code{false} if recording has
  been paused with \code{pause\_recording()} and the code has been
  compiled with \code{ADEPT\_RECORDING\_PAUSABLE} defined.
  Otherwise returns \code{true}.
%
\citem{void compute\_tangent\_linear()} Perform a tangent-linear
calculation (forward-mode differentiation) using the stored
differential statements.  Before calling this function you need call
the \code{adouble::set\_gradient} or \code{set\_gradients} function (see
section \ref{sec:adouble}) on the independent variables to set the
initial gradients, otherwise the function will throw a
\code{gradients\_not\_initialized} exception. This function is
synonymous with \codebf{forward()}.
%
\citem{void compute\_adjoint()} Perform an adjoint calculation
(reverse-mode differentiation) using the stored differential
statements.  Before calling this function you need call the
\code{adouble::set\_gradient} or \code{set\_gradients} function on the
dependent variables to set the initial gradients, otherwise the
function will throw a \code{gradients\_not\_initialized}
exception. This function is synonymous with \codebf{reverse()}.
%
\citem{void independent(const adouble\&\ x)} Before computing Jacobian
  matrices, you need to identify the independent and dependent
  variables, which correspond to the columns and rows of he Jacobian,
  respectively. This function adds \codebf{x} to the list of
  independent variables. If it is the $n$th variable identified in
  this way, the $n$th column of the Jacobian will correspond to
  derivatives with respect to \codebf{x}.
\citem{void dependent(const adouble\&\ y)} Add \codebf{y} to the
  list of dependent variables.  If it is the $m$th variable identified
  in this way, the $m$th row of the Jacobian will correspond to
  derivatives of \codebf{y} with respect to each of the independent
  variables.
\citem{void independent(const adouble* x\_ptr, \Offset\ n)} Add
  \codebf{n} independent variables to the list, which must be
  stored consecutively in memory starting at the memory pointed to by
  \codebf{x\_ptr}.
\citem{void dependent(const adouble* y\_ptr, \Offset\ n)} Add
\codebf{n} dependent variables to the list, which must be stored
consecutively in memory starting at the memory pointed to by
\codebf{y\_ptr}.
%
\citem{void jacobian(double* jacobian\_out)} Compute the Jacobian matrix, i.e., the gradient of the $m$
dependent variables (identified with the \codebf{dependent(...)}
function) with respect to the $n$ independent variables (identified
with \codebf{independent(...)}. The result is returned in the memory
pointed to by \codebf{jacobian\_out}, which must have been allocated
to hold $m\times n$ values. The result is stored in
column-major order, i.e., the $m$ diemension of the matrix varies
fastest. If no dependents or independents have been identified,
then the function will throw a
\code{dependents\_or\_independents\_not\_identified} exception. In
practice, this function calls \codebf{jacobian\_forward} if $n\le
m$ and \codebf{jacobian\_reverse} if $n>m$.
%
\citem{void jacobian(Matrix jac)} Compute Jacobian matrix and store in
a correctly sized \Adept\ \code{Matrix} object \codebf{jac}, which may
be a subset of an larger matrix. See chapter \ref{chap:arrays} for a
full description of \Adept\ array objects.
%
\citem{Matrix jacobian()} As above but the Jacobian matrix is returned
from the function.
%
\citem{void jacobian\_forward(double* jacobian\_out)} Compute the
Jacobian matrix by executing $n$ forward passes through the stored
list of differential statements; this is typically faster than
\codebf{jacobian\_reverse} for $n\le m$.
%
\citem{void jacobian\_forward(Matrix jac)} As above but store in a
correctly sized \Adept\ \code{Matrix} object \codebf{jac}.
%
\citem{Matrix jacobian\_forward()} As above but the Jacobian matrix is
returned from the function.
%
\citem{void jacobian\_reverse(double* jacobian\_out)} Compute the
Jacobian matrix by executing $m$ reverse passes through the stored
list of differential statements; this is typically faster than
\codebf{jacobian\_forward} for $n>m$.
%
\citem{void jacobian\_reverse(Matrix jac)} As above but store in a
correctly sized \Adept\ \code{Matrix} object \codebf{jac}.
%
\citem{Matrix jacobian\_reverse()} As above but the Jacobian matrix is
returned from the function.
%
\citem{void clear\_gradients()} Clear the gradients set with the
\code{set\_gradient} member function of the \code{adouble} class. This
enables multiple adjoint and/or tangent-linear calculations to be
performed with the same recording.
%
\citem{void clear\_independents()} Clear the list of independent
variables, enabling a new Jacobian matrix to be computed from the same
recording but for a different set of independent variables.
%
\citem{void clear\_dependents()} Clear the list of dependent
variables, enabling a new Jacobian matrix to be computed from the same
recording but for a different set of dependent variables.
%
\citem{\Offset\ n\_independents()} Return the number of independent
variables that have been identified.
%
\citem{\Offset\ n\_dependents()} Return the number of dependent
variables that have been identified.
%
\citem{\Offset\ n\_statements()} Return the number of differential
statements in the recording.
%
\citem{\Offset\ n\_operations()} Return the total number of operations
in the recording, i.e the total number of terms on the right-hand-side
of all the differential statements.
%
\citem{\Offset\ max\_gradients()} Return the number of working gradients
that need to be stored in order to perform a forward or reverse pass.
%
\citem{size\_t memory()} Return the number of bytes currently
used to store the differential statements and the working
gradients. Note that this does not include memory allocated but not
currently used.
%
\citem{\Offset\ n\_gradients\_registered()} Each time an
\code{adouble} object is created, it is allocated a unique index that
is used to identify its gradient in the recorded differential
statements. When the object is destructed, its index is freed for
reuse. This function returns the number of gradients currently
registered, equal to the number of \code{adouble} objects currently
created.
%
\citem{void print\_status(std::ostream\&\ os = std::cout)} Print the
current status of the \codebf{Stack} object, such as number of
statements and operations stored and allocated, to the stream
specified by \codebf{os}, or standard output if this function is
called with no arguments.  Sending the \codebf{Stack} object to the
stream using the ``\code{<<}'' operator results in the same behaviour.
%
\citem{void print\_statements(std::ostream\&\ os = std::cout)} Print
the list of differential statements to the specified stream (or
standard output if not specified). Each line corresponds to a separate
statement, for example ``\code{d[3] = 1.2*d[1] + 3.4*d[2]}''.
%
\citem{bool print\_gradients(std::ostream\&\ os = std::cout)} Print
the vector of gradients to the specified stream (or standard output if
not specified). This function returns
\code{false} if no \code{set\_gradient}
function has been called to set the first gradient and initialize the
vector, and \code{true} otherwise. To diagnose what
\codebf{compute\_tangent\_linear} and 
\codebf{compute\_adjoint} are doing, it can be useful to call
\codebf{print\_gradients} immediately before and after.
%
\citem{void activate()} Activate the \codebf{Stack} object by copying
its \code{this} pointer to a global variable that will be accessed by
subsequent operations involving \code{adouble} objects.  If another
\codebf{Stack} is already active, a \code{stack\_already\_active}
exception will be thrown. To check whether this is the case before
calling \codebf{activate()}, check that the \code{active\_stack()}
function (described below) returns \code{0}.
%
\citem{void deactivate()} Deactivate the \codebf{Stack} object by
checking whether the global variable holding the pointer to the
currently active \codebf{Stack} is equal to \code{this}, and if it is,
setting it to \code{0}.
%
\citem{bool is\_active()} Returns \code{true} if the \codebf{Stack}
object is the currently active one, \code{false} otherwise.
%
\citem{void start()} This function was present in version 0.9 to
activate a \codebf{Stack} object, since in that version they were not
constructed in an activated state.  This function has now been
deprecated and will always throw a \code{feature\_not\_available}
exception.
\citem{int max\_jacobian\_threads()} Return the maximum number of
OpenMP threads available for Jacobian calculations.  The number will
be 1 if either the library was or the current source code is compiled
without OpenMP support (i.e.\ without the \code{-fopenmp} compiler and
linker flag). (Introduced in \Adept\ version 1.1.) 
\citem{int set\_max\_jacobian\_threads(int n)} Set the maximum number of
threads to be used in Jacobian calculations to \code{n}, if
possible. A value of 1 indicates that OpenMP will not be used, while a
value of 0 indicates that the maximum available will be used. Returns
the maximum that will be used, which may be fewer than requested,
e.g. 1 if the \Adept\ library was compiled without OpenMP
support. (Introduced in \Adept\ version 1.1.) 
\citem{void preallocate\_statements(int n)} If you know in advance
roughly how many differential statements will be stored by an
algorithm then you may be able to speed-up the first use of the stack
by preallocating the memory needed to store them.  More memory will
still be allocated if needed, but this should reduce the number of
allocations and copies.
\citem{void preallocate\_operations(int n)} Likewise, if you know in
advance roughly how many operations will be stored then you can
speed-up the first use of the stack with this member function.
\end{description}

\noindent The following non-member functions are provided in the
\code{adept} namespace:
\begin{description}
\citem{adept::Stack* active\_stack()} Returns a pointer to the
currently active \codebf{Stack} object, or \code{0} if there is none.
\citem{bool is\_thread\_unsafe()} Returns \code{true} if your code has
been compiled with \code{ADEPT\_STACK\_THREAD\_UNSAFE}, \code{false}
otherwise.
%
\end{description}


\section{Member functions of the \codestyle{adouble} object}
\label{sec:adouble}
This section describes the user-oriented member functions of the
\code{adouble} class. Some functions have arguments with default
values; if these arguments are omitted then the default values will be
used. Some of these functions throw \Adept\ exceptions, defined in
section \ref{sec:exceptions}.
\begin{description}
\citem{double value()} Return the underlying \code{double} value.
%
\citem{void set\_value(double x)} Set the value of the \codebf{adouble}
object to \codebf{x}, without storing the equivalent differential
statement in the currently active stack.
%
\citem{void set\_gradient(const double\&\ gradient)} Set the
gradient corresponding to this \codebf{adouble} variable. The first call
of this function (for any \codebf{adouble} variable) after a new
recording is made also initializes the vector of working gradients.
This function should be called for one or more \codebf{adouble} objects
after a recording has been made but before a call to
\code{Stack::compute\_tangent\_linear()} or
\code{Stack::compute\_adjoint()}.
%
\citem{void get\_gradient(double\&\ gradient)} Set \codebf{gradient}
to the value of the gradient corresponding to this \codebf{adouble}
object. This function is used to extract the result after a call to
\code{Stack::compute\_tangent\_linear()} or
\code{Stack::compute\_adjoint()}. If the \codebf{set\_gradient} function
was not called since the last recording was made, this function will
throw a \code{gradients\_not\_initialized} exception.  The function
can also throw a \code{gradient\_out\_of\_range} exception if new
\codebf{adouble} objects were created since the first
\codebf{set\_gradient} function was called.
%
\citem{void add\_derivative\_dependence(const adouble\&\ r, const
  double\&\ g)} Add a differential statement to the currently active
stack of the form $\delta \codebf{l}=\codebf{g}\times\delta
\codebf{r}$, where \codebf{l} is the \codebf{adouble} object from which
this function is called.  This function is needed to interface to
software containing hand-coded Jacobians, as described in section
\ref{sec:interfacehandcoded}; in this case \codebf{g} is the gradient
$\partial\codebf{l}/\partial\codebf{r}$ obtained from such software.
%
\citem{void append\_derivative\_dependence(const adouble\&\ r, const
  double\&\ g)} Assuming that the same \codebf{adouble} object has just
had its \codebf{add\_derivative\_dependence} member function called,
this function appends ${}+\codebf{g}\times\delta\codebf{r}$ to the
most recent differential statement on the stack.  If the calling
\codebf{adouble} object is different, then a \code{wrong\_gradient}
exception will be thrown. Note that multiple
\codebf{append\_derivative\_dependence} calls can be made in succession.
%
\item[\begin{minipage}{\textwidth}\codesize\texttt{void 
add\_derivative\_dependence(const adouble* r, const double* g,}\\ 
\mbox{ }\texttt{\hspace{18em}\Offset\ n = 1, \Offset\
      m\_stride = 1)}\end{minipage}]
%
Add a differential statement to the currently active stack of the form
$\delta\codebf{l}=\sum_{i=0}^{\codebf{n}-1}\codebf{m[}i\codebf{]}
\times\delta\codebf{r[}i\codebf{]}$, where \codebf{l} is the \codebf{adouble}
object from which this function is called. If the \codebf{g\_stride}
argument is provided, then the index to the \codebf{g} array will be
$i\times\codebf{g\_stride}$ rather than $i$.  This is useful if the
Jacobian provided is oriented such that the relevant gradients for
\codebf{l} are not spaced consecutively.
%
\item[\begin{minipage}{\textwidth}\codesize\texttt{void 
append\_derivative\_dependence(const adouble* rhs, const double* g,}\\ 
\mbox{ }\texttt{\hspace{20em}\Offset\ n = 1, \Offset\
      g\_stride = 1)}\end{minipage}]
%
Assuming that the same \codebf{adouble} object has just called the
\codebf{add\_derivative\_dependence} function, this function appends
${}+\sum_{i=0}^{\codebf{n}-1}\codebf{m[}i\codebf{]}
\times\delta\codebf{r[}i\codebf{]}$ to the most recent differential
statement on the stack. If the calling \codebf{adouble} object is
different, then a \code{wrong\_gradient} exception will be
thrown. The \codebf{g\_stride} argument behaves the same way as in the
previous function described.
\end{description}

\noindent The following non-member functions are provided in the
\code{adept} namespace:
\begin{description}
\citem{double value(const adouble\& x)} Returns the underlying
value of \codebf{x} as a \codebf{double}. This is useful to enable
\codebf{x} to be used in \code{fprintf} function calls. It is
generally better to use \codebf{adept::value(x)} rather than
\codebf{x.value()}, because the former also works if you compile the
code with the \code{ADEPT\_NO\_AUTOMATIC\_DIFFERENTIATION} flag set,
as discussed in section \ref{sec:multipleobjects}.
%
\citem{void set\_values(adouble* x, \Offset\ n, const double* x\_val)}
Set the value of the \codebf{n} \codebf{adouble} objects starting at
\codebf{x} to the values in \codebf{x\_val}, without storing the
equivalent differential statement in the currently active stack.
%
\citem{void set\_gradients(adouble* x, size\_t n, const double*
  gradients)} Set the gradients corresponding to the \codebf{n}
\codebf{adouble} objects starting at \codebf{x} to the \codebf{n}
\code{double}s starting at \codebf{gradients}.  This has the same
effect as calling the \codebf{set\_gradient} member function of each
\codebf{adouble} object in turn, but is more concise.
%
\citem{void get\_gradients(const adouble* y, size\_t n, double*
  gradients)} Copy the gradient of the \codebf{n} \codebf{adouble}
objects starting at \codebf{y} into the \codebf{n} \code{double}s
starting at \codebf{gradients}. This has the same effect as calling
the \codebf{get\_gradient} member function of each \codebf{adouble} object
in turn, but is more concise.  This function can throw a
\code{gradient\_out\_of\_range} exception if new \codebf{adouble}
objects were created since the first \codebf{set\_gradients} function
or \codebf{set\_gradient} member function was called.
\end{description}

\chapter{Using \Adept's array functionality}
\label{chap:arrays}

\section{Introduction}
\label{sec:array_functionality}
The design of \Adept's array capability and many of the functions is
inspired to a significant extent by the built-in array support in
Fortran 90 (and later), and a lesser extent by Matlab, although
implemented in the ``C++ way'', e.g.\ default row-major order with all
array indices starting from zero.  Future additions to the array
capability in \Adept\ will attempt to reproduce built-in Fortran array
functions if available\footnote{This decision may puzzle some readers,
  since Fortran is a dirty word to many C++ users due to the
  limitations of the FORTRAN 77 language. Many of these limitations
  were overcome in Fortran 90, whose array functionality in particular
  is rather well designed. Indeed, the pioneering ``Blitz++'' C++
  array library \cite[]{Veldhuizen1995} also reproduces many Fortran
  array functions. All references to Fortran in this document imply
  the 1990 (or later) standard.}. This design makes \Adept\ a good
choice if you have Fortran code that you wish to convert to C++.
\Adept\ provides the following array functionality:
%
\begin{description}
\item[Multi-dimensional arrays.]  Standard dynamically sized arrays
  can have an arbitrary number of dimensions (although indexing and
  slicing is supported only up to 7), and may refer to non-contiguous
  areas of memory. See section \ref{sec:array}.
\item[Mathematical operators and functions.] \Adept\ supports array
  expressions containing the standard mathematical operators \code{+},
  \code{-}, \code{*} and \code{/}, as well as their assignment
  versions \code{+=}, \code{-=}, \code{*=} and \code{/=}. When applied
  to arrays, they work ``element-wise'', applying the same operation
  to every element of the arrays. \Adept\ also supports array
  operations on all the mathematical functions listed in section
  \ref{sec:ad_functionality}. The following operators and functions
  return boolean array expressions: \code{==}, \code{!=}, \code{>},
  \code{<}, \code{>=} and \code{<=}, \code{isfinite}, \code{isinf} and
  \code{isnan}.  See section \ref{sec:operators}.
\item[Array slicing.] There are many ways to produce an array that
  references a subset of another array, and therefore can be used as
  an lvalue in a statement. Arrays can be indexed with scalar
  integers, a contiguous range of integers, a strided range of
  integers or an arbitrary list of integers.  This is facilitated with
  ``\code{\_\_}'' (a double underscore) and ``\code{end}'', such that
  \code{A(\_\_,end-1)} returns a vector pointing to the penultimate
  column of matrix \code{A}. The member function \code{subset}
  produces an array pointing to a contiguous subset of the original
  array, while \code{diag\_vector} and \code{diag\_matrix} produce
  arrays pointing to the diagonal of the original array.  \code{T}
  produces an array pointing to the transpose of the original array.
  See section \ref{sec:slice}.
\item[Passing arrays to and from functions.] \Adept\ uses a
  reference-counting approach to implement the storage of array data,
  enabling multiple array objects to point to the same data, or parts
  of it in the case of array slices. This makes it straightforward to
  pass arrays to and from functions without having to perform a deep
  copy. See section \ref{sec:passing}.
\item[Array reduction operations.] The functions \code{sum},
  \code{mean}, \code{product}, \code{minval}, \code{maxval} and
  \code{norm2} perform reduction operations that return an array of
  lower rank to the expression they are applied to. The functions
  \code{all} and \code{any} do the same but for boolean
  expressions. \code{count} returns the number of \code{true} elements
  in a boolean expression.
% The function
%  \code{find(A)} returns indices to the \code{true} elements of
%  \code{A}. 
  See section \ref{sec:reduce}.
\item[Array expansion operations.] The functions \code{outer\_product}
  and \code{spread} return an expression of a higher rank than the
  expression they are applied to. See section \ref{sec:expand}
\item[Conditional operations.] Two convenient ways are provided to
  perform an operation on an array depending on the result of a
  boolean expression: \code{where} and \code{find}. The statement
  \code{A.where(B>0)=C} assigns elements of \code{C} to elements of
  \code{A} whenever the corresponding element of \code{B} is greater
  than zero. For vectors only, the same result could be obtained with
  \code{A(find(B>0))=C(find(B>0))}. See section \ref{sec:conditional}.
\item[Fixed-size arrays.] \Adept\ provides a fixed-size array class
  with dimensions (up to seven) that are known at compile time. The
  functionality is very similar to standard dynamic arrays.
\item[Special square matrices.] \Adept\ uses specific classes for
  symmetric, triangular and band-diagonal matrices, the latter of
  which use compressed storage and include diagonal and tridiagonal
  matrices. Certain operations such as matrix multiplication and solving
  linear equations are optimized especially for these objects. See
  section \ref{sec:square}.
\item[Matrix multiplication.] Matrix multiplication can be applied to
  one- and two-dimensional arrays using the \code{matmul} function, of
  for extra syntactic sugar, the ``\code{**}''
  pseudo-operator. \Adept\ uses whatever BLAS (Basic Linear Algebra
  Subroutines) support is available on your system, including
  optimized versions for symmetric and band-diagonal matrices. See
  section \ref{sec:matmul}.
\item[Linear algebra.] \Adept\ uses the LAPACK library to invert
  matrices and solve linear systems of equations. See section
  \ref{sec:la}.
\item[Array bounds and alias checking.] \Adept\ checks at compile time
  that terms in an array expression accord in rank, and at run time
  that they accord in the size of each dimension. Run-time alias
  checking is performed to determine if any objects on the
  right-hand-side of a statement overlap in memory with the
  left-hand-side of the statement, making a temporary copy of the
  right-hand-side if they do. This can be overridden with the
  \code{noalias} function. See section \ref{sec:bounds}.
\item[Interoperability with Fortran arrays.] The Fortran 2018 standard
  enables Fortran's assumed-shape arrays to be passed to and from
  C/C++. Section \ref{sec:fortran} describes how they can be treated
  as \Adept\ arrays within C++.
\end{description}% 
%

\section{The \codestyle{Array} class}
\label{sec:array}
The bread and butter of array operations is provided by the
\code{Array} class template (in the \code{adept} namespace along with
all other public types and classes), which has the following declaration:
\begin{lstlisting}
 namespace adept {
   template <int Rank, typename Type = Real, bool IsActive = false>
   class Array;
 }
\end{lstlisting}
The first template argument provides the number of dimensions of the
array and may be 1 or greater, although indexing and slicing is only
supported up to 7 dimensions. The second argument is the numerical
type being stored and can be any simple integer or real number,
including \code{bool}. The default type is \code{adept::Real}, which
is the default floating-point type the \Adept\ library has been
compiled to use for computing derivatives, and is usually
\code{double}. The final argument states whether the array is
``active'', i.e.\ whether it participates in the differentiation of an
algorithm.

A number of typedefs are provided for the most common types of array:
\code{Vector}, \code{Matrix}, \code{Array3D} and so on up to
\code{Array7D} provide inactive arrays of type \code{Real} and rank
1--7. The corresponding active types are \code{aVector},
\code{aMatrix}, \code{aArray3D} etc. Arrays of other numeric types
have the pattern \code{boolVector}, \code{intVector},
\code{floatVector}, \code{afloatVector}, and similarly for matrices
and higher dimensional arrays. If you wanted shortcuts for
other types you could do the following:
\begin{lstlisting}
 typedef adept::Array<4,unsigned int> uintArray4D;
 typedef adept::Array<2,long double,true> alongdoubleMatrix; // Active
\end{lstlisting}

An \code{Array} with uninitialized elements can be constructed in
numerous ways:
\begin{lstlisting}
 using namespace adept;
 Vector v;                  // Initialize an empty vector
 Array3D A(3,4,5);          // Initialize a 3x4x5 array (up to 7 arguments possible)
 Matrix M(dimensions(3,4)); // The "dimensions" function takes up to 7 arguments
 Matrix N(M.dimensions());  // Make N the same size as M
\end{lstlisting}
In the remaining code examples it will be assumed that
\code{using namespace adept} has already been called.  When new memory
is needed, the \code{Array} object creates a \code{Storage} object
that contains the memory needed, and stores pointers to both the
\code{Storage} object and the start of the data. By default the data
are accessed in C-style row-major order (i.e.\ the final index
corresponds to the array dimension that varies most rapidly in
memory). However, this is flexible since in addition to storing the
length of each of its $n$ dimensions, a rank-$n$ \code{Array} also
stores $n$ ``offsets'' that define the separation of elements in
memory in each dimension. Thus, a 3-by-4 matrix with row-major storage
would store offsets of (4,1). The same size matrix would use
column-major storage simply by storing offsets of (1,3). To make new
arrays use column-major storage, call the following function:
\begin{lstlisting}
 set_array_row_major_order(false);
\end{lstlisting}
Note that this does not change the storage of any existing
objects. Note also that when array expressions are evaluated, the data
are requested in row-major order, so the use of column-major arrays
will incur a performance penalty.

An \code{Array} may also be constructed such that it immediately
contains data:
\begin{lstlisting}
 Vector v = M(__,0); // Link to a existing array, in this case the first column of M
 Vector v(M(__,0));  // Has exactly the same effect as the previous example
 Matrix N = log(M);  // Initialize with the size and values of a mathematical expression
\end{lstlisting}
It can be seen from the constructors involving \code{Vector}s that an
\code{Array} can be configured to ``link'' to part of an existing
\code{Array}, and modifications to the numbers in one will be seen by
the other. This is a very useful feature as it allows slices of an
array to be passed to functions and modified; see section
\ref{sec:slice}. Note that the array or sub-array being linked to must
be of the same rank, type and activeness as the linking array.
Internally, linking is achieved by both the arrays pointing to the
same \code{Storage} object, which itself contains a reference count of
the number of arrays pointing to it. When an \code{Array} is
destructed the reference count is reduced by one and only if it falls
to zero will the data get deallocated. This ensures that if the
\code{Array} being linked to goes out of scope, the linking
\code{Array} will ``steal'' the data.

You can also make an \code{Array} point to data not held in a
\code{Storage} object, for example in a function whose interface is
only in terms of intrinsic C types:
\begin{lstlisting}
 double my_norm2(int n, double* ptr) {
   Vector x(ptr, dimensions(n)); // Create a Vector pointing to existing data
   return norm2(x);              // Use Adept's L2-norm function
 }
\end{lstlisting}
The \code{Vector} in this example can be used in the same way as any
other array, but relies on the existing data not being deallocated for
the lifetime of the \code{Vector}.

After it has been constructed, an \code{Array} can be resized,
relinked or cleared completely as follows:
\begin{lstlisting}
 M.resize(5,2);            // Works up to 7 dimensions
 M.resize(dimension(5,2)); // As above
 N.resize(M.dimensions()); // Resize N to be the same size as M
 v.link(M(end-1,__));      // Size of v set to that of the argument and link to data
 v >>= M(end-1,__);        // Convenient syntax for linking, similar to Fortran's "->"
 M.clear();                // Returns array to original empty state
\end{lstlisting}
The member functions \code{resize} and \code{clear} unlink from any
existing data, which involves deallocation if no other array is
pointing to the same data. If the \code{link} function, or the
alternative ``\code{>>=}'' syntax, is applied with a non-empty array
on the left-hand-side then the existing data will be quietly cleared
before linking to the new data. Note that if you assign one array to
another (e.g.\ \code{N=M}), then they must be of the same size; if
they are not then you should clear the left-hand-side first. By
default, resized arrays are row-major, unless
\code{set\_array\_row\_major(false)} has been called. To explicitly
specify the ordering, you may use the \code{resize\_row\_major} or
\code{resize\_column\_major} member functions in place of
\code{resize}.

The \code{Array} class implements a number of member functions for
inquiring about its properties:
\begin{description}
\citem{size()} Returns the total number of elements, i.e.\ the product
of the lengths of each of the dimensions.
\citem{dimension(i)} Returns the length of dimension \code{i}.
\citem{offset(i)} Returns the separation in memory of elements along
dimension \code{i}.
\citem{gradient\_index()} For active arrays, returns the gradient
index of the first element of the array, which is always positive; for
inactive arrays it returns a negative number.
\citem{empty()} Returns \code{true} if the array is in the empty
state, or \code{false} otherwise.
\citem{dimensions()} Returns a object listing the extent of each
dimension in the \code{Array}, useful for resizing other arrays.  The
object is actually of type \code{ExpressionSize<int Rank>} (where
\code{Rank} is the rank of the array), a thin wrapper for a simple
  \code{int[Rank]} C-array, although it is rare to need to use it
  explicitly.
\citem{offset()} Returns an object (also of type
\code{ExpressionSize<int Rank>}) describing how array indices are
translated into memory offsets.
\end{description}

An \code{Array} may be filled using the \code{<<} operator for the
first element followed by either the \code{<<} or \code{,} operators
for subsequent elements:
\begin{lstlisting}
 Vector v(4);
 v << 1 << 2 << 3 << 4; // Fill the four elements of v
 v << 1, 2, 3, 4;       // Same behaviour but easier on the eye
 v << 1, 2, 3, 4, 5;    // Error: v has been overfilled
 Matrix M(2,4);
 M << 1, 2, 3, 4,       // Filling of multi-dimensional arrays
      5, 6, 7, 8;       // automatically moves on to next dimension
 M << 1, 2, 3, 4,
      v;                // v treated as a row vector here
\end{lstlisting}
For multidimensional arrays, elements are filled such that the final
dimension ticks over fastest (regardless of whether the array uses
row-major storage internally), and new rows are started when a row is
complete. Moreover, other arrays can be part of the list of elements,
provided that they fit in.  In this context, a rank-1 array is treated
as a row vector. An \code{index\_out\_of\_bounds} exception is thrown
if an array is overfilled, while an \code{empty\_array} exception is
thrown if an attempt is made to fill an empty array.

\cxx11 \begin{leftbar} If you compile your code with C++11 features
  enabled then you can use the ``initializer list'' feature to fill
  arrays using the C-like curly bracket syntax:
\begin{lstlisting}
 Vector v;              // Construct an empty vector
 v = {1, 2, 3};         // Resize to length 3 and fill
 Vector w = {1, 2, 3};  // Construct a vector of length 3 and fill
 w = {4.4, 5.5};        // Underfill leads to remaining elements set to zero (as in C)
 w = {6, 7, 8, 9};      // Overfill leads to size_mismatch exception being thrown
 Matrix M = {{1, 2, 3}, // Multi-dimensional arrays use nested curly brackets;
             {4, 5}};   //  ...underfill again leads to remaining elements set to zero
\end{lstlisting}
Another convenient property of this syntax is that temporary arrays
with explicit values can be used in expressions:
\begin{lstlisting}
 v = w * Vector{3.0, 4.2, 5.1};
\end{lstlisting}
\end{leftbar}

When interfacing with other libraries, direct access to the data is
often required. The \code{Array} class provides the following member
functions:
\begin{description}
\citem{data()} Returns a pointer to the first element in the array,
i.e.\ the element found by indexing all the dimensions of the array
with zero. It is up to the caller to understand the layout of the data
in memory and not to stray outside.  Remember that an array may be
strided and the stride may even be negative so that the data returned
from increasing indices are actually from earlier memory
addresses. Note that a double-precision active array is not stored as
an array of \code{adouble} objects, but as an array of \code{double}
data and a single gradient index for the first element. Thus the
pointer returned by \code{data()} will point to the underlying
inactive data.  In contexts where the \code{Array} object is
\code{const}, a \code{const} pointer will be returned. Note that in a
multi-dimensional array, successive array dimensions are not
guaranteed to be contiguous in memory since it is sometimes
advantageous for vectorization for \Adept\ to pad the rows to an alignment
boundary. You can use the output of the \code{offset()} member
function to determine the spacing of the elements in each dimension.
%
\citem{const\_data()} It is sometimes convenient to specify explicitly
that read-only access is required, in which case you can use
\code{const\_data()} to return a \code{const} pointer to the first
element in the array.
\end{description} 

\section{Operators and mathematical functions}
\label{sec:operators}
The operators and mathematical functions listed in section
\ref{sec:ad_functionality} have been overloaded so that they work exactly as you
would expect. Consider this example:
\begin{lstlisting}
 floatVector a(5);      // Inactive single-precision vector
 aVector b(5), c(5);    // Active vectors
 aReal d;               // An active scalar
 // ... other code manipulating a-d ...
 b = 2.0;               // Set all elements of b to a scalar value
 c += 5.0*a + sin(b)/d; // Add the right-hand-side to c
\end{lstlisting}
The penultimate illustrates that all elements of an \code{Array} can
be set to the same value, although note that this will only work if
the array is not in the empty state. The final line illustrates how
terms with different rank, type and activeness can participate in the
same expression. Scalars and arrays can participate in the same
expression on the right-hand-side of a statement provided that the
arrays have the same size as the array on the left-hand-side. Objects
of different type (in this case single and double precision) can be
combined in a mathematical operation, and the type of that operation
will be the larger (higher precision) of the two types. If active and
inactive objects participate in an expression then the left-hand-side
must also be active. Expression templates ensure that no temporary
arrays need to be created to store the output of intermediate parts of
the expression.  The functions \code{max} and \code{min} behave just
like binary operators (such as \code{+} and \code{*}) in this regard,
as shown by the following:
\begin{lstlisting}
 c = max(a,b);          // Element-wise comparison of a and b 
 c = min(a,3.0);        // Return minimum of each element of a and 3
\end{lstlisting}

The examples so far have floating-point results, but some operators
(e.g.\ ``\code{==}'') and some functions (e.g.\ \code{isinf}) take
floating-point arguments and return a boolean.  The \Adept\ versions
take floating-point array expressions as arguments and return
\code{bool} expressions of the same rank and size. Finally, the
\Adept\ versions of the operators \code{!}, \code{||} and \code{\&\&}
take a \code{bool} expression as arguments and return a \code{bool}
expression of the same size and rank.

\section{Array slicing}
\label{sec:slice}
This section concerns the many ways that sub-parts of an \code{Array}
can be extracted to produce an object that can be used as an lvalue;
that is, if the object is modified then it will modify part of the
original \code{Array}. It should be stressed that none of these
methods results in any rearrangement of data in memory, so they should
be efficient.

The first way this can be done is via the function-call and
member-access operators (i.e.\ \code{operator()} and
\code{operator[]}, respectively) of the \code{Array}. In the case of
the function-call operator, the same number of arguments as the rank
of the array must be provided, where each argument states how its
corresponding dimension should be treated.  The nature of the
resulting object depends on the type of all of the arguments in a way
that is similar to how Fortran arrays behave, although note that
array indices always start at 0. The four different behaviours are as
follows:


\begin{description}
\item[Extract single value.] If every argument is an integer scalar or
  scalar expression, then a reference to a single element of the array
  will be extracted. If an argument is an integer expression
  containing \code{end}, then \code{end} will be interpretted to be
  the index to the final element of that dimension (a feature borrowed
  from Matlab). If the array is active then the returned object will
  be of a special ``active reference'' type that can be used as an
  lvalue and ensures that any expressions making use of this element
  can be differentiated. Now for some examples:
  \begin{lstlisting}
 aMatrix A(4,3);
 aReal x = A(1,1);  // Copy element at second row and second column into x
 A(end-1,1) *= 2.0; // Double the element in the penultimate column and 2nd row of A
 A(3) = 4.0;        // Error: number of indices does not match number of dimensions
  \end{lstlisting}
\item[Extract regular subarray.] If every argument is either (i) an
  integer scalar or scalar expression, or (ii) a regular range of
  indices, and there is at least one of (ii), then an \code{Array}
  object will be returned of the same type and activeness as the
  original. However, for each argument of type (i), the rank of the
  returned array will be one less than that of the original. There are
  three ways to express a regular range of indices: ``\code{\_\_}''
  represents all indices of a particular dimension, \code{range(a,b)}
  represents a contiguous range of indices between \code{a} and
  \code{b} (equivalent to \code{a:b} in Fortran and Matlab), and
  \code{stride(a,b,c)} represents a regular range of indices between
  \code{a} and \code{b} with spacing \code{c} (equivalent to
  \code{a:b:c} in Fortran and \code{a:c:b} in Matlab). Note that
  \code{a}, \code{b} and \code{c} may be scalar expressions containing
  \code{end}, but \code{c} must not be zero although it can be
  negative to indicate a reversed ordering. The rank of the returned
  array is known at compile time; thus if range arguments are found at
  run-time to contain only one element (e.g.\ \code{range(1,1)}) then
  the dimension being referred to will be not be removed in the
  returned array but will remain as a singleton dimension. This
  behaviour is the same as indexing an array dimension with \code{1:1}
  in Fortran. Now for some examples:
\begin{lstlisting}
 v(range(1,end-1))           // Subset of vector v that excludes 1st & last points
 A(0,stride(end,0,-1))       // First row of A as a vector treated in reverse order
 A(range(0,0),stride(0,0,1)) // A 1-by-1 matrix containing the first element of A
\end{lstlisting}
\item[Extract irregular subarray.] If an array is indexed as in either
  of the two methods above, except that one or more dimensions is
  instead indexed using a rank-1 \code{Array} of integers, then the
  result is a special ``indexed-array'' type that stores how each
  dimension is indexed.  If it then participates either on the left-
  or right-hand-side of a mathematical expression then when an element
  is requested, the indices will be queried to map the request to
  obtain the correct element from the original array. This is much
  less efficient than using regular ranges of indices as above. It
  also means that if an indexed array is passed to a function
  expecting an object of type \code{Array}, then it will first be
  converted to an \code{Array} and any modifications performed within
  the function will not be passed back to the original array. For
  example:
\begin{lstlisting}
 intVector index(3);
 index << 2, 3, 5;
 Array A(4,4);
 A(0,index) = 2.0; // Set irregularly spaced elements of the first row of A
\end{lstlisting}
\item[Slice leading dimension.] In C, an element is extracted from a
  two-dimensional array using \code{A[i][j]}, and \code{A[i]} returns
  a pointer to a single row of \code{A}, where \code{i} and \code{j}
  are integers. To enable similar functionality, if \code{A} is an
  \Adept\ matrix then \code{A[i]} indexes the leading dimension by
  integer \code{i} returning an array of rank one less than the
  original. This is equivalent to \code{A(i,\_\_)}. Furthermore,
  \code{A[i][j]} will return an individual element as in C, but it
  should be stressed that \code{A(i,j)} is more efficient since it
  does not involve the creation of intermediate arrays.
\end{description}
%
There are a few other ways to produce lvalues that consist of a subset
or a reordering of an array. They are implemented as member functions
of the \code{Array} class, in order to distinguish from non-member
functions that produce a copy of the data and therefore cannot be
usefully used as lvalues.  For example, \code{A.T()} and
\code{transpose(A)} both return the transpose of matrix \code{A}, but
the former is faster since it does not make a copy of the original
data, while the latter is more flexible since it can be applied to
array expressions (e.g.\ \code{transpose(A*B)}).  The member functions
available are:
\begin{description}
\citem{subset(int ibegin0, int iend0, ...)} This function returns a
contiguous subset of an array as an array of the same rank that points
to the original data. It takes twice as many arguments as the array
has dimensions, with each pair of arguments representing the indices
to the first and last element to include from a particular
dimension. Exactly the same result can be obtained using \code{range}
but the \code{subset} form is more concise. For example, for a matrix
\code{M}, \code{M.subset(1,5,3,10)} is equivalent to
\code{M(range(1,5),range(3,10))}.
%
\citem{T()} This function returns the transpose of a rank-2 array (a
matrix). The returned array points to the same data but with its
dimensions reversed. A compile-time error occurs if this function is
used on an array with rank other than 2.  Currently \Adept\ doesn't
allow the transpose of a rank-1 array (a vector), since vectors are
not intended to have an intrinsic orientation.  When orientation
matters, such as in matrix multiplication, the intended orientation
may be inferred from the context or specified explicitly.
%
\citem{permute(int i0, int i1, ...)} This function is the
generalization of the transpose for multi-dimensional arrays: it
returns an array of the same rank as the original but with the
dimensions rearranged according to the arguments. There must be the
same number of arguments as there are dimensions, and each dimension
(starting at 0) must be provided once only. The returned array is
linked to the original; the permutation is achieved simply by
rearranging the list of dimensions and the list of ``offsets'' (the
separation in memory of elements along each dimension individually).
%
\citem{diag\_matrix()} When this function is applied to a rank-1
\code{Array} of length $n$, it returns an $n$-by-$n$ diagonal matrix
(specifically a \code{DiagMatrix}; see section \ref{sec:square}) that
points to the data from the rank-1 array along its diagonal.
%
\citem{diag\_vector()} When this function is applied to a rank-2
\code{Array} with equally sized dimensions, it returns a rank-1 array
pointing to the data along its diagonals.  An
\code{invalid\_operation} exception is thrown if applied to a
non-square matrix, and a compile-time error if applied to an array of
rank other than 2.
%
\citem{diag\_vector(int i)} When applied to a square rank-2 $n$-by-$n$
\code{Array}, this returns a rank-1 array of length
$n-\mathrm{abs}(i)$ pointing to the $i$th superdiagonal of the square
matrix, or the $-i$th subdiagonal if $i$ is negative. An
\code{invalid\_exception} exception occurs if applied to a non-square
matrix, and a compile-time error if applied to an array of rank other
than 2.
%
\citem{submatrix\_on\_diagonal(int ibegin,int iend)} When applied to a
square rank-2 array, this function returns a square matrix that shares
part of the diagonal of the original matrix.  Thus
\code{A.submatrix\_on\_diagonal(int ibegin,int iend)} is equivalent to
\code{A(range(ibegin,iend),range(ibegin,iend))}. Its purpose is to
provide a subsetting facility for symmetric, triangular and
band-diagonal matrices (see section \ref{sec:square}) for which
general array indexing is not available. If applied to a non-square
matrix, an \code{invalid\_operation} exception will be thrown.
%\citem{upper\_matrix()}
%\citem{lower\_matrix()}
%\citem{band\_matrix<LDiag,UDiag>()}
\citem{reshape(int i0, int i1...)} Only applicable to an \code{Array}
of rank 1, this returns a multi-dimensional array whose dimensions are
given by the arguments to the function.  Between 2 and 7 dimensions
are possible. If the arguments are such that the total size of the
returned array would not match the length of the vector, an
\code{invalid\_dimension} exception is thrown.
\end{description}

\section{Passing arrays to and from functions}
\label{sec:passing}
When writing functions taking array arguments, there are three
different ways to do it depending on the extent to which the function
needs to be able to modify the array.  In the case of constant array
arguments, a constant reference should be used; for example:
\begin{lstlisting}
 Real l3norm(const Vector& v) {     // Function returning the L3-norm of a vector
   return cbrt(sum(v*v*v));
 }
 Vector w(3); w << 1.0, 2.0, 3.0;   // Create a test vector
 Real ans1 = l3norm(w);             // Named vector argument
 Real ans2 = l3norm(w(range(0,1))); // Temporary vector argument
 Real ans3 = l3norm(2.0*w);         // Expression implicitly converted to temporary vector
\end{lstlisting}
This function works with all three types of argument.  The last
example illustrates that when an inactive rank-1 expression is passed
to the function, it is evaluated and the result placed in a temporary
vector that is passed to the function.

At the other extreme, we may wish to create a function that modifies
an array argument, including the possibility of changing its size; for
example:
\begin{lstlisting}
 void resize_and_zero(int n, Vector& v) { // A rather pointless function...
   v.resize(n); v = 0.0;
 }
 Vector w(4);
 resize_and_zero(2,w);                    // Results in w={0.0, 0.0}
 resize_and_zero(2,w(range(0,2)));        // Compile error: argument is temporary
 resize_and_zero(2,2.0*w);                // Compile error: argument is not an lvalue
\end{lstlisting}
In this case, due to the C++ rule that a non-constant reference cannot
bind to a temporary object, the function can only take a
\emph{non-temporary} \code{Vector} as an argument.  This is fair
enough; it would not make sense to resize the subset of an array, or
an expression. However, it is very common to want to pass a subset of
an array to a function and for the function to modify the values of
the array, but not to resize it. In \Adept\ this is achieved as
follows:
\begin{lstlisting}
 void square_in_place(Vector v) {
   v *= v;
 }
 Vector w(3); w << 2.0, 3.0, 5.0;
 square_in_place(w);              // Results in w={4.0, 9.0, 25.0}
 square_in_place(w(range(0,1)));  // Results in w={4.0, 9.0, 5.0}
 square_in_place(2.0*w);          // No effect on w
\end{lstlisting}
Even though the \code{Vector} has been passed by value, the
\code{Vector} copy constructor performs a ``shallow copy'', which
means that little more than the array dimensions and a pointer to the
data are copied. Therefore, in the first two examples above the vector
\code{v} inside the function points to data in \code{w}, and can
therefore modify \code{w}.  By contrast, when an expression is passed
to the function, a new \code{Vector} is created to hold the result of
the expression, and when this is modified inside the function it does
not affect the data in the calling routine.

The fact that \code{Array} copy constructors perform shallow copies
also improves the efficiency of functions that return arrays such as
the following:
\begin{lstlisting}
 Matrix square(const Matrix& in) {
   Matrix out = in*in; // Create an matrix containing the result of in*in
   return out;  
 }
 Matrix A(100,100);    // Allocate memory for "A"
 Matrix B = square(A); // Copy constructor: shallow copy of "out" into "B"
\end{lstlisting}
At the \code{return} statement, matrix \code{out} is received by the
copy constructor of matrix \code{B}, so a shallow copy is
performed. This means that the description of matrix \code{out} is
copied to \code{B}, including a pointer to \code{Storage} object
containing both the data and a count of the number of references to
it; this counter is increased by one. Matrix \code{out} is then
destructed, and the counter is immediately reduced by one. The net
result is that \code{B} has ``stolen'' the data in the matrix from
\code{out} without it having been copied, thus avoiding unnecessary
allocation of memory on the heap followed by copying and deallocation. 

The shallow-copy implementation leads to behaviour that users may not
be expecting. If an array is initialized from another array in either
of the following two ways:
\begin{lstlisting}
 Matrix M(3,4);
 Matrix A(M);   // Call copy constructor
 Matrix B = M;  // Call copy constructor
\end{lstlisting}
then the result is that \code{A}, \code{B} and \code{M} share the same
data, rather than a copy being made.  To make a deep copy, it is
necessary to do the following:
\begin{lstlisting}
 Matrix M(3,4);
 Matrix A;      // Create empty matrix
 A = M;         // Call assignment operator for deep copy
\end{lstlisting}
This is annoying, but the alternative is that there would be no clean
way to pass a subset of an array to a function that then modifies its
values. The same behaviour is implemented in the Blitz++ array class
\cite[]{Veldhuizen1995}.

It should be noted that with the introduction of ``move semantics'' in
the C++11 standard, the it is possible to detect when an array
returned from a function is about to be destructed, and therefore
invoke a move constructor that implements a shallow copy. This negates
one of the two reasons from making the copy constructor execute only a
shallow copy.  But it does not help in passing array subsets to
functions, unless two versions of every function were created, one
accepting an lvalue reference (\code{Array\&}) and the other accepting
an rvalue reference (\code{Array\&\&}), which is hardly practical.

\cxx11 \begin{leftbar}If you compile your code with C++11 features
  enabled then move semantics can sometimes make assignment more
  efficient. Consider code calling the \code{square} function above:
\begin{lstlisting}
 Matrix A(10,10), B(10,10);
 B = square(A); // Move assignment operator performs shallow copy
 Matrix C(B);   // B and C now share the same data
 B = square(A); // Move assignment operator performs deep copy
\end{lstlisting}
  Both assignments are to temporary objects about to be destructed, so
  the move assignment operator is called. This operator checks how
  many references there are to the data in \code{B}. In the first case
  there is only one reference, so the data in \code{B} can safely be
  discarded and a shallow copy (a ``move'') of the data in the
  temporary is performed. In the second case there are two references,
  so a deep copy must be performed in order that \code{C} sees the
  change in \code{B}.
\end{leftbar}

\section{Array reduction operations}
\label{sec:reduce}
A family of functions return a result that is reduced in rank compared
to their argument, and operate in the same way as Fortran functions of
the same name.  Consider the \code{sum} function, which can be used
either to sum all the elements in an array expression and return a
scalar, or to sum elements along the dimension specified in the second
argument and return an array whose rank is one less than the first
argument:
\begin{lstlisting}
 Array A(3,4);
 Real x = sum(A);     // Sum all elements of matrix A
 Vector v = sum(A,1); // Sum along the row dimension returning a vector of length 3
\end{lstlisting}
Functions that are used in the same way are \code{mean},
\code{product}, \code{minval}, \code{maxval} and \code{norm2} (the
square-root of the sum of the squares of each element).  Note the
difference between \code{maxval} and \code{max}: the behaviour of
\code{max} is outlined in section \ref{sec:operators}. Three further
functions operate in the same way but on boolean arrays: \code{all}
returns \code{true} only if all elements are \code{true}, \code{any}
returns \code{true} if any element is \code{true} (and \code{false}
otherwise), while \code{count} returns the number of \code{true}
elements.  Each of these can work on an individual dimension as with
\code{sum} and friends.

A further function, \code{dot\_product(a,b)}, takes two arguments that
must be rank-1 arrays of the same length and returns the dot
product. This is essentially the same as \code{sum(a*b)}.

\section{Array expansion operations}
\label{sec:expand}
The function \code{outer\_product(x,y)} returns the outer product of
two rank-1 expressions; if ${\bf x}$ and ${\bf y}$ are interpreted as
column vectors then ${\bf xy}^T$ is returned. If \code{outer\_product}
is used in an expression then an intermediate matrix object is not
created to store it.

The function \code{spread<dim>(A,n)} returns an array that replicates
the \code{A} array \code{n} times along dimension \code{dim}. The
returned array has a rank one larger than \code{a} whose dimension
\code{dim} is \code{n} and the remaining dimensions are the same as
those of \code{A}. It is essentially the same as the Fortran function
of the same name, but \code{dim} is provided as a template arguent
since performance is improved if this is known at compile time.  The
following illustrates \code{spread} for an argument of rank 1:
\begin{lstlisting}
 Vector v(3); v << 1, 2, 3;
 Matrix M0 = spread<0>(v,2);
 // M1 contains {{1, 2, 3},
 //              {1, 2, 3}}
 Matrix M1 = spread<1>(v,2);
 // M2 contains {{1, 1},
 //              {2, 2},
 //              {3, 3}}
\end{lstlisting}
Note that \code{spread<1>(x,y.size())*spread<0>(y,x.size())} gives the
same result as \code{outer\_product(x,y)}.

\section{Conditional operations}
\label{sec:conditional}
There are two main ways to perform an operation on an array depending
on the result of a boolean expression. The first is similar to the
Fortran \code{where} construct:
\begin{lstlisting}
 Array A(3,4);
 Array B(3,4);
 A.where(B > 0.0)   = 2.0 * B;            // Only assign to A if B > 0
 A.where(!isnan(B)) = either_or(-B, 0.0); // Read from either one expression or the other
\end{lstlisting}
In the first example, \code{A} is only assigned if a condition is met,
and therefore \code{A} must be of the same size and rank of the
boolean expression. In the second example \code{A} is filled with
elements from the first argument of \code{either\_or} if the boolean
expression is \code{true}, or from the second argument otherwise; if
\code{A} is empty then it will be resized to the size of the boolean
expression. In both cases, the expressions on the right-hand-side may
be scalars or array expressions of the same size as the boolean
expression.  Equivalent expressions are possible replacing the
assignment operator with the \code{+=}, \code{-=}, \code{*=} and
\code{/=} operators, in which case \code{A} must already be the same
size as the boolean expression.

An alternative approach that works with only vectors uses the
\code{find} function. This is similar to the equivalent Matlab
function and returns an \code{IndexVector} (a vector of integers of
sufficient precision to index an array) containing indices to the
\code{true} elements of the vector:
\begin{lstlisting}
 Vector v(10), w(10);
 v(find(v > 5.0)) = 3.0;
 IndexVector index = find(v > 5.0);
 v(index) = 2.0 * u(index);
\end{lstlisting}
This will work if no \code{true} elements are found: \code{find} will
return an empty array, and when \code{v} is indexed by an empty
vector, no action will be taken.  In general, \code{find} is less
efficient than \code{where}.

\section{Fixed-size arrays}
\label{sec:fixed}
The size of the \code{Array} class is dynamic, which is somewhat
sub-optimal for small arrays whose dimensions are known at compile
time. \Adept\ provides an alternative class template for an array
whose size is known at compile time and whose data are stored on the
stack. It has the following declaration:
\begin{lstlisting}
 namespace adept {
   template <typename Type, bool IsActive, int Dim0, int Dim1 = 0, ...>
   class FixedArray;
 }
\end{lstlisting}
The type (e.g.\ \code{double}) and activeness are specified by the
first two template arguments, while the remaining template arguments
provide the size of the dimensions, up to 7.  Only as many sizes need
to be specified as there are dimensions.  A user working with arrays
of a particular size could use \code{typedef} to provide convenient
names; for example:
\begin{lstlisting}
 typedef FixedArray<double,false,4>   Vector4;
 typedef FixedArray<double,false,4,4> Matrix44;
 typedef FixedArray<double,true,4>    aVector4;
 typedef FixedArray<double,true,4,4>  aMatrix44;
\end{lstlisting}
In the \code{adept} namespace, \Adept\ defines \code{Vector2},
\code{Vector3}, \code{Matrix22}, \code{Matrix33} and their active
counterparts.

Fixed arrays have all the same capabilities as dynamic arrays, with a
few exceptions:
\begin{itemize}
\item Since their size is fixed, there are no member functions
  \code{resize}, \code{clear} or \code{in\_place\_transpose}.
\item Since for the lifetime of the object it is associated with data
  on the stack, it cannot link to other data.  This means that there
  is no member function \code{link}, and also if it is passed by value
  to a function then the contents of the array will be copied, rather
  than the behaviour of the \code{Array} class where the receiving
  function links to the original data.
\end{itemize}
All the same slicing operations are available as discussed in section
\ref{sec:slice}, and they return the same types when applied to fixed
arrays as they do when applied to dynamic arrays.  Thus most
operations return an \code{Array} object that links to a subset of the
data within the \code{FixedArray} object.

\section{Special square matrices}
\label{sec:square}
\Adept\ offers several special types of square matrix that can
participate in array expressions.  They are more efficient than
\code{Array}s in certain operations such as matrix multiplication and
assignment, but less efficient in operations such as accessing
individual elements. All use an internal storage scheme compatible
with BLAS (Basic Linear Algebra Subprograms).  All are specializations
of the \code{SpecialMatrix} class template, which has the following
declaration:
\begin{lstlisting}
 namespace adept {
   template <typename Type, class Engine, bool IsActive = false>
   class SpecialMatrix;
 }
\end{lstlisting}
The first template argument is the numerical type, the second provides
the functionality specific to the type of matrix being simulated, and
the third states whether the matrix participates in the
differentiation of an algorithm. The specific types of special matrix
are as follows:
\begin{description}
\item[Square matrices.] \code{SquareMatrix} provides a dense square
  matrix of type \code{Real} with \code{aSquareMatrix} its active
  counterpart. Its functionality is similar to a rank-2 \code{Array},
  except that its dimensions are always equal and the data along its
  fastest varying dimension are always contiguous in memory, which may
  make it faster than \code{Array} in some instances.
\item[Symmetric matrices.] \code{SymmMatrix} provides a symmetric
  matrix of type \code{Real}, and \code{aSymmMatrix} is its active
  equivalent. Internally this type uses row-major unpacked storage
  with the data held in the lower triangle of the array and zeros in
  the upper triangle (equivalent to column-major storage with data in
  the upper triangle). If the oposite configuration is required then
  it is available by specifying different template arguments to the
  \code{SpecialMatrix} class template.  Note that with normal access
  methods, the storage scheme is opaque to the user; for example,
  \code{S(1,2)=2.0} and \code{S(2,1)=2.0} have the same effect.
\item[Triangular matrices.] \code{LowerMatrix} and \code{UpperMatrix}
  (and their active equivalents prefixed by ``\code{a}'') provide
  triangular matrices of type \code{Real}. Internally they use
  row-major unpacked storage, although column-major storage is
  available by specifying different template arguments to the
  \code{SpecialMatrix} class template.
\item[Band diagonal matrices.] \code{DiagMatrix}, \code{TridiagMatrix}
  and \code{PentadiagMatrix} provide diagonal, tridiagonal and
  pentadiagonal \code{Real} matrices, respectively (with their active
  equivalents prefixed by ``\code{a}''). Internally, row-major
  BLAS-type band storage is used such that an $n$-by-$n$ tridiagonal
  matrix stores $3n$ rather than $n^2$ elements. \Adept\ supports
  arbitrary numbers of sub-diagonals and super-diagonals, accessible
  by specifying different template arguments to the
  \code{SpecialMatrix} class template.
\end{description}
A \code{SpecialMatrix} can be constructed and resized as for
\code{Array}s (see section \ref{sec:array}), with the following
additions:
\begin{lstlisting}
 SymmMatrix S(4);  // Initialize a 4-by-4 symmetric matrix
 S.resize(5);      // Resize to a 5-by-5 matrix
\end{lstlisting}
These are applicable to all types of \code{SpecialMatrix}.

In terms of array indexing and slicing, the member functions \code{T},
\code{diag} and \code{diag\_submatrix} described in section
\ref{sec:slice} are all available, but if you index a
\code{SpecialMatrix} with \code{S(a,b)} then \code{a} and \code{b} must
be scalars or scalar expressions. For triangular or band-diagonal
matrices, if the requested element is one of the zero parts of the
matrix then it can only be used as an rvalue in an expression. If you
wish to extract arbitrary subarrays from a \code{SpecialMatrix} then it
must first be converted to a \code{Matrix}:
\begin{lstlisting}
 SymmMatrix S(6);
 intVector index(3);
 index << 2, 3, 5;
 Matrix M = Matrix(S)(index,stride(0,4,2));
\end{lstlisting}


\section{Matrix multiplication}
\label{sec:matmul}
Matrix multiplication may be invoked in two equivalent ways: using the
\code{matmul} function or the ``\code{**}'' pseudo-operator. Following
Fortran, the two arguments may be either rank-1 or rank-2, but at
least one argument must be of rank-2. The orientation of any rank-1
argument is inferred from whether it is the first or second argument,
as shown here:
\begin{lstlisting}
 Matrix A(3,5), B(5,3), C;
 Vector v(5), w;
 C = matmul(A,B); // Matrix-matrix multiplication: return a 3x3 matrix
 w = matmul(v,B); // Interpret v as a row vector: return a vector of length 3
 w = matmul(A,v); // Interpret v as a column vector: return a vector of length 3
\end{lstlisting}
In this way it is never necessary to transpose a vector; the
appropriate orientation to use is inferred from the context.  You may
find it clearer to use ``\code{**}'' for matrix multiplication as
illustrated here:\footnote{A drawback of the \code{**} interface with
  the orientation of vector arguments being inferred is that in an
  expression like \code{A**v**B} (where \code{A} and \code{B} are
  matrices and \code{v} is a vector), \code{v} is interpreted as a
  column vector in \code{A**v}, which returns a column vector result,
  but this result is then implicitly transposed when it is used as the
  left-hand argument of the matrix multiplication with \code{B}.
  Moreover, the order of precedence affects the result, since this
  expression will not give the same answer as \code{A**(v**B)}.
 % I may
 % consider introducing additional constraints and features in future
 % versions to require users to more explicitly state what they mean in
 % such situations, to reduce the chance of accidental mistakes.
}
\begin{lstlisting}
 Matrix A(3,5), B;
 SymmMatrix S(5);                // 5-by-5 symmetric matrix
 Vector c, x(5);
 c = A **  log(S) ** x;          // Returns a vector of length 3
 c = matmul(matmul(A,log(S)),x); // Equivalent to the previous line but using matmul
 c = A ** (log(S) ** x);         // As the previous example but more efficient
 B = 2.0 * S ** A.T();           // Returns a 5-by-3 matrix
 B = 2.0 * S ** A;               // Run-time error: inner dimensions don't match
\end{lstlisting}
The ``\code{**}'' pseudo-operator has been implemented in \Adept\ by
overloading the dereference operator such that ``\code{*A}'' returns a
special type when applied to array expressions, and overloading the
multiply operator to perform matrix multiplication when one of these
types is on the right-hand-side. This means that \code{**} has the
same precedence as ordinary multiplication, and both will be applied
in order of left to right.  Thus, in the first example above,
matrix-matrix multiplication is performed followed by matrix-vector
multiplication. The second example shows how to make this more
efficient with parentheses to specify that the rightmost matrix
multiplication should be applied first, leading to two matrix-vector
multiplications.  The final example shows an expression that would
fail at runtime with an \code{inner\_dimension\_mismatch} exception
due to the matrix multiplication being applied to matrices whose inner
dimensions do not match.

You cannot use \code{matmul} or ``\code{**}'' for vector-vector
multiplication, since it is ambiguous whether you require the inner
product (dot product) or the outer product. Therefore you must
explicitly call the function \code{dot\_product} (section
\ref{sec:reduce}) or \code{outer\_product} (section \ref{sec:expand}).

In order to get the best performance, \Adept\ does not use expression
templates for matrix multiplication but rather calls the appropriate
level-2 BLAS function for matrix-vector multiplication and level-3
BLAS function for matrix-matrix multiplication. For matrix
multiplication involving active vectors and matrices, \Adept\ first
uses BLAS to perform the matrix multiplication and then stores the
equivalent differential statements. There are therefore a few factors
that users should be aware of in order to get the best performance:
\begin{itemize}
\item If an array expression rather than an array is provided as an
  argument to matrix multiplication, it will first be converted to an
  \code{Array} of the same rank. Therefore, if the same expression is
  used more than once in a sequence of matrix multiplications, better
  performance will be obtained by precomputing the array expression
  and storing it in a temporary matrix:
\begin{lstlisting}
 Matrix A(5,5), B(5,5), C(5,5), D(5,5)
 // Slow implementation:
 C = transpose(2.0*A*B) ** (2.0*A*B);
 D = (2.0*A*B) ** C;
 // Faster implementation:
 {
   Matrix tmp = 2.0*A*B;
   C = tmp.T() ** tmp;
   D = tmp ** C;
 } // "tmp" goes out of scope here
\end{lstlisting}
\item If the left-hand argument of a matrix multiplication is a
  symmetric, triangular or band matrix then a specialist BLAS function
  will be used that is faster than the one for general dense
  matrices. \Adept\ may not be able to tell if the result of an array
  expression is symmetric, triangular or has a band structure, and so
  may not call the most efficient BLAS function. The user can help as
  follows:
\begin{lstlisting}
 SymmetricMatrix S(5,5)
 Matrix A(5,5), B(5,5)
 B = (2.0*exp(S)) ** A;           // Slower
 B = SymmMatrix(2.0*exp(S)) ** A; // Faster
\end{lstlisting}
\item BLAS requires that the fastest-varying dimension of input
  matrices are contiguous and increasing. This is always the case for
  the special square matrices described in section \ref{sec:square},
  but not necessarily for a \code{Matrix} or an \code{aMatrix}, which are
  particular cases of the general \code{Array} type. If the
  fastest-varying dimension of such a matrix is not contiguous and
  increasing then \Adept\ will copy it to a temporary matrix before
  invoking matrix multiplications, as in the following example:
\begin{lstlisting}
 Matrix A(5,5), B, C(5,5);
 B.link(A(__, stride(end,1,-1)); // Fastest varying dim is contiguous but decreasing
 C = A ** A; // Matrix multiplication applied directly with A
 C = B ** B; // Adept will copy B to a temporary matrix before multiplication
\end{lstlisting}
\end{itemize}

An additional member function to mention in this section is
\code{in\_place\_transpose()}, which is only applicable to
matrices. It transposes the matrix by swapping the dimensions and the
offsets to each dimension, but leaving the actual data untouched.
This means that a matrix with row-major storage will be changed to
column-major, and vice versa.

\Adept\ can differentiate expressions involving matrix multiplcation,
but this is far from optimal in \Adept\ version 2.0, for two
reasons. Firstly, only differentiation of dense matrices has been
implemented, so when matrix multiplication is applied to active
``special matrices'' (symmetric, band, upper-triangular and
lower-triangular matrices), they are first copied to a dense
matrix. Secondly, the \Adept\ stack format can currently only store
differential statements for scalar expressions, which for matrix
multiplication leads to lots of repeated values on the stack. A future
version of \Adept\ will redesign the stack to allow matrices to be
stored in it; this will be much faster and much less memory-hungry.

\section{Linear algebra}
\label{sec:la}
\Adept\ provides the functions \code{solve} and \code{inv} to solve
systems of linear equations and to invert a matrix, respectively,
which themselves call the most appropriate function from
LAPACK.
\begin{lstlisting}
 Matrix A(5,5), Ainv(5,5), X(5,5), B(5,5);
 SymmMatrix S(5), Sinv(5);
 Vector x(5), b(5);
 Ainv = inv(A);     // Invert general square matrices using LU decomposition
 Sinv = inv(S);     // Invert symmetric matrices using Cholesky decomposition
 x = solve(A,b);    // Solve general system of linear equations
 X = solve(S,B);    // Solve symmetric system of linear equations with matrix right-hand-side
\end{lstlisting}
\iffalse
As for matrix multiplication described in section \ref{sec:matmul}, if
the arguments to \code{solve} and \code{inv} are not matrices with
fastest-varying dimensions that are contiguous and increasing, then
\Adept\ will first convert them to temporary matrices before
performing the operation.
\fi

Statements involving \code{solve} and \code{inv} cannot yet be
automatically differentiated. When the \Adept\ stack is redesigned to
hold matrices, this capability will be added.

\section{Interpolation}
\emph{Adept} supports linear and nearest-neighbour interpolation, in
one, two and three dimensions via the \code{interp}, \code{interp2d}
and \code{interp3d} functions. The example below shows how these
functions are called and the size of the arguments, but does not fill
the arguments with actual data (see the test program
\code{test/test\_interp.cpp} for complete usage):
%
\begin{lstlisting}
 // Size of each dimension
 int nx, ny, nz;
 // Coordinate vectors of each dimension (must be monotonic)
 Vector x(nx), y(ny), z(nz);
 // Arrays to be interpolated
 Vector  A1(nx);
 Matrix  A2(ny,nx);
 Array3D A3(nz,ny,nx);
 // Number of points required
 int ni;
 // Locations of these points
 Vector xi(ni), yi(ni), zi(ni);
 // Output vector
 Vector v(ni);
 // Linear interpolation (default)
 v = interp(x,A1,xi);
 v = interp(x,A1,xi,ADEPT_INTERPOLATE_LINEAR); // Specifying scheme explicitly
 v = interp2d(y,x,A2,yi,xi);
 v = interp3d(z,y,x,A3,zi,yi,xi);
 // Nearest-neighbour interpolation
 v = interp(x,A1,xi,ADEPT_INTERPOLATE_NEAREST);
 v = interp2d(y,x,A2,yi,xi,ADEPT_INTERPOLATE_NEAREST);
 v = interp3d(z,y,x,A3,zi,yi,xi,ADEPT_INTERPOLATE_NEAREST);
\end{lstlisting}
%
Each interpolation function takes coordinate vectors describing each
dimension of the interpolation array in the order of the dimensions of
that array. In the two dimensional case, since matrices are indexed
first by row ($y$ axis) then column ($x$ axis), this is the order they
are shown here.

The interpolation arrays (\code{A1}, \code{A2} and \code{A3} here) may
have more dimensions than shown above; for each additional dimension, a
further dimension is added to the output array, and effectively
multiple arrays are interpolated at once. In this case, the coordinate
vectors still refer to the first one, two or three dimensions of this
array and the remaining (more rapidly varying in memory) dimensions
come after.

As can be seen from the listing above, an optional argument after the
array arguments specifies the interpolation scheme to use, but this
argument can also be used to specify the extrapolation policy to apply
for requested points that lie outside of the interpolation array by
using a bitwise-OR with one of the following:
%
\begin{description}
  \citem{ADEPT\_EXTRAPOLATE\_DEFAULT} Use the default extrapolation
  policy associated with the interpolation scheme (see
  below). Obviously this can be omitted.
  \citem{ADEPT\_EXTRAPOLATE\_LINEAR} Linear extrapolation; this is the
  default for linear interpolation, but is not available with
  nearest-neighbour interpolation.  \citem{ADEPT\_EXTRAPOLATE\_CLAMP}
  Clamp the returned value at the nearest valid point in the
  interpolation array; this is the default for nearest-neighbour
  interpolation.  \citem{ADEPT\_EXTRAPOLATE\_CONSTANT} Set outliers to
  a constant value provided by a further optional argument to the
  function, or \code{NaN} if no additional argument is provided.
\end{description}
For example:
\begin{lstlisting}
 // Explicit selection of default behaviour (linear interpolation & extrapolation)
 v = interp(x,A1,xi,ADEPT_INTERPOLATE_LINEAR|ADEPT_EXTRAPOLATE_DEFAULT);
 // Nearest-neighbour interpolation with clamped extrapolation
 v = interp(x,A1,xi,ADEPT_INTERPOLATE_NEAREST|ADEPT_EXTRAPOLATE_CLAMP);
 // Nearest-neighbour interpolation, outliers set to NaN
 v = interp(x,A1,xi,ADEPT_EXTRAPOLATE_NEAREST|ADEPT_EXTRAPOLATE_CONSTANT);
 // Linear interpolation, outliers set to zero
 v = interp(x,A1,xi,ADEPT_EXTRAPOLATE_CONSTANT, 0.0);
\end{lstlisting}

\section{Bounds and alias checking}
\label{sec:bounds}
When encountering an array or active expression, \Adept\ performs
several checks to test the validity of the expression both at compile
time and at runtime:
\begin{description}
\item[Activeness check.] An expression in which an active expression
  is assigned to an inactive array will fail to compile.
\item[Rank check.] An expression will fail to compile if the rank of
  the array on the left-hand-side of the ``\code{=}'' operator (or the
  operators ``\code{+=}'', ``\code{*=}'', etc.) does not match the
  rank of the array expression on the right-hand-side. However, a
  scalar (rank-0) expression can be assigned to an array of any rank;
  its value will be assigned to all elements of the
  array. Compile-time rank checks are also performed for each binary
  operation (binary operators such as ``\code{+}'' and binary
  functions such as \code{pow}) making up an array expression:
  compilation will fail if the two arguments do not have the same rank
  and neither is of rank 0.
\item[Dimension check.] When a binary operation is applied to two
  array expressions of rank $n$ then \Adept\ checks at run-time that
  each of the $n$ dimensions has the same length. Otherwise, a
  \code{size\_mismatch} exception is thrown.
\item[Alias check.] By default, \Adept\ checks to see whether the memory
  referenced in the array object on the left-hand-side of a statement
  overlaps with the memory referenced by any of the objects on the
  right-hand-side, as in this example of a shift-right operation:
\begin{lstlisting}
 Vector v(6);
 v(range(1,end)) = v(range(0,end-1));
\end{lstlisting}
  In order to prevent the right-hand-side changing during the
  operation, \Adept\ copies the expression on the right-hand-side to a
  temporary array and then assigns the left-hand-side array to this
  temporary, which is equivalent to the following:
\begin{lstlisting}
 {
   Vector tmp;
   tmp = v(range(0,end-1));
   v(range(1,end)) = tmp;
 } // tmp goes out of scope here
\end{lstlisting}
  However, for speed \Adept\ does not check to see whether individual
  memory locations are shared; rather the start and end memory
  locations are checked to see if they overlap. This means that for
  certain strided operations, copying to a temporary array is
  unnecessary.  Nor is it necessary if elements of an array will be
  accessed in exactly the same order on the left-hand-side as the
  right-hand-side. If the user is sure that alias checking is not
  necessary then he or she can override alias checking for part or all
  of an array expression using the \code{noalias} function, as
  follows:
\begin{lstlisting}
 v(stride(1,end,2)) = noalias(v(stride(0,end-1,2))); // No overlap between RHS and LHS
 v = 1.0 + noalias(exp(v));                          // LHS & RHS accessed in same order 
\end{lstlisting}
  Note that for speed, alias checking is not performed if the
  left-hand-side is a \code{FixedArray}, since such arrays can never
  point to another location and therefore aliasing is less likely to
  arise. Aliasing is still possible if one of the terms on the
  right-hand-side points to the data in the \code{FixedArray} on the
  left. In this case, you can use the \code{eval} function, which
  takes a non-scalar expression as an argument, and returns an array
  containing a copy of the data. For example:
\begin{lstlisting}
 FixedArray<Real,false,3> v = {1.0, 2.0, 3.0}; // C++11 initialization of inactive vector
 v = v(stride(end,0,-1));                      // Aliasing leads to v = {3.0, 2.0, 3.0}
 v = eval(v(stride(end,0,-1)));                // Expected result:  v = {3.0, 2.0, 1.0}
\end{lstlisting}
  To avoid the overhead of alias checking, you can define the
  preprocessor variable \code{ADEPT\_NO\_ALIAS\_CHECKING}, but then it
  is up to the user to identify the statements where aliasing will
  occur and use the \code{eval} function to ensure the correct
  behaviour.
\item[Bounds check.] If the preprocessor variable
  \code{ADEPT\_BOUNDS\_CHECKING} is defined then additional run-time
  checks will be performed when an array is indexed or sliced using
  the methods described in section \ref{sec:slice}; if an index is
  ount of bounds then a \code{index\_out\_of\_bounds} exception will
  be thrown.  This makes indexing and slicing of arrays slower so
  would normally only be used for debugging.
\end{description}

\section{Automatic differentiation capabilities specific to arrays}
Section \ref{sec:adjoint} described how the \code{get\_gradient()}
member function could be used to extract the gradients from a scalar
\code{adouble} object after applying forward- or reverse-mode
differentiation. In the same way, gradients may be extracted from
active \code{Array} and \code{FixedArray} objects, returning an
inactive \code{Array} of the same rank and size. For example, to
compute the derivative of a \code{norm2} operation, we could do the
following:
\begin{lstlisting}
 Stack stack;                     // Stack to store differential statements
 aVector x = {1.0, 2.0, 3.0};     // C++11 initialization
 stack.new_recording();           // Clear any stored differential statements
 aReal y = norm2(x);              // Perform operation to be differentiated
 y.set_gradient(1.0);             // Seed the independent variable
 stack.reverse();                 // Reverse-mode differentiation
 Vector dy_dx = x.get_gradient(); // Extract vector of derivatives
\end{lstlisting}

\section{Array thread safety}
\label{sec:thread}
There are numerous ways of obtaining an \code{Array} that links to
data in another \code{Array} object; not only the ``\code{>>=}'' link
operator described in section \ref{sec:array}, but also the various
subsetting member functions described in section \ref{sec:slice}, and
even just passing arrays to and from functions. This avoids deep
copying and so improves efficiency. In addition to the new \code{Array}
pointing to the same data, it also points to the same \code{Storage}
object, and when a new link is created, the counter in this object
indicating the number of objects pointing to it is incremented. This
ensures that the data will remain provided there is at least one
object linking to it.  A downside of this model is that if multiple
threads access an array simultaneously, even if just to read it, then
the reference counter can become corrupted.  There are two solutions
to this problem. 

\cxx11 \begin{leftbar} If you are using C++11 then you can define the
  \code{ADEPT\_STORAGE\_THREAD\_SAFE} preprocessor variable, which
  makes the reference counter in \code{Storage} objects of type
  \code{std::atomic<int>} and thereby protects all operations on them
  by a mutex. This may degrade the efficiency of your code since the
  mutex will be redundant in single-threaded code. \end{leftbar}

Alternatively, we use the capability of arrays to access data not held
in a \code{Storage} object. The \code{Array} and \code{SpecialMatrix}
classes have a \code{soft\_link()} member function that returns an
object of the same type, size and activeness, which points to the same
data but does not contain a link to the \code{Storage} object:
\begin{lstlisting}
 Matrix M(2,2);
 // ...enter multi-threaded environment
 Matrix N;
 N >>= M.soft_link();            // N links to same data as M but without Storage object
 Vector v = M.soft_link()(__,0); // v links to subset of M but without Storage object
                                 // (recall that the copy constructor is called here) 
\end{lstlisting}
The linked objects may be used in the same way as any other
\code{Array}. This is demonstrated in the
\code{test\_thread\_safe\_arrays} test program.
\section{Writing an array to a stream}
As you would expect, an array can be written to a stream with the
``\code{<<}'' operator:
\begin{lstlisting}
 Vector v = {1, 2};            // Using C++11 initializer lists
 Matrix M = {{3, 4}, {5, 6}};  // for convenience
 std::cout << v << "\n";
 std::cout << M << "\n";
\end{lstlisting}
which by default produces
\begin{lstlisting}
 1 2
 3 4
 5 6
\end{lstlisting}
You can change the output to use curly brackets to indicate
the dimensions of the array as follows:
\begin{lstlisting}
 Vector v = {1, 2};
 Matrix M = {{3, 4}, {5, 6}};
 adept::set_array_print_style(PRINT_STYLE_CURLY);
 std::cout << "v = " << v << ";\n";
 std::cout << "M = " << M << ";\n";
\end{lstlisting}
which produces output that looks like C/C++ code:
\begin{lstlisting}
 v = {1, 2};
 M =
 {{3, 4},
  {5, 6}};
\end{lstlisting}
The available print styles for use by \code{set\_array\_print\_style}
are \code{PRINT\_STYLE\_PLAIN} (default), \code{PRINT\_STYLE\_CURLY},
\code{PRINT\_STYLE\_CSV} (comma-separated values) and
\code{PRINT\_STYLE\_MATLAB} (matrix ordering indicated by Matlab-style
semi-colons and square brackets).

\section{Fortran interoperability}
\label{sec:fortran}
The traditional way to pass arrays between Fortran and C/C++ makes use
of the fact that Fortran passes its ``explicit-shape'' arrays (the
type used since Fortran-77) to and from routines simply as a pointer
to the first element of the array. It is then up to the receiving
routine to declare the size of the array correctly.  \Adept\ arrays
can therefore be passed to Fortran routines using their \code{data()}
and \code{const\_data()} member functions, which return pointers to
the first element of the array.

Since Fortran-90, the language also supports ``assumed-shape'' arrays,
which are very much like \Adept's \code{Array} objects: they contain
within them the extent of each array dimension, and may refer to data
that are strided (non-contiguous) in memory.  Fortran passes an
assumed-shape array to subroutines and functions in the form of a
pointer to its \emph{array descriptor} (sometimes known as a
\emph{dope vector}), which contains a pointer to the first element of
the array and information on the rank, type, and the extent and
stride-in-memory of each dimension.

The Fortran 2018 standard defines an interface to allow assumed-shape
arrays to be passed to and from C or C++ functions.  Fortran compilers
supporting this standard provide a C/C++ header file
\code{ISO\_Fortran\_binding.h} that defines the array descriptor as a
structure \code{CFI\_cdesc\_t}.  The \Adept\ header file
\code{adept\_fortran.h} provides a class \code{adept::FortranArray}, a
thin wrapper to this structure, that enables an \Adept\ \code{Array}
object to share its data with a Fortran array. This is very efficient
as only the array descriptor information is copied, not the actual
data in the array.  At the time of writing, support for this
capability in Fortran compilers is limited.

An crucial point to be aware of in all the examples that follow is
that \Adept\ indexes its arrays in row-major order starting at 0,
while Fortran indexes its arrays in column-major order starting (by
default) at 1. When arrays are passed between the two languages, the
native array convention is adopted. Therefore, matrix element
\code{A(0,10)} in \Adept\ would be indexed as \code{A(11,1)} in
Fortran.

\subsection{Passing arrays from C++/Adept to Fortran}
Suppose we have a Fortran subroutine that takes an integer array and a
single-precision array as arguments:
\begin{lstlisting}[language=Fortran]
 ! Define a routine callable with same name in C/C++
 subroutine fortran_routine(int_array, flt_array) bind(c)
   implicit none
   integer(kind=4), intent(inout) :: int_array(:,:) ! Matrix of 4-byte integers
   real(kind=4),    intent(inout) :: flt_array(:,:) ! Matrix of 4-byte real numbers
   ! --- Body of routine here ---
 end subroutine fortran_routine
\end{lstlisting}
The following C++ program demonstrates how \Adept\ arrays can be
passed this routine:
\begin{lstlisting}
 #include <adept_fortran.h>
 // Declare interface to the routine, turning off C++ name mangling so that it can be linked
 // to Fortran 
 extern "C" void fortran_routine(adept::FortranArray* int_array,
                                 adept::FortranArray* flt_array);
 int main() {
   // Initialize Adept matrices, using shortcuts to the types Array<2,int> and Array<2,float>
   adept::intMatrix   int_arr = {{2, 3, 5}, {7, 11, 13}};
   adept::floatMatrix flt_arr = {{2.0, 3.0, 5.0}, {7.0, 11.0, 13.0}};
   // Convert Adept arrays to Fortran arrays pointing to the same data, and call the routine;
   // the conversion to FortranArray pointers is done automatically
   fortran_routine(adept::FortranArray(int_arr),
                   adept::FortranArray(flt_arr));
   return 0;
 }
\end{lstlisting}
This will fail to compile if \code{ISO\_Fortran\_binding.h} file is
not found. To link the two object files into an executable you will
need to use your C++ compiler, but include the relevant Fortran
library on the command line (e.g.\ \code{-lgfortran} if you compiled
\code{fortran\_routine} with the GNU Fortran compiler, or
\code{-lifcore} if you used the Intel Fortran compiler).

\subsection{Passing arrays from Fortran to C++/Adept}
We can also pass arrays the other way. Consider the following Fortran
program:
\begin{lstlisting}[language=Fortran]
 program test_interoperability
   implicit none
   ! Define interface to a function implemented in C++
   interface
     subroutine adept_routine(int_array, flt_array) bind(c)
       integer(kind=4), intent(inout) :: int_array(:,:) ! Matrix of 4-byte integers
       real(kind=4),    intent(inout) :: flt_array(:,:) ! Matrix of 4-byte real numbers
     end subroutine adept_routine 
   end interface
   ! Body of program starts here
   integer(kind=4), allocatable :: imat(:,:)
   real(kind=4),    allocatable :: fmat(:,:)  
   ! --- Code to allocate and populate imat and fmat here ---
   ! Now call the C++ function
   call adept_routine(imat, fmat)
 end program test_interoperability
\end{lstlisting}
The routine could be implemented in C++ as follows:
\begin{lstlisting}
 #include <adept_fortran.h>
 extern "C" void adept_routine(adept::FortranArray* int_array,
                               adept::FortranArray* flt_array) {
   // Declare Adept arrays
   adept::intMatrix   int_arr;
   adept::floatMatrix flt_arr;
   // Associate Adept arrays with Fortran data, or throw a fortran_interoperability_error
   // exception if the rank or type do not match
   int_arr >>= int_array;
   flt_arr >>= flt_array;
   // --- Operations on int_arr and flt_arr now modify the Fortran arrays ---
 }
\end{lstlisting}
Since the executable now contains a Fortran source file with a
\code{program} statement, rather than a C++ source file defining a
\code{main} function, the linking step of the compilation must be
carried out using the Fortran compiler, but passing it the C++
standard library, i.e.\ \code{-lstdc++}.

In the example above, the \Adept\ arrays \code{int\_arr} and
\code{flt\_arr} behave in the same way as ``linked'' arrays described
in section \ref{sec:array}: they know that they do not ``own'' the
original data, so if the user then calls their \code{clear} or
\code{resize} member functions, they will unlink themselves from the
Fortran arrays.  The \code{FortranArray} class provides no array
features itself, so must be linked to an \code{Array} object before
any work can be done on it, but it does provide a handful of member
functions for querying its properties:
\begin{description}
 \citem{int rank()} Return the number of dimensions of the array.
 \citem{int dimension(int i)} Return the extent of dimension \code{i}
 in memory, counting dimensions from 0 but using the Fortran ordering.
 \citem{int offset(int i)} Return the stride in memory of dimension
 \code{i}.
 \citem{bool is\_type<Type>()} Return \code{true} if the element type
 of the array is the same as \code{Type} (which must be a known type at
 compile time).
 \citem{Type* data<Type>()} Return a pointer to the first element of
 the data, cast to the specified type.
\end{description}

\chapter{Using \Adept's optimization functionality}
\label{chap:optimize}

\section{Background}
\label{sec:optimize}
Since version 2.0.8, \Adept\ provides functionality for solving
non-linear optimization problems, specifically finding the state
vector ${\bf x}$ that minimizes the scalar cost function $J({\bf
  x})$ (also known as a penalty function or objective function).
%
A \emph{gradient-free} minimization algorithm (e.g.\ Nelder-Mead)
requires simply a user-supplied function for computing $J$, calling it
multiple times for different ${\bf x}$ to find the minimum $J$.
%
A \emph{first-order} minimization algorithm requires also a
user-supplied function returning the gradient of the cost function
$\partial J/{\partial\bf x}$ (a vector). Examples are the Conjugate Gradient
method and the Limited-Memory Broyden-Fletcher-Goldfarb-Shanno
(L-BFGS) method. Knowing the gradient enables such algorithms to find
the minimum with far fewer function calls, although a function call
returning $\partial J/{\partial\bf x}$ is slower than one returning only
$J$. \Adept's optimization interface is in terms of passive array
types, so the user is not obliged to use \Adept's automatic
differentiation capability to compute these gradients, although the
examples in this chapter assume that they do.
%
A \emph{second-order} minimization algorithm makes use of not only $J$
and $\partial J/{\partial\bf x}$, but also a user-supplied function for the
Hessian ${\bf A}=\partial^2J({\bf x})/\partial{\bf x}^2=\nabla_{\bf
  x}^2J$ (a symmetric matrix), or an approximation of it.  Examples
are the Gauss-Newton and Levenberg-Marquardt methods. Knowing the
second derivative means that even fewer iterations should be required
to find the minimum of $J$, but ${\bf A}$ is more expensive to compute
than $\partial J/{\partial\bf x}$.

\Adept\ does not have the ability to automatically compute Hessian
matrices for an arbitrary cost function, but frequently the cost
function has a specific form that makes it possible to compute the
approximate Hessian from the Jacobian matrix.  Consider the
optimization problem of finding the parameters $\x$ of non-linear
model $\y(\x)$ that provides the closest match to a set of
``observations'' $\y^o$ in a least-squares sense.  For maximum
generality we add constraints that penalize differences between $\x$
and a set of \emph{a~priori} values $\x^a$, as well as a
regularization term.  In this case the cost function could be written
as \def\myspace{~~}
\begin{equation}
J(\x) \myspace =\myspace \frac12\left[\y(\x)-\y^o\right]^\mathrm{T}{\bf
  R}^{-1}\left[\y(\x)-\y^o\right]
\myspace+\myspace\frac12\left[\x-\x^a\right]^\mathrm{T}{\bf
  B}^{-1}\left[\x-\x^a\right]
\myspace+\myspace\frac12\x^\mathrm{T}{\bf T}\x.\nonumber
\label{eq:objective}
\end{equation}
Here, all vectors are treated as column vectors, ${\bf R}$ is the
error covariance matrix of the observations, ${\bf B}$ is the error
covariance matrix of the \emph{a~priori} values, and ${\bf T}$ is a
Twomey-Tikhonov matrix that penalizes either spatial gradients or
curvature in $\x$.  The approximate Hessian matrix is then given by
\begin{equation}
{\bf A} \myspace\simeq\myspace {\bf H}^\mathrm{T}{\bf
  R}^{-1}{\bf H}\nonumber
\myspace+\myspace {\bf B}^{-1} \myspace+\myspace {\bf T},
\label{eq:hessian}
\end{equation}
which can be coded up using \Adept\ to compute the Jacobian matrix
${\bf H}=\partial\y/\partial\x$. Each term on the right-hand-side of
(\ref{eq:hessian}) has its corresponding term in (\ref{eq:objective}),
so it is easy to work out what the Hessian would look like if only a
subset of the terms in (\ref{eq:objective}) were present. The first
term of (\ref{eq:hessian}) is the `Gauss-Newton' approximation of the
true Hessian of the first term of (\ref{eq:objective}).  It is exact
if $\y(\x)$ is linear, i.e.\ if each element of $\y$ could be
represented as a linear combination of the elements of $\x$. In many
cases this is a good enough approximation of the Hessian for fast
convergence to be achieved.

\section{\Adept\ interface}
\label{sec:minimizer_interface}
For the purposes of demonstrating how this would be implemented in
\Adept\ we simplify (\ref{eq:objective}) down to the case of
minimizing a quadratic function, in which case $J={\bf
  y}^\mathrm{T}{\bf y}/2$ and ${\bf y}={\bf x}$.  The former of these
two equations means that the Hessian matrix is simply ${\bf A}={\bf
  H}^\mathrm{T}{\bf H}$. The latter we implement using active
variables:
%
\begin{lstlisting}
 adept::aVector calc_y(const adept::aVector& x) { return x; }
\end{lstlisting}
%
The test program \code{test/test\_minimizer.cpp} uses a ${\bf y}({\bf
  x})$ function for the much more intersting case of the
$N$-dimensional Rosenbrock function.  To set up the problem ready for
minimizing, we create a class that derives from \Adept's
\code{Optimizable} class and overrides five of its virtual functions:
%
\begin{lstlisting}
 // Include this header file for the functionality described in this chapter
 #include <adept_optimize.h>

 class SimpleOptimizable : public adept::Optimizable {
 public:
   // Return the cost function for a given state vector x
   virtual adept::Real calc_cost_function(const adept::Vector& x) {
     adept::Vector y = value(calc_y(x)); // "value" throws away the activeness
     return 0.5*sum(y*y);
   }

   // Calculate the cost function and its gradient from x
   virtual adept::Real calc_cost_function_gradient(const adept::Vector& x,
					           adept::Vector gradient) {
     adept::aVector xactive = x;           // Copy x to an active variable
     stack.new_recording();
     adept::aVector y = calc_y(xactive);   // Calculate y from x
     adept::aReal cost = 0.5*sum(y*y);     // Calculate cost function as an active variable
     cost.set_gradient(1.0);               // Use reverse-mode differentiation to
     stack.reverse();                      //   compute the gradient
     gradient = xactive.get_gradient();
     return value(cost);                   // Return cost function as passive variable
   }

   // Calculate the cost function, its gradient and the approximate Hessian matrix
   virtual adept::Real calc_cost_function_gradient_hessian(const adept::Vector& x,
			   adept::Vector gradient, adept::SymmMatrix& hessian) {
     adept::aVector xactive = x;           // Copy x to an active variable
     stack.new_recording();
     adept::aVector y = calc_y(xactive);   // Calculate y from x
     adept::aReal cost = 0.5*sum(y*y);     // Calculate cost function as an active variable
     stack.independent(xactive);           // Define independent variables
     stack.dependent(y);                   // Define dependent variables
     adept::Matrix jac = stack.jacobian(); // Compute Jacobian matrix dy/dx
     hessian  = jac.T() ** jac;            // Hessian is a simple matrix product of Jacobian
     gradient = jac.T() ** value(y);       // Gradient is a matrix-vector product
     return value(cost);                   // Return cost function as passive variable
   }

   // Every iteration this function is called: here simply report progress to standard output
   virtual void report_progress(int niter, const adept::Vector& x,
                                adept::Real cost, adept::Real gnorm) {
     std::cout << "Iteration " << niter << ": cost function = " << cost << "\n";
   }
 
   // Minimization algorithm may want to check what derivatives are available: here we 
   // provide 0th (cost function alone) 1st (gradient) and 2nd (Hessian), so return true
   // for 0, 1 or 2, false otherwise
   virtual bool provides_derivative(int order) { return (order >= 0 && order <= 2); }

 // Keep an instance of the Adept stack within the class: avoids the initialization costs
 // incurred each iteration if it was inside calc_cost_function_gradient and
 // calc_cost_function_gradient_hessian 
 private:
   adept::Stack stack;
 };
\end{lstlisting}
Note that if you plan to use a first-order minimization algorithm, you
do not need to provide a
\code{calc\_cost\_function\_gradient\_hessian} function.

\Adept's \code{Minimizer} class can minimize the cost function
held in an \code{Optimizable} object by calling the user-supplied
virtual functions, as follows:
\begin{lstlisting}
 SimpleOptimizable quadratic_function;
 adept::Minimizer minimizer(MINIMIZER_ALGORITHM_LEVENBERG); // Select minimization algorithm
 int nx = 10;         // Number of state variables
 adept::Vector x(nx); // Declare state vector
 x = 3.0;             // Initialize state vector to first guess values, all 3.0
 // Minimize the cost function:
 adept::MinimizerStatus status = minimizer.minimize(quadratic_function, x);
 // Report the convergence status:
 std::cout << "Convergence status: " << adept::minimizer_status_string(status) << "\n";
\end{lstlisting}
%
After the \code{minimize} member function is called, \code{x} contains
the state vector that minimizes the cost function.

The available minimization algorithms are:
\begin{description}
\citem{MINIMIZER\_ALGORITHM\_CONJUGATE\_GRADIENT} The first-order
Conjugate-Gradient algorithm performs a line search along the
steepest-descent direction, then uses the Polak-Ribi\`ere formula to
compute subsequent search directions that are conjugate to the
previous $N$ directions, where $N$ is the number of state
variables. The Conjugate-Gradient method is the most memory efficient,
so suitable for problems with large $N$. The line search first
brackets the minimum then fits a cubic polynomial to the values and
gradients at the bounding points to find the best estimate of the next
search point. The Wolfe conditions are applied to determine whether
the cost function along the search direction has been sufficiently
minimized.
%
\citem{MINIMIZER\_ALGORITHM\_CONJUGATE\_GRADIENT\_FR} As above but
using the Fletcher-Reeves formula to compute new search directions.
%
\citem{MINIMIZER\_ALGORITHM\_LIMITED\_MEMORY\_BFGS} The first-order
Limited-Memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) algorithm
uses a limited number of previous search directions (default 6) to
build up an approximation to the inverse of the Hessian matrix,
enabling it to make a better estimate of the location of the minimum
of the cost function, but with a slightly higher memory
footprint. Note that the full inverse Hessian is not computed
explicitly so this method is still efficient in memory for large $N$.
%
\citem{MINIMIZER\_ALGORITHM\_LEVENBERG} The second-order Levenberg
algorithm tries to perform a Gauss-Newton step using the approximate
Hessian matrix and assuming that the curvature of $J$ is locally
constant. If $J$ at the new ${\bf x}$ is not reduced by this step then
a damping parameter $\lambda$ is used to scale between Gauss-Newton
and a steepest-descent algorithm.
%
\citem{MINIMIZER\_ALGORITHM\_LEVENBERG\_MARQUARDT} The second-order
Levenberg-Marquardt algorithm is similar to the Levenberg algorithm,
but scales such that the step sizes are changed in each dimension
according to the curvature of the cost function in that dimension
(i.e.\ the diagonal of the Hessian matrix). This tends to result in
faster convergence than the Levenberg algorithm for problems with very
different scaling for each element of the state vector.
\end{description}
The minimizer can be configured in detail by calling its member
functions listed in section \ref{sec:minimizer_options}.  The possible
values for the return status are given in section
\ref{sec:minimizer_status}.

The case above is an example of \emph{unconstrained minimization}: the
minimizer is free to try any values of ${\bf x}$. This can lead to it
trying unphysical values, such as negative values for a quantity that
cannot be negative.  To prevent this, all the minimization algorithms
allow the user to specify simple box constraints on the elements of
the state vector.  Suppose we wanted to constrain element 0 to be
positive and element 1 to lie in the range 10--20, we would add these
lines:
\begin{lstlisting}
 // Declare vectors containing the lower and upper bounds on x
 adept::Vector x_lower, x_upper;
 // Set them to the minimum and maximum possible values for their element type (e.g. double)
 adept::minimizer_initialize_bounds(nx, x_lower, x_upper);
 // Set a lower bound on element 0 and both bounds on element 1
 x_lower(0) = 0.0;
 x_lower(1) = 10.0;
 x_upper(1) = 20.0;
 // Call the minimize function with two extra arguments specifying the bounds
 status = minimizer.minimize(quadratic_function, x, x_lower, x_upper);
\end{lstlisting}

% minimizer.set_max_iterations(100);
% minimizer.set_converged_gradient_norm(0.1);

\section{Other member functions of the \code{Minimizer} class}
\label{sec:minimizer_options}
In addition to the \code{minimize} member function described in
section 3\ref{sec:minimizer_interface}, the following
\code{adept::Minimizer} member functions may be called to configure
the behaviour of the minimization algorithm:
\begin{description}
\citem{void set\_algorithm(MinimizerAlgorithm algo)} Set the algorithm
to one of the available minimization algorithm,
e.g.\ \code{MINIMIZER\_ALGORITHM\_LEVENBERG}. This is an alternative
to providing it as an argument to the \code{Minimizer} constructor.
%
\citem{void set\_algorithm(const std::string\&\ algo)} Set the
algorithm using a string, which may be one of ``\code{L-BFGS}'',
``\code{Conjugate-Gradient}'', ``\code{Conjugate-Gradient-FR}'',
``\code{Levenberg}'' or ``\code{Levenberg-Marquardt}''.  Note that
this function is case-insensitive, and will also accept spaces or
underscores in place of hyphens.
%
\citem{void set\_max\_iterations(int max\_it)} Set the maximum number
of iterations (default 100).
%
\citem{void set\_converged\_gradient\_norm(Real cgn)} The L2-norm of
the $\partial J/\partial{\bf x}$ vector is computed each iteration,
and convergence is deemed to have been achieved when it falls below
the value specified here (default 0.1).
%
\citem{void set\_max\_step\_size(Real mss)} Set the maximum step size
each iteration (default: no maximum). A negative or zero value
indicates that no maximum step size is to be used.
%
\citem{void ensure\_updated\_state(int order = 2)} Often the user will
require the Hessian matrix to compute errors in the solution, and will
store the Hessian matrix each time the
\code{calc\_cost\_function\_gradient\_hessian} function is
called. However, by default there is no guarantee that when the
minimization has completed this function will have been called with the
final version of the state vector.  Calling the member function here
requests that after minimization is complete, the derivatives of at
least the specified \code{order} are consistent with the final state
vector (e.g.\ 2 for both the Hessian and the gradient vector).
%
\citem{set\_max\_line\_search\_iterations(int mi)} Set the maximum
number of iterations to perform in a line search (default 10). The
same value is used by the Conjugate-Gradient and L-BFGS methods.
%
\citem{set\_armijo\_coeff(Real ac)} The first of the Wolfe conditions
determines how much of a decrease in the cost function is satisfactory
for a line search to complete, controlled by the Armijo coefficient
(default $10^{-4}$). The same value is used by the Conjugate-Gradient
and L-BFGS methods.
%
\citem{set\_lbfgs\_curvature\_coeff(Real lcc)} The second Wolfe
condition is that the magnitude of the gradient in the search
direction is reduced by a certain amount determined by the curvature
coefficient, the optimum value of which is different for the
Conjugate-Gradient and L-BFGS methods. The default for the L-BFGS
method is 0.9.
%
\citem{set\_cg\_curvature\_coeff(Real cgcc)} The curvature coefficient
to use for the Conjugate-Gradient method (default 0.1).
%
\citem{void set\_levenberg\_damping\_limits(Real damp\_min, Real
  damp\_max)} Set the minimum and maximum postive values of the
damping parameter $\lambda$ used by both the Levenberg and
Levenberg-Marquardt algorithms (default $1/128$ and
$10^5$). Internally the algorithm can still use zero when each
iteration is reducing the cost function.
%
\citem{void set\_levenberg\_damping\_start(Real damp\_start)} Set the
initial damping factor for the Levenberg and Levenberg-Marquardt
algorithms (default 0).
%
\citem{void set\_levenberg\_damping\_restart(Real damp\_restart)} Set
the value of the damping factor $\lambda$ in the Levenberg and
Levenberg-Marquardt algorithms that is used when a value of
$\lambda=0$ does not result in the cost function being reduced
(default $1/4$).
%
\citem{void set\_levenberg\_damping\_multiplier(Real damp\_multiply,
  Real damp\_divide)} Set the multiplier and divider that will be used
to scale the damping factor when an iteration does not and does reduce
the cost function, respectively (default 2.0 and 5.0).
\end{description}
The following member functions return the minimizer algorithm that the
\code{Minimizer} is currently configured to use:
\begin{description}
\citem{MinimizerAlgorithm algorithm()} Return the enumeration
representing the minimization algorithm.
\citem{std::string algorithm\_name()} Return a string representing the
minimization algorithm.
\end{description}
The following member functions extract information about the progress
of the minimization after it has completed:
\begin{description}
\citem{int n\_iterations()} Return the number of iterations performed.
Only iterations that successfully reduced the cost function are
counted.
%
\citem{int n\_samples()} Return the number of times the cost
function was computed, including times when this did not reduce the
cost function.
%
\citem{Real cost\_function()} Return the final value of the cost
function.
%
\citem{Real gradient\_norm()} Return the final value of the norm of
the $\partial J/\partial{\bf x}$ vector.
%
\citem{Real start\_cost\_function()} Return the cost function for the
first guess of the state vector provided by the user.
%
\citem{MinimizerStatus status()} Return the convergence status.
\end{description}


\section{Return status for minimization}
\label{sec:minimizer_status}
The following enumerations may be returned by \code{Minimizer}'s
\code{minimize} member function representing the status of the
minimization. The \code{adept::minimizer\_status\_string} function
converts a status to a user-readable string, as demonstrated
in one of the examples in section \ref{sec:minimizer_interface}.
\begin{description}
\citem{MINIMIZER\_STATUS\_SUCCESS} Minimization was successful.
%
\citem{MINIMIZER\_STATUS\_EMPTY\_STATE}  The state vector provided is empty.
%
\citem{MINIMIZER\_STATUS\_MAX\_ITERATIONS\_REACHED} The maximum number
of iterations was reached.
%
\citem{MINIMIZER\_STATUS\_FAILED\_TO\_CONVERGE} Convergence was not
achieved, even though some progress may have been made in minimizing
the cost function. This usually occurs when, in the vicinity of the
minimum, the $J({\bf x})$ terrain is quite flat and numerical errors
mean that the gradient returned from the user-supplied function does
not point uphill as it should.  This means that when the algorithm
uses the gradient to try to go downhill it finds the cost function
increasing.
%
\citem{MINIMIZER\_STATUS\_INVALID\_COST\_FUNCTION} The cost function
returned is NaN or infinity. This is usually solved by using bounded
minimization to ensure that the values of ${\bf x}$ are kept within
physically reasonable bounds.
\citem{MINIMIZER\_STATUS\_INVALID\_GRADIENT} The gradient vector
returned contains NaN or infinity values. Use bounded minimization.
%
\citem{MINIMIZER\_STATUS\_INVALID\_BOUNDS} The bounds requested are
not valid, for instance a maximum bound was requested that is less
than the minimum bound.
%\citem{MINIMIZER\_STATUS\_NUMBER\_AVAILABLE}
%\citem{MINIMIZER\_STATUS\_NOT\_YET\_CONVERGED
\end{description}


\chapter{General considerations}
\label{chap:gen}

\section{Setting and checking the global configuration}
\label{sec:settings}
\noindent The following non-member functions are provided in the
\code{adept} namespace:
\begin{description}
\citem{std::string version()} Returns a string containing the version
number of the \Adept\ library (e.g. ``\code{2.0.8}'').
\citem{std::string compiler\_version()} Returns a string containing
the compiler name and version used to compile the \Adept\ library.
\citem{std::string compiler\_flags()} Returns a string containing the
compiler flags used when compiling the \Adept\ library.
\citem{std::string configuration()} Returns a multi-line string
listing numerous aspects of the way \Adept\ has been configured.
\citem{bool have\_matrix\_multiplication()} Returns \code{true} if the
Adept library has been compiled with BLAS support, \code{false}
otherwise.
\citem{bool have\_linear\_algebra()} Returns \code{true} if the
Adept library has been compiled with LAPACK support, \code{false}
otherwise.
\citem{int set\_max\_blas\_threads(int n)} Set the maximum number of
threads used for matrix operations by the BLAS library, or zero to use
the upper limit on your system. The number returned is the number
actually used.  
\citem{int max\_blas\_threads()} Return the maximum number of
threads available for matrix operations by the BLAS library.
%
\end{description}

The preprocessor can detect the \Adept\ version at run-time via the
\code{ADEPT\_VERSION} preprocessor variable, which is an integer
variable with the digits $abbcc$ corresponding to \Adept\ version
$a.bb.cc$. This could be used to activate a different compile path
dependent on the version, or even to fail to compile if the version is
not recent enough:
\begin{lstlisting}
 #if ADEPT_VERSION < 10910
 #error "Adept >= 1.9.10 is required by this program"
 #endif
\end{lstlisting}

\section{Parallelizing \Adept\ programs}
\Adept\ currently has limited built-in support for parallelization. If
the algorithms that you wish to differentiate are individually small
enough to be treated by a single processor core, and you wish to
differentiate multiple algorithms independently (or the same algorithm
but with multiple sets of inputs) then parallelization is
straightforward. This is because the global variable containing a
pointer to the \Adept\ stack uses thread-local storage.  This means
that if a process spawns multiple threads (e.g.\ using OpenMP or
Pthreads) then each thread can declare one \code{adept::Stack} object
and all \code{adouble} operations will result in statements being
stored on the stack object specific to that thread.  The
\Adept\ package contains a test program \code{test\_thread\_safe} that
demonstrates this approach in OpenMP.

If your problem is larger and you wish to use parallelism to speed-up
the differentiation of a single large algorithm then the build-in
support is more limited. Provided your program and the \Adept\ library
were compiled with OpenMP enabled (which is the default for the
\Adept\ library if your compiler supports OpenMP), the computation of
Jacobian matrices will be parallelized.  By default, the maximum
number of concurrent threads will be equal to the number of available
cores, but this can be overridden with the
\code{set\_max\_jacobian\_threads} member function of the \code{Stack}
class.  Note that the opportunity for speed-up depends on the size of
your Jacobian matrix: for an $m\times n$ matrix, the number of
independent passes through the stored data is $\mathrm{min}(m,n)$ and
each thread treats \code{ADEPT\_MULTIPASS\_SIZE} of them (see section
\ref{sec:configuring_lib}), so the maximum number of threads that can
be exploited is $\mathrm{min}(m,n)/$\code{ADEPT\_MULTIPASS\_SIZE}.
Again, the \code{test\_thread\_safe} program can demonstrate the
parallelization of Jacobian calculations.  Note, however, that if the
\code{jacobian} function is called from within an OpenMP thread
(e.g.\ if the program already uses OpenMP with each thread containing
its own \code{adept::Stack} object), then the program is likely not to
be able to spawn more threads to assist with the Jacobian calculation.

If you need Jacobian matrices then the ability to parallelize the
calculation of them is useful since this tends to be more
computationally costly than recording the original algorithm.  If you
only require the tangent-linear or adjoint calculations (equivalent to
a Jacobian calculation with $n=1$ or $m=1$, respectively), then
unfortunately you are stuck with single threading. It is intended that
a future version of \Adept\ will enable all aspects of differentiating
an algorithm to be parallelized with either or both of OpenMP and MPI.

If your BLAS library has support for parallelization then be aware
that the performance may be poor if other parts of the program are
parallelized.  This occurs with OpenBLAS, which uses Pthreads, if you
also use parallelized Jacobian calculations, which use OpenMP.  In
this instance you can turn off parallelization of array operations
with the \code{set\_max\_blas\_threads(1)} function in the
\code{adept} namespace.  The number of available threads for array
operations is returned by the \code{max\_blas\_threads()} function.
Alternatively, you can use the \code{OPENBLAS\_NUM\_THREADS}
environment variable to control the number of threads used by
OpenBLAS, and the \code{OMP\_NUM\_THREADS} environment variable to
control the number used in Jacobian calculations.


\section{The fast exponential function}
\label{sec:fastexp}
\Adept\ was originally developed for algorithms that make frequent
calls to the exponential function \code{exp}, but unfortunately most
compilers do not vectorize \code{exp}.  Therefore, \Adept\ provides
the function \code{fastexp} in the \code{adept} namespace, which can
operate on active and passive scalars and array arguments (including
the simple \code{float} and \code{double}) just like \code{exp}.  It
uses an adapted form of an algorithm from Agner Fog's Vector Class
Library (VCL) that is around a third faster for scalar arguments, but
can be vectorized making it as much as 10 times faster when applied to
\Adept\ arrays depending on the instruction set available.  It is
accurate but not bit-reproducible with \code{exp} and produces finite
results for a slightly smaller range of input values: from $-87.3$ to
$+89.0$ for \code{float} arguments and from $-708.39$ to $+709.70$ for
\code{double} arguments.

If you have an existing code that calls \code{exp} with \Adept\ types
as arguments, and wish to use the faster algorithm for all of them,
then simply compile your code with \code{-DADEPT\_FAST\_EXPONENTIAL}.
This will not change the behaviour of \code{exp} for other types of
arguments, which would typically use the version from the C++ standard
library. If you compile your code with
\code{-DADEPT\_FAST\_SCALAR\_EXPONENTIAL} then a fast exponential
function \code{adept::exp} will be defined that works on arguments of
type \code{float} and \code{double}.  However, this can cause a
namespace clash as some C header files import \code{exp} from the
standard library outside of any namespace.

\section{Tips for the best performance}
\label{sec:tips}
\begin{itemize}
\item If you are working with single-threaded code, or in a
  multi-threaded program but with only one thread using a Stack
  object, then you can get slightly faster code by compiling all of
  your code with \code{-DADEPT\_STACK\_THREAD\_UNSAFE}. This uses a
  standard (i.e. non-thread-local) global variable to point to the
  currently active stack object, which is slightly faster to access.
\item If you compile with the \code{-g} option to store debugging
  symbols, your object files and executable will be much larger
  because every mathematical statement in the file will have the name
  of its associated templated type stored in the file, and these names
  can be long. Once you have debugged your code, you may wish to omit
  debugging symbols from production versions of the executable, or
  reduce the level of detail with \code{-g1} (on the GNU C++
  compiler).  There is typically no performance penalty associated
  with including debugging symbols.
\item A high compiler optimization setting is recommended to inline
  the function calls associated with mathematical expressions.  On the
  GNU C++ compiler, the \code{-O3 -march=native} setting is
  recommended.
\item As outlined in the previous section, if you use the \code{exp}
  function then you can replace them with the faster \code{fastexp}
  function in or compile your code with
  \code{-DADEPT\_FAST\_EXPONENTIAL}.
\item On Intel and ARM architectures, \Adept\ will use the SSE2, AVX,
  AVX512 or NEON instruction sets (depending on availability) to
  vectorize array expressions that satisfy a number of requirements:
  (1) they contain only elementary mathematical operators (including
  the functions \code{sqrt}, \code{max}, \code{min} and
  \code{fastexp}), (2) the arrays in the expression are either all of
  type \code{float} or all of type \code{double}, (3) all the arrays
  in the expression must have their final dimension increasing in
  memory with no stride, and (4) none of the arrays are active. On the
  GNU compiler the \code{-march=native} selects the best available
  instruction set, but you can select a specific set with
  \code{-msse2}, \code{-mavx} or \code{-mavx512f}. With the SSE2 and
  NEON instruction sets, 2 \code{double}s or 4 \code{float}s are
  operated on at once, for AVX these rise to 4 and 8 respectively, and
  for AVX512 they rise to 8 and 16 respectively.
\item By default the Jacobian functions are compiled to process a
  strip of rows or columns of the Jacobian matrix at once. The optimum
  width of the strip depends on your platform, and you may wish to
  change it. To make the Jacobian functions process \textit{n} rows or
  columns at once, recompile the \Adept\ library with
  \code{-DADEPT\_MULTIPASS\_SIZE=}\textit{n}.
\item If you suspect memory usage is a problem, you may investigate
  the memory used by \Adept\ by simply sending your \code{Stack} object to a
  stream, e.g. ``\code{std::cout \textless\textless\ stack}''. You may
  also use the \code{memory()} member function, which returns the
  total number of bytes used. Further details of similar functions is
  given in section \ref{sec:stack}.
\end{itemize}

\section{Exceptions thrown by the \Adept\ library}
\label{sec:exceptions}
Some functions in the \Adept\ library can throw exceptions, and the
exceptions that can be thrown are typically derived from either
\code{adept::autodiff\_exception} or
\code{adept::array\_exception}. These classes are derived from
\code{adept::exception}, which is itself derived from
\code{std::exception}. Most indicate an error in the users code,
usually associated with calling \Adept\ functions in the wrong order.

An overly comprehensive exception-catching implementation that takes
different actions depending on whether a specific \Adept\ exception,
an exception related to automatic differentiation, a general
\Adept\ exception, or a non-\Adept\ exception is thrown, could have
the following form:
%
\begin{lstlisting}
 try {
   adept::Stack stack;
   // ... Code using the Adept library goes here ...
 }
 catch (adept::stack_already_active& e) {
   // Catch a specific Adept exception
   std::cerr << "Error: " << e.what() << std::endl;
   // ... any further actions go here ...
 }
 catch (adept::autodiff_exception& e) {
   // Catch any Adept exception related to automatic differentiation not yet caught
   std::cerr << "Error: " << e.what() << std::endl;
   // ... any further actions go here ...
 }
 catch (adept::exception& e) {
   // Catch any other Adept exception not yet caught
   std::cerr << "Error: " << e.what() << std::endl;
   // ... any further actions go here ...
 }
 catch (...) {
   // Catch any exceptions not yet caught
   std::cerr << "An error occurred" << std::endl;
   // ... any further actions go here ...
 }
\end{lstlisting}
%
All exceptions implement the \code{what()} member function, which
returns a \code{const char*} containing an error message. 

\subsection{General exceptions}
The following exceptions are not specific to arrays or automatic
differentiation and inherit directly from \code{adept::exception}::
\begin{description}
\citem{feature\_not\_available} This exception is thrown by deprecated
functions, such as \code{Stack::start()}. It is also thrown by
functions that are not available because a certain library is not
being used, such as \code{inv} if \Adept\ was compiled without LAPACK
support, or matrix multiplciation via the `\code{**}' psudo-operator
if \Adept\ was compiled without BLAS support.
\end{description}

\subsection{Automatic-differentiation exceptions}
The following exceptions relate to automatic differentiation (the
functionality described in chapter \ref{chap:ad}), and all are in the
\code{adept} namespace:
\begin{description}
\citem{gradient\_out\_of\_range} This exception can be thrown by the
\code{adouble::get\_gradient} member function if the index to its
gradient is larger than the number of gradients stored.  This can
happen if the \code{adouble} object was created after the first
\code{adouble::set\_gradient} call since the last
\code{Stack::new\_recording} call. The first
\code{adouble::set\_gradient} call signals to the \Adept\ stack that
the main algorithm has completed and so memory can be allocated to
store the gradients ready for a forward or reverse pass through the
differential statements. If further \code{adouble} objects are created
then they may have a gradient index that is out of range of the memory
allocated.
%
\citem{gradients\_not\_initialized} This exception can be thrown by
functions that require the list of working gradients to have been
initialized (particularly the functions
\code{Stack::compute\_tangent\_linear} and
\code{Stack::compute\_adjoint}). This initialization occurs when
\code{adouble::set\_gradient} is called.
%
\citem{stack\_already\_active} This exception is thrown when an
attempt is made to make a particular \code{Stack} object ``active'',
but there already is an active stack in this thread. This can be
thrown by the \code{Stack} constructor or the \code{Stack::activate}
member function.
%
\citem{dependents\_or\_independents\_not\_identified} This exception
is thrown when an attempt is made to compute a Jacobian but the
independents and/or dependents have not been identified.
%
\citem{wrong\_gradient} This exception is thrown by the
\code{adouble::append\_derivative\_dependence} if the \code{adouble}
object that it is called from is not the same as that of the most
recent \code{adouble::add\_derivative\_dependence}. 
%
\citem{non\_finite\_gradient} This exception is thrown if the users
code is compiled with the preprocessor variable
\code{ADEPT\_TRACK\_NON\_FINITE\_GRADIENTS} defined, and a
mathematical operation is carried out for which the derivative is not
finite. This is useful to locate the source of non-finite derivatives
coming out of an algorithm.
\end{description}

\subsection{Array exceptions}
\label{sec:array_exceptions}
The following exceptions relate to arrays (the functionality described
in chapter \ref{chap:arrays}), and all are in the \code{adept}
namespace:
\begin{description}
\citem{size\_mismatch} A mathematical operation taking two arguments
has been applied to array expressions that are not of the same
size. The same exception is thrown if an array expression is applied
to an array of a different size.
\citem{inner\_dimension\_mismatch} Matrix multiplication has been
attempted with arrays whose inner dimensions don't agree.
\citem{empty\_array} An empty array has been used in an operation when
a non-empty array is required; for example, if an attempt is made to
link an array to an empty array (see section \ref{sec:array} for more
information on linking).
\citem{invalid\_dimension} Attempt to create an array with a negative
dimension.
\citem{index\_out\_of\_bounds} An element or range of elements has
been requested from an array but one of the indices provided is out of
range; for a dimension of length $n$, the index is not in the range
$0$ to $n-1$. Note that bounds checking is only applied if the
preprocessor variable \code{ADEPT\_BOUNDS\_CHECKING} is defined.
%\citem{invalid\_lvalue}
\citem{invalid\_operation} An invalid operation has been performed
that can only be detected at run-time, for example, calling the
\code{diag\_submatrix} member function of a non-square rank-2
\code{Array}.
\citem{matrix\_ill\_conditioned} An attempt has been made to factorize
an ill-conditioned matrix (either via \code{solve} or \code{inv}).
\citem{fortran\_interoperability\_error} An attempt has been made to
associate an \Adept\ \code{Array} with a \code{FortranArray} of the
wrong rank or type.
\end{description}

\section{Configuring the behaviour of \Adept}
\label{sec:configuring}
The behaviour of the \Adept\ library can be changed by defining one or
more of the \Adept\ preprocessor variables. This can be done either by
editing the \code{adept/base.h} file and uncommenting the relevant
\code{\#define} lines, or by compiling your code with \code{-Dxxx}
compiler options (replacing \code{xxx} by the relevant preprocessor
variable. There are two types of preprocessor variable: the first
types only apply to the compilation of user code, while the second
types require the \Adept\ library to be recompiled.

\subsection{Modifications not requiring a library recompile}
\label{sec:configuring_no_lib}
The preprocessor variables that apply only to user code and do not
require the \Adept\ library to be recompiled are as follows:
\begin{description}
\citem{ADEPT\_STACK\_THREAD\_UNSAFE} If this variable is defined, the
currently active stack is stored as a global variable but is not
defined to be ``thread-local''. This is slightly faster, but means
that you cannot use multi-threaded code with separate threads holding
their own active \code{Stack} object. Note that although defining this
variable does not require a library recompile, all source files that
make up a single executable must be compiled with this option (or all
not be).
%
\citem{ADEPT\_RECORDING\_PAUSABLE} This option enables an algorithm to
be run both with and without automatic differentiation from within the
same program via the functions \code{Stack::pause\_recording()} and
\code{Stack::continue\_recording()}.  Note that although defining this
variable does not require a library recompile, all source files that
make up a single executable must be compiled with this option (or all
not be). Further details on this option are provided in section
\ref{sec:pausable}.
%
\citem{ADEPT\_NO\_AUTOMATIC\_DIFFERENTIATION} This option turns off
automatic differentiation by treating \code{adouble} objects as
\code{double}. It is useful if you want to compile one source file
twice to produce versions with and without automatic
differentiation. Further details on this option are provided in
section \ref{sec:multipleobjects}.
%
\citem{ADEPT\_TRACK\_NON\_FINITE\_GRADIENTS} Often when an algorithm
is first converted to use an operator-overloading automatic
differentiation library, the gradients come out as Not-a-Number or
Infinity. The reason is often that the algorithm contains operations
for which the derivative is not finite (e.g.\ $\sqrt{a}$ for $a=0$),
or constructions where a non-finite value is produced but subsequently
made finite (e.g.\ $\exp(-1.0/a)$ for $a=0$). Usually the algorithm
can be recoded to avoid these problems, if the location of the
problematic operations can be identified. By defining this
preprocessor variable, a \code{non\_finite\_gradient} exception will
be thrown if any operation results in a non-finite derivative. Running
the program within a debugger (and ensuring that the exception is not
caught within the program) enables the offending line to be
identified.
%
\citem{ADEPT\_INITIAL\_STACK\_LENGTH} This preprocessor variable is
set to an integer, and is used as the default initial amount of memory
allocated for the recording, in terms of the number of statements and
operations.
%
\citem{ADEPT\_REMOVE\_NULL\_STATEMENTS} If many variables in your code
are likely to be zero then redundant operations will be added to the
list of differential statements. For example, the assignment
$a=b\times c$ with active variables $b$ and $c$ both being zero
results in the differential statement $\delta a=0\times\delta
b+0\times\delta c$. This preprocessor variable checks for zeros and
removes terms on the right-hand-side of differential statements if it
finds them. In this case it would put $\delta a=0$ on the stack
instead. This option slows down the recording stage, but speeds up the
subsequent use of the recorded stack for adjoint and Jacobian
calculations. The speed up of the latter is only likely to exceed the
slow down of the former if your code contains many zeros. For most
codes, this option causes a net slow down.
%
\citem{ADEPT\_COPY\_CONSTRUCTOR\_ONLY\_ON\_RETURN\_FROM\_FUNCTION} In
\Adept\ 1.1 this enabled a small but unsafe optimization. It now has
no effect.
%
\citem{ADEPT\_BOUNDS\_CHECKING} If this variable is defined, check
that all array indices are within the bounds of the array throwing an
\code{index\_out\_of\_bounds} exception if necessary.  If this
variable is not defined then these checks are not performed, which is
faster but means that attempts to access arrays out of bounds will
result either of corruption of other memory used by the process, or a
segmentation fault. 
\citem{ADEPT\_NO\_ALIAS\_CHECKING} This variable turns off alias
checking, which results in faster code, but may lead to unexpected
results if the right-hand-side of an array statement shares data with
the left-hand-side of the expression. If this is likely for a
particular statement then use the \code{eval} function, described in
section \ref{sec:bounds}.
\citem{ADEPT\_NO\_DIMENSION\_CHECKING} This variable turns off
checking the dimensions match when an array expression is assigned to
another array.
\citem{ADEPT\_STORAGE\_THREAD\_SAFE} This variable ensures that
accesses to the reference counter in \code{Storage} objects are
atomic, enabling the \code{Array} and \code{SpecialMatrix} objects
that use them to be accessed safely in a multi-threaded
environment. Note that this may incur a performance penalty, and is
only available in C++11. See section \ref{sec:thread}.
\citem{ADEPT\_INIT\_REAL\_SNAN} To detect errors caused by use of
uninitialized data, initialize floating point arrays and active
scalars with signaling NaNs.  This is typically accompanied by
directing the program to fail with a floating-point exception if a NaN
is used in an expression, achieved by adding the following to one of
the program source files:
\begin{lstlisting}
 #include <fenv.h>
 int _feenableexcept_status = feenableexcept(FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW);
\end{lstlisting}
It should then be possible to use a debugger to identify the array
that was read before being initialized with real numbers.
\end{description}

\subsection{Modifications requiring a library recompile}
\label{sec:configuring_lib}
\noindent The preprocessor variables that require the \Adept\ library
to be recompiled are as follows. Note that if these variables are used
they must be the same when compiling both the library and the user
code. This is safest to implement by editing section 2 of the
\code{adept/base.h} header file.
\begin{description}
\citem{ADEPT\_REAL\_TYPE\_SIZE} If you want to compile \Adept\ to use
a precision other than double for the \code{Real} type, and hence for
automatic differentiation, then define this preprocessor variable to
be \code{4} (for \code{float}), \code{8} (for \code{double}) or
\code{16} (for \code{long double}). This will also change the default
floating-point type for arrays, including shortcuts such as
\code{Vector}, \code{Matrix}, \code{SymmMatrix}. Note that if you
specify \code{16} but your compiler cannot support it
(i.e.\ \code{sizeof(long double)==8}) then \Adept\ would produce
sub-optimal code so will fail to compile.
%
\citem{ADEPT\_STACK\_STORAGE\_STL} Use the C++ standard template
library \code{vector} or \code{valarray} classes for storing the
recording and the list of gradients, rather than dynamically allocated
arrays. In practice, this tends to slow down the code.
%
\citem{ADEPT\_MULTIPASS\_SIZE} This is set to an integer, invariably a
power of two, specifying the number of rows or columns of a Jacobian
that are calculated at once. The optimum value depends on the platform
and the capability of the compiler to optimize loops whose length is
known at compile time.
% 
\citem{ADEPT\_MULTIPASS\_SIZE\_ZERO\_CHECK} This is also set to an
integer; if it is greater than \codebf{ADEPT\_MULTIPASS\_SIZE}, then
the \code{Stack::jacobian\_reverse} function checks gradients are
non-zero before using them in a multiplication.
%
\citem{ADEPT\_THREAD\_LOCAL} This can be used to specify the way that
thread-local storage is declared by your compiler.  Thread-local
storage is used to ensure that the \Adept\ library is thread-safe. By
default this variable is not defined initially, and then later in
\code{adept/base.h} it is set to an appropriate value on your system:
\code{thread\_local} if you compile with the C++11 standard, otherwise
\code{\_\_declspec(thread)} on Microsoft Visual C++, an empty
declaration on Mac (since thread-local storage is not available on
many Mac platforms) and \code{\_\_thread} otherwise (appropriate for
at least the GNU, Intel, Sun and IBM compilers). To override the
default behaviour, define this variable yourself in
\code{adept/base.h}.
\end{description}

\section{Frequently asked questions}
\label{sec:faq}
\begin{description}
\item[Why are all the gradients coming out of the automatic
  differentiation zero?] You have almost certainly omitted or
  misplaced the call of the \code{adept::Stack} member function
  ``\code{new\_recording()}''. It should be placed \emph{after} the
  independent variables in the algorithm have been initialized, but
  before any subsequent calculations are performed on these
  variables. If it is omitted or placed before the point where the
  independent variables are initialized, the differential statements
  corresponding to this initialization (which are all of the form
  $\delta x=0$), will be placed in the list of differential statements
  and will unhelpfully set to zero all your gradients right at the
  start of a forward pass (resulting from a call to \code{forward()})
  or set them to zero right at the end of a reverse pass (resulting
  from a call to \code{reverse()}).
\item[Why are the gradients coming out of the automatic
  differentiation NaN or Inf (even though the value is correct)?] This
  can occur if the algorithm contains operations for which the
  derivative is not finite (e.g.\ $\sqrt{a}$ for $a=0$), or
  constructions where a non-finite value is produced but subsequently
  made finite (e.g.\ $\exp(-1.0/a)$ for $a=0$). Usually the algorithm
  can be recoded to avoid these problems, if the location of the
  problematic operations can be identified. The simplest way to locate
  the offending statement is to recompile your code with the \code{-g}
  option and the \code{ADEPT\_TRACK\_NON\_FINITE\_GRADIENTS}
  preprocessor variable set (see section
  \ref{sec:configuring_no_lib}). Run the program within a debugger and
  a \code{non\_finite\_gradient} exception will be thrown, which if
  not caught within the program will enable you to locate the line in
  your code where the problem originated.  You may need to turn
  optimizations off (compile with \code{-O0}) for the line
  identification to be accurate. You can also turn on trapping of
  floating-point exceptions, as explained in the description of the
  \code{ADEPT\_INIT\_REAL\_SNAN} option in section
  \ref{sec:configuring_no_lib}.
\item[Why are the gradients coming out of the automatic
  differentiation wrong?] Before suspecting a bug in \Adept, note that
  round-off error can lead to incorrect gradients even in hand-coded
  differential code. Consider the following:
\begin{lstlisting}
 int main() {
   Stack stack;
   adouble a = 1.0e-26, b;
   stack.new_recording();
   b = sin(a) / a;
   b.set_gradient(1.0);
   stack.compute_adjoint();
   std::cout << "a=" << a << ", b=" << b << ", db/da=" << a.get_gradient() << "\n";
 }
\end{lstlisting}
  We know that near \code{a=0} we should have \code{b=1} and the
  gradient should be \code{0}.  But running the program above will
  give a gradient of \code{1.71799e+10}. If you hand-code the
  gradient, i.e.
\begin{lstlisting}
 double A = 1.0e-26;
 double dB_dA = cos(A)/A - sin(A) / (A*A);
\end{lstlisting}
  you will you will also get the wrong gradient.  You can see that the
  answer is the difference of two very large numbers and so subject to
  round-off error.  This example is therefore not a bug of \Adept, but
  a limitation of finite-precision machines.  To check this, try
  compiling your code using either the ADOL-C or CppAD automatic
  differentiation tools; I have always found these tools to give
  exactly the same gradient as \Adept. Unfortunately, round-off error
  can build up over many operations to give the wrong result, so there
  may not be a simple solution in your case.
\item[Can \Adept\ reuse a stored tape for multiple runs of the same
  algorithm but with different inputs?] No. \Adept\ does not store the
  full algorithm in its stack (as ADOL-C does in its tapes, for
  example), only the derivative information.  So from the stack alone
  you cannot rerun the function with different inputs.  However,
  rerunning the algorithm including recomputing the derivative
  information is fast using \Adept, and is still faster than libraries
  that store enough information in their tapes to enable a tape to be
  reused with different inputs.  It should be stressed that for any
  algorithm that includes different paths of execution (``if''
  statements) based on the values of the inputs, such a tape would
  need to be rerecorded anyway. This includes any algorithm containing
  a look-up table.
\item[Why does my code crash with a segmentation fault?] This means it
  is trying to access a memory address not belonging to your program,
  and the first thing to do is to run your program in a debugger to
  find out at what point in your code this occurs. If it is in the
  \code{adept::aReal} constructor (note that \code{aReal} is synonymous with
  \code{adouble}), then it is very likely that you have tried to
  initiate an \code{adept::adouble} object before initiating an
  \code{adept::Stack} object. As described in section
  \ref{sec:stack_setup}, there are good reasons why you need to
  initialize the \code{adept::Stack} object first.
\item[How can I interface \Adept\ with a matrix library such as
  Eigen?]  Unfortunately the use of expression templates in
  \Adept\ means that it does not work optimally (if it works at all)
  with third-party matrix libraries that use expression
  templates. This is the reason why Adept 2.0 combines array
  functionality with automatic differentiation in a single
  expression-template framework.
\item[Do you have plans to enable \Adept\ to produce Hessian
  matrices?]  Not in the near future as this is a huge change.
  However, if your cost function $J(\x)$ (also known as a cost
  function or penalty function) has a specific form then the
  approximate Hessian matrix can be computed from the Jacobian matrix,
  as described in chapter \ref{chap:optimize}.
\item[Why doesn't the ternary operator work?] Some compilers will fail
  to compile the following function:
\begin{lstlisting}
 adept::adouble piecewise(adept::adouble x) {
   return x < 1.0 ? x*x : 2.0*x-1.0;
 }
\end{lstlisting}%
The reason is that these compilers require that the two possible
outcomes of the ternary operator have the same type, but due to the
use of expression templates, the types of these mathematical
expressions are actually different.  The ternary operator cannot be
overloaded to allow such arguments. The solution is to explicitly
convert the outcomes to \code{adouble}:
\begin{lstlisting}
 adept::adouble piecewise(adept::adouble x) {
   return x < 1.0 ? adept::adouble(x*x) : adept::adouble(2.0*x-1.0);
 }
\end{lstlisting}
\item[Why is my executable so huge?]  Probably you are including
  debugging symbols by compiling with the \code{-g} option. Expression
  templates need long strings to describe them, so this extra content
  can increase the size of object files and executables by a factor of
  ten.  This does not slow down execution, but for production code you
  may wish to compile without debugging symbols, or if you use the GNU
  compiler use instead the \code{-g1} option which stores a reduced
  amount of debugging information.
\item[Why do I get incorrect behaviour when I use the ``\code{auto}''
  keyword?] Since C++11, many programmers make widespread use of
  \code{auto} as the type of a local object that can be inferred from
  its initializer. This is problematic for most expression-template
  libraries, including \Adept, because expressions are not evaluated
  immediately.  For example, dividing one \code{aReal} object by
  another returns an
  \code{adept::internal::BinaryOperation<Real,aReal,Divide,aReal>}
  object, and the division is only performed (and differentiated) when
  this object is assigned to an \code{aReal} object. The \code{auto}
  keyword will be interpreted as the type of the internal object, but
  this internal object may contain references to temporary objects
  that make up the other parts of the expression, and which go out of
  scope after the semi-colon at the end of the \code{auto} statement.
  In this example the correct behaviour is obtained by replacing
  \code{auto} with \code{aReal}.  Never to use the \code{auto} keyword
  when initializing an object from an \Adept\ expression.
\end{description}
\section{Copyright and license for \Adept\ software}
\label{sec:license}
Versions 1.9 of \Adept\ and later are owned and copyrighted jointly by
the University of Reading and the European Centre for Medium Range
Weather Forecasts. The copyright to versions 1.1 and earlier is held
solely by the University of Reading.

Since version 1.1, the \Adept\ library is released under the Apache
License, Version 2.0, which is available at
\url{http://www.apache.org/licenses/LICENSE-2.0}.  In short, this
free-software license permits you to use the library for any purpose,
and to modify it and combine it with other software to form a larger
work.  If you choose, you may release the modified software in either
source code or object code form, so may use \Adept\ in both
open-source software and non-free proprietary software. However,
distributed versions must retain copyright notices and also distribute
both the information in the NOTICES file and a copy of the Apache
License.  Different license terms may be applied to your distributed
software, although they must include the conditions on redistribution
provided in the Apache License.  This is a just short summary; if in
doubt, consult the text of the license.

In addition to the legally binding terms of the license, it is
\emph{requested} that:
\begin{itemize}
\item You cite \cite{Hogan2014} in publications describing algorithms
  and software that make use of the \Adept\ library. While not not a
  condition of the license, this is good honest practice in science
  and engineering.
\item If you make modifications to the \Adept\ library that might be
  useful to others, you release your modifications under the terms of
  the Apache License, Version 2.0, so that they are available to
  others and could also be merged into a future official version of
  \Adept. If you do not state the license applied to your
  modifications then by default they will be under the terms of the
  Apache License. You will retain copyright of your modifications, but
  if your modifications are written in the course of employment then
  under almost all circumstances (including employment by a
  University) it is your employer who holds the copyright.  Therefore
  you should obtain permission from them to release your modifications
  under the Apache License.
\end{itemize}

Note that other source files in the \Adept\ package used for
demonstrating and benchmarking \Adept\ are released under the GNU
all-permissive license\footnote{The GNU all-permissive license reads:
  \emph{Copying and distribution of this file, with or without
    modification, are permitted in any medium without royalty provided
    the copyright notice and this notice are preserved.  This file is
    offered as-is, without any warranty.}}, which is specified at the
top of all files it applies to.

\Adept\ version 1.0 was released under the terms of the GNU General
Public License (GPL) and so could not be released as part of a larger
work unless the entire work was released under the conditions of the
GPL.  It is hoped that the switch to the Apache License will
facilitate wider use of \Adept.

\section*{Acknowledgments}
Adept 1.0 was developed by Robin Hogan at the University of Reading
with funding from European Space Agency contract
40001041528/11/NL/CT. Some of the modifications to produce version 1.1
were funded by a National Centre for Earth Observation Mission Support
grant (Natural Environment Research Council grant NE/H003894/1). Dr
Brian Tse is thanked for his work exploring different parallelization
strategies during this period. Subsequent development has been carried
out under employment at the European Centre for Medium Range Weather
Forecasts.

\begin{thebibliography}{00}
\markright{References}
\harvarditem{Bell}{2007}{Bell2007}Bell, B., 2007: CppAD: A package for C++
algorithmic differentiation. \url{http://www.coin-or.org/CppAD}
% 
\harvarditem{Liu and Nocedal}{1989}{Liu+1989}Liu, D. C., and Nocedal,
  J., 1989: On the limited memory method for large scale
optimization. \emph{Math.\ Programming B,} {\bf 45,} 503--528.
%
\harvarditem{Gay}{2005}{Gay2005}Gay, D. M., 2005: Semiautomatic
differentiation for efficient gradient computations.  In
\emph{Automatic Differentiation: Applications, Theory, and
  Implementations}, H. M. B\"ucker, G. F. Corliss, P.  Hovland,
U. Naumann and B. Norris (eds.), Springer, 147--158.
%
\harvarditem{Griewank et~al.}{1996}{Griewank+1996}Griewank, A.,
  Juedes, D., and Utke, J., 1996:  Algorithm 755: ADOL-C: a package for the
automatic differentiation of algorithms written in C/C++. \textit{ACM
  Trans.\ Math.\ Softw.,} \textbf{22,} 131--167.
\harvarditem{Hogan}{2014}{Hogan2014}Hogan, R. J., 2014: Fast reverse-mode
  automatic differentiation using expression templates in
  C++. \textit{ACM Trans.\ Math.\ Softw.,} \textbf{40,} 26:1-26:16.
\harvarditem{Veldhuizen}{1995}{Veldhuizen1995}Veldhuizen, T., 1995:
Expression templates. {\it C++ Report,} {\bf 7,} 26--31.
\end{thebibliography}

\end{document}


================================================
FILE: doc/adept_reference.tex
================================================
\documentclass[10pt,a4,landscape]{article}
% Page set up
\setlength{\oddsidemargin}{-1cm} %{0.5cm}
\setlength{\evensidemargin}{-1cm} %{0.5cm}
\setlength{\topmargin}{-3cm}
%\setlength{\topmargin}{0cm}
%\setlength{\textheight}{24cm}
%\setlength{\textwidth}{16cm}
\setlength{\textheight}{19cm}
\setlength{\textwidth}{26cm}
\setlength{\marginparsep}{0.5cm}
\setlength{\marginparwidth}{0cm}
%\setlength{\parindent}{1em}
%\setlength{\parskip}{0.5ex}
\def\myvskip{\vskip 1ex}
\def\hangingpar{\parshape 2 0cm \linewidth 1ex \dimexpr\linewidth-1ex\relax}
\renewcommand{\baselinestretch}{1.05}
\sloppy
%\usepackage{multicol}
\usepackage{lmodern}\usepackage[T1]{fontenc}
\usepackage{color}
\usepackage[figuresright]{rotating}
\DeclareFontFamily{T1}{lmttc}{\hyphenchar \font-1 }
\DeclareFontShape{T1}{lmttc}{m}{n}
     {<-> ec-lmtlc10}{}
\DeclareFontShape{T1}{lmttc}{m}{it}
     {<->sub*lmttc/m/sl}{}
\DeclareFontShape{T1}{lmttc}{m}{sl}
     {<-> ec-lmtlco10}{}
%\def\myfont{\fontfamily{cmss}\fontseries{lmtt}\selectfont}
\def\myfont{\fontfamily{cmss}\selectfont}
\def\mysize{\footnotesize}
\def\mysize{\small}
\def\codeindent{\hspace{\tabcolsep}}
\setlength{\parindent}{0pt}
\def\code#1{\texttt{#1}}
\renewcommand{\rmdefault}{cmss}
\begin{document}
\pagestyle{empty}
\twocolumn
\mysize\myfont\section*{\Huge Adept Quick Reference}
%\section*{General}
All functions and types are placed in the \code{adept} namespace.
\subsection*{Header files}
\begin{tabular}{ll}
\code{adept.h} & Include if only scalar automatic differentiation is required\\
\code{adept\_arrays.h} & Include if array capabilities are needed as well\\
\code{adept\_fortran.h} & Interface to Fortran 2018 array descriptors\\
\code{adept\_optimize.h} & Minimization algorithms, e.g.\ Levenberg-Marquardt\\
\code{adept\_source.h} & Include entire Adept library, so linking to library not required \\
\end{tabular}

%\section*{Automatic differentiation functionality}
\subsection*{Scalar types}
\begin{tabular}{ll}
\code{Real} & Passive scalar type used for differentiation (usually
\code{double})\\
\code{aReal} & Active scalar of underlying type \code{Real} \\
\code{adouble}, \code{afloat} & Active scalars of underlying type
\code{double} and \code{float}\\
\end{tabular}
\subsection*{Basic reverse-mode workflow}
\begin{tabular}{ll}
\code{Stack stack;} & Object to store derivative information\\
\code{aVector x = \{1.0, 2.0\};} & Initialize independent (input) variables (C++11)\\
\code{stack.new\_recording();} & Start a new recording\\
\code{aReal J = algorithm(x);} & Any complicated algorithm here\\
\code{J.set\_gradient(1.0);} & Seed adjoint of cost function\\
\code{stack.reverse();} & Perform reverse-mode differentiation\\
\code{Vector dJ\_dx = x.get\_gradient();} & Return gradients of output with respect to inputs\\
\end{tabular}


\subsection*{Basic Jacobian workflow}
\begin{tabular}{ll}
\code{Stack stack;} & Object to store derivative information\\
\code{aVector x = \{1.0, 2.0\};} & Initialize independent (input) variables (C++11)\\
\code{stack.new\_recording();} & Start a new recording\\
\code{aVector y = algorithm(x);} & Algorithm with vector output\\
\code{stack.independent(x);} & Declare independent variables \\
\code{stack.dependent(y);} & Declare dependent variables\\
\code{Matrix dy\_dx = stack.jacobian();} & Compute Jacobian matrix\\
\end{tabular}
\subsection*{\code{aReal} member functions}
The first three functions below also work with active array arguments, where
\code{g} would be of the equivalent passive array type:\\
\begin{tabular}{ll}
\code{.set\_gradient(g)} & Initialize gradient to \code{g} \\
\code{.get\_gradient()} & After forward or reverse pass, return gradient\\
\code{.get\_gradient(g)} & As above, but writing gradient to \code{g}\\
\code{.add\_derivative\_dependence(a,p)} & Add \code{p}$\times\delta$\code{a} to the stack\\
\code{.append\_derivative\_dependence(a,p)} & Append $+$\code{p}$\times\delta$\code{a} to the stack\\
\end{tabular}

\subsection*{\code{Stack} member functions}
Constructors:\\
\begin{tabular}{ll}
\code{Stack stack;} & Construct and activate immediately \\
\code{Stack stack(false);} & Construct in inactive state\\
\end{tabular}

Member functions:\\
\begin{tabular}{ll}
\code{.new\_recording()} & Clear any existing differential statements\\
\code{.pause\_recording()} & Pause recording (\code{ADEPT\_PAUSABLE\_RECORDING} needed)\\
\code{.continue\_recording()} & Continue recording \\
\code{.is\_recording()} & Is Adept currently recording?\\
\code{.forward()} & Perform forward-mode differentiation\\
\code{.compute\_tangent\_linear()} & ...as above\\
\code{.reverse()} & Perform reverse-mode differentiation\\
\code{.compute\_adjoint()} & ...as above\\
\code{.independent(x)} & Declare an independent variable (active scalar or array)\\
\code{.independent(xptr,n)} & Declare \code{n} independent scalar variables starting at \code{xptr} \\
\code{.dependent(y)} & Declare a dependent variable (active scalar or array)\\
\code{.dependent(yptr,n)} & Declare \code{n} dependent scalar variables starting at \code{yptr}\\
\code{.jacobian()} & Return Jacobian matrix\\
\code{.jacobian(jacptr)} & Place Jacobian matrix into \code{jacptr} (column major)\\
\code{.jacobian(jacptr,false)} & Place Jacobian matrix into \code{jacptr} (row major)\\
\code{.clear\_gradients()} & Clear gradients set with \code{set\_gradient} function \\
\code{.clear\_independents()} & Clear independent variables\\
\code{.clear\_dependents()} & Clear dependent variables\\
\code{.n\_independents()} & Number of independent variables declared \\
\code{.n\_dependents()} & Number of dependent variables declared\\
%\end{tabular}
%\begin{tabular}{ll}
\code{.print\_status()} & Print status of \code{Stack} to standard output\\
\code{.print\_statements()} & Print list of differential statements\\
\code{.print\_gradients()} & Print current values of gradients\\
\code{.activate()} & Activate the stack \\
\code{.deactivate()} & Deactivate the stack\\
\code{.is\_active()} & Is the stack currently active?\\
\code{.memory()} & Return number of bytes currently used\\
\code{.preallocate\_statements(n)} & Preallocate space for \code{n} statements\\
\code{.preallocate\_operations(n)} & Preallocate space for \code{n} operations\\
\end{tabular}

\subsection*{Query functions in \code{adept} namespace}
\begin{tabular}{ll}
\code{active\_stack()} & Return pointer to currently active \code{Stack} object\\
\code{version()} & Return \code{std::string} with Adept version number\\
\code{configuration()} & Return \code{std::string} describing Adept configuration\\
\code{have\_matrix\_multiplication()} & Adept compiled with matrix multiplication (BLAS)?\\
\code{have\_linear\_algebra()} & Adept compiled with linear-algebra (LAPACK)?\\
\code{set\_max\_blas\_threads(n)} & Set maximum threads for matrix operations\\
\code{max\_blas\_threads()} & Get maximum threads for matrix operations\\
\code{is\_thread\_unsafe()} & Global \code{Stack} object is \textit{not} thread-local?\\
\end{tabular}
\newpage
%\section*{Array functionality}
\subsection*{Dense dynamic array types}
\begin{tabular}{ll}
\code{Vector}, \code{Matrix}, \code{Array3D}, \code{Array4D}... \code{Array7D} & Arrays of type \code{Real}\\
\code{intVector}, \code{intMatrix}, \code{intArray3D}...  \code{intArray7D}& Arrays of type \code{int}\\
\code{boolVector}, \code{boolMatrix}, \code{boolArray3D}...  \code{boolArray7D}& Arrays of type \code{bool}\\
\code{floatVector}, \code{floatMatrix}, \code{floatArray3D}... \code{floatARray7D} & Arrays of type \code{float}\\
\code{aVector}, \code{aMatrix}, \code{aArray3D}... \code{aArray7D} & Active arrays of type \code{Real}\\
\end{tabular}
\myvskip
Define new dynamic array types as follows:\\
\begin{tabular}{l}
\code{typedef Array<short,2,false> shortMatrix;}\\
\code{typedef Array<float,3,true> afloatArray3D;}
\end{tabular}

\subsection*{Dense fixed-size array types}
\begin{tabular}{ll}
\code{Vector2}, \code{Vector3}, \code{Vector4} & Passive vectors of fixed length 2--4\\ 
\code{Matrix22}, \code{Matrix33}, \code{Matrix44} & Passive matrices of fixed size 2$\times$2, 3$\times$3, 4$\times$4\\
\code{aVector2}, \code{aVector3}, \code{aVector4} & Active vectors of fixed length 2--4\\ 
\code{aMatrix22}, \code{aMatrix33}, \code{aMatrix44} & Active matrices of fixed size 2$\times$2, 3$\times$3, 4$\times$4\\
\end{tabular}
\myvskip
Define new fixed array types as follows:\\
\begin{tabular}{l}
\code{typedef FixedArray<short,false,2,4> shortMatrix24;}\\
\code{typedef FixedArray<Real,true,3,3,3> aArray333;}
\end{tabular}
\subsection*{Special square matrix types}
\begin{tabular}{ll}
\code{SymmMatrix}, \code{aSymmMatrix} & Symmetric matrix\\
\code{DiagMatrix}, \code{aDiagMatrix} & Diagonal matrix\\
\code{TridiagMatrix}, \code{aTridiagMatrix} & Tridiagonal matrix\\
\code{PentadiagMatrix}, \code{aPentadiagMatrix} & Pentadiagonal matrix\\
\code{LowerMatrix}, \code{aLowerMatrix} & Lower-triangular matrix\\
\code{UpperMatrix}, \code{aUpperMatrix} & Upper-triangular matrix\\
\end{tabular}
\subsection*{Dense dynamic array constructors}
\begin{tabular}{ll}
\code{Matrix M;} & Create an empty matrix of type \code{Real}\\
\code{Matrix N(M);} & Create matrix sharing data with existing matrix\\
\code{Matrix N = M;} & ...as above\\
\code{Matrix N(3,4);} & Create matrix with size 3$\times$4\\
\code{Matrix N(dimensions(3,4));} & ...as above\\
\code{Matrix N(M.dimensions());} & Create matrix with the same size as \code{M}\\
\code{Matrix N(ptr,dimensions(3,4));} & Create 3$\times$4 matrix sharing data from pointer \code{ptr}\\
\code{Matrix N = log(M);} & Create matrix containing copy of right-hand-side\\
\code{Matrix N = \{\{1.0,2.0\},\{3.0,4.0\}\};} & Create 2$\times$2 matrix from initializer list (C++11)\\
\end{tabular}
\subsection*{Array resize and link member functions}
\begin{tabular}{ll}
\code{.clear()} & Return array to original empty state\\
\code{.resize(3,4)} & Resize array discarding data\\
\code{.resize(dimensions(3,4))} & ...as above\\
\code{.resize\_row\_major(3,4)} & Resize with row-major storage (default)\\
\code{.resize\_column\_major(3,4)} & Resize with column-major storage\\
\code{.resize(M.dimensions())} & Resize to same as \code{M}\\
\code{.resize\_contiguous(...)} & Resize guaranteeing contiguous storage\\
\code{N >{}>= M;} & Discard existing data and link to array on right-hand-side\\
\end{tabular}
\subsection*{Array query member functions}
\begin{tabular}{ll}
\code{::rank} & Number of array dimensions\\
\code{.empty()} & Return \code{true} if array is empty, \code{false} otherwise\\
\code{.dimensions()} & Return an object that can be used to resize other arrays\\
\code{.dimension(i)} & Return length of dimension \code{i} (0 based)\\
\code{.size()} & Return total number of elements\\
\code{.data()} & Return pointer to underlying passive data\\
\code{.const\_data()} & Return \code{const} pointer to underlying data\\
\end{tabular}
\subsection*{Array filling}
\begin{tabular}{ll}
\code{M = 1.0;} & Fill all elements of array with the same number\\
\code{M <{}< 1.0, 2.0, 3.0, 4.0;} & Fill first four elements of array\\
\code{M = \{\{1.0,2.0\},\{3.0,4.0\}\};} & Fill 2$\times$2 matrix (C++11)\\
\end{tabular}
\subsection*{Array indexing and slicing}
Dense arrays can be indexed/sliced using the function-call operator
with as many arguments as there are dimensions (e.g.\ index a matrix
with \code{M(i,j)}). In all cases a slice can be used as an lvalue or
rvalue. If all arguments are scalars then a single element of the
array is extracted. The following special values are available:\\
\begin{tabular}{ll}
\code{end} & The last element of the dimension being indexed\\
\code{end-1} & Penultimate element of indexed dimension (any integer arithmetic possible)\\
\end{tabular}

If one or more argument is a \textit{regular index range} then the return
type will be an \code{Array} pointing to part of the original
array. For every scalar argument, its rank will be reduced by one
compared to the original array. The available ranges are:\\
\begin{tabular}{ll}
\code{\_\_} & All elements of indexed dimension \\
\code{range(ibeg,iend)} & Contiguous range from \code{ibeg} to \code{iend}\\
\code{stride(ibeg,iend,istride)} & Strided range (\code{istride} can be negative but not zero)\\
\end{tabular}

If any of the arguments is a \textit{irregular index range} (such as
an \code{intVector} containing an arbitrary list of indices) then the
return type will be an \code{IndexedArray}. If used as an lvalue, it
will modify the original array, but if passed into a function
receiving an \code{Array} type then any modifications inside the
function will not affect the original array.
\subsection*{Passing arrays to and from functions}
There are three ways an array can be received as an argument to a function:\\
\begin{tabular}{ll}
\code{Matrix\&} & For an array that might be resized in the function\\
\code{Matrix} & For an array or array slice to be modified inside the function\\
\code{const Matrix\&} & For a read-only array, array slice or array expression\\
\end{tabular}

\subsection*{Member functions returning lvalue}
The functions in this section return an \code{Array} that links to the
original data and can be used on the left- or right-hand-side of an
assignment. The following only work on dynamic or fixed-size dense
arrays:\\
\begin{tabular}{ll}
\code{.subset(ibeg0,iend0,ibeg1,iend1,...)} & Contiguous subset\\
\code{.permute(i0,i1,...)} & Permute dimensions\\
\code{.diag\_matrix()} & For vector, return \code{DiagMatrix}\\
\code{.soft\_link()} \\
\end{tabular}

The following works on any matrix:\\
\begin{tabular}{ll}
\code{.T()} & Transpose of matrix\\
\end{tabular}

The following work only with square matrices, including special square
matrices\\
\begin{tabular}{ll}
\code{.diag\_vector()} & Return vector linked to its diagonals\\
\code{.diag\_vector(i)} & Return vector linked to offdiagonal \code{i}\\
\code{.submatrix\_on\_diagonal(ibeg,iend)} & Return square matrix lying on diagonal\\
\end{tabular}
\subsection*{Elemental mathematical functions}
Return passive part of active object: \code{value(x)}

\hangingpar
Binary operators: \code{+}, \code{-},
  \code{*} and \code{/}.

\hangingpar
Assignment operators:  \code{+=}, \code{-=}, \code{*=} and \code{/=}.

\hangingpar
Unary functions: \code{sqrt}, \code{exp},
  \code{log}, \code{log10}, \code{sin}, \code{cos}, \code{tan},
  \code{asin}, \code{acos}, \code{atan}, \code{sinh}, \code{cosh},
  \code{tanh}, \code{abs}, \code{asinh}, \code{acosh}, \code{atanh},
  \code{expm1}, \code{log1p}, \code{cbrt}, \code{erf}, \code{erfc},
  \code{exp2}, \code{log2}, \code{round}, \code{trunc}, \code{rint},
  \code{nearbyint} and \code{fastexp}.

\hangingpar
Binary functions: \code{pow}, \code{atan2}, \code{min},
  \code{max}, \code{fmin} and \code{fmax}.

\hangingpar
Unary functions returning \code{bool} expressions: \code{isfinite},
\code{isinf} and \code{isnan}.

\hangingpar
Binary operators returning \code{bool} expressions: \code{==},
\code{!=}, \code{>}, \code{<}, \code{>=} and \code{<=}.

\subsection*{Alias-related functions}
\begin{tabular}{ll}
\code{eval(E)} & Avoid aliasing by evaluating expression \code{E} into an array\\
\code{noalias(E)} & Turn off alias checking for expression \code{E}\\
\end{tabular}
\subsection*{Reduction functions}
\begin{tabular}{ll}
\code{sum(M)} & Return the sum of all elements in \code{M}\\
\code{sum(M,i)} & Return array of rank one less than \code{M} containing sum along \code{i}th dimension (0 based)\\
\end{tabular}

\hangingpar Other reduction functions working in the same way:
\code{mean}, \code{product}, \code{minval}, \code{maxval}, \code{norm2}.

\begin{tabular}{ll}
\code{dot\_product(x,y)} & The same as \code{sum(a*b)} for rank-1
arguments\\
\end{tabular}
\subsection*{Expansion functions}
\begin{tabular}{ll}
\code{spread<d>(M,n)} & Replicate \code{M} array expression \code{n}
times along dimension \code{d}\\
\code{outer\_product(x,y)} & Return rank-2 outer product from two
rank-1 arguments\\
\end{tabular}
\subsection*{Matrix multiplication and linear algebra}
\begin{tabular}{ll}
\code{transpose(M)} & Transpose matrix or 2D matrix expression\\
\code{matmul(M,N)} & Matrix multiply, where at least one argument must
be a matrix, and \\
&orientation of any vector arguments is inferred\\
\code{M ** N} & Shortcut for \code{matmul}; precedence is the same as normal
  multiply\\
\code{inv(M)} & Inverse of square matrix\\
\code{solve(A,x)} & Solve system of linear equations\\ 
\end{tabular}

\subsection*{Preprocessor variables}
The following can be defined to change the behaviour of your code:\\
\begin{tabular}{ll}
\code{ADEPT\_STACK\_THREAD\_UNSAFE} & Thread-unsafe \code{Stack} (faster)\\
\code{ADEPT\_RECORDING\_PAUSABLE} & Recording can be paused (slower)\\
\code{ADEPT\_NO\_AUTOMATIC\_DIFFERENTIATION} & Turn off differentiation\\
\code{ADEPT\_TRACK\_NON\_FINITE\_GRADIENTS} & Exception thrown if derivative non-finite\\
\code{ADEPT\_BOUNDS\_CHECKING} & Check array bounds (slower)\\
\code{ADEPT\_NO\_ALIAS\_CHECKING} & Turn off alias checking (faster)\\
\code{ADEPT\_NO\_DIMENSION\_CHECKING} & Turn off dimension checking (faster)\\
\code{ADEPT\_INIT\_REAL\_SNAN} & Initialize real numbers to signaling NaN\\
\code{ADEPT\_INIT\_REAL\_ZERO} & Initialize real numbers to zero\\
\code{ADEPT\_FAST\_EXPONENTIAL} & Use faster vectorizable exponential\\
\code{ADEPT\_FAST\_SCALAR\_EXPONENTIAL} & Provide faster \code{adept::exp} for scalars\\
\code{ADEPT\_FAST} & Enable bit-reproducible options\\
\code{ADEPT\_STORAGE\_THREAD\_SAFE} & Thread-safe array storage (slower)\\
\code{ADEPT\_SUPPORT\_HUGE\_ARRAYS} & Use \code{std::size\_t} for array dimensions\\
\code{ADEPT\_REAL\_TYPE\_SIZE} & Size of \code{Real}: 4 or 8 (default 8)
\end{tabular}
The \code{ADEPT\_VERSION} variable contains version number as an
integer, e.g.\ \code{20108}, while \code{ADEPT\_VERSION\_STR} contains
it as a string, e.g.\ ``2.0.8''.
\onecolumn

\newpage

\def\Y{\textbf{Y}}
\def\r#1{\rotatebox{90}{#1}}

\setlength{\topmargin}{-3cm}
\begin{table}[tb!]
%\caption{
\begin{center}
%\parbox{0.9\columnwidth}{
\mysize\myfont Comparison of array syntax between
  Fortran 90 (and later), Matlab and the C++ libraries Adept and Eigen
%In these examples, \code{v} and \code{w} are vectors
%  and \code{A} and \code{B} are matrices.
%}

  \footnotesize
  \myfont
\begin{tabular}{lllll}
\hline
{\large\phantom{X}}
& \mysize Fortran 90+ & \mysize Matlab & \mysize C++ Adept (with C++11 features) & \mysize C++ Eigen \\
\hline
Maximum dimensions &
7 (15 from Fortran 2008) &
Unlimited &
7 &
2
\\
\hline
Vector declaration &
\code{real,dimension(:)} &
&
\code{Vector} &
\code{VectorXd}
\\
Matrix declaration &
\code{real,dimension(:,:)} &
&
\code{Matrix} &
\code{MatrixXd, ArrayXd}
\\
3D array declaration &
\code{real,dimension(:,:,:)}&
&
\code{Array3D}
\\
Fixed matrix declaration &
\code{real,dimension(M,N)} &
&
\code{FixedMatrix<double,false,M,N>} &
\code{Matrix<double,M,N>}
\\
Diagonal matrix declaration&
&
&
\code{DiagMatrix} &
\code{DiagonalMatrix<double,Dynamic>}
\\
%Tridiagonal matrix &
%&
%&
%\code{TridiagMatrix} &
%\\
Symmetric matrix decl.&
&
&
\code{SymmMatrix}
\\
%Upper-triangular matrix &
%&
%&
%\code{UpperMatrix} &
%\\
Sparse matrix declaration&
&
%\code{sparse(A)}
&
&
\code{SparseMatrix<double>}
\\
\hline
Get rank &
\code{rank(A)} &
\code{ndims(A)} &
\code{A::rank}
\\
Get total size &
\code{size(A)} &
\code{numel(A)} &
\code{A.size()} &
\code{A.size()}
\\
Get size of dimension &
\code{size(A,i)} &
\code{size(A,i)} &
\code{A.size(i)} &
\code{A.rows()}, \code{A.cols()}
\\
Get all dimensions &
\code{shape(A)} &
\code{size(A)} &
\code{A.dimensions()}
\\
\hline
Resize &
\code{allocate(A(m,n))} &
\code{A = zeros(m,n)} &
\code{A.resize(m,n)} &
\code{A.resize(m,n)} 
\\
Clear &
\code{deallocate(A)} &
\code{A = []} &
\code{A.clear()} &
\code{A.resize(0,0)}
\\
Link/associate &
\code{A => B} &
&
\code{A >{}>= B} &
%Low-level access via \code{Map}
(Complicated)
\\
\hline
Set elements to constant &
\code{A = x} &
\code{A(:) = x} &
\code{A = x} &
\code{A.fill(x)}
\\
Fill vector with data &
\code{v = [0,1]} &
\code{v = [0,1]} &
\code{v <{}< 0,1} &
\code{v <{}< 0,1}
\\
Fill matrix with data &
\code{A=reshape([0,1,2,3],[2,2])} &
\code{A = [1 2; 3 4]} &
\code{A <{}< 1,2,3,4} or \code{A = \{\{1,2\},\{3,4\}\}} &
\code{A <{}< 1,2,3,4}
\\
Vector literal &
\code{[1.0, 2.0]} &
\code{[1.0 2.0]} &
\code{Vector\{1.0, 2.0\}} &
\\
\hline
Vector subset &
\code{v(i1:i2)} &
\code{v(i1:i2)} &
\code{v.subset(i1,i2)} &
\code{v.segment(i1,m)}
%\code{Map<VectorXd> w(v.data()+1,8)}
\\
Strided indexing &
\code{v(i1:i2:s)} &
\code{v(i1:s:i2)} &
\code{v(stride(i1,i2,s))} &
%\code{Map<VectorXd,0,InnerStride<> > w(v.data()+1,4,InnerStride<2>)}
(Complicated)
\\
Vector end indexing &
\code{v(i:)} &
\code{v(i:end)} &
\code{v.subset(i,end)} &
\code{v.tail(n)}
\\
Index relative to end &
&
\code{v(end-1)} &
\code{v(end-1)} &
\\
Index by int vector &
\code{v(index)} &
\code{v(index)} &
\code{v(index)}
\\
\hline
Matrix subset &
\code{A(i1:i2,j1:j2)} &
\code{A(i1:i2,j1:j2)} &
\code{A.subset(i1,i2,j1,j2)} &
\code{A.block(i1,j1,m,n)}
\\
Extract row &
\code{A(i,:)} &
\code{A(i,:)} &
\code{A(i,\_\_)}, \code{A[i]} &
\code{A.row(i)}
\\
Matrix end block &
\code{M(i:,j:)} &
\code{M(i:end,j:end)} &
\code{M.subset(i,end,j,end)} &
\code{M.bottomRightCorner(m,n)}
\\
Diagonal matrix from vector &
&
\code{diag(v)} &
\code{v.diag\_matrix()} &
\code{v.asDiagonal()}
\\
Matrix diagonals as vector &
&
\code{diag(A)} &
\code{A.diag\_vector()} &
\code{A.diagonal()} 
\\
Matrix off-diagonals &
&
\code{diag(A,i)} &
\code{A.diag\_vector(i)} &
\code{A.diagonal(i)} 
%\\
%Symmetric view &
%&
%&
%\code{%\color{red}
%A.symm\_matrix<UPPER>()
%}&
%\code{A.selfAdjointView<Upper>()}
%\\
%Upper-triangular view &
%&
%&
%\code{\color{red}A.upper\_matrix()} &
%\code{A.triangularView<Upper>()}
\\
\hline
Elementwise multiplication &
\code{A * B} & 
\code{A .* B} &
\code{A * B} &
\code{A.array() * B.array()}
\\
Elemental function &
\code{sqrt(A)} &
\code{sqrt(A)} &
\code{sqrt(A)} &
\code{A.array().sqrt()}
\\
Addition assignment &
\code{A = A + B} &
\code{A = A + B} &
\code{A += B} &
\code{A.array() += B}
\\
Power &
\code{A ** B} &
\code{A .\textasciicircum\ C} &
\code{pow(A,B)} &
\code{A.array().pow(B)}
\\
\hline
Matrix multiplication &
\code{matmul(A,B)} &
\code{A * B} &
\code{A ** B} &
\code{A * B}
\\
Dot product &
\code{dot\_product(v,w)} &
\code{dot(v,w)} &
\code{dot\_product(v,w)} &
\code{v.dot(w)}
\\
Matrix transpose &
\code{transpose(A)} &
\code{A'} &
\code{A.T()} &
\code{A.transpose()}
\\
In-place transpose &
&
&
\code{A.in\_place\_transpose()} &
\code{A.transposeInPlace()}
\\
Matrix solve &
&
\code{A \textbackslash\ b} &
\code{solve(A,b)} &
\code{A.colPivHouseholderQr().solve(b)}
\\
Matrix inverse &
&
\code{inv(A)} &
\code{inv(A)} &
\code{A.inverse()}
\\
\hline
``Find'' conditional assign &
&
\code{v(find(w<0)) = 0} &
\code{v(find(w<0)) = 0}
\\
``Where'' conditional assign &
\code{where(w<0) v = 0} &
&
\code{v.where(w<0) = 0} &
\code{v = (w<0).select(0,v)}
\\
``Where'' with both cases &
\code{...elsewhere v = 1} &
&
\code{v.where(w<0)=either\_or(0,1)} &
\code{v = (w<0).select(0,1)}
\\
\hline
Average all elements &
\code{mean(A)} & 
\code{mean(A(:)} &
\code{mean(A)} &
\code{A.mean()}
\\
Average along dimension &
\code{mean(A,i)} & 
\code{mean(A,i)} &
\code{mean(A,i)} &
\code{A.colwise().mean()}
\\
Maximum of all elements &
\code{maxval(A)} &
\code{max(A(:))} &
\code{maxval(A)} &
\code{A.maxCoeff()}
\\
Maximum of two arrays &
\code{max(A,B)} &
(Complicated) &
\code{max(A,B)}, \code{fmax(A,B)} &
\code{A.max(B)}
\\
Spread along new dimension &
\code{spread(A,dim,n)} &
&
\code{spread<dim>(A,n)}
\\
\hline
\end{tabular}
\end{center}
\end{table}
\end{document}


================================================
FILE: include/Makefile.am
================================================
include_HEADERS = adept.h adept_arrays.h adept_optimize.h adept_source.h adept_fortran.h

pkginclude_HEADERS = adept/Active.h adept/ActiveReference.h adept/Allocator.h \
	adept/Array.h adept/Expression.h adept/ExpressionSize.h \
	adept/IndexedArray.h adept/matmul.h adept/RangeIndex.h \
	adept/ScratchVector.h adept/SpecialMatrix.h adept/Stack.h \
	adept/StackStorage.h adept/StackStorageOrig.h \
	adept/StackStorageOrigStl.h adept/Statement.h adept/Storage.h \
	adept/array_shortcuts.h adept/base.h adept/reduce.h \
	adept/contiguous_matrix.h adept/exception.h adept/settings.h \
	adept/interp.h adept/ActiveConstReference.h adept/cppblas.h \
	adept/scalar_shortcuts.h adept/solve.h adept/traits.h adept/where.h \
	adept/vector_utilities.h adept/FixedArray.h adept/Packet.h \
	adept/UnaryOperation.h adept/BinaryOperation.h adept/ArrayWrapper.h \
	adept/outer_product.h adept/spread.h adept/inv.h adept/eval.h \
	adept/noalias.h adept/store_transpose.h adept/quick_e.h \
	adept/GradientIndex.h adept/Optimizable.h adept/Minimizer.h

EXTRA_DIST = Timer.h create_adept_source_header

adept_source.h: @top_srcdir@/adept/*.h @top_srcdir@/adept/*.cpp @srcdir@/create_adept_source_header
	@srcdir@/create_adept_source_header
all-local: adept_source.h


================================================
FILE: include/Timer.h
================================================
/* Timer.h - Utility class for timing different parts of a program

  Copyright (C) 2012-2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

#ifndef Timer_H
#define Timer_H 1

#ifdef _WIN32
#include <windows.h>
#include <time.h>
#else
#include <sys/time.h>
#endif

#include <map>
#include <string>
#include <sstream>
#include <vector>
#include <iostream>

// The Timer class: all functions are inline
class Timer {
public:
  typedef int TimerInt;

  // Constructor can specify a number of unnamed activities
  Timer(TimerInt n_activities = 0) 
    : current_activity_(-1), timer_on_(false), print_on_exit_(false) {
#ifdef _WIN32
    win_last_time_.QuadPart = 0;
#else
    last_time_.tv_sec = 0;
    last_time_.tv_usec = 0;
#endif
    timings_.reserve(100);
    names_.reserve(100);
    for (TimerInt i = 0; i < n_activities; i++) {
      std::stringstream s;
      s << "Activity " << i;
      timings_.push_back(0.0);
      names_.push_back(s.str());
    }
  }

  // When the timer is destructed (typically at program exit), print
  // out the times spent in each activity
  ~Timer() {
    if (print_on_exit_) {
      print();
    }
  }

  // Print out the times spent in each activity
  void print() {
    double sum = 0.0;
    std::cerr << size() << " activities:\n";
    for (TimerInt i = 0; i < size(); i++) {
      std::cerr.width(10);
      std::cerr << std::right << timings_[i] << " s: " << names_[i] << "\n";
      sum += timings_[i];
    }
    std::cerr.width(10);
    std::cerr << std::right << sum << " s: Total\n";
  }

  // Register a new activity with the specified name, returning the
  // tag to be used to specify it in future, as a TimerInt
  TimerInt new_activity(const std::string& name) {
    TimerInt tag = size();
    names_.push_back(name);
    timings_.push_back(0.0);
    return tag;
  }

  // Stop timing current activity
  void stop() {
    if (timer_on_) {
      timings_[current_activity_] += split_();
    }
    timer_on_ = false;
  };

  // Start timing specified activity
  void start(TimerInt activity) {
    if (timer_on_) {
      timings_[current_activity_] += split_();
    }
    else {
      split_();
    }

    if (activity >= 0 && activity < size()) {
      current_activity_ = activity;
      timer_on_ = true;
    }
    else {
      // Activity out of range - to keep this inline function fast we
      // don't throw an exception but just don't record the time for
      // this event
      timer_on_ = false;
    }
  };

  // Set the timing for a specific activity back to zero
  void reset(TimerInt activity) {
    if (activity >= 0 && activity < size()) {
      timings_[activity] = 0.0;
    }
  }

  // Return the list of timings in seconds as a constant reference to
  // a vector of doubles
  const std::vector<double>& timings() { return timings_; }

  // Return a single timing
  double timing(TimerInt activity) {
    if (activity >= 0 && activity < size()) {
      return timings_[activity];
    }
    else {
      return 0.0;
    }
  }

  // Convert from size_t to int
  TimerInt size() {
    return timings_.size();
  }

  // Decide whether the contents of the timer class will be printed
  // when it is destructed
  void print_on_exit(bool b = true) {
    print_on_exit_ = b;
  }

private:
  // Use Unix system call to get the time accurately
  double split_() {
#ifdef _WIN32
    using namespace std;
    QueryPerformanceFrequency(&frequency);
    QueryPerformanceCounter(&win_time_);
    double dsec = (double) (win_time_.QuadPart - win_last_time_.QuadPart)
      / (double) frequency.QuadPart;
    win_last_time_ = win_time_;
    return dsec;
#else
    struct timeval time;
    gettimeofday(&time, NULL);
    double dsec = time.tv_sec - last_time_.tv_sec
      + 0.000001 * (time.tv_usec - last_time_.tv_usec);
    last_time_ = time;
    return dsec;
#endif
  }
  // Data
  std::vector<double> timings_;
  std::vector<std::string> names_;
  TimerInt current_activity_;
#ifdef _WIN32
  LARGE_INTEGER frequency;                 // ticks per second
  LARGE_INTEGER win_time_, win_last_time_; // ticks
#else
  timeval last_time_;
#endif
  bool timer_on_;
  bool print_on_exit_;
};

#endif


================================================
FILE: include/adept/Active.h
================================================
/* Active.h -- Active scalar type for automatic differentiation

    Copyright (C) 2012-2014 University of Reading
    Copyright (C) 2015-2018 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

  
   The Active class describes a scalar variable that can participate
   in expressions to be differentiated. It is a generalization of the
   aReal (or adouble) class in Adept 1.0, which was always double
   precision; Active<T> takes a template argument T that is any
   floating-point type.

*/

#ifndef AdeptActive_H
#define AdeptActive_H

#include <iostream>
#include <vector>

#include <adept/Expression.h>
#include <adept/exception.h>
#include <adept/ExpressionSize.h>
#include <adept/Stack.h>

namespace adept {

  // ---------------------------------------------------------------------
  // Definition of Active class
  // ---------------------------------------------------------------------
  template <typename Type>
  class Active : public Expression<Type, Active<Type> > {
    // CONTENTS
    // 1. Preamble
    // 2. Constructors
    // 3. Operators
    // 4. Public member functions that don't modify the object
    // 5. Public member functions that modify the object
    // 6. Protected member functions
    // 7. Data

  public:
    // -------------------------------------------------------------------
    // 1. Preamble
    // -------------------------------------------------------------------

    // Static definitions to enable the properties of this type of
    // expression to be discerned at compile time
    static const bool is_active = true;
    static const bool is_lvalue = true;
    static const int  rank      = 0;
    static const int  n_active  = 1 + internal::is_complex<Type>::value;
    static const int  n_arrays  = 0;
    static const int  n_scratch = 0;
    typedef Type T; // Needed so that ADEPT_INIT_REAL_SNAN works

    // -------------------------------------------------------------------
    // 2. Constructors
    // -------------------------------------------------------------------

    // Constructor registers the new Active object with the currently
    // active stack.  Note that this object is not explicitly
    // initialized with a particular number; the user should not
    // assume that it is set to zero but should later assign it to a
    // particular value. Otherwise in the reverse pass the
    // corresponding gradient will not be set to zero.
#ifdef ADEPT_INIT_REAL
    Active()
      : val_(ADEPT_INIT_REAL), gradient_index_(ADEPT_ACTIVE_STACK->register_gradient()) { }
#else
    Active()
      : val_(0.0), gradient_index_(ADEPT_ACTIVE_STACK->register_gradient()) { }
#endif

    // Constructor with a passive argument; this constructor is
    // invoked with either of the following:
    //   aReal x = 1.0;
    //   aReal x(1.0);
    template <typename PType>
    Active(const PType& rhs,
	   typename internal::enable_if<internal::is_not_expression<PType>::value>::type* dummy = 0)
      : val_(rhs), gradient_index_(ADEPT_ACTIVE_STACK->register_gradient())
    {
      // By pushing this to the statement stack without pushing
      // anything on to the operation stack we ensure that in the
      // reverse pass the gradient of this object will be set to zero
      // after it has been manipulated. This is important because the
      // gradient entry might be reused.
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
	ADEPT_ACTIVE_STACK->push_lhs(gradient_index_);
#ifdef ADEPT_RECORDING_PAUSABLE
      }
#endif
    }

    // Constructor taking an element from an active array: the value
    // and gradient_index of the element are provided
    template <typename PType>
    Active(const PType& rhs, Index gradient_index)
      : val_(rhs), gradient_index_(ADEPT_ACTIVE_STACK->register_gradient())
    {
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
	ADEPT_ACTIVE_STACK->push_rhs(1.0,gradient_index);
	ADEPT_ACTIVE_STACK->push_lhs(gradient_index_);
#ifdef ADEPT_RECORDING_PAUSABLE
      }
#endif
    }
   
    // Constructor with an active argument

    // Normal copy construction: register the new object then treat
    // this as an assignment.  We need two versions because if we
    // don't provide the first then the compiler will provide it and
    // not use the second if Type==AType
    Active(const Active<Type>& rhs) 
      : val_(0.0), gradient_index_(ADEPT_ACTIVE_STACK->register_gradient())
    {
      *this = rhs;
    }
    template <typename AType>
    Active(const Active<AType>& rhs) 
      : val_(0.0), gradient_index_(ADEPT_ACTIVE_STACK->register_gradient())
    {
      *this = rhs;
    }

    // Construction with an expression.  This is primarily used so
    // that if we define a function func(aReal a), it will also accept
    // active expressions by implicitly converting them to an aReal.
    template<typename AType, class E>
    //          explicit
    Active(const Expression<AType, E>& rhs,
	   typename internal::enable_if<E::rank==0
			      && E::is_active>::type* dummy = 0)
      : gradient_index_(ADEPT_ACTIVE_STACK->register_gradient())
    {
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
      	// Check there is enough space in the operation stack
	ADEPT_ACTIVE_STACK->check_space_static<E::n_active>();
#endif
	// Get the value and push the gradients on to the operation
	// stack, thereby storing the right-hand-side of the statement
	val_ = rhs.scalar_value_and_gradient(*ADEPT_ACTIVE_STACK);
	// Push the gradient offet of this object on to the statement
	// stack, thereby storing the left-hand-side of the statement
	ADEPT_ACTIVE_STACK->push_lhs(gradient_index_);
#ifdef ADEPT_RECORDING_PAUSABLE
      }
      else {
	val_ = rhs.scalar_value();
      }
#endif
    }
	   
    // Destructor simply unregisters the object from the stack,
    // freeing up the gradient index for another
    ~Active() {
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif

	ADEPT_ACTIVE_STACK->unregister_gradient(gradient_index_);

#ifdef ADEPT_RECORDING_PAUSABLE
      }
#endif
    }


    // -------------------------------------------------------------------
    // 3. Operators
    // -------------------------------------------------------------------
	   
    // Assignment operator with an inactive variable on the rhs
    template <typename PType>
    typename internal::enable_if<internal::is_not_expression<PType>::value,
		       Active&>::type
    operator=(const PType& rhs) {
      val_ = rhs;
      // Pushing the gradient index on to the statement stack with no
      // corresponding operations ensures that the gradient will be
      // set to zero in the reverse pass when it is finished with
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
	ADEPT_ACTIVE_STACK->push_lhs(gradient_index_);
#ifdef ADEPT_RECORDING_PAUSABLE
      }
#endif
      return *this;
    }

    // Assignment operator with an active variable on the rhs: first a
    // non-template version because otherwise compiler will generate
    // its own
    Active& operator=(const Active& rhs) {
      // Check there is space in the operation stack for one more
      // entry
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
	ADEPT_ACTIVE_STACK->check_space(1);
#endif
	// Same as construction with an expression (defined above)
	val_ = rhs.scalar_value_and_gradient(*ADEPT_ACTIVE_STACK);
	ADEPT_ACTIVE_STACK->push_lhs(gradient_index_);
#ifdef ADEPT_RECORDING_PAUSABLE
      }
      else {
	val_ = rhs.scalar_value();
      }
#endif
      return *this; 
    }

    // Assignment operator with an active variable on the rhs
    template <class AType>
    Active& operator=(const Active<AType>& rhs) {
      // Check there is space in the operation stack for one more
      // entry
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
	ADEPT_ACTIVE_STACK->check_space(1);
#endif
	// Same as construction with an expression (defined above)
	val_ = rhs.scalar_value_and_gradient(*ADEPT_ACTIVE_STACK);
	ADEPT_ACTIVE_STACK->push_lhs(gradient_index_);
#ifdef ADEPT_RECORDING_PAUSABLE
      }
      else {
	val_ = rhs.scalar_value();
      }
#endif
      return *this;
    }
    
    // Assignment operator with an expression on the rhs: very similar
    // to construction with an expression (defined above)
    template <typename AType, class E>
    typename internal::enable_if<E::is_active && E::rank==0, Active&>::type
    operator=(const Expression<AType, E>& rhs) {
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
	ADEPT_ACTIVE_STACK->check_space_static<E::n_active>();
#endif
	val_ = rhs.scalar_value_and_gradient(*ADEPT_ACTIVE_STACK);
	ADEPT_ACTIVE_STACK->push_lhs(gradient_index_);
#ifdef ADEPT_RECORDING_PAUSABLE
      }
      else {
	val_ = rhs.scalar_value();
      }
#endif
      return *this;
    }
  
    // All the compound assignment operators are unpacked, i.e. a+=b
    // becomes a=a+b; first for an Expression on the rhs
    template<typename AType, class E>
    typename internal::enable_if<E::rank==0, Active&>::type
    operator+=(const Expression<AType,E>& rhs) {
      return *this = (*this + rhs);
    }
    template<typename AType, class E>
    typename internal::enable_if<E::rank==0, Active&>::type
    operator-=(const Expression<AType,E>& rhs) {
      return *this = (*this - rhs);
    }
    template<typename AType, class E>
    typename internal::enable_if<E::rank==0, Active&>::type
    operator*=(const Expression<AType,E>& rhs) {
      return *this = (*this * rhs);
    }
    template<typename AType, class E>
    typename internal::enable_if<E::rank==0, Active&>::type
    operator/=(const Expression<AType,E>& rhs) {
      return *this = (*this / rhs);
    }

    // And likewise for a passive scalar on the rhs
    template <typename PType>
    typename internal::enable_if<internal::is_not_expression<PType>::value, Active&>::type
    operator+=(const PType& rhs) {
      val_ += rhs;
      return *this;
    }
    template <typename PType>
    typename internal::enable_if<internal::is_not_expression<PType>::value, Active&>::type
    operator-=(const PType& rhs) {
      val_ -= rhs;
      return *this;
    }
    template <typename PType>
    typename internal::enable_if<internal::is_not_expression<PType>::value, Active&>::type
    operator*=(const PType& rhs) {
      return *this = (*this * rhs);
    }
    template <typename PType>
    typename internal::enable_if<internal::is_not_expression<PType>::value, Active&>::type
    operator/=(const PType& rhs) {
      return *this = (*this / rhs);
    }

      
    // -------------------------------------------------------------------
    // 4. Public member functions that don't modify the object
    // -------------------------------------------------------------------

    // Get the underlying passive value of this object
    Type value() const {
      return val_; 
    }

    // Get the index of the gradient information for this object
    const Index& gradient_index() const { return gradient_index_; }

    // If an expression leads to calc_gradient being called on an
    // active object, we push the multiplier and the gradient index on
    // to the operation stack (or 1.0 if no multiplier is specified
    template <int Rank>
    void calc_gradient(Stack& stack, const ExpressionSize<Rank>&) const {
      stack.push_rhs(1.0, gradient_index_);
    }

    template <int Rank, typename MyType>
    void calc_gradient(Stack& stack, const MyType& multiplier, 
		       const ExpressionSize<Rank>&) const {
      stack.push_rhs(multiplier, gradient_index_);
    }

    // Set the value of the gradient, for initializing an adjoint;
    // note that the value of the gradient is not held in the active
    // object but rather held by the stack
    template <typename MyType>
    void set_gradient(const MyType& gradient) const {
      return ADEPT_ACTIVE_STACK->set_gradients(gradient_index_,
					       gradient_index_+1, 
					       &gradient);
    }

    // Get the value of the gradient, for extracting the adjoint after
    // calling reverse() on the stack
    template <typename MyType>
    void get_gradient(MyType& gradient) const {
      return ADEPT_ACTIVE_STACK->get_gradients(gradient_index_,
					       gradient_index_+1, &gradient);
    }
    Type get_gradient() const {
      Type gradient = 0;
      ADEPT_ACTIVE_STACK->get_gradients(gradient_index_,
					gradient_index_+1, &gradient);
      return gradient;
    }
 

    // For modular codes, some modules may have an existing
    // Jacobian code and possibly be unsuitable for automatic
    // differentiation using Adept (e.g. because they are written in
    // Fortran).  In this case, we can use the following two functions
    // to "wrap" the non-Adept code.

    // Suppose the non-adept code uses the double values from n aReal
    // objects pointed to by "x" to produce a single double value
    // "y_val" (to be assigned to an aReal object "y"), plus a pointer
    // to an array of forward derivatives "dy_dx".  Firstly you should
    // assign the value using simply "y = y_val;", then call
    // "y.add_derivative_dependence(x, dy_dx, n);" to specify how y
    // depends on x. A fourth argument "multiplier_stride" may be used
    // to stride the indexing to the derivatives, in case they are
    // part of a matrix that is oriented in a different sense.
    template <typename MyReal>
    typename internal::enable_if<internal::is_floating_point<MyReal>::value,
		       void>::type
    add_derivative_dependence(const Active* rhs,
			      const MyReal* multiplier,
			      int n, 
			      int multiplier_stride = 1) const {
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
	// Check there is space in the operation stack for n entries
	ADEPT_ACTIVE_STACK->check_space(n);
#endif
	for (int i = 0; i < n; i++) {
	  Real mult = multiplier[i*multiplier_stride];
	  if (mult != 0.0) {
	    // For each non-zero multiplier, add a pseudo-operation to
	    // the operation stack
	    ADEPT_ACTIVE_STACK->push_rhs(mult,
					 rhs[i].gradient_index());
	  }
	}
	ADEPT_ACTIVE_STACK->push_lhs(gradient_index_);
#ifdef ADEPT_RECORDING_PAUSABLE
      }
#endif
    }

    // Suppose the non-Adept code uses double values from n aReal
    // objects pointed to by "x" and m aReal objects pointed to by "z"
    // to produce a single double value, plus pointers to arrays of
    // forward derivatives "dy_dx" and "dy_dz".  Firstly, as above,
    // you should assign the value using simply "y = y_val;", then
    // call "y.add_derivative_dependence(x, dy_dx, n);" to specify how
    // y depends on x.  To specify also how y depends on z, call
    // "y.append_derivative_dependence(z, dy_dz, n);".
    template <typename MyReal>
    typename internal::enable_if<internal::is_floating_point<MyReal>::value,
		       void>::type
    append_derivative_dependence(const Active* rhs,
				 const MyReal* multiplier,
				 int n,
				 int multiplier_stride = 1) const {
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
	// Check there is space in the operation stack for n entries
	ADEPT_ACTIVE_STACK->check_space(n);
#endif
	for (int i = 0; i < n; ++i) {
	  Real mult = multiplier[i*multiplier_stride];
	  if (mult != 0.0) {
	    // For each non-zero multiplier, add a pseudo-operation to
	    // the operation stack
	    ADEPT_ACTIVE_STACK->push_rhs(mult,
					 rhs[i].gradient_index());
	  }
	}
	if (!(ADEPT_ACTIVE_STACK->update_lhs(gradient_index_))) {
	  throw wrong_gradient("Wrong gradient: append_derivative_dependence called on a different aReal object from the most recent add_derivative_dependence call"
			       ADEPT_EXCEPTION_LOCATION);
	}
#ifdef ADEPT_RECORDING_PAUSABLE
      }
#endif
    }

    // For only one independent variable on the rhs, these two
    // functions are convenient as they don't involve pointers
    template <class T>
    void add_derivative_dependence(const T& rhs, Real multiplier) const {
      ADEPT_ACTIVE_STACK->add_derivative_dependence(gradient_index_,
						    rhs.gradient_index(),
						    multiplier);
    }
    template <class T>
    void append_derivative_dependence(const T& rhs, Real multiplier) const {
      ADEPT_ACTIVE_STACK->append_derivative_dependence(gradient_index_,
						       rhs.gradient_index(),
						       multiplier);
    }
 
    // -------------------------------------------------------------------
    // 4.1. Public member functions used by other expressions
    // -------------------------------------------------------------------
    bool get_dimensions_(ExpressionSize<0>& dim) const { return true; }

    std::string expression_string_() const {
      std::stringstream s;
      s << "Active(" << val_ << ")";
      return s.str();
    }

    bool is_aliased_(const Type* mem1, const Type* mem2) const { 
      return false;
    }

    Type value_with_len_(const Index& j, const Index& len) const
    { return val_; }

    template <int MyArrayNum, int NArrays>
    void advance_location_(ExpressionSize<NArrays>& loc) const { } 

    template <int MyArrayNum, int NArrays>
    Type value_at_location_(const ExpressionSize<NArrays>& loc) const
    { return val_; }
    
    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
    Type value_at_location_store_(const ExpressionSize<NArrays>& loc,
				internal::ScratchVector<NScratch>& scratch) const
    { return val_; }

    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
    Type value_stored_(const ExpressionSize<NArrays>& loc,
		     const internal::ScratchVector<NScratch>& scratch) const
    { return val_; }

    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
    void calc_gradient_(Stack& stack, 
			const ExpressionSize<NArrays>& loc,
			const internal::ScratchVector<NScratch>& scratch) const {
      stack.push_rhs(1.0, gradient_index_);
    }

    template <int MyArrayNum, int MyScratchNum, 
	      int NArrays, int NScratch, typename MyType>
    void calc_gradient_(Stack& stack, 
			const ExpressionSize<NArrays>& loc,
			const internal::ScratchVector<NScratch>& scratch,
			const MyType& multiplier) const {
      stack.push_rhs(multiplier, gradient_index_);
    }

    template <int MyArrayNum, int Rank, int NArrays>
    void set_location_(const ExpressionSize<Rank>& i, 
		       ExpressionSize<NArrays>& index) const {}


    // The Stack::independent(x) and Stack::dependent(y) functions add
    // the gradient_index of objects x and y to std::vector<uIndex>
    // objects in Stack. Since x and y may be scalars or arrays, this
    // is best done by delegating to the Active or Array classes.
    template <typename IndexType>
    void push_gradient_indices(std::vector<IndexType>& vec) const {
      vec.push_back(gradient_index_);
    }

    // -------------------------------------------------------------------
    // 5. Public member functions that modify the object
    // -------------------------------------------------------------------

    // Set the value 
    template <typename MyType>
    void set_value(const MyType& x) { val_ = x; }

    // For use in creating active references, to get a non-const
    // reference to the underlying passive data
    Type& lvalue() { return val_; }

    
    // -------------------------------------------------------------------
    // 6. Protected member functions
    // -------------------------------------------------------------------
  protected:
    
    // -------------------------------------------------------------------
    // 7. Data
    // -------------------------------------------------------------------
  private:
    Type val_;                     // The numerical value
    Index gradient_index_;         // Index to where the corresponding
				   // gradient will be held during the
				   // adjoint calculation
  }; // End of definition of Active


  // ---------------------------------------------------------------------
  // Helper function for Active class
  // ---------------------------------------------------------------------

  // A way of setting the initial values of an array of n aReal
  // objects without the expense of placing them on the stack
  template<typename Type>
  inline
  void set_values(Active<Type>* a, Index n, const Type* data)
  {
    for (Index i = 0; i < n; i++) {
      a[i].set_value(data[i]);
    }
  }

  // Extract the values of an array of n aReal objects
  template<typename Type>
  inline
  void get_values(const Active<Type>* a, Index n, Type* data)
  {
    for (Index i = 0; i < n; i++) {
      data[i] = a[i].value();
    }
  }
  
  // Set the initial gradients of an array of n aReal objects; this
  // should be done after the algorithm has called and before the
  // Stack::forward or Stack::reverse functions are called
  template<typename Type>
  inline
  void set_gradients(Active<Type>* a, Index n, const Type* data)
  {
    for (Index i = 0; i < n; i++) {
      a[i].set_gradient(data[i]);
    }
  }
  
  // Extract the gradients from an array of aReal objects after the
  // Stack::forward or Stack::reverse functions have been called
  template<typename Type>
  inline
  void get_gradients(const Active<Type>* a, Index n, Type* data)
  {
    for (Index i = 0; i < n; i++) {
      a[i].get_gradient(data[i]);
    }
  }

  // Print an active scalar to a stream
  template<typename Type>
  inline
  std::ostream&
  operator<<(std::ostream& os, const Active<Type>& v)
  {
    os << v.value();
    return os;
  }

  // Print an active scalar expression to a stream
  template <typename Type, class E>
  inline
  typename internal::enable_if<E::rank == 0 && E::is_active, std::ostream&>::type
  operator<<(std::ostream& os, const Expression<Type,E>& expr) {
    os << expr.scalar_value();
    return os;
  }

  namespace internal {
    // ---------------------------------------------------------------------
    // Definition of active_scalar
    // ---------------------------------------------------------------------
    
    // Return the active scalar version of Type if it is active,
    // otherwise just return Type
    
    template <class Type, bool IsActive> struct active_scalar {
      typedef Type type;
    };

    template <class Type> struct active_scalar<Type, true> {
      typedef Active<Type> type;
    };

  }

} // End namespace adept

#endif


================================================
FILE: include/adept/ActiveConstReference.h
================================================
/* ActiveConstReference.h -- Const reference to an active element of an array

    Copyright (C) 2015-2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.


   Provide an active scalar type where the data is actually a
   reference to an element of array. This enables an active array to
   be indexed such that the returned value can be used as an r-value
   and participate in expressions to be differentiated.

*/

#ifndef AdeptActiveConstReference_H
#define AdeptActiveConstReference_H

#include <iostream>
#include <vector>

#include <adept/Active.h>

namespace adept {

  // ---------------------------------------------------------------------
  // Definition of ActiveReference class
  // ---------------------------------------------------------------------
  template <typename Type>
  class ActiveConstReference : public Expression<Type, ActiveConstReference<Type> > {
    // CONTENTS
    // 1. Preamble
    // 2. Constructors
    // 3. Operators
    // 4. Public member functions that don't modify the object
    // 5. Public member functions that modify the object
    // 6. Protected member functions
    // 7. Data

  public:
    // -------------------------------------------------------------------
    // 1. Preamble
    // -------------------------------------------------------------------

    // Static definitions to enable the properties of this type of
    // expression to be discerned at compile time
    static const bool is_active = true;
    static const int  rank      = 0;
    static const int  n_active  = 1 + internal::is_complex<Type>::value;
    static const int  n_arrays  = 0;
    static const int  n_scratch = 0;

    // -------------------------------------------------------------------
    // 2. Constructors
    // -------------------------------------------------------------------

  private:
    // There is only one way to construct an ActiveConstReference, so all
    // others that would otherwise be generated by the compiler are
    // made inaccessible
    ActiveConstReference() { }

  public:

    ActiveConstReference(const ActiveConstReference& rhs)
      : val_(rhs.value()), gradient_index_(rhs.gradient_index()) { }
    
    // In order to initialize this object, we pass in the gradient
    // index from the location in the array as the first argument.
    ActiveConstReference(const Type& val, Index gradient_index)
      : val_(val), gradient_index_(gradient_index) { 
    }

    /*
    ActiveConstReference(const ActiveConstReference& rhs)
      : val_(const_cast<ActiveConstReference<Type>&>(rhs).lvalue()),
	gradient_index_(rhs.gradient_index()) { }
    */

    // Destructor does not unregister the object from the stack since
    // it is not the only reference to it.
    ~ActiveConstReference() { }


    // -------------------------------------------------------------------
    // 3. Operators
    // -------------------------------------------------------------------
	   
    // Assignment operator with an active variable on the rhs: first a
    // non-template version because otherwise compiler will generate
    // its own; must be inaccessible
  private:
    ActiveConstReference& operator=(const ActiveConstReference& rhs) { }

  public:
    // -------------------------------------------------------------------
    // 4. Public member functions that don't modify the object
    // -------------------------------------------------------------------

    // Get the underlying passive value of this object
    const Type& value() const {
      return val_; 
    }

    // Get the index of the gradient information for this object
    const Index& gradient_index() const { return gradient_index_; }

    // If an expression leads to calc_gradient being called on an
    // active object, we push the multiplier and the gradient index on
    // to the operation stack (or 1.0 if no multiplier is specified)
    template <int Rank>
    void calc_gradient(Stack& stack, const ExpressionSize<Rank>&) const {
      stack.push_rhs(1.0, gradient_index_);
    }

    template <int Rank, typename MyType>
    void calc_gradient(Stack& stack, const MyType& multiplier, 
		       const ExpressionSize<Rank>&) const {
      stack.push_rhs(multiplier, gradient_index_);
    }

    // Set the value of the gradient, for initializing an adjoint;
    // note that the value of the gradient is not held in the active
    // object but rather held by the stack
    template <typename MyType>
    void set_gradient(const MyType& gradient) const {
      return ADEPT_ACTIVE_STACK->set_gradients(gradient_index_,
					       gradient_index_+1, 
					       &gradient);
    }

    // Get the value of the gradient, for extracting the adjoint after
    // calling reverse() on the stack
    template <typename MyType>
    void get_gradient(MyType& gradient) const {
      return ADEPT_ACTIVE_STACK->get_gradients(gradient_index_,
					       gradient_index_+1, &gradient);
    }
    Type get_gradient() const {
      Type gradient = 0;
      ADEPT_ACTIVE_STACK->get_gradients(gradient_index_,
					gradient_index_+1, &gradient);
      return gradient;
    }
 

    // For modular codes, some modules may have an existing
    // Jacobian code and possibly be unsuitable for automatic
    // differentiation using Adept (e.g. because they are written in
    // Fortran).  In this case, we can use the following two functions
    // to "wrap" the non-Adept code.

    // Suppose the non-adept code uses the double values from n aReal
    // objects pointed to by "x" to produce a single double value
    // "y_val" (to be assigned to an aReal object "y"), plus a pointer
    // to an array of forward derivatives "dy_dx".  Firstly you should
    // assign the value using simply "y = y_val;", then call
    // "y.add_derivative_dependence(x, dy_dx, n);" to specify how y
    // depends on x. A fourth argument "multiplier_stride" may be used
    // to stride the indexing to the derivatives, in case they are
    // part of a matrix that is oriented in a different sense.
    void add_derivative_dependence(const Active<Type>* rhs,
				   const Real* multiplier,
				   int n, 
				   int multiplier_stride = 1) const {
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
	// Check there is space in the operation stack for n entries
	ADEPT_ACTIVE_STACK->check_space(n);
#endif
	for (int i = 0; i < n; i++) {
	  Real mult = multiplier[i*multiplier_stride];
	  if (mult != 0.0) {
	    // For each non-zero multiplier, add a pseudo-operation to
	    // the operation stack
	    ADEPT_ACTIVE_STACK->push_rhs(mult,
					 rhs[i].gradient_index());
	  }
	}
	ADEPT_ACTIVE_STACK->push_lhs(gradient_index_);
#ifdef ADEPT_RECORDING_PAUSABLE
      }
#endif
    }

    // Suppose the non-Adept code uses double values from n aReal
    // objects pointed to by "x" and m aReal objects pointed to by "z"
    // to produce a single double value, plus pointers to arrays of
    // forward derivatives "dy_dx" and "dy_dz".  Firstly, as above,
    // you should assign the value using simply "y = y_val;", then
    // call "y.add_derivative_dependence(x, dy_dx, n);" to specify how
    // y depends on x.  To specify also how y depends on z, call
    // "y.append_derivative_dependence(z, dy_dz, n);".
    void append_derivative_dependence(const Active<Type>* rhs,
				      const Real* multiplier,
				      int n,
				      int multiplier_stride = 1) const {
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
	// Check there is space in the operation stack for n entries
	ADEPT_ACTIVE_STACK->check_space(n);
#endif
	for (int i = 0; i < n; i++) {
	  Real mult = multiplier[i*multiplier_stride];
	  if (mult != 0.0) {
	    // For each non-zero multiplier, add a pseudo-operation to
	    // the operation stack
	    ADEPT_ACTIVE_STACK->push_rhs(mult,
					 rhs[i].gradient_index());
	  }
	}
	if (!(ADEPT_ACTIVE_STACK->update_lhs(gradient_index_))) {
	  throw wrong_gradient("Wrong gradient: append_derivative_dependence called on a different aReal object from the most recent add_derivative_dependence call"
			       ADEPT_EXCEPTION_LOCATION);
	}
#ifdef ADEPT_RECORDING_PAUSABLE
      }
#endif
    }
    // For only one independent variable on the rhs, these two
    // functions are convenient as they don't involve pointers
    template <class T>
    void add_derivative_dependence(T& rhs, Real multiplier) const {
      ADEPT_ACTIVE_STACK->add_derivative_dependence(gradient_index_,
						    rhs.gradient_index(),
						    multiplier);
    }
    template <class T>
    void append_derivative_dependence(T& rhs, Real multiplier) const {
      ADEPT_ACTIVE_STACK->append_derivative_dependence(gradient_index_,
						       rhs.gradient_index(),
						       multiplier);
    }
 
    // -------------------------------------------------------------------
    // 4.1. Public member functions used by other expressions
    // -------------------------------------------------------------------
    bool get_dimensions_(ExpressionSize<0>& dim) const { return true; }

    std::string expression_string_() const {
      std::stringstream s;
      s << "ActiveConstReference(" << val_ << ")";
      return s.str();
    }

    bool is_aliased_(const Type* mem1, const Type* mem2) const { 
      return &val_ >= mem1 && &val_ <= mem2; 
    }

    Type value_with_len_(const Index& j, const Index& len) const
    { return val_; }

    template <int MyArrayNum, int NArrays>
    void advance_location_(ExpressionSize<NArrays>& loc) const { } 

    template <int MyArrayNum, int NArrays>
    Type value_at_location_(const ExpressionSize<NArrays>& loc) const
    { return val_; }
    
    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
    Type value_at_location_store_(const ExpressionSize<NArrays>& loc,
				internal::ScratchVector<NScratch>& scratch) const
    { return val_; }

    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
    Type value_stored_(const ExpressionSize<NArrays>& loc,
		     const internal::ScratchVector<NScratch>& scratch) const
    { return val_; }

    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
    void calc_gradient_(Stack& stack, 
			const ExpressionSize<NArrays>& loc,
			const internal::ScratchVector<NScratch>& scratch) const {
      stack.push_rhs(1.0, gradient_index_);
    }

    template <int MyArrayNum, int MyScratchNum, 
	      int NArrays, int NScratch, typename MyType>
    void calc_gradient_(Stack& stack, 
			const ExpressionSize<NArrays>& loc,
			const internal::ScratchVector<NScratch>& scratch,
			const MyType& multiplier) const {
      stack.push_rhs(multiplier, gradient_index_);
    }

    template <int MyArrayNum, int Rank, int NArrays>
    void set_location_(const ExpressionSize<Rank>& i, 
		       ExpressionSize<NArrays>& index) const {}


    // The Stack::independent(x) and Stack::dependent(y) functions add
    // the gradient_index of objects x and y to std::vector<uIndex>
    // objects in Stack. Since x and y may be scalars or arrays, this
    // is best done by delegating to the ActiveConstReference or Array classes.
    template <typename IndexType>
    void push_gradient_indices(std::vector<IndexType>& vec) const {
      vec.push_back(gradient_index_);
    }

    // -------------------------------------------------------------------
    // 5. Public member functions that modify the object
    // -------------------------------------------------------------------

    // Set the value 
    template <typename MyType>
    void set_value(const MyType& x) { val_ = x; }
    
    // -------------------------------------------------------------------
    // 6. Protected member functions
    // -------------------------------------------------------------------
  protected:
    
    // -------------------------------------------------------------------
    // 7. Data
    // -------------------------------------------------------------------
  private:
    const Type& val_;              // Reference to the numerical value
    Index gradient_index_;         // Index to where the corresponding
				   // gradient will be held during the
				   // adjoint calculation
  }; // End of definition of ActiveConstReference


  // ---------------------------------------------------------------------
  // Helper function for ActiveConstReference class
  // ---------------------------------------------------------------------

  template<typename Type>
  inline
  std::ostream&
  operator<<(std::ostream& os, const ActiveConstReference<Type>& v)
  {
    os << v.value();
    return os;
  }


  namespace internal {
    
    // ---------------------------------------------------------------------
    // active_const_reference
    // ---------------------------------------------------------------------

    // Return the active reference version of Type if it is active,
    // otherwise just return Type&

    template <class Type, bool IsActive> struct active_const_reference {
      typedef const Type& type;
    };
    template <class Type> struct active_const_reference<Type, true> {
      typedef ActiveConstReference<Type> type;
    };
  }


} // End namespace adept

#endif


================================================
FILE: include/adept/ActiveReference.h
================================================
/* ActiveReference.h -- Reference to an active element of an array

    Copyright (C) 2015-2018 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.


   Provide an active scalar type where the data is actually a
   reference to an element of array. This enables an active array to
   be indexed such that the returned value can be used as an l-value
   and participate in expressions to be differentiated.

*/

#ifndef AdeptActiveReference_H
#define AdeptActiveReference_H

#include <iostream>
#include <vector>

#include <adept/Active.h>

namespace adept {

  // ---------------------------------------------------------------------
  // Definition of ActiveReference class
  // ---------------------------------------------------------------------
  template <typename Type>
  class ActiveReference : public Expression<Type, ActiveReference<Type> > {
    // CONTENTS
    // 1. Preamble
    // 2. Constructors
    // 3. Operators
    // 4. Public member functions that don't modify the object
    // 5. Public member functions that modify the object
    // 6. Protected member functions
    // 7. Data

  public:
    // -------------------------------------------------------------------
    // 1. Preamble
    // -------------------------------------------------------------------

    // Static definitions to enable the properties of this type of
    // expression to be discerned at compile time
    static const bool is_active = true;
    static const int  rank      = 0;
    static const int  n_active  = 1 + internal::is_complex<Type>::value;
    static const int  n_arrays  = 0;
    static const int  n_scratch = 0;

    // -------------------------------------------------------------------
    // 2. Constructors
    // -------------------------------------------------------------------

  private:
    // There is only one way to construct an ActiveReference, so all
    // others that would otherwise be generated by the compiler are
    // made inaccessible
    ActiveReference() { }

    ActiveReference(ActiveReference& rhs)
      : val_(rhs.lvalue()), gradient_index_(rhs.gradient_index()) { }

  public:
    
    // In order to initialize this object, we pass in the gradient
    // index from the location in the array as the first argument.
    ActiveReference(Type& val, Index gradient_index)
      : val_(val), gradient_index_(gradient_index) { 
    }

    // 
    ActiveReference(const ActiveReference& rhs)
      : val_(const_cast<ActiveReference<Type>&>(rhs).lvalue()),
	gradient_index_(rhs.gradient_index()) { }

    // Destructor does not unregister the object from the stack since
    // it is not the only reference to it.
    ~ActiveReference() { }


    // -------------------------------------------------------------------
    // 3. Operators
    // -------------------------------------------------------------------
	   
    // Assignment operator with an inactive variable on the rhs
    template <typename PType>
    typename internal::enable_if<internal::is_not_expression<PType>::value,
		       ActiveReference&>::type
    operator=(const PType& rhs) {
      val_ = rhs;
      // Pushing the gradient index on to the statement stack with no
      // corresponding operations ensures that the gradient will be
      // set to zero in the reverse pass when it is finished with
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
	ADEPT_ACTIVE_STACK->push_lhs(gradient_index_);
#ifdef ADEPT_RECORDING_PAUSABLE
      }
#endif
      return *this;
    }

    // Assignment operator with an active variable on the rhs: first a
    // non-template version because otherwise compiler will generate
    // its own
    ActiveReference& operator=(const ActiveReference& rhs) {
      // Check there is space in the operation stack for one more
      // entry
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
	ADEPT_ACTIVE_STACK->check_space(1);
#endif
	val_ = rhs.scalar_value_and_gradient(*ADEPT_ACTIVE_STACK);
	ADEPT_ACTIVE_STACK->push_lhs(gradient_index_);
#ifdef ADEPT_RECORDING_PAUSABLE
      }
      else {
	val_ = rhs.scalar_value();
      }
#endif
      return *this; 
    }

    // Assignment operator with an active variable on the rhs
    template <class AType>
    ActiveReference& operator=(const Active<AType>& rhs) {
      // Check there is space in the operation stack for one more
      // entry
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
	ADEPT_ACTIVE_STACK->check_space(1);
#endif
	// Same as construction with an expression (defined above)
	val_ = rhs.scalar_value_and_gradient(*ADEPT_ACTIVE_STACK);
	ADEPT_ACTIVE_STACK->push_lhs(gradient_index_);
#ifdef ADEPT_RECORDING_PAUSABLE
      }
      else {
	val_ = rhs.scalar_value();
      }
#endif
      return *this;
    }
    
    // Assignment operator with an expression on the rhs
    template <typename AType, class E>
    typename internal::enable_if<E::is_active && E::rank==0, ActiveReference&>::type
    operator=(const Expression<AType, E>& rhs) {
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
	ADEPT_ACTIVE_STACK->check_space_static<E::n_active>();
#endif
	val_ = rhs.scalar_value_and_gradient(*ADEPT_ACTIVE_STACK);
	ADEPT_ACTIVE_STACK->push_lhs(gradient_index_);
#ifdef ADEPT_RECORDING_PAUSABLE
      }
      else {
	val_ = rhs.scalar_value();
      }
#endif
      return *this;
    }
  
    // All the compound assignment operators are unpacked, i.e. a+=b
    // becomes a=a+b; first for an Expression on the rhs
    template<typename AType, class E>
    typename internal::enable_if<E::rank==0, ActiveReference&>::type
    operator+=(const Expression<AType,E>& rhs) {
      return *this = (*this + rhs);
    }
    template<typename AType, class E>
    typename internal::enable_if<E::rank==0, ActiveReference&>::type
    operator-=(const Expression<AType,E>& rhs) {
      return *this = (*this - rhs);
    }
    template<typename AType, class E>
    typename internal::enable_if<E::rank==0, ActiveReference&>::type
    operator*=(const Expression<AType,E>& rhs) {
      return *this = (*this * rhs);
    }
    template<typename AType, class E>
    typename internal::enable_if<E::rank==0, ActiveReference&>::type
    operator/=(const Expression<AType,E>& rhs) {
      return *this = (*this / rhs);
    }

    // And likewise for a passive scalar on the rhs
    template <typename PType>
    typename internal::enable_if<internal::is_not_expression<PType>::value, ActiveReference&>::type
    operator+=(const PType& rhs) {
      val_ += rhs;
      return *this;
    }
    template <typename PType>
    typename internal::enable_if<internal::is_not_expression<PType>::value, ActiveReference&>::type
    operator-=(const PType& rhs) {
      val_ -= rhs;
      return *this;
    }
    template <typename PType>
    typename internal::enable_if<internal::is_not_expression<PType>::value, ActiveReference&>::type
    operator*=(const PType& rhs) {
      return *this = (*this * rhs);
    }
    template <typename PType>
    typename internal::enable_if<internal::is_not_expression<PType>::value, ActiveReference&>::type
    operator/=(const PType& rhs) {
      return *this = (*this / rhs);
    }

      
    // -------------------------------------------------------------------
    // 4. Public member functions that don't modify the object
    // -------------------------------------------------------------------

    // Get the underlying passive value of this object
    Type value() const {
      return val_; 
    }

    // Get the index of the gradient information for this object
    const Index& gradient_index() const { return gradient_index_; }

    // If an expression leads to calc_gradient being called on an
    // active object, we push the multiplier and the gradient index on
    // to the operation stack (or 1.0 if no multiplier is specified)
    template <int Rank>
    void calc_gradient(Stack& stack, const ExpressionSize<Rank>&) const {
      stack.push_rhs(1.0, gradient_index_);
    }

    template <int Rank, typename MyType>
    void calc_gradient(Stack& stack, const MyType& multiplier, 
		       const ExpressionSize<Rank>&) const {
      stack.push_rhs(multiplier, gradient_index_);
    }

    // Set the value of the gradient, for initializing an adjoint;
    // note that the value of the gradient is not held in the active
    // object but rather held by the stack
    template <typename MyType>
    void set_gradient(const MyType& gradient) const {
      return ADEPT_ACTIVE_STACK->set_gradients(gradient_index_,
					       gradient_index_+1, 
					       &gradient);
    }

    // Get the value of the gradient, for extracting the adjoint after
    // calling reverse() on the stack
    template <typename MyType>
    void get_gradient(MyType& gradient) const {
      return ADEPT_ACTIVE_STACK->get_gradients(gradient_index_,
					       gradient_index_+1, &gradient);
    }
    Type get_gradient() const {
      Type gradient = 0;
      ADEPT_ACTIVE_STACK->get_gradients(gradient_index_,
					gradient_index_+1, &gradient);
      return gradient;
    }
 

    // For modular codes, some modules may have an existing
    // Jacobian code and possibly be unsuitable for automatic
    // differentiation using Adept (e.g. because they are written in
    // Fortran).  In this case, we can use the following two functions
    // to "wrap" the non-Adept code.

    // Suppose the non-adept code uses the double values from n aReal
    // objects pointed to by "x" to produce a single double value
    // "y_val" (to be assigned to an aReal object "y"), plus a pointer
    // to an array of forward derivatives "dy_dx".  Firstly you should
    // assign the value using simply "y = y_val;", then call
    // "y.add_derivative_dependence(x, dy_dx, n);" to specify how y
    // depends on x. A fourth argument "multiplier_stride" may be used
    // to stride the indexing to the derivatives, in case they are
    // part of a matrix that is oriented in a different sense.
    void add_derivative_dependence(const Active<Type>* rhs,
				   const Real* multiplier,
				   int n, 
				   int multiplier_stride = 1) const {
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
	// Check there is space in the operation stack for n entries
	ADEPT_ACTIVE_STACK->check_space(n);
#endif
	for (int i = 0; i < n; i++) {
	  Real mult = multiplier[i*multiplier_stride];
	  if (mult != 0.0) {
	    // For each non-zero multiplier, add a pseudo-operation to
	    // the operation stack
	    ADEPT_ACTIVE_STACK->push_rhs(mult,
					 rhs[i].gradient_index());
	  }
	}
	ADEPT_ACTIVE_STACK->push_lhs(gradient_index_);
#ifdef ADEPT_RECORDING_PAUSABLE
      }
#endif
    }

    // Suppose the non-Adept code uses double values from n aReal
    // objects pointed to by "x" and m aReal objects pointed to by "z"
    // to produce a single double value, plus pointers to arrays of
    // forward derivatives "dy_dx" and "dy_dz".  Firstly, as above,
    // you should assign the value using simply "y = y_val;", then
    // call "y.add_derivative_dependence(x, dy_dx, n);" to specify how
    // y depends on x.  To specify also how y depends on z, call
    // "y.append_derivative_dependence(z, dy_dz, n);".
    template <typename T>
    void append_derivative_dependence(const Active<Type>* rhs,
				      const Real* multiplier,
				      int n,
				      int multiplier_stride = 1) const {
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
	// Check there is space in the operation stack for n entries
	ADEPT_ACTIVE_STACK->check_space(n);
#endif
	for (int i = 0; i < n; i ++) {
	  Real mult = multiplier[i*multiplier_stride];
	  if (mult != 0.0) {
	    // For each non-zero multiplier, add a pseudo-operation to
	    // the operation stack
	    ADEPT_ACTIVE_STACK->push_rhs(mult,
					 rhs[i].gradient_index());
	  }
	}
	if (!(ADEPT_ACTIVE_STACK->update_lhs(gradient_index_))) {
	  throw wrong_gradient("Wrong gradient: append_derivative_dependence called on a different aReal object from the most recent add_derivative_dependence call"
			       ADEPT_EXCEPTION_LOCATION);
	}
#ifdef ADEPT_RECORDING_PAUSABLE
      }
#endif
    }

    // For only one independent variable on the rhs, these two
    // functions are convenient as they don't involve pointers
    template <class T>
    void add_derivative_dependence(T& rhs, Real multiplier) const {
      ADEPT_ACTIVE_STACK->add_derivative_dependence(gradient_index_,
						    rhs.gradient_index(),
						    multiplier);
    }
    template <class T>
    void append_derivative_dependence(T& rhs, Real multiplier) const {
      ADEPT_ACTIVE_STACK->append_derivative_dependence(gradient_index_,
						       rhs.gradient_index(),
						       multiplier);
    }
     
 
    // -------------------------------------------------------------------
    // 4.1. Public member functions used by other expressions
    // -------------------------------------------------------------------
    bool get_dimensions_(ExpressionSize<0>& dim) const { return true; }

    std::string expression_string_() const {
      std::stringstream s;
      s << "ActiveReference(" << val_ << ")";
      return s.str();
    }

    bool is_aliased_(const Type* mem1, const Type* mem2) const { 
      return &val_ >= mem1 && &val_ <= mem2; 
    }

    Type value_with_len_(const Index& j, const Index& len) const
    { return val_; }

    template <int MyArrayNum, int NArrays>
    void advance_location_(ExpressionSize<NArrays>& loc) const { } 

    template <int MyArrayNum, int NArrays>
    Type value_at_location_(const ExpressionSize<NArrays>& loc) const
    { return val_; }
    
    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
    Type value_at_location_store_(const ExpressionSize<NArrays>& loc,
				internal::ScratchVector<NScratch>& scratch) const
    { return val_; }

    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
    Type value_stored_(const ExpressionSize<NArrays>& loc,
		     const internal::ScratchVector<NScratch>& scratch) const
    { return val_; }

    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
    void calc_gradient_(Stack& stack, 
			const ExpressionSize<NArrays>& loc,
			const internal::ScratchVector<NScratch>& scratch) const {
      stack.push_rhs(1.0, gradient_index_);
    }

    template <int MyArrayNum, int MyScratchNum, 
	      int NArrays, int NScratch, typename MyType>
    void calc_gradient_(Stack& stack, 
			const ExpressionSize<NArrays>& loc,
			const internal::ScratchVector<NScratch>& scratch,
			const MyType& multiplier) const {
      stack.push_rhs(multiplier, gradient_index_);
    }

    template <int MyArrayNum, int Rank, int NArrays>
    void set_location_(const ExpressionSize<Rank>& i, 
		       ExpressionSize<NArrays>& index) const {}


    // The Stack::independent(x) and Stack::dependent(y) functions add
    // the gradient_index of objects x and y to std::vector<uIndex>
    // objects in Stack. Since x and y may be scalars or arrays, this
    // is best done by delegating to the ActiveReference or Array classes.
    template <typename IndexType>
    void push_gradient_indices(std::vector<IndexType>& vec) const {
      vec.push_back(gradient_index_);
    }

    // -------------------------------------------------------------------
    // 5. Public member functions that modify the object
    // -------------------------------------------------------------------

    // Set the value 
    template <typename MyType>
    void set_value(const MyType& x) { val_ = x; }
    
    // -------------------------------------------------------------------
    // 6. Protected member functions
    // -------------------------------------------------------------------
  protected:
    
    // For use in creating active references, to get a non-const
    // reference to the underlying passive data
    Type& lvalue() { return val_; }

    // -------------------------------------------------------------------
    // 7. Data
    // -------------------------------------------------------------------
  private:
    Type& val_;                    // Reference to the numerical value
    Index gradient_index_;         // Index to where the corresponding
				   // gradient will be held during the
				   // adjoint calculation
  }; // End of definition of ActiveReference


  // ---------------------------------------------------------------------
  // Helper function for ActiveReference class
  // ---------------------------------------------------------------------

  template<typename Type>
  inline
  std::ostream&
  operator<<(std::ostream& os, const ActiveReference<Type>& v)
  {
    os << v.value();
    return os;
  }


  namespace internal {
    
    // ---------------------------------------------------------------------
    // active_reference
    // ---------------------------------------------------------------------

    // Return the active reference version of Type if it is active,
    // otherwise just return Type&

    template <class Type, bool IsActive> struct active_reference {
      typedef Type& type;
    };
    template <class Type> struct active_reference<Type, true> {
      typedef ActiveReference<Type> type;
    };
  }


} // End namespace adept

#endif


================================================
FILE: include/adept/Allocator.h
================================================
/* Allocator.h -- Allocates elements to arrays

    Copyright (C) 2015 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.
*/

#ifndef AdeptAllocator_H
#define AdeptAllocator_H 1

#include <adept/Array.h>
//#include <adept/SpecialMatrix.h>

namespace adept {
  namespace internal {
   
    template <int Rank, class A>
    class Allocator {
    public:
      // Create an allocator object and copy the first object in it
      template <typename F>
      Allocator(A& array, const F& first_arg) 
	: array_(array), size_(array.dimensions()),
	  //	  filled_size_(0), 
	  obj_size_(0), coords_(0),
	  scalar_size_(1) {
	*this << first_arg;
      }

      // Copy a scalar into the array
      template <typename T>
      typename enable_if<is_not_expression<T>::value,Allocator&>::type
      operator<<(const T& x) {
	if (coords_[Rank-1] >= size_[Rank-1]) {
	  // We have reached the end of the array: move to next row
	  complete_row<Rank>();
	  // All dimensions of this object are of length 1
	  obj_size_.set_all(1);
	}
	else if (coords_[Rank-1] == 0) {
	  // At the beginning of a row: set the size of the template
	  // object to that of a scalar
	  obj_size_ = scalar_size_;	  
	}
	else if (obj_size_ != scalar_size_) {
	  // The template object size is not the same as a scalar,
	  // indicating that dissimilar objects have been concatenated
	  // in a row
	  throw index_out_of_bounds("Scalar added to array with \"<<\" when previous objects on row were not scalar" 
				    ADEPT_EXCEPTION_LOCATION);
	}
	// Add the scalar to the array and increment the final index
	array_.get_lvalue(coords_) = x;
	++coords_[Rank-1];
	return *this;
      }


      // Copy an expression into the array
      template <typename T, class E>
      typename enable_if<(E::rank <= Rank), Allocator&>::type
      operator<<(const Expression<T,E>& x) {
	// Evaluate expression and store in an Array of the same rank
	// (if Expression is already an Array then this will make a
	// shallow copy). Ought to check for aliasing.
	const Array<E::rank,T,E::is_active> xx(x.cast());
	ExpressionSize<Rank-1> leading_dim;
	//	leading_dim.copy_dissimilar(xx.dimensions());
	partial_copy(xx.dimensions(), leading_dim);

	if (coords_[Rank-1] >= size_[Rank-1]) {
	  // We have reached the end of the array: move to next row
	  complete_row<Rank>();
	}
	if (coords_[Rank-1] == 0) {
	  partial_copy(xx.dimensions(), obj_size_);
	}
	else if (obj_size_ != leading_dim) {
	  // The template object size is not the same as the current
	  // array, indicating that dissimilar objects have been
	  // concatenated in a row
	  throw index_out_of_bounds("Expression added to array with \"<<\" does not match size of previous objects on row"
				    ADEPT_EXCEPTION_LOCATION);
	}
	// Add the object to the array and increment the final index
	ExpressionSize<Rank> i_lhs(coords_);
	ExpressionSize<E::rank> i_rhs(0);
	int rank;
	do {
	  array_.get_lvalue(i_lhs) = xx.get_rvalue(i_rhs);
	  advance_index(rank, i_lhs, i_rhs, xx.dimensions());
	}
	while (rank >= 0);
	
	coords_[Rank-1] += xx.dimension(E::rank-1);
	return *this;
      }

      template <int RhsRank>
      void advance_index(int& rank, ExpressionSize<Rank>& i_lhs, 
			 ExpressionSize<RhsRank>& i_rhs,
			 const ExpressionSize<RhsRank>& size) const {
	rank = RhsRank;
	while (--rank >= 0) {
	  if (++i_rhs[rank] >= size[rank]) {
	    i_rhs[rank] = 0;
	    i_lhs[rank+(Rank-RhsRank)] -= (size[rank]-1);
	    }
	  else {
	    ++i_lhs[rank+(Rank-RhsRank)];
	    break;
	  }
	}
      }
      
      // Comma operator does the same as "<<" operator
      template <typename T>
      typename enable_if<is_not_expression<T>::value,Allocator&>::type
      operator,(const T& x) {
	return *this << x;
      }
	
    protected:
      // A vector should never complete a row as this indicates it has
      // been overfilled
      template <int MyRank>
      typename enable_if<(MyRank <= 1), void>::type
      complete_row() {
	throw index_out_of_bounds("Row overflow in filling Vector with \"<<\""
				  ADEPT_EXCEPTION_LOCATION);
      }

      // Multi-dimensional arrays: move to next row, checking which
      // dimensions have been filled
      template <int MyRank>
      typename enable_if<(MyRank > 1), void>::type
      complete_row() {
	int next_dim = Rank-2;
	while (next_dim >= 0) {
	  if (coords_[next_dim]+obj_size_[next_dim] < size_[next_dim]) {
	    //	    filled_size_[next_dim] += obj_size_[next_dim];
	    coords_[next_dim] += obj_size_[next_dim];
	    for (int i = next_dim+1; i < Rank; ++i) {
	      coords_[i] = 0;
	    }
	    break;
	  }
	  --next_dim;
	}
	if (next_dim < 0) {
	  throw index_out_of_bounds("Dimension overflow in filling array with \"<<\""
				    ADEPT_EXCEPTION_LOCATION);
	}
	obj_size_.set_all(0);
      }

      template <int MyRank>
      typename enable_if<(MyRank > 1), void>::type
      partial_copy(const ExpressionSize<MyRank>& from,
		   ExpressionSize<Rank-1>& to) const {
	for (int i = 0; i < Rank-MyRank; ++i) {
	  to[i] = 1;
	}
	for (int i = Rank-MyRank; i < Rank-1; ++i) {
	  to[i] = from[i+(MyRank-Rank)];
	}
      }

      template <int MyRank>
      typename enable_if<(MyRank <= 1), void>::type
      partial_copy(const ExpressionSize<MyRank>& from,
		   ExpressionSize<Rank-1>& to) const {
	to.set_all(1);
      }


    protected:
      A& array_;
      const ExpressionSize<Rank> size_;
      //      ExpressionSize<Rank-1> filled_size_;
      ExpressionSize<Rank-1> obj_size_;
      ExpressionSize<Rank> coords_;
      const ExpressionSize<Rank-1> scalar_size_;
    };
    
  }

  // Allow object to be filled with "A << 1, 2, 3";
  template <int Rank, typename T, bool IsActive, typename E>
  internal::Allocator<Rank,Array<Rank,T,IsActive> > 
  operator<<(Array<Rank,T,IsActive>& array, const E& x) {
    if (array.empty()) {
      throw empty_array("Attempt to fill empty array with \"<<\""
			ADEPT_EXCEPTION_LOCATION);
    }
    return internal::Allocator<Rank,Array<Rank,T,IsActive> >(array, x);
  }

}


#endif


================================================
FILE: include/adept/Array.h
================================================
/* Array.h -- active or inactive Array of arbitrary rank

    Copyright (C) 2014-2021 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.


   The Array class has functionality modelled on Fortran-90 arrays -
   they can have a rank up to 7 (above will work, but some forms of
   indexing these arrays will not work).

*/

#ifndef AdeptArray_H
#define AdeptArray_H 1

#include <iostream>
#include <sstream>
#include <limits>
#include <string>

#include <adept/base.h>

#ifdef ADEPT_CXX11_FEATURES
#include <initializer_list>
#endif

#include <adept/Storage.h>
#include <adept/Expression.h>
#include <adept/RangeIndex.h>
#include <adept/ActiveReference.h>
#include <adept/ActiveConstReference.h>
#include <adept/IndexedArray.h>
#include <adept/where.h>
#include <adept/noalias.h>
#include <adept/GradientIndex.h>

namespace adept {

  enum ArrayPrintStyle {
    PRINT_STYLE_PLAIN,
    PRINT_STYLE_CSV,
    PRINT_STYLE_CURLY,
    PRINT_STYLE_MATLAB
  };

  enum MatrixStorageOrder {
    ROW_MAJOR=0, COL_MAJOR=1
  };

  // Forward declarations to enable diag_matrix
  template <typename, class, bool> class SpecialMatrix;
  namespace internal {
    template <MatrixStorageOrder, Index, Index> struct BandEngine;
  }

  // Forward declaration to enable linking at construction and via
  // link to FixedArray
  template <typename, bool, Index, Index, Index, Index, Index, Index, Index>
  class FixedArray;

  namespace internal {

    // -------------------------------------------------------------------
    // Global variables
    // -------------------------------------------------------------------
    // The following global variables affect the behaviour of the
    // Array class, and are modified using set_*

    // This is "true" by default: row-major is the normal C/C++
    // convention
    extern bool array_row_major_order;

    // When arrays are sent to a stream the dimensions can be grouped
    // with curly brackets
    //    extern bool array_print_curly_brackets;

    // Variables describing how arrays are written to a stream
    extern ArrayPrintStyle array_print_style;
    extern std::string vector_separator;
    extern std::string vector_print_before;
    extern std::string vector_print_after;
    extern std::string array_opening_bracket;
    extern std::string array_closing_bracket;
    extern std::string array_contiguous_separator;
    extern std::string array_non_contiguous_separator;
    extern std::string array_print_before;
    extern std::string array_print_after;
    extern std::string array_print_empty_before;
    extern std::string array_print_empty_after;
    extern bool array_print_indent;
    extern bool array_print_empty_rank;

    // Forward declaration to enable Array::where()
    //    template <class A, class B> class Where;

    // -------------------------------------------------------------------
    // Helper classes
    // -------------------------------------------------------------------

    // The following are used by expression_string()
    template <int Rank, bool IsActive>
    struct array_helper            { const char* name() { return "Array";  } };
    template <int Rank>
    struct array_helper<Rank,true> { const char* name() { return "aArray";  } };

    template <>
    struct array_helper<1,false>   { const char* name() { return "Vector"; } };
    template <>
    struct array_helper<1,true>    { const char* name() { return "aVector"; } };

    template <>
    struct array_helper<2,false>   { const char* name() { return "Matrix"; } };
    template <>
    struct array_helper<2,true>    { const char* name() { return "aMatrix"; } };

  } // End namespace internal


  // -------------------------------------------------------------------
  // Definition of Array class
  // -------------------------------------------------------------------
  template<int Rank, typename Type = Real, bool IsActive = false>
  class Array
    : public Expression<Type,Array<Rank,Type,IsActive> >,
      protected internal::GradientIndex<IsActive> {

  public:
    // -------------------------------------------------------------------
    // Array: 1. Static Definitions
    // -------------------------------------------------------------------

    // The Expression base class needs access to some protected member
    // functions in section 5
    friend struct Expression<Type,Array<Rank,Type,IsActive> >;

    // Static definitions to enable the properties of this type of
    // expression to be discerned at compile time
    static const bool is_active  = IsActive;
    static const bool is_lvalue  = true;
    static const int  rank       = Rank;
    static const int  n_active   = IsActive * (1 + internal::is_complex<Type>::value);
    static const int  n_scratch  = 0;
    static const int  n_arrays   = 1;
    static const bool is_vectorizable = Packet<Type>::is_vectorized;

    // -------------------------------------------------------------------
    // Array: 2. Constructors
    // -------------------------------------------------------------------
    
    // Initialize an empty array
    Array() : data_(0), storage_(0), dimensions_(0)
    { ADEPT_STATIC_ASSERT(!(std::numeric_limits<Type>::is_integer
			    && IsActive), CANNOT_CREATE_ACTIVE_ARRAY_OF_INTEGERS); }

    // Initialize an array with specified size
    Array(const Index* dims) : storage_(0)
    { resize(dims); }
    Array(const ExpressionSize<Rank>& dims) : storage_(0)
    { resize(dims); }

    // A way to only enable construction if the correct number of
    // arguments is provided (resize_<x> is only defined for x==Rank)
    Array(Index m0) : storage_(0) { resize_<1>(m0); }
    Array(Index m0, Index m1) : storage_(0) { resize_<2>(m0,m1); }
    Array(Index m0, Index m1, Index m2) : storage_(0) { resize_<3>(m0,m1,m2); }
    Array(Index m0, Index m1, Index m2, Index m3) : storage_(0) 
    { resize_<4>(m0,m1,m2,m3); }
    Array(Index m0, Index m1, Index m2, Index m3, Index m4)  : storage_(0)
    { resize_<5>(m0,m1,m2,m3,m4); }
    Array(Index m0, Index m1, Index m2, Index m3, Index m4, Index m5)  : storage_(0)
    { resize_<6>(m0,m1,m2,m3,m4,m5); }
    Array(Index m0, Index m1, Index m2, Index m3, Index m4, Index m5, Index m6) 
      : storage_(0) 
    { resize_<7>(m0,m1,m2,m3,m4,m5,m6); }

    // A way to directly create arrays, needed when subsetting
    // other arrays
    Array(Type* data, Storage<Type>* s, const ExpressionSize<Rank>& dims,
	  const ExpressionSize<Rank>& offset)
      : data_(data), storage_(s), dimensions_(dims), offset_(offset) { 
      if (storage_) {
	storage_->add_link(); 
	internal::GradientIndex<IsActive>::set(data_, storage_);
      }
      else {
	// Active arrays need a gradient index so it is an error for
	// them to get to this point
	internal::GradientIndex<IsActive>::assert_inactive();
      }
    }

    // Similar to the above, but with the gradient index supplied explicitly,
    // needed when an active FixedArray is being sliced, which
    // produces an active Array
    Array(const Type* data0, Index data_offset, const ExpressionSize<Rank>& dims,
	  const ExpressionSize<Rank>& offset, Index gradient_index0)
      : internal::GradientIndex<IsActive>(gradient_index0, data_offset),
	data_(const_cast<Type*>(data0)+data_offset), storage_(0), dimensions_(dims), offset_(offset) { }

    // Initialize an array pointing at existing data: the fact that
    // storage_ is a null pointer is used to convey the information
    // that it is not necessary to deallocate the data when this array
    // is destructed
    Array(Type* data, const ExpressionSize<Rank>& dims)
      : data_(data), storage_(0), dimensions_(dims) {
      ADEPT_STATIC_ASSERT(!IsActive, CANNOT_CONSTRUCT_ACTIVE_ARRAY_WITHOUT_GRADIENT_INDEX);
      // Active arrays need a gradient index so it is an error for
      // them to get to this point
      internal::GradientIndex<IsActive>::assert_inactive();
      pack_contiguous_(); 
    }

    // Copy constructor: links to the source data rather than copying
    // it.  This is needed because we want a function returning an
    // Array not to make a deep copy, but rather to perform a
    // (computationally cheaper) shallow copy; when the Array within
    // the function is destructed, it will remove its link to the
    // data, and the responsibility for deallocating the data will
    // then pass to the Array in the calling function.
    Array(Array& rhs) 
      : internal::GradientIndex<IsActive>(rhs.gradient_index()), 
	data_(rhs.data()), storage_(rhs.storage()), 
	dimensions_(rhs.dimensions()), offset_(rhs.offset())
    {
      if (storage_) storage_->add_link(); 
#ifdef ADEPT_VERBOSE_FUNCTIONS
      std::cout << "  running constructor Array(Array&)\n";
#endif
    }

    // Copy constructor with const argument does exactly the same
    // thing
    Array(const Array& rhs) 
      : internal::GradientIndex<IsActive>(rhs.gradient_index()),
	dimensions_(rhs.dimensions()), offset_(rhs.offset())
    { 
      link_(const_cast<Array&>(rhs));
#ifdef ADEPT_VERBOSE_FUNCTIONS
      std::cout << "  running constructor Array(const Array&)\n";
#endif
    }
  private:
    void link_(Array& rhs) {
      data_ = const_cast<Type*>(rhs.data()); 
      storage_ = const_cast<Storage<Type>*>(rhs.storage());
      if (storage_) storage_->add_link();
    }

  public:

    // Initialize with an expression on the right hand side by
    // evaluating the expression, requiring the ranks to be equal.
    // Note that this constructor enables expressions to be used as
    // arguments to functions that expect an array - to prevent this
    // implicit conversion, use the "explicit" keyword.
    template<typename EType, class E>
    Array(const Expression<EType, E>& rhs,
	  typename internal::enable_if<E::rank == Rank && (Rank > 0),int>::type = 0)
      : data_(0), storage_(0), dimensions_(0)
    {
#ifdef ADEPT_VERBOSE_FUNCTIONS
      std::cout << "  running constructor Array(const Expression&), implemented by assignment\n";
#endif
      *this = rhs; 
    }

#ifdef ADEPT_CXX11_FEATURES
    // Initialize from initializer list
    template <typename T>
    Array(std::initializer_list<T> list) : data_(0), storage_(0), dimensions_(0) {
      *this = list;
    }

    // The unfortunate restrictions on initializer_list constructors
    // mean that each possible Array rank needs explicit treatment
    template <typename T>
    Array(std::initializer_list<
	  std::initializer_list<T> > list)
      : data_(0), storage_(0), dimensions_(0) { *this = list; }

    template <typename T>
    Array(std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<T> > > list)
      : data_(0), storage_(0), dimensions_(0) { *this = list; }

    template <typename T>
    Array(std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<T> > > > list)
      : data_(0), storage_(0), dimensions_(0) { *this = list; }

    template <typename T>
    Array(std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<T> > > > > list)
      : data_(0), storage_(0), dimensions_(0) { *this = list; }

    template <typename T>
    Array(std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<T> > > > > > list)
      : data_(0), storage_(0), dimensions_(0) { *this = list; }

    template <typename T>
    Array(std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<T> > > > > > > list)
      : data_(0), storage_(0), dimensions_(0) { *this = list; }
    

#endif


    // Destructor: if the data are stored in a Storage object then we
    // tell it that one fewer object is linking to it; if the number
    // of links to it drops to zero, it will destruct itself and
    // deallocate the memory.
    ~Array()
    { if (storage_) storage_->remove_link(); }

    // -------------------------------------------------------------------
    // Array: 3. Assignment operators
    // -------------------------------------------------------------------

    // Assignment to another matrix: copy the data...
    // Ideally we would like this to fall back to the operator=(const
    // Expression&) function, but if we don't define a copy assignment
    // operator then C++ will generate a default one :-(
    Array& operator=(const Array& rhs) {
#ifdef ADEPT_VERBOSE_FUNCTIONS
      std::cout << "  running Array::operator=(const Array&), implemented with operator=(const Expression&)\n";
#endif
      return (*this = static_cast<const Expression<Type,Array>&> (rhs));
    }

#ifdef ADEPT_MOVE_SEMANTICS
    Array& operator=(Array&& rhs) {
#ifdef ADEPT_VERBOSE_FUNCTIONS
      std::cout << "  running Array::operator=(Array&&)\n";
#endif
      // A fast "swap" operation can be performed only if the present
      // ("this") array is either empty, or its data is contained in a
      // Storage object with only one link to it (corresponding to the
      // present array). We may not perform a swap if its data is not
      // in a Storage object, since it might be linked to another
      // location that is expecting the result of the assignment to
      // change the data in that location. We also require that the
      // RHS data would otherwise be lost (but it is not clear that
      // this is necessary).
      if ((empty() || (storage_ && storage_->n_links() == 1))
	  && (!rhs.storage() || rhs.storage()->n_links() == 1)) {
	// We still need to check that the dimensions match
	if (empty() || internal::compatible(dimensions_, rhs.dimensions())) {
	  swap(*this, rhs);
	}
	else {
	  std::string str = rhs.expression_string()
	    + " assigned to " + expression_string_();
	  throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
	}
      }
      else {
	// Need a full copy because other arrays are linked to the
	// Storage object
	*this = static_cast<const Expression<Type,Array>&> (rhs);
      }
      return *this;
    }

    friend void swap(Array& l, Array& r) noexcept {
#ifdef ADEPT_VERBOSE_FUNCTIONS
      std::cout << "  running swap(Array&,Array&)\n";
#endif
      Type* tmp_data = l.data_;
      l.data_ = r.data_;
      r.data_ = tmp_data;
      Storage<Type>* tmp_storage = l.storage_;
      l.storage_ = r.storage_;
      r.storage_ = tmp_storage;
      swap(l.dimensions_, r.dimensions_);
      swap(l.offset_, r.offset_);
      static_cast<internal::GradientIndex<IsActive>&>(l).swap_value(static_cast<internal::GradientIndex<IsActive>&>(r));
    }

#endif


    // Assignment to an array expression of the same rank
    template <typename EType, class E>
    inline //__attribute__((always_inline))
    typename internal::enable_if<E::rank == Rank, Array&>::type
    operator=(const Expression<EType,E>&  __restrict rhs) {
#ifdef ADEPT_VERBOSE_FUNCTIONS
      std::cout << "  running Array::operator=(const Expression&)\n";
#endif
#ifndef ADEPT_NO_DIMENSION_CHECKING
      ExpressionSize<Rank> dims;
      if (!rhs.get_dimensions(dims)) {
	std::string str = "Array size mismatch in "
	  + rhs.expression_string() + ".";
	throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
      }
      else if (empty()) {
	resize(dims);
      }
      else if (!internal::compatible(dims, dimensions_)) {
	std::string str = "Expr";
	str += dims.str() + " object assigned to " + expression_string_();
	throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
      }
#else
      if (empty()) {
	ExpressionSize<Rank> dims;
	if (!rhs.get_dimensions(dims)) {
	  std::string str = "Array size mismatch in "
	    + rhs.expression_string() + ".";
	  throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
	}	
	resize(dims);
      }
#endif
      if (!empty()) {
#ifndef ADEPT_NO_ALIAS_CHECKING
	// Check for aliasing first
	Type const * ptr_begin;
	Type const * ptr_end;
	data_range(ptr_begin, ptr_end);
	if (rhs.is_aliased(ptr_begin, ptr_end)) {
	  Array<Rank,Type,IsActive> copy;
	  // It would be nice to wrap noalias around rhs, but then
	  // this leads to infinite template recursion since the "="
	  // operator calls the current function but with a modified
	  // expression type. perhaps a better way would be to make
	  // copy.assign_no_alias(rhs) work.
	  copy = rhs;
	  assign_expression_<Rank, IsActive, E::is_active>(copy);
	}
	else {
#endif
	  // Select active/passive version by delegating to a
	  // protected function
	  // The cast() is needed because assign_expression_ accepts
	  // its argument by value
	  assign_expression_<Rank, IsActive, E::is_active>(rhs.cast());
#ifndef ADEPT_NO_ALIAS_CHECKING
	}
#endif
      }
      return *this;
    }


    // Assignment to an array expression of the same rank in which the
    // activeness of the right-hand-side is ignored
    template <typename EType, class E>
    typename internal::enable_if<E::rank == Rank, Array&>::type
    assign_inactive(const Expression<EType,E>& rhs) {
      ExpressionSize<Rank> dims;
      if (!rhs.get_dimensions(dims)) {
	std::string str = "Array size mismatch in "
	  + rhs.expression_string() + ".";
	throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
      }
      else if (empty()) {
	resize(dims);
      }
      else if (!internal::compatible(dims, dimensions_)) {
	std::string str = "Expr";
	str += dims.str() + " object assigned to " + expression_string_();
	throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
      }

      if (!empty()) {
	// Check for aliasing first
	Type const * ptr_begin;
	Type const * ptr_end;
	data_range(ptr_begin, ptr_end);
	if (rhs.is_aliased(ptr_begin, ptr_end)) {
	  Array<Rank,Type,IsActive> copy;
	  copy.assign_inactive(rhs);
	  //	  *this = copy;
	  assign_expression_<Rank, IsActive, false>(copy);
	}
	else {
	  assign_expression_<Rank, IsActive, false>(rhs.cast());
	}
      }
      return *this;
    }

    // Assignment to a single value copies to every element
    template <typename RType>
    typename internal::enable_if<internal::is_not_expression<RType>::value
                       // FIX
                       || internal::is_active<Type>::value
		       , Array&>::type
    operator=(RType rhs) {
      if (!empty()) {
	assign_inactive_scalar_<Rank,IsActive>(rhs);
      }
      return *this;
    }

    // Assign active scalar expression to an active array by first
    // converting the RHS to an active scalar
    template <typename EType, class E>
    typename internal::enable_if<E::rank == 0 && (Rank > 0) && IsActive && !E::is_lvalue,
      Array&>::type
    operator=(const Expression<EType,E>& rhs) {
      Active<EType> x = rhs;
      *this = x;
      return *this;
    }

    // Assign an active scalar to an active array
    template <typename PType>
    // FIX
    typename internal::enable_if<!internal::is_active<PType>::value && IsActive, Array&>::type
    //    Array& 
    operator=(const Active<PType>& rhs) {
      ADEPT_STATIC_ASSERT(IsActive, ATTEMPT_TO_ASSIGN_ACTIVE_SCALAR_TO_INACTIVE_ARRAY);
      if (!empty()) {
#ifdef ADEPT_RECORDING_PAUSABLE
	if (!ADEPT_ACTIVE_STACK->is_recording()) {
	  assign_inactive_scalar_<Rank,false>(rhs.scalar_value());
	  return *this;
	}
#endif
	ExpressionSize<Rank> i(0);
	Index index = 0;
	int my_rank;
	static const int last = Rank-1;
	// In case PType != Type we make a local copy to minimize type
	// conversions
	Type val = rhs.scalar_value();
	
	ADEPT_ACTIVE_STACK->check_space(size());
	do {
	  i[last] = 0;
	  // Innermost loop
	  for ( ; i[last] < dimensions_[last]; ++i[last],
		  index += offset_[last]) {
	    data_[index] = val;
	    ADEPT_ACTIVE_STACK->push_rhs(1.0, rhs.gradient_index());
	    ADEPT_ACTIVE_STACK->push_lhs(gradient_index()+index);
	  }
	  advance_index(index, my_rank, i);
	} while (my_rank >= 0);
      }
      return *this;
    }

#define ADEPT_DEFINE_OPERATOR(OPERATOR, OPSYMBOL)		\
    template <class RType>				\
    Array& OPERATOR(const RType& rhs) {			\
      return *this = noalias(*this OPSYMBOL rhs);	\
    }
    ADEPT_DEFINE_OPERATOR(operator+=, +)
    ADEPT_DEFINE_OPERATOR(operator-=, -)
    ADEPT_DEFINE_OPERATOR(operator*=, *)
    ADEPT_DEFINE_OPERATOR(operator/=, /)
  //    ADEPT_DEFINE_OPERATOR(operator&=, &);
  //    ADEPT_DEFINE_OPERATOR(operator|=, |);
#undef ADEPT_DEFINE_OPERATOR

    // Enable the A.where(B) = C construct.

    // Firstly implement the A.where(B) to return a "Where<A,B>" object
    template <class B>
    typename internal::enable_if<B::rank == Rank, internal::Where<Array,B> >::type
    where(const Expression<bool,B>& bool_expr) {
#ifndef ADEPT_NO_DIMENSION_CHECKING
      ExpressionSize<Rank> dims;
      if (!bool_expr.get_dimensions(dims)) {
	std::string str = "Array size mismatch in "
	  + bool_expr.expression_string() + ".";
	throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
      }
      else if (dims != dimensions_) {
	throw size_mismatch("Boolean expression of different size"
			    ADEPT_EXCEPTION_LOCATION);
      }
#endif
      return internal::Where<Array,B>(*this, bool_expr.cast());
    }
    
    // When Where<A,B> = C is invoked, it calls
    // A.assign_conditional(B,C). This is implemented separately for
    // the case when C is an inactive scalar and when it is an array
    // expression.
    template <class B, typename C>
    typename internal::enable_if<internal::is_not_expression<C>::value, void>::type
    assign_conditional(const Expression<bool,B>& bool_expr,
			    C rhs) {
      if (!empty()) {
	assign_conditional_inactive_scalar_<IsActive>(bool_expr, rhs);
      }
    }

    template <class B, typename T, class C>
    void assign_conditional(const Expression<bool,B>& bool_expr,
			    const Expression<T,C>& rhs) {
      // Assume size of bool_expr already checked
#ifndef ADEPT_NO_DIMENSION_CHECKING
      ExpressionSize<Rank> dims;
      if (!rhs.get_dimensions(dims)) {
	std::string str = "Array size mismatch in "
	  + rhs.expression_string() + ".";
	throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
      }
      else if (!internal::compatible(dims,dimensions_)) {
	throw size_mismatch("Right-hand-side of \"where\" construct of incompatible size"
			    ADEPT_EXCEPTION_LOCATION);
      }
#endif
      // Check for aliasing first
      Type const * ptr_begin;
      Type const * ptr_end;
      data_range(ptr_begin, ptr_end);
      if (rhs.is_aliased(ptr_begin, ptr_end)) {
	Array<Rank,Type,IsActive> copy;
	copy = rhs;
	assign_conditional_<IsActive>(bool_expr.cast(), copy);
      }
      else {
	// Select active/passive version by delegating to a
	// protected function
	assign_conditional_<IsActive>(bool_expr.cast(), rhs.cast());
      }
      //      return *this;
    }

#ifdef ADEPT_CXX11_FEATURES
    // Assignment of an Array to an initializer list; the first ought
    // to only work for Vectors
    template <typename T>
    typename internal::enable_if<std::is_convertible<T,Type>::value, Array&>::type
    operator=(std::initializer_list<T> list) {
      ADEPT_STATIC_ASSERT(Rank==1,RANK_MISMATCH_IN_INITIALIZER_LIST);

      if (empty()) {
	resize(list.size());
      }
      else if (list.size() > static_cast<std::size_t>(dimensions_[0])) {
	throw size_mismatch("Initializer list is larger than Vector in assignment"
			    ADEPT_EXCEPTION_LOCATION);
      }
      // Zero the whole array first in order that automatic
      // differentiation works
      *this = 0;
      Index index = 0;
      for (auto i = std::begin(list); i < std::end(list); ++i,
	   ++index) {
	data_[index*offset_[0]] = *i;	
      }
      return *this;
    }

    // Assignment of a higher rank Array to a list of lists...
    template <class IType>
    Array& operator=(std::initializer_list<std::initializer_list<IType> > list) {
      ADEPT_STATIC_ASSERT(Rank==internal::initializer_list_rank<IType>::value+2,
      			  RANK_MISMATCH_IN_INITIALIZER_LIST);
      if (empty()) {
	Index dims[ADEPT_MAX_ARRAY_DIMENSIONS];
	int ndims = 0;
	shape_initializer_list_(list, dims, ndims);
	resize(dims);
      }
      else if (list.size() > static_cast<std::size_t>(dimensions_[0])) {
	throw size_mismatch("Multi-dimensional initializer list larger than slowest-varying dimension of Array"
			    ADEPT_EXCEPTION_LOCATION);
      }
      Index index = 0;
      for (auto i = std::begin(list); i < std::end(list); ++i,
	   ++index) {
	(*this)[index] = *i;
      }
      return *this;
    }


  protected:
    template <typename T>
    typename internal::enable_if<std::is_convertible<T,Type>::value>::type
    shape_initializer_list_(std::initializer_list<T> list,
			    Index* dims, int& ndims) const {
      dims[ndims] = list.size();
      ndims++;
    }
    template <class IType>
    void
    shape_initializer_list_(std::initializer_list<std::initializer_list<IType> > list,
			    Index* dims, int& ndims) const {
      dims[ndims] = list.size();
      ndims++;
      shape_initializer_list_(*(list.begin()), dims, ndims);
    }


  public:

#endif


    // -------------------------------------------------------------------
    // Array: 4. Access functions, particularly operator()
    // -------------------------------------------------------------------
  
    // Get l-value of the element at the specified coordinates
    typename internal::active_reference<Type,IsActive>::type
    get_lvalue(const ExpressionSize<Rank>& i) {
      return get_lvalue_<IsActive>(index_(i));
    }
    
    typename internal::active_scalar<Type,IsActive>::type
    get_rvalue(const ExpressionSize<Rank>& i) const {
      return get_rvalue_<IsActive>(index_(i));
    }

  protected:
    template <bool MyIsActive>
    typename internal::enable_if<MyIsActive, ActiveReference<Type> >::type
    get_lvalue_(const Index& loc) {
      return ActiveReference<Type>(data_[loc], gradient_index()+loc);
    }
    template <bool MyIsActive>
    typename internal::enable_if<!MyIsActive, Type&>::type
    get_lvalue_(const Index& loc) {
      return data_[loc];
    }

    template <bool MyIsActive>
    typename internal::enable_if<MyIsActive, Active<Type> >::type
    get_rvalue_(const Index& loc) const {
      return Active<Type>(data_[loc], gradient_index()+loc);
    }
    template <bool MyIsActive>
    typename internal::enable_if<!MyIsActive, const Type&>::type
    get_rvalue_(const Index& loc) const {
      return data_[loc];
    }

  public:
    // Get a constant reference to the element at the specified
    // location, ignoring whether it is active or not
    //    const Type& get(const ExpressionSize<Rank>& i) const {
    //      return data_[index_(i)];
    //    }

    // The following provide a way to access individual elements of
    // the array.  There must be the same number of arguments to
    // operator() as the rank of the array.  Each argument must be of
    // integer type, or a rank-0 expression of integer type (such as
    // "end" or "end-3"). Inactive arrays return a reference to the
    // element, while active arrays return an ActiveReference<Type>
    // object.  Up to 7 dimensions are supported.

    // l-value access to inactive array with function-call operator
    template <typename I0>
    typename internal::enable_if<Rank==1 && internal::all_scalar_ints<1,I0>::value && !IsActive, Type&>::type
    operator()(I0 i0) 
    { return data_[internal::get_index_with_len(i0,dimensions_[0])*offset_[0]]; }

    // r-value access to inactive array with function-call operator
    template <typename I0>
    typename internal::enable_if<Rank==1 && internal::all_scalar_ints<1,I0>::value && !IsActive, const Type&>::type
    operator()(I0 i0) const
    { return data_[internal::get_index_with_len(i0,dimensions_[0])*offset_[0]]; }

    // l-value access to inactive array with element-access operator
    template <typename I0>
    typename internal::enable_if<Rank==1 && internal::all_scalar_ints<1,I0>::value && !IsActive, Type&>::type
    operator[](I0 i0) 
    { return data_[internal::get_index_with_len(i0,dimensions_[0])*offset_[0]]; }

    // r-value access to inactive array with element-access operator
    template <typename I0>
    typename internal::enable_if<Rank==1 && internal::all_scalar_ints<1,I0>::value && !IsActive, const Type&>::type
    operator[](I0 i0) const
    { return data_[internal::get_index_with_len(i0,dimensions_[0])*offset_[0]]; }

  protected:
    template <bool MyIsActive>
    typename internal::enable_if<!MyIsActive,Type&>::type
    get_scalar_reference(const Index& offset)
    { return data_[offset]; }

    template <bool MyIsActive>
    typename internal::enable_if<!MyIsActive,const Type&>::type
    get_scalar_reference(const Index& offset) const
    { return data_[offset]; }

    template <bool MyIsActive>
    typename internal::enable_if<MyIsActive,ActiveReference<Type> >::type
    get_scalar_reference(const Index& offset) 
    { return ActiveReference<Type>(data_[offset], gradient_index()+offset); }
    template <bool MyIsActive>
    typename internal::enable_if<MyIsActive,ActiveConstReference<Type> >::type
    get_scalar_reference(const Index& offset) const
    { return ActiveConstReference<Type>(data_[offset], gradient_index()+offset); }

  public:

    // l-value access to active array with function-call operator
    template <typename I0>
    typename internal::enable_if<Rank==1 && internal::all_scalar_ints<1,I0>::value && IsActive,
		       ActiveReference<Type> >::type
    operator()(I0 i0) {
      Index offset = internal::get_index_with_len(i0,dimensions_[0])*offset_[0];
      return ActiveReference<Type>(data_[offset], gradient_index()+offset);
    }
    
    // r-value access to active array with function-call operator
    template <typename I0>
    typename internal::enable_if<Rank==1 && internal::all_scalar_ints<1,I0>::value && IsActive,
		       ActiveConstReference<Type> >::type
    operator()(I0 i0) const {
      Index offset = internal::get_index_with_len(i0,dimensions_[0])*offset_[0];
      return ActiveConstReference<Type>(data_[offset], gradient_index()+offset);
    }

    // l-value access to active array with element-access operator
    template <typename I0>
    typename internal::enable_if<Rank==1 && internal::all_scalar_ints<1,I0>::value && IsActive,
		       ActiveReference<Type> >::type
    operator[](I0 i0) {
      Index offset = internal::get_index_with_len(i0,dimensions_[0])*offset_[0];
      return ActiveReference<Type>(data_[offset], gradient_index()+offset);
    }
    
    // r-value access to active array with element-access operator
    template <typename I0>
    typename internal::enable_if<Rank==1 && internal::all_scalar_ints<1,I0>::value && IsActive,
		       ActiveConstReference<Type> >::type
    operator[](I0 i0) const {
      Index offset = internal::get_index_with_len(i0,dimensions_[0])*offset_[0];
      return ActiveConstReference<Type>(data_[offset], gradient_index()+offset);
    }
    
    // 2D array l-value and r-value access
    template <typename I0, typename I1>
    typename internal::enable_if<Rank==2 && internal::all_scalar_ints<2,I0,I1>::value,
		       typename internal::active_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1) {
      return get_scalar_reference<IsActive>(
		    internal::get_index_with_len(i0,dimensions_[0])*offset_[0]
		  + internal::get_index_with_len(i1,dimensions_[1])*offset_[1]);
    }
    template <typename I0, typename I1>
    typename internal::enable_if<Rank==2 && internal::all_scalar_ints<2,I0,I1>::value,
		       typename internal::active_const_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1) const {
      return get_scalar_reference<IsActive>(
		    internal::get_index_with_len(i0,dimensions_[0])*offset_[0]
		  + internal::get_index_with_len(i1,dimensions_[1])*offset_[1]);
    }

    // 3D array l-value and r-value access
    template <typename I0, typename I1, typename I2>
    typename internal::enable_if<Rank==3 && internal::all_scalar_ints<3,I0,I1,I2>::value,
		       typename internal::active_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2) {
      return get_scalar_reference<IsActive>(
		     internal::get_index_with_len(i0,dimensions_[0])*offset_[0]
		   + internal::get_index_with_len(i1,dimensions_[1])*offset_[1]
		   + internal::get_index_with_len(i2,dimensions_[2])*offset_[2]);
    }
    template <typename I0, typename I1, typename I2>
    typename internal::enable_if<Rank==3 && internal::all_scalar_ints<3,I0,I1,I2>::value,
		       typename internal::active_const_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2) const {
      return get_scalar_reference<IsActive>(
		     internal::get_index_with_len(i0,dimensions_[0])*offset_[0]
		   + internal::get_index_with_len(i1,dimensions_[1])*offset_[1]
		   + internal::get_index_with_len(i2,dimensions_[2])*offset_[2]);
    }

    // 4D array l-value and r-value access
    template <typename I0, typename I1, typename I2, typename I3>
    typename internal::enable_if<Rank==4 && internal::all_scalar_ints<4,I0,I1,I2,I3>::value,
		       typename internal::active_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3) {
      return get_scalar_reference<IsActive>(
		     internal::get_index_with_len(i0,dimensions_[0])*offset_[0]
		   + internal::get_index_with_len(i1,dimensions_[1])*offset_[1]
		   + internal::get_index_with_len(i2,dimensions_[2])*offset_[2]
		   + internal::get_index_with_len(i3,dimensions_[3])*offset_[3]);
    }
    template <typename I0, typename I1, typename I2, typename I3>
    typename internal::enable_if<Rank==4 && internal::all_scalar_ints<4,I0,I1,I2,I3>::value,
		       typename internal::active_const_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3) const {
      return get_scalar_reference<IsActive>(
		     internal::get_index_with_len(i0,dimensions_[0])*offset_[0]
		   + internal::get_index_with_len(i1,dimensions_[1])*offset_[1]
		   + internal::get_index_with_len(i2,dimensions_[2])*offset_[2]
		   + internal::get_index_with_len(i3,dimensions_[3])*offset_[3]);
    }

    // 5D array l-value and r-value access
    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4>
    typename internal::enable_if<Rank==5 && internal::all_scalar_ints<5,I0,I1,I2,I3,I4>::value,
		       typename internal::active_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4) {
      return get_scalar_reference<IsActive>(
		     internal::get_index_with_len(i0,dimensions_[0])*offset_[0]
		   + internal::get_index_with_len(i1,dimensions_[1])*offset_[1]
		   + internal::get_index_with_len(i2,dimensions_[2])*offset_[2]
		   + internal::get_index_with_len(i3,dimensions_[3])*offset_[3]
		   + internal::get_index_with_len(i4,dimensions_[4])*offset_[4]);
    }
    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4>
    typename internal::enable_if<Rank==5 && internal::all_scalar_ints<5,I0,I1,I2,I3,I4>::value,
		       typename internal::active_const_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4) const {
      return get_scalar_reference<IsActive>(
		     internal::get_index_with_len(i0,dimensions_[0])*offset_[0]
		   + internal::get_index_with_len(i1,dimensions_[1])*offset_[1]
		   + internal::get_index_with_len(i2,dimensions_[2])*offset_[2]
		   + internal::get_index_with_len(i3,dimensions_[3])*offset_[3]
		   + internal::get_index_with_len(i4,dimensions_[4])*offset_[4]);
    }

    // 6D array l-value and r-value access
    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4, typename I5>
    typename internal::enable_if<Rank==6 && internal::all_scalar_ints<6,I0,I1,I2,I3,I4,I5>::value,
		       typename internal::active_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5) {
      return get_scalar_reference<IsActive>(
		     internal::get_index_with_len(i0,dimensions_[0])*offset_[0]
		   + internal::get_index_with_len(i1,dimensions_[1])*offset_[1]
		   + internal::get_index_with_len(i2,dimensions_[2])*offset_[2]
		   + internal::get_index_with_len(i3,dimensions_[3])*offset_[3]
		   + internal::get_index_with_len(i4,dimensions_[4])*offset_[4]
		   + internal::get_index_with_len(i5,dimensions_[5])*offset_[5]);
    }
    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4, typename I5>
    typename internal::enable_if<Rank==6 && internal::all_scalar_ints<6,I0,I1,I2,I3,I4,I5>::value,
		       typename internal::active_const_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5) const {
      return get_scalar_reference<IsActive>(
		     internal::get_index_with_len(i0,dimensions_[0])*offset_[0]
		   + internal::get_index_with_len(i1,dimensions_[1])*offset_[1]
		   + internal::get_index_with_len(i2,dimensions_[2])*offset_[2]
		   + internal::get_index_with_len(i3,dimensions_[3])*offset_[3]
		   + internal::get_index_with_len(i4,dimensions_[4])*offset_[4]
		   + internal::get_index_with_len(i5,dimensions_[5])*offset_[5]);
    }

    // 7D array l-value and r-value access
    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4, typename I5, typename I6>
    typename internal::enable_if<Rank==7 && internal::all_scalar_ints<7,I0,I1,I2,I3,I4,I5,I6>::value,
		       typename internal::active_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6) {
      return get_scalar_reference<IsActive>(
		     internal::get_index_with_len(i0,dimensions_[0])*offset_[0]
		   + internal::get_index_with_len(i1,dimensions_[1])*offset_[1]
		   + internal::get_index_with_len(i2,dimensions_[2])*offset_[2]
		   + internal::get_index_with_len(i3,dimensions_[3])*offset_[3]
		   + internal::get_index_with_len(i4,dimensions_[4])*offset_[4]
		   + internal::get_index_with_len(i5,dimensions_[5])*offset_[5]
		   + internal::get_index_with_len(i6,dimensions_[6])*offset_[6]);
    }
     template <typename I0, typename I1, typename I2, typename I3,
	      typename I4, typename I5, typename I6>
    typename internal::enable_if<Rank==7 && internal::all_scalar_ints<7,I0,I1,I2,I3,I4,I5,I6>::value,
		       typename internal::active_const_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6) const {
      return get_scalar_reference<IsActive>(
		     internal::get_index_with_len(i0,dimensions_[0])*offset_[0]
		   + internal::get_index_with_len(i1,dimensions_[1])*offset_[1]
		   + internal::get_index_with_len(i2,dimensions_[2])*offset_[2]
		   + internal::get_index_with_len(i3,dimensions_[3])*offset_[3]
		   + internal::get_index_with_len(i4,dimensions_[4])*offset_[4]
		   + internal::get_index_with_len(i5,dimensions_[5])*offset_[5]
		   + internal::get_index_with_len(i6,dimensions_[6])*offset_[6]);
    }
   

    // The following define the case when operator() is called and one
    // of the arguments is a "range" object (an object that describes
    // a range of indices that are either contiguous or separated by a
    // fixed stride), while all others are of integer type (or a
    // rank-0 expression of integer type). An array object is returned
    // with a rank that may be reduced from that of the original
    // array, by one for each dimension that was indexed by an
    // integer. The new array points to a subset of the original data,
    // so modifying it will modify the original array.

    // First the case of a vector where we know the argument must be a
    // "range" object
    template <typename I0>
    typename internal::enable_if<internal::is_ranged<Rank,I0>::value,
		       Array<1,Type,IsActive> >::type
    operator()(I0 i0) {
      ExpressionSize<1> new_dim((i0.end(dimensions_[0])
				 + i0.stride(dimensions_[0])
				 -i0.begin(dimensions_[0]))
				/i0.stride(dimensions_[0]));
      ExpressionSize<1> new_offset(i0.stride(dimensions_[0])*offset_[0]);
#ifdef ADEPT_VERBOSE_FUNCTIONS
      std::cout << "  running Array::operator()(RANGED)\n";
#endif
      return Array<1,Type,IsActive>(data_ + i0.begin(dimensions_[0])*offset_[0],
	storage_, new_dim, new_offset);
    }
    template <typename I0>
    typename internal::enable_if<internal::is_ranged<Rank,I0>::value,
		       const Array<1,Type,IsActive> >::type
    operator()(I0 i0) const {
      ExpressionSize<1> new_dim((i0.end(dimensions_[0])
				 + i0.stride(dimensions_[0])
				 -i0.begin(dimensions_[0]))
				/i0.stride(dimensions_[0]));
      ExpressionSize<1> new_offset(i0.stride(dimensions_[0])*offset_[0]);
#ifdef ADEPT_VERBOSE_FUNCTIONS
      std::cout << "  running Array::operator()(RANGED) const\n";
#endif
      return Array<1,Type,IsActive>(data_ + i0.begin(dimensions_[0])*offset_[0],
				    storage_, new_dim, new_offset);
    }

  private:
    // For multi-dimensional arrays, we need a helper function

    // Treat the indexing of dimension "irank" in the case that the
    // index is of integer type
    template <typename T, int NewRank>
    typename internal::enable_if<internal::is_scalar_int<T>::value, void>::type
    update_index(const Index& irank, const T& i, Index& inew_rank, Index& ibegin,
		 ExpressionSize<NewRank>& new_dim, 
		 ExpressionSize<NewRank>& new_offset) const {
      ibegin += internal::get_index_with_len(i,dimensions_[irank])*offset_[irank];
    }

    // Treat the indexing of dimension "irank" in the case that the
    // index is a "range" object
    template <typename T, int NewRank>
    typename internal::enable_if<internal::is_range<T>::value, void>::type
    update_index(const Index& irank, const T& i, Index& inew_rank, Index& ibegin,
		 ExpressionSize<NewRank>& new_dim, 
		 ExpressionSize<NewRank>& new_offset) const {
      ibegin += i.begin(dimensions_[irank])*offset_[irank];
      new_dim[inew_rank]
      = (i.end(dimensions_[irank])
	 + i.stride(dimensions_[irank])-i.begin(dimensions_[irank]))
      / i.stride(dimensions_[irank]);
      new_offset[inew_rank] = i.stride(dimensions_[irank])*offset_[irank];
      ++inew_rank;
    }

  public:

    // Now the individual overloads for each number of arguments, up
    // to 7, with separate r-value (const) and l-value (non-const)
    // versions
    template <typename I0, typename I1>
    typename internal::enable_if<internal::is_ranged<Rank,I0,I1>::value,
		       Array<internal::is_ranged<Rank,I0,I1>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1) {
      static const int new_rank = internal::is_ranged<Rank,I0,I1>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index(0, i0, inew_rank, ibegin, new_dim, new_offset);
      update_index(1, i1, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_ + ibegin, storage_,
					   new_dim, new_offset);
    }

    template <typename I0, typename I1>
    typename internal::enable_if<internal::is_ranged<Rank,I0,I1>::value,
		       const Array<internal::is_ranged<Rank,I0,I1>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1) const {
      static const int new_rank = internal::is_ranged<Rank,I0,I1>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index(0, i0, inew_rank, ibegin, new_dim, new_offset);
      update_index(1, i1, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_ + ibegin, storage_,
					   new_dim, new_offset);
    }

    template <typename I0, typename I1, typename I2>
    typename internal::enable_if<internal::is_ranged<Rank,I0,I1,I2>::value,
	       Array<internal::is_ranged<Rank,I0,I1,I2>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1, I2 i2) {
      static const int new_rank = internal::is_ranged<Rank,I0,I1,I2>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index(0, i0, inew_rank, ibegin, new_dim, new_offset);
      update_index(1, i1, inew_rank, ibegin, new_dim, new_offset);
      update_index(2, i2, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_ + ibegin, storage_,
					   new_dim, new_offset);
    }

    template <typename I0, typename I1, typename I2>
    typename internal::enable_if<internal::is_ranged<Rank,I0,I1,I2>::value,
	       const Array<internal::is_ranged<Rank,I0,I1,I2>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1, I2 i2) const {
      static const int new_rank = internal::is_ranged<Rank,I0,I1,I2>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index(0, i0, inew_rank, ibegin, new_dim, new_offset);
      update_index(1, i1, inew_rank, ibegin, new_dim, new_offset);
      update_index(2, i2, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_ + ibegin, storage_,
					   new_dim, new_offset);
    }

    template <typename I0, typename I1, typename I2, typename I3>
    typename internal::enable_if<internal::is_ranged<Rank,I0,I1,I2,I3>::value,
       Array<internal::is_ranged<Rank,I0,I1,I2,I3>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3) {
      static const int new_rank = internal::is_ranged<Rank,I0,I1,I2,I3>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index(0, i0, inew_rank, ibegin, new_dim, new_offset);
      update_index(1, i1, inew_rank, ibegin, new_dim, new_offset);
      update_index(2, i2, inew_rank, ibegin, new_dim, new_offset);
      update_index(3, i3, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_ + ibegin, storage_,
					   new_dim, new_offset);
    }

    template <typename I0, typename I1, typename I2, typename I3>
    typename internal::enable_if<internal::is_ranged<Rank,I0,I1,I2,I3>::value,
       const Array<internal::is_ranged<Rank,I0,I1,I2,I3>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3) const {
      static const int new_rank = internal::is_ranged<Rank,I0,I1,I2,I3>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index(0, i0, inew_rank, ibegin, new_dim, new_offset);
      update_index(1, i1, inew_rank, ibegin, new_dim, new_offset);
      update_index(2, i2, inew_rank, ibegin, new_dim, new_offset);
      update_index(3, i3, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_ + ibegin, storage_,
					   new_dim, new_offset);
    }

    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4>
    typename internal::enable_if<internal::is_ranged<Rank,I0,I1,I2,I3,I4>::value,
       Array<internal::is_ranged<Rank,I0,I1,I2,I3,I4>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4) {
      static const int new_rank = internal::is_ranged<Rank,I0,I1,I2,I3,I4>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index(0, i0, inew_rank, ibegin, new_dim, new_offset);
      update_index(1, i1, inew_rank, ibegin, new_dim, new_offset);
      update_index(2, i2, inew_rank, ibegin, new_dim, new_offset);
      update_index(3, i3, inew_rank, ibegin, new_dim, new_offset);
      update_index(4, i4, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_ + ibegin, storage_,
					   new_dim, new_offset);
    }
  
    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4>
    typename internal::enable_if<internal::is_ranged<Rank,I0,I1,I2,I3,I4>::value,
       const Array<internal::is_ranged<Rank,I0,I1,I2,I3,I4>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4) const {
      static const int new_rank = internal::is_ranged<Rank,I0,I1,I2,I3,I4>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index(0, i0, inew_rank, ibegin, new_dim, new_offset);
      update_index(1, i1, inew_rank, ibegin, new_dim, new_offset);
      update_index(2, i2, inew_rank, ibegin, new_dim, new_offset);
      update_index(3, i3, inew_rank, ibegin, new_dim, new_offset);
      update_index(4, i4, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_ + ibegin, storage_,
					   new_dim, new_offset);
    }
  
    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4, typename I5>
    typename internal::enable_if<internal::is_ranged<Rank,I0,I1,I2,I3,I4,I5>::value,
       Array<internal::is_ranged<Rank,I0,I1,I2,I3,I4,I5>::count,Type,IsActive> >::type
     operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5) {
      static const int new_rank = internal::is_ranged<Rank,I0,I1,I2,I3,I4,I5>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index(0, i0, inew_rank, ibegin, new_dim, new_offset);
      update_index(1, i1, inew_rank, ibegin, new_dim, new_offset);
      update_index(2, i2, inew_rank, ibegin, new_dim, new_offset);
      update_index(3, i3, inew_rank, ibegin, new_dim, new_offset);
      update_index(4, i4, inew_rank, ibegin, new_dim, new_offset);
      update_index(5, i5, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_ + ibegin, storage_,
					   new_dim, new_offset);
    }


    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4, typename I5>
    typename internal::enable_if<internal::is_ranged<Rank,I0,I1,I2,I3,I4,I5>::value,
       const Array<internal::is_ranged<Rank,I0,I1,I2,I3,I4,I5>::count,Type,IsActive> >::type
     operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5) const {
      static const int new_rank = internal::is_ranged<Rank,I0,I1,I2,I3,I4,I5>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index(0, i0, inew_rank, ibegin, new_dim, new_offset);
      update_index(1, i1, inew_rank, ibegin, new_dim, new_offset);
      update_index(2, i2, inew_rank, ibegin, new_dim, new_offset);
      update_index(3, i3, inew_rank, ibegin, new_dim, new_offset);
      update_index(4, i4, inew_rank, ibegin, new_dim, new_offset);
      update_index(5, i5, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_ + ibegin, storage_,
					   new_dim, new_offset);
    }

    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4, typename I5, typename I6>
    typename internal::enable_if<internal::is_ranged<Rank,I0,I1,I2,I3,I4,I5,I6>::value,
       Array<internal::is_ranged<Rank,I0,I1,I2,I3,I4,I5,I6>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6) {
      static const int new_rank = internal::is_ranged<Rank,I0,I1,I2,I3,I4,I5,I6>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index(0, i0, inew_rank, ibegin, new_dim, new_offset);
      update_index(1, i1, inew_rank, ibegin, new_dim, new_offset);
      update_index(2, i2, inew_rank, ibegin, new_dim, new_offset);
      update_index(3, i3, inew_rank, ibegin, new_dim, new_offset);
      update_index(4, i4, inew_rank, ibegin, new_dim, new_offset);
      update_index(5, i5, inew_rank, ibegin, new_dim, new_offset);
      update_index(6, i6, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_ + ibegin, storage_,
					   new_dim, new_offset);
    }

    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4, typename I5, typename I6>
    typename internal::enable_if<internal::is_ranged<Rank,I0,I1,I2,I3,I4,I5,I6>::value,
       const Array<internal::is_ranged<Rank,I0,I1,I2,I3,I4,I5,I6>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6) const {
      static const int new_rank = internal::is_ranged<Rank,I0,I1,I2,I3,I4,I5,I6>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index(0, i0, inew_rank, ibegin, new_dim, new_offset);
      update_index(1, i1, inew_rank, ibegin, new_dim, new_offset);
      update_index(2, i2, inew_rank, ibegin, new_dim, new_offset);
      update_index(3, i3, inew_rank, ibegin, new_dim, new_offset);
      update_index(4, i4, inew_rank, ibegin, new_dim, new_offset);
      update_index(5, i5, inew_rank, ibegin, new_dim, new_offset);
      update_index(6, i6, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_ + ibegin, storage_,
					   new_dim, new_offset);
    }
  
    // If one or more of the indices is not guaranteed to be monotonic
    // at compile time then we must return an IndexedArray, now done
    // for all possible numbers of arguments

    // Indexing a 1D array
    template <typename I0>
    typename internal::enable_if<Rank == 1 && internal::is_int_vector<I0>::value
		       && !internal::is_ranged<Rank,I0>::value,
		       internal::IndexedArray<Rank,Type,IsActive,Array,I0> >::type
    operator()(const I0& i0) {
      return internal::IndexedArray<Rank,Type,IsActive,Array,I0>(*this, i0);
    }
    template <typename I0>
    typename internal::enable_if<Rank == 1 && internal::is_int_vector<I0>::value
		       && !internal::is_ranged<Rank,I0>::value,
		       const internal::IndexedArray<Rank,Type,IsActive,
					  Array,I0> >::type
    operator()(const I0& i0) const {
      return internal::IndexedArray<Rank,Type,IsActive,
			  Array,I0>(*const_cast<Array*>(this), i0);
    }
  
    // Indexing a 2D array
    template <typename I0, typename I1>
    typename internal::enable_if<Rank == 2 && internal::is_irreg_indexed<Rank,I0,I1>::value,
		       internal::IndexedArray<internal::is_irreg_indexed<Rank,I0,I1>::count,
				    Type,IsActive,Array,I0,I1> >::type
    operator()(const I0& i0, const I1& i1) {
      static const int new_rank = internal::is_irreg_indexed<Rank,I0,I1>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,Array,I0,I1>(*this, i0, i1);
    }
    template <typename I0, typename I1>
    typename internal::enable_if<Rank == 2 && internal::is_irreg_indexed<Rank,I0,I1>::value,
		       const internal::IndexedArray<internal::is_irreg_indexed<Rank,I0,I1>::count,
				    Type,IsActive,Array,I0,I1> >::type
    operator()(const I0& i0, const I1& i1) const {
      static const int new_rank = internal::is_irreg_indexed<Rank,I0,I1>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,
			  Array,I0,I1>(*const_cast<Array*>(this), i0, i1);
    }

    // Indexing a 3D array
    template <typename I0, typename I1, typename I2>
    typename internal::enable_if<Rank == 3 && internal::is_irreg_indexed<Rank,I0,I1,I2>::value,
		       internal::IndexedArray<internal::is_irreg_indexed<Rank,I0,I1,I2>::count,
				    Type,IsActive,Array,I0,I1,I2> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2) {
      static const int new_rank = internal::is_irreg_indexed<Rank,I0,I1,I2>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,Array,
			  I0,I1,I2>(*this, i0, i1, i2);
    }
    template <typename I0, typename I1, typename I2>
    typename internal::enable_if<Rank == 3 && internal::is_irreg_indexed<Rank,I0,I1,I2>::value,
		       const internal::IndexedArray<internal::is_irreg_indexed<Rank,
							   I0,I1,I2>::count,
				    Type,IsActive,Array,I0,I1,I2> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2) const {
      static const int new_rank = internal::is_irreg_indexed<Rank,I0,I1,I2>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,Array,
			  I0,I1,I2>(*const_cast<Array*>(this), i0, i1, i2);
    }

    // Indexing a 4D array
    template <typename I0, typename I1, typename I2, typename I3>
    typename internal::enable_if<Rank == 4 && internal::is_irreg_indexed<Rank,I0,I1,I2,I3>::value,
		       internal::IndexedArray<internal::is_irreg_indexed<Rank,I0,I1,I2,I3>::count,
				    Type,IsActive,Array,I0,I1,I2,I3> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) {
      static const int new_rank = internal::is_irreg_indexed<Rank,I0,I1,I2,I3>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,Array,
			  I0,I1,I2,I3>(*this, i0, i1, i2, i3);
    }
    template <typename I0, typename I1, typename I2, typename I3>
    typename internal::enable_if<Rank == 4 && internal::is_irreg_indexed<Rank,I0,I1,I2,I3>::value,
		       const internal::IndexedArray<internal::is_irreg_indexed<Rank,I0,I1,
							   I2,I3>::count,
				    Type,IsActive,Array,I0,I1,I2,I3> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const {
      static const int new_rank = internal::is_irreg_indexed<Rank,I0,I1,I2,I3>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,Array,I0,I1,I2,
			  I3>(*const_cast<Array*>(this), i0, i1, i2, i3);
    }

    // Indexing a 5D array
    template <typename I0, typename I1, typename I2, typename I3, typename I4>
    typename internal::enable_if<Rank == 5
		       && internal::is_irreg_indexed<Rank,I0,I1,I2,I3,I4>::value,
		       internal::IndexedArray<internal::is_irreg_indexed<Rank,I0,I1,I2,
						     I3,I4>::count,
			    Type,IsActive,Array,I0,I1,I2,I3,I4> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2, 
	       const I3& i3, const I4& i4) {
      static const int new_rank = internal::is_irreg_indexed<Rank,I0,I1,I2,I3,
						   I4>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,Array,I0,I1,I2,I3,
			  I4>(*this, i0, i1, i2, i3, i4);
    }
    template <typename I0, typename I1, typename I2, typename I3, typename I4>
    typename internal::enable_if<Rank == 5
		       && internal::is_irreg_indexed<Rank,I0,I1,I2,I3,I4>::value,
		       const internal::IndexedArray<internal::is_irreg_indexed<Rank,I0,I1,I2,
							   I3,I4>::count,
				  Type,IsActive,Array,I0,I1,I2,I3,I4> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2, 
	       const I3& i3, const I4& i4) const {
      static const int new_rank = internal::is_irreg_indexed<Rank,I0,I1,I2,I3,
						   I4>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,Array,I0,I1,I2,I3,
			  I4>(*const_cast<Array*>(this), i0, i1, i2, i3, i4);
    }

    // Indexing a 6D array
    template <typename I0, typename I1, typename I2,
	      typename I3, typename I4, typename I5>
    typename internal::enable_if<Rank == 6
		       && internal::is_irreg_indexed<Rank,I0,I1,I2,I3,I4,I5>::value,
		       internal::IndexedArray<internal::is_irreg_indexed<Rank,I0,I1,I2,I3,
							   I4,I5>::count,
			  Type,IsActive,Array,I0,I1,I2,I3,I4,I5> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2, 
	       const I3& i3, const I4& i4, const I5& i5) {
      static const int new_rank = internal::is_irreg_indexed<Rank,I0,I1,I2,I3,
						   I4,I5>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,Array,I0,I1,I2,I3,I4,
			  I5>(*this,i0,i1,i2,i3,i4,i5);
    }
    template <typename I0, typename I1, typename I2,
	      typename I3, typename I4, typename I5>
    typename internal::enable_if<Rank == 6
		       && internal::is_irreg_indexed<Rank,I0,I1,I2,I3,I4,I5>::value,
		       const internal::IndexedArray<internal::is_irreg_indexed<Rank,I0,I1,I2,I3,
							   I4,I5>::count,
			  Type,IsActive,Array,I0,I1,I2,I3,I4,I5> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2, 
	       const I3& i3, const I4& i4, const I5& i5) const {
      static const int new_rank = internal::is_irreg_indexed<Rank,I0,I1,I2,I3,
						   I4,I5>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,Array,I0,I1,I2,I3,I4,
			  I5>(*const_cast<Array*>(this),i0,i1,i2,i3,i4,i5);
    }

    // Indexing a 7D array
    template <typename I0, typename I1, typename I2,
	      typename I3, typename I4, typename I5, typename I6>
    typename internal::enable_if<Rank == 7
		       && internal::is_irreg_indexed<Rank,I0,I1,I2,I3,I4,I5>::value,
		       internal::IndexedArray<internal::is_irreg_indexed<Rank,I0,I1,I2,I3,
						     I4,I5,I6>::count,
			  Type,IsActive,Array,I0,I1,I2,I3,I4,I5,I6> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
	       const I4& i4, const I5& i5, const I6& i6) {
      static const int new_rank = internal::is_irreg_indexed<Rank,I0,I1,I2,I3,
						   I4,I5,I6>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,Array,I0,I1,I2,I3,I4,I5,
			  I6>(*this,i0,i1,i2,i3,i4,i5,i6);
    }
    template <typename I0, typename I1, typename I2,
	      typename I3, typename I4, typename I5, typename I6>
    typename internal::enable_if<Rank == 7
		       && internal::is_irreg_indexed<Rank,I0,I1,I2,I3,I4,I5>::value,
		       const internal::IndexedArray<internal::is_irreg_indexed<Rank,I0,I1,I2,I3,
							   I4,I5,I6>::count,
			  Type,IsActive,Array,I0,I1,I2,I3,I4,I5,I6> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
	       const I4& i4, const I5& i5, const I6& i6) const {
      static const int new_rank = internal::is_irreg_indexed<Rank,I0,I1,I2,I3,
						   I4,I5,I6>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,Array,I0,I1,I2,I3,I4,I5,
			  I6>(*const_cast<Array*>(this),i0,i1,i2,i3,i4,i5,i6);
    }


    // Provide a C-array-like array access: for a multidimensional
    // array, operator[](i), where i is of integer type, returns an
    // array of rank one less than the original array, where the new
    // array is "sliced" at index i of dimension 0.  For a vector,
    // operator[](i) returns an l-value to the element at i.  Thus for
    // a 3D array A, A[1][2][3] returns a single element. Note that
    // this will be slower than A(1,2,3) because each operator[]
    // creates a new array (although does not copy the data).
    template <typename T>
    typename internal::enable_if<internal::is_scalar_int<T>::value && (Rank > 1),
      Array<Rank-1,Type,IsActive> >::type
    operator[](T i) {
      int index = internal::get_index_with_len(i,dimensions_[0])*offset_[0];
      ExpressionSize<Rank-1> new_dim;
      ExpressionSize<Rank-1> new_offset;
      for (int j = 1; j < Rank; ++j) {
	new_dim[j-1] = dimensions_[j];
	new_offset[j-1] = offset_[j];
      }
      return Array<Rank-1,Type,IsActive>(data_ + index,
					 storage_,
					 new_dim, new_offset);
    }

    // The const version, alas, throws away the constness because we
    // don't have a way of returning an unmodifiable array
    template <typename T>
    typename internal::enable_if<internal::is_scalar_int<T>::value && (Rank > 1),
      Array<Rank-1,Type,IsActive> >::type
    operator[](T i) const {
      int index = internal::get_index_with_len(i,dimensions_[0])*offset_[0];
      ExpressionSize<Rank-1> new_dim;
      ExpressionSize<Rank-1> new_offset;
      for (int j = 1; j < Rank; ++j) {
	new_dim[j-1] = dimensions_[j];
	new_offset[j-1] = offset_[j];
      }
      return Array<Rank-1,Type,IsActive>(const_cast<Type*>(data_) + index,
					 storage_,
					 new_dim, new_offset);
    }


    // diag_matrix(), where *this is a 1D array, returns a DiagMatrix
    // containing the data as the diagonal pointing to the original
    // data, Can be used as an lvalue.
    SpecialMatrix<Type, internal::BandEngine<ROW_MAJOR,0,0>, IsActive>
    diag_matrix();

    Array<1,Type,IsActive>
    diag_vector(Index offdiag = 0) {
      ADEPT_STATIC_ASSERT(Rank == 2, DIAG_VECTOR_ONLY_WORKS_ON_SQUARE_MATRICES);
      if (empty()) {
	// Return an empty vector
	return Array<1,Type,IsActive>();
      }
      else if (dimensions_[0] != dimensions_[1]) {
	throw invalid_operation("diag_vector member function only applicable to square matrices"
				ADEPT_EXCEPTION_LOCATION);
      }
      else if (offdiag >= 0) {
	Index new_dim = std::min(dimensions_[0], dimensions_[1]-offdiag);
	return Array<1,Type,IsActive>(data_+offset_[1]*offdiag, storage_, 
				      ExpressionSize<1>(new_dim),
				      ExpressionSize<1>(offset_[0]+offset_[1]));
      }
      else {
	Index new_dim = std::min(dimensions_[0]+offdiag, dimensions_[1]);
	return Array<1,Type,IsActive>(data_-offset_[0]*offdiag, storage_, 
				      ExpressionSize<1>(new_dim),
				      ExpressionSize<1>(offset_[0]+offset_[1]));
      }
    }
  
    Array
    submatrix_on_diagonal(Index ibegin, Index iend) {
      ADEPT_STATIC_ASSERT(Rank == 2,
		SUBMATRIX_ON_DIAGONAL_ONLY_WORKS_ON_SQUARE_MATRICES);
      if (dimensions_[0] != dimensions_[1]) {
	throw invalid_operation("submatrix_on_diagonal member function only applicable to square matrices"
				ADEPT_EXCEPTION_LOCATION);
      }
      else if (ibegin < 0 || ibegin > iend || iend >= dimensions_[0]) {
	throw index_out_of_bounds("Dimensions out of range in submatrix_on_diagonal"
				  ADEPT_EXCEPTION_LOCATION);
      }
      else {
	Index len = iend-ibegin+1;
	ExpressionSize<2> dim(len,len);
	return Array(data_+ibegin*(offset_[0]+offset_[1]),
		     storage_, dim, offset_);
      }
    }

    // For extracting contiguous sections out of an array use the
    // following. Currently this just indexes each dimension with the
    // contiguous range(a,b) index, but in future it may be optimized.

    // 1D array subset
    template <typename B0, typename E0>
    Array
    subset(const B0& ibegin0, const E0& iend0) {
      ADEPT_STATIC_ASSERT(Rank == 1,
			  SUBSET_WITH_2_ARGS_ONLY_ON_RANK_1_ARRAY);
      return (*this)(range(ibegin0,iend0));
    }
    template <typename B0, typename E0>
    const Array
    subset(const B0& ibegin0, const E0& iend0) const {
      ADEPT_STATIC_ASSERT(Rank == 1,
			  SUBSET_WITH_2_ARGS_ONLY_ON_RANK_1_ARRAY);
      return (*this)(range(ibegin0,iend0));
    }

    // 2D array subset
    template <typename B0, typename E0, typename B1, typename E1>
    Array
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1) {
      ADEPT_STATIC_ASSERT(Rank == 2,
			  SUBSET_WITH_4_ARGS_ONLY_ON_RANK_2_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1));
    }
    template <typename B0, typename E0, typename B1, typename E1>
    const Array
    subset(const B0& ibegin0, const E0& iend0, 
	  const B1& ibegin1, const E1& iend1) const {
      ADEPT_STATIC_ASSERT(Rank == 2,
			  SUBSET_WITH_4_ARGS_ONLY_ON_RANK_2_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1));
    }

    // 3D array subset
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2>
    Array
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2) {
      ADEPT_STATIC_ASSERT(Rank == 3,
			  SUBSET_WITH_6_ARGS_ONLY_ON_RANK_3_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2));
    }     
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2>
    const Array
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2) const {
      ADEPT_STATIC_ASSERT(Rank == 3,
			  SUBSET_WITH_6_ARGS_ONLY_ON_RANK_3_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2));
    }

    // 4D array subset
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2, typename B3, typename E3>
    Array
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2,
	   const B3& ibegin3, const E3& iend3) {
      ADEPT_STATIC_ASSERT(Rank == 4,
			  SUBSET_WITH_8_ARGS_ONLY_ON_RANK_4_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2),range(ibegin3,iend3));
    }
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2, typename B3, typename E3>
    const Array
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2,
	   const B3& ibegin3, const E3& iend3) const {
      ADEPT_STATIC_ASSERT(Rank == 4,
			  SUBSET_WITH_8_ARGS_ONLY_ON_RANK_4_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2),range(ibegin3,iend3));
    } 

    // 5D array subset
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2, typename B3, typename E3,
	      typename B4, typename E4>
    Array
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2,
	   const B3& ibegin3, const E3& iend3,
	   const B4& ibegin4, const E4& iend4) {
      ADEPT_STATIC_ASSERT(Rank == 5,
			  SUBSET_WITH_10_ARGS_ONLY_ON_RANK_5_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2),range(ibegin3,iend3),
		     range(ibegin4,iend4));
    }
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2, typename B3, typename E3,
	      typename B4, typename E4>
    const Array
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2,
	   const B3& ibegin3, const E3& iend3,
	   const B4& ibegin4, const E4& iend4) const {
      ADEPT_STATIC_ASSERT(Rank == 5,
			  SUBSET_WITH_10_ARGS_ONLY_ON_RANK_5_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2),range(ibegin3,iend3),
		     range(ibegin4,iend4));
    }

    // 6D array subset
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2, typename B3, typename E3,
	      typename B4, typename E4, typename B5, typename E5>
    Array
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2,
	   const B3& ibegin3, const E3& iend3,
	   const B4& ibegin4, const E4& iend4,
	   const B5& ibegin5, const E5& iend5) {
      ADEPT_STATIC_ASSERT(Rank == 6,
			  SUBSET_WITH_12_ARGS_ONLY_ON_RANK_6_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2),range(ibegin3,iend3),
		     range(ibegin4,iend4),range(ibegin5,iend5));
    }
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2, typename B3, typename E3,
	      typename B4, typename E4, typename B5, typename E5>
    const Array
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2,
	   const B3& ibegin3, const E3& iend3,
	   const B4& ibegin4, const E4& iend4,
	   const B5& ibegin5, const E5& iend5) const {
      ADEPT_STATIC_ASSERT(Rank == 6,
			  SUBSET_WITH_12_ARGS_ONLY_ON_RANK_6_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2),range(ibegin3,iend3),
		     range(ibegin4,iend4),range(ibegin5,iend5));
    }

    // 7D array subset
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2, typename B3, typename E3,
	      typename B4, typename E4, typename B5, typename E5,
	      typename B6, typename E6>
    Array
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2,
	   const B3& ibegin3, const E3& iend3,
	   const B4& ibegin4, const E4& iend4,
	   const B5& ibegin5, const E5& iend5,
	   const B6& ibegin6, const E6& iend6) {
      ADEPT_STATIC_ASSERT(Rank == 7,
			  SUBSET_WITH_14_ARGS_ONLY_ON_RANK_7_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2),range(ibegin3,iend3),
		     range(ibegin4,iend4),range(ibegin5,iend5),
		     range(ibegin6,iend6));
    }
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2, typename B3, typename E3,
	      typename B4, typename E4, typename B5, typename E5,
	      typename B6, typename E6>
    const Array
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2,
	   const B3& ibegin3, const E3& iend3,
	   const B4& ibegin4, const E4& iend4,
	   const B5& ibegin5, const E5& iend5,
	   const B6& ibegin6, const E6& iend6) const {
      ADEPT_STATIC_ASSERT(Rank == 7,
			  SUBSET_WITH_14_ARGS_ONLY_ON_RANK_7_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2),range(ibegin3,iend3),
		     range(ibegin4,iend4),range(ibegin5,iend5),
		     range(ibegin6,iend6));
    }

    // -------------------------------------------------------------------
    // Array: 5. Public member functions
    // -------------------------------------------------------------------
  
    // Link to an existing array of the same rank, type and activeness
    Array& link(Array& rhs) {
      if (!rhs.data()) {
	throw empty_array("Attempt to link to empty array"
			  ADEPT_EXCEPTION_LOCATION);
      }
      else {
	clear();
	data_ = rhs.data();
	storage_ = rhs.storage();
	dimensions_.copy(rhs.dimensions());
	offset_.copy(rhs.offset());
	if (storage_) {
	  storage_->add_link();
	}
	if (IsActive) {
	  internal::GradientIndex<IsActive>::set(data_, storage_);
	}
      }
      return *this;
    }

    // Fortran-like link syntax A >>= B
    Array& operator>>=(Array& rhs)
    { return link(rhs); }  

#ifndef ADEPT_MOVE_SEMANTICS
    // A common pattern is to link to a subset of another Array,
    // e.g. vec1.link(vec2(range(2,4))), but the problem is that the
    // argument to link is a temporary so will not bind to Array&. In
    // C++98 we therefore need a function taking const Array& and then
    // cast away the const-ness. This has the unfortunate side effect
    // that a non-const Array can be linked to a const Array.
    Array&        link(const Array& rhs) { return link(const_cast<Array&>(rhs)); }
    Array& operator>>=(const Array& rhs) { return link(const_cast<Array&>(rhs)); }
#else
    // But in C++11 we can solve this problem and only bind to
    // temporary non-const Arrays
    Array&        link(Array&& rhs) { return link(const_cast<Array&>(rhs)); }
    Array& operator>>=(Array&& rhs) { return link(const_cast<Array&>(rhs)); }
#endif

    // To prevent linking to an rvalue expression we write a templated
    // function that will fail to compile
    template<class E>
    typename internal::enable_if<!E::is_lvalue,void>::type
    link(const Expression<Type,E>&) {
      ADEPT_STATIC_ASSERT(E::is_lvalue, CAN_ONLY_LINK_TO_AN_LVALUE_EXPRESSION);
    }
    template<class E>
    typename internal::enable_if<!E::is_lvalue,void>::type
    operator>>=(const Expression<Type,E>&) {
      ADEPT_STATIC_ASSERT(E::is_lvalue, CAN_ONLY_LINK_TO_AN_LVALUE_EXPRESSION);
    }

    // STL-like size() returns total length of array
    Index size() const {
      Index s = 1;
      for (int i = 0; i < Rank; ++i) {
	s *= dimensions_[i];
      }
      return s; 
    }

    // Return constant reference to dimensions
    const ExpressionSize<Rank>& dimensions() const {
      return dimensions_;
    }

    bool get_dimensions_(ExpressionSize<Rank>& dim) const {
      dim = dimensions_;
      return true;
    }

    // Return individual dimension - probably deprecate "dimension" in
    // favour of "size"
    Index dimension(int j) const {
      return dimensions_[j];
    }
    Index size(int j) const {
      return dimensions_[j];
    }

    // Return individual offset
    Index offset(int j) const {
      return offset_[j];
    }

    // Return constant reference to offsets
    const ExpressionSize<Rank>& offset() const {
      return offset_;
    }

    const Index& last_offset() const { return offset_[Rank-1]; }

    // Return true if the array is empty
    bool empty() const { return (dimensions_[0] == 0); }

    // Return a string describing the array
    std::string info_string() const {
      std::stringstream str;
      str << "Array<" << Rank << ">, dim=" << dimensions_ << ", offset=" << offset_ << ", data_location=" << data_;
      if (IsActive) {
	str << ", gradient_index=" << gradient_index();
      }
      return str.str();
    }

    // Return a pointer to the start of the data
    Type* data() { return data_; }
    const Type* data() const { return data_; }
    const Type* const_data() const { return data_; }

    // Older style
    Type* data_pointer() { return data_; }
    const Type* data_pointer() const { return data_; }
    const Type* const_data_pointer() const { return data_; }

    // For vectors only, we allow a pointer to be returned to a
    // specified element
    Type* data_pointer(Index i) { 
      ADEPT_STATIC_ASSERT(Rank == 1, CAN_ONLY_USE_DATA_POINTER_WITH_INDEX_ON_VECTORS);
      if (data_) {
	return data_ + offset_[0]*i;
      }
      else {
	return 0;
      }
    }
    const Type* const_data_pointer(Index i) const { 
      ADEPT_STATIC_ASSERT(Rank == 1, CAN_ONLY_USE_CONST_DATA_POINTER_WITH_INDEX_ON_VECTORS);
      if (data_) {
	return data_ + offset_[0]*i;
      }
      else {
	return 0;
      }
    }
   
    // Return a pointer to the storage object
    Storage<Type>* storage() { return storage_; }

    // Reset the array to its original empty state, removing the link
    // to the data (which may deallocate the data if it was the only
    // link) and set the dimensions to zero
    void clear() {
      if (storage_) {
	storage_->remove_link();
	storage_ = 0;
      }
      data_ = 0;
      dimensions_.set_all(0);
      offset_.set_all(0);
      internal::GradientIndex<IsActive>::clear();
    }

    // Resize an array
    void
    resize(const Index* dim, bool force_contiguous = false) {

      ADEPT_STATIC_ASSERT(!(std::numeric_limits<Type>::is_integer
	    && IsActive), CANNOT_CREATE_ACTIVE_ARRAY_OF_INTEGERS);

      if (storage_) {
	storage_->remove_link();
	storage_ = 0;
      }
      // Check requested dimensions
      for (int i = 0; i < Rank; ++i) {
	if (dim[i] < 0) {
	  throw invalid_dimension("Negative array dimension requested"
				  ADEPT_EXCEPTION_LOCATION);
	}
	else if (dim[i] == 0) {
	  // If any of the dimensions is zero, we clear the array
	  // completely and all dimensions will be zero
	  clear();
	  return;
	}
      }
      dimensions_.copy(dim); // Copy dimensions
      if (force_contiguous) {
	pack_contiguous_();
      }
      else {
	pack_();
      }
      Index data_vol;
      if (internal::array_row_major_order) {
	data_vol = offset_[0]*dimensions_[0];
      }
      else {
	data_vol = size();
      }
      storage_ = new Storage<Type>(data_vol, IsActive);
      data_ = storage_->data();
      internal::GradientIndex<IsActive>::set(data_, storage_);
    }

    // Resize with an ExpressionSize object
    void resize(const ExpressionSize<Rank>& dim) {
      resize(&dim[0]);
    }

    // Resize using contiguous storage with an ExpressionSize object
    void resize_contiguous(const ExpressionSize<Rank>& dim) {
      resize(&dim[0], true);
    }

    // Resize specifying order
    void resize_row_major(const ExpressionSize<Rank>& dim) {
      resize(&dim[0]);
      pack_row_major_();
    }
    void resize_row_major_contiguous(const ExpressionSize<Rank>& dim) {
      resize(&dim[0], true);
      pack_row_major_contiguous_();
    }
    void resize_column_major(const ExpressionSize<Rank>& dim) {
      resize(&dim[0]);
      pack_column_major_();
    }

    // Resize with integer arguments
    void
    resize(Index m0, Index m1=-1, Index m2=-1, Index m3=-1,
	   Index m4=-1, Index m5=-1, Index m6=-1) {
      Index dim[7] = {m0, m1, m2, m3, m4, m5, m6};
      // Check invalid dimensions
      for (int i = 0; i < Rank; ++i) {
	if (dim[i] < 0) {
	  throw invalid_dimension("Invalid dimensions in array resize"
				  ADEPT_EXCEPTION_LOCATION);
	}
      }
      resize(dim);
    }

    void
    resize_row_major(Index m0, Index m1=-1, Index m2=-1, Index m3=-1,
	   Index m4=-1, Index m5=-1, Index m6=-1) {
      Index dim[7] = {m0, m1, m2, m3, m4, m5, m6};
      // Check invalid dimensions
      for (int i = 0; i < Rank; ++i) {
	if (dim[i] < 0) {
	  throw invalid_dimension("Invalid dimensions in array resize"
				  ADEPT_EXCEPTION_LOCATION);
	}
      }
      resize_row_major(dim);
    }

    void
    resize_column_major(Index m0, Index m1=-1, Index m2=-1, Index m3=-1,
	   Index m4=-1, Index m5=-1, Index m6=-1) {
      Index dim[7] = {m0, m1, m2, m3, m4, m5, m6};
      // Check invalid dimensions
      for (int i = 0; i < Rank; ++i) {
	if (dim[i] < 0) {
	  throw invalid_dimension("Invalid dimensions in array resize"
				  ADEPT_EXCEPTION_LOCATION);
	}
      }
      resize_column_major(dim);
    }

    // Resize with contiguous storage and integer arguments
    void
    resize_contiguous(Index m0, Index m1=-1, Index m2=-1, Index m3=-1,
	   Index m4=-1, Index m5=-1, Index m6=-1) {
      Index dim[7] = {m0, m1, m2, m3, m4, m5, m6};
      // Check invalid dimensions
      for (int i = 0; i < Rank; ++i) {
	if (dim[i] < 0) {
	  throw invalid_dimension("Invalid dimensions in array resize"
				  ADEPT_EXCEPTION_LOCATION);
	}
      }
      resize(dim, true);
    }


  protected:
    // Initialize with "MyRank" explicit dimensions, the function
    // only being defined if MyRank is equal to the actual Rank of
    // the Array
    template <int MyRank>
    typename internal::enable_if<Rank == MyRank,void>::type
    resize_(Index m0, Index m1=-1, Index m2=-1, Index m3=-1,
	   Index m4=-1, Index m5=-1, Index m6=-1) {
      Index dim[7] = {m0, m1, m2, m3, m4, m5, m6};
      resize(dim);
    }

    // Vectorization of arrays of rank>1 is possible provided that the
    // fastest varying dimension has padding, if necessary, to ensure
    // alignment
    template <int ARank>
    typename internal::enable_if<ARank==1 || ((ARank>1)&&!Packet<Type>::is_vectorized), bool>::type
    columns_aligned_() const {
      return true;
    }
    template <int ARank>
    typename internal::enable_if<(ARank>1)&&Packet<Type>::is_vectorized,bool>::type
    columns_aligned_() const {
      return offset_[Rank-2] % Packet<Type>::size == 0;
    }

  public:
  
    bool is_aliased_(const Type* mem1, const Type* mem2) const {
      Type const * ptr_begin;
      Type const * ptr_end;
      data_range(ptr_begin, ptr_end);
      if (ptr_begin <= mem2 && ptr_end >= mem1) {
	return true;
      }
      else {
	return false;
      }
    }
    bool all_arrays_contiguous_() const { return offset_[Rank-1] == 1 && columns_aligned_<Rank>(); }

    // Is the first data element aligned to a packet boundary?
    bool is_aligned_() const {
      return !(reinterpret_cast<std::size_t>(data_) & Packet<Type>::align_mask);
      // If we could union data with a uintptr_t object then we could
      // do the following, but there is no guarantee that uintptr_t
      // exists :-(
      //      return !(data_unsigned_int_ & Packet<Type>::align_mask);
    }

    // Return the number of unaligned elements before reaching the
    // first element on an alignment boundary, which is in units of
    // "n" Types. The first "%" argument finds how many elements the
    // first element is above an alignment boundary; the following bit
    // then works out how many elements to the next alignment
    // boundary.
    template <int n>
    int alignment_offset_() const {
      // This is rather slow!
      return (n - (reinterpret_cast<std::size_t>(reinterpret_cast<void*>(data_))/sizeof(Type))
	      % n) % n;
    }

    Type value_with_len_(const Index& j, const Index& len) const {
      ADEPT_STATIC_ASSERT(Rank == 1, CANNOT_USE_VALUE_WITH_LEN_ON_ARRAY_OF_RANK_OTHER_THAN_1);
      return data_[j*offset_[0]];
    }

    std::string expression_string_() const {
      if (true) {
	std::string a = internal::array_helper<Rank,IsActive>().name();
	a += dimensions_.str();
	return a;
      }
      else {
	std::stringstream s;
	print(s);
	return s.str();
      }
    }

    // The same as operator=(inactive scalar) but does not put
    // anything on the stack
    template <typename RType>
    typename internal::enable_if<internal::is_not_expression<RType>::value, Array&>::type
    set_value(RType x) {
      if (!empty()) {
	assign_inactive_scalar_<Rank,false>(x);
      }
      return *this;
    }
  

    // Is the array contiguous in memory?
    bool is_contiguous() const {
      Index offset_expected = 1;
      for (int i = Rank-1; i >= 0; ++i) {
	if (offset_[i] != offset_expected) {
	  return false;
	}
	offset_expected *= dimensions_[i];
      }
      return true;
    }
    
    // Determine whether rows or columns are contiguous in memory and
    // increasing, needed for calling the BLAS matrix multipliciation
    // functions; the first can be used to check if the fastest
    // varying dimension is contiguous, to see if array indexes can be
    // incremented simply.
    bool is_row_contiguous() const {
      //      ADEPT_STATIC_ASSERT(Rank == 2, CANNOT_CHECK_ROW_CONTIGUOUS_IF_NOT_MATRIX);
      //      return offset_[1] == 1;
      if (Rank > 1) {
	return offset_[Rank-1] == 1 && offset_[Rank-2] >= dimensions_[Rank-1];
      }
      else {
	return offset_[Rank-1] == 1;
      }
    }
    bool is_column_contiguous() const {
      ADEPT_STATIC_ASSERT(Rank == 2, CANNOT_CHECK_COLUMN_CONTIGUOUS_IF_NOT_MATRIX);
      return offset_[0] == 1;
    }

  public:
    // Return the gradient index for the first element in the array,
    // or -1 if not active
    Index gradient_index() const {
      //      ADEPT_STATIC_ASSERT(IsActive, CANNOT_ACCESS_GRADIENT_INDEX_OF_INACTIVE_ARRAY);
      //      return my_gradient_index<IsActive>();
      return internal::GradientIndex<IsActive>::get();
    }

    /*
    std::ostream& print(std::ostream& os) const {
      if (empty()) {
	os << "(empty " << Rank << "-D array)";
      }
      else if (adept::internal::array_print_curly_brackets) {
	adept::ExpressionSize<Rank> i(0);
	int my_rank = -1;
	if (Rank > 1) {
	  os << "\n";
	}
	do {
	  for (int r = 0; r < my_rank+1; r++)
	    { os << " "; }
	  for (int r = my_rank+1; r < Rank; r++)
	    { os << "{"; }
	  for (i[Rank-1] = 0; i[Rank-1] < dimensions_[Rank-1]-1; ++i[Rank-1])
	    { os << data_[index_(i)] << ", "; }
	  os << data_[index_(i)];
	  my_rank = Rank-1;
	  while (--my_rank >= 0) {
	    if (++i[my_rank] >= dimensions_[my_rank]) {
	      i[my_rank] = 0;
	      os << "}";
	    }
	    else {
	      os << "},\n";
	      break;
	    }
	  }
	} while (my_rank >= 0);
	if (Rank > 1) {
	  os << "}"; // "}/n"
	}
	else {
	  os << "}";
	}
      }
      else {
	adept::ExpressionSize<Rank> i(0);
	int my_rank;
	do {
	  for (i[Rank-1] = 0; i[Rank-1] < dimensions_[Rank-1]; ++i[Rank-1]) {
	    os << " " << data_[index_(i)];
	  }
	  my_rank = Rank-1;
	  while (--my_rank >= 0) {
	    if (++i[my_rank] >= dimensions_[my_rank]) {
	      i[my_rank] = 0;
	    }
	    else {
	      break;
	    }
	  }
	  os << "\n";
	} while (my_rank >= 0);
      }
      return os;
    }
    */

    std::ostream& print(std::ostream& os) const {
      using namespace internal;
      if (empty()) {
	os << array_print_empty_before;
	if (array_print_empty_rank) {
	  os << Rank;
	}
	os << array_print_empty_after;
      }
      else if (Rank == 1) {
	// Print a vector
	os << vector_print_before << data_[0];
	for (int i = 1; i < dimensions_[0]; ++i) {
	  os << vector_separator << data_[i*offset_[0]];
	}
	os << vector_print_after;
      }
      else {
	// Print a multi-dimensional array
	adept::ExpressionSize<Rank> i(0);
	int my_rank = -1;
	os << array_print_before;
	do {
	  if (array_print_indent) {
	    if (my_rank >= 0) {
	      os << " ";
	      for (int r = 0; r < my_rank*static_cast<int>(array_opening_bracket.size()); r++) {
		os << " ";
	      }
	    }
	  }
	  if (my_rank == -1) {
	    for (int r = 1; r < Rank; r++) {
	      os << array_opening_bracket;
	    }
	  }
	  else {
	    for (int r = my_rank+1; r < Rank; r++) {
	      os << array_opening_bracket;
	    }
	  }
	  for (i[Rank-1] = 0; i[Rank-1] < dimensions_[Rank-1]-1; ++i[Rank-1]) {
	    os << data_[index_(i)] << array_contiguous_separator;
	  }
	  os << data_[index_(i)];
	  my_rank = Rank-1;
	  while (--my_rank >= 0) {
	    if (++i[my_rank] >= dimensions_[my_rank]) {
	      i[my_rank] = 0;
	      os << array_closing_bracket;
	    }
	    else {
	      os << array_closing_bracket << array_non_contiguous_separator;
	      break;
	    }
	  }
	} while (my_rank >= 0);
	os << array_print_after;
      }
      return os;
    }

    // Get pointers to the first and last data members in memory.  
    void data_range(Type const * &data_begin, Type const * &data_end) const {
      data_begin = data_;
      data_end = data_;
      for (int i = 0; i < Rank; i++) {
	if (offset_[i] >= 0) {
	  data_end += (dimensions_[i]-1)*offset_[i];
	}
	else {
	  data_begin += (dimensions_[i]-1)*offset_[i];
	}
      }
    }

  
    // The Stack::independent(x) and Stack::dependent(y) functions add
    // the gradient_index of objects x and y to std::vector<uIndex>
    // objects in Stack. Since x and y may be scalars or arrays, this
    // is best done by delegating to the Active or Array classes.
    template <typename IndexType>
    void push_gradient_indices(std::vector<IndexType>& vec) const {
      ADEPT_STATIC_ASSERT(IsActive,
		  CANNOT_PUSH_GRADIENT_INDICES_FOR_INACTIVE_ARRAY); 
      ExpressionSize<Rank> i(0);
      Index gradient_ind = gradient_index();
      Index index = 0;
      int my_rank;
      vec.reserve(vec.size() + size());
      do {
	// Innermost loop - note that the counter is index, not max_index
	for (Index max_index = index + dimensions_[Rank-1]*offset_[Rank-1];
	     index < max_index;
	     index += offset_[Rank-1]) {
	  vec.push_back(gradient_ind + index);
	}
	// Increment counters appropriately depending on which
	// dimensions have been finished
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }

    // Return inactive array linked to original data
    Array<Rank, Type, false> inactive_link() {
      return Array<Rank, Type, false>(data_, storage_, dimensions_, offset_);
    }

    // Perform an in-place transpose for 2D arrays only
    Array& in_place_transpose() {
      ADEPT_STATIC_ASSERT(Rank == 2, 
			  IN_PLACE_TRANSPOSE_ONLY_POSSIBLE_WITH_2D_ARRAYS);
      Index tmp;
      // Swap dimensions
      tmp = dimensions_[0];
      dimensions_[0] = dimensions_[1];
      dimensions_[1] = tmp;
      // Swap offsets
      tmp = offset_[0];
      offset_[0] = offset_[1];
      offset_[1] = tmp;
      return *this;
    }

    // Transpose helper functions
  protected:
    template<int MyRank>
    typename internal::enable_if<MyRank == 2, Array<2,Type,IsActive> >::type
    my_T() {
      // Transpose 2D array: create output array initially as link
      // to input array
      Array<2,Type,IsActive> out(*this);
      // Swap dimensions
      return out.in_place_transpose();
    }
    template<int MyRank>
    typename internal::enable_if<MyRank == 2, const Array<2,Type,IsActive> >::type
    my_T() const {
      // Transpose 2D array: create output array initially as link
      // to input array
      Array<2,Type,IsActive> out(const_cast<Array&>(*this));
      // Swap dimensions
      return out.in_place_transpose();
    }

  public:
    // Out-of-place transpose
    Array<2,Type,IsActive>
    T() {
      ADEPT_STATIC_ASSERT(Rank == 1 || Rank == 2, 
			  TRANSPOSE_ONLY_POSSIBLE_WITH_1D_OR_2D_ARRAYS);
      return my_T<Rank>();
    }
    const Array<2,Type,IsActive>
    T() const {
      ADEPT_STATIC_ASSERT(Rank == 1 || Rank == 2, 
			  TRANSPOSE_ONLY_POSSIBLE_WITH_1D_OR_2D_ARRAYS);
      return my_T<Rank>();
    }

    // "permute" is a generalized transpose, returning an Array linked
    // to the current one but with the dimensions rearranged according
    // to idim: idim[0] is the 0-based number of the dimension of the
    // current array that will be dimension 0 of the new array,
    // idim[1] is the number of the dimension of the current array
    // that will be dimension 1 of the new array and so on.
    Array permute(const Index* idim) {
      if (empty()) {
	throw empty_array("Attempt to permute an empty array"
			  ADEPT_EXCEPTION_LOCATION);
      }
      ExpressionSize<Rank> new_dims(0);
      ExpressionSize<Rank> new_offset;
      for (int i = 0; i < Rank; ++i) {
	if (idim[i] >= 0 && idim[i] < Rank) {
	  new_dims[i] = dimensions_[idim[i]];
	  new_offset[i] = offset_[idim[i]];
	}
	else {
	  throw invalid_dimension("Dimensions must be in range 0 to Rank-1 in permute"
				  ADEPT_EXCEPTION_LOCATION);
	}
      }
      for (int i = 0; i < Rank; ++i) {
	if (new_dims[i] == 0) {
	  throw invalid_dimension("Missing dimension in permute"
				  ADEPT_EXCEPTION_LOCATION);
	}
      }
      return Array(data_, storage_, new_dims, new_offset);
    }

    Array permute(const ExpressionSize<Rank>& idim) {
      return permute(&idim[0]);
    }

    // Up to 7 dimensions we can specify the dimensions as separate
    // arguments
    typename internal::enable_if<(Rank < 7), Array>::type
    permute(Index i0, Index i1, Index i2 = -1, Index i3 = -1, Index i4 = -1,
	    Index i5 = -1, Index i6 = -1) {
      Index idim[7] = {i0, i1, i2, i3, i4, i5, i6};
      for (int i = 0; i < Rank; ++i) {
	if (idim[i] == -1) {
	  throw invalid_dimension("Incorrect number of dimensions provided to permute"
				  ADEPT_EXCEPTION_LOCATION);
	}
      }
      return permute(idim);
    }

    // Only applicable to vectors, return a multi-dimensional array
    // that links to the data in the vector
    template <int NewRank>
    Array<NewRank,Type,IsActive> reshape(const ExpressionSize<NewRank>& dims) {
      ADEPT_STATIC_ASSERT(Rank == 1, CANNOT_RESHAPE_MULTIDIMENSIONAL_ARRAY);
      Index new_size = 1;
      for (int i = 0; i < NewRank; ++i) {
	new_size *= dims[i];
      }
      if (new_size != dimensions_[0]) {
	throw invalid_dimension("Size of reshaped array does not match original vector");
      }
      ExpressionSize<NewRank> offset;
      offset[NewRank-1] = offset_[0];
      for (int i = NewRank-2; i >= 0; --i) {
	offset[i] = dims[i+1]*offset[i+1];
      }
      return Array<NewRank,Type,IsActive>(data_,storage_,dims,offset);
    }

    // More convenient interfaces to reshape providing a list of
    // integer dimensions
    Array<2,Type,IsActive> reshape(Index i0, Index i1)
    { return reshape(ExpressionSize<2>(i0,i1)); }
    Array<3,Type,IsActive> reshape(Index i0, Index i1, Index i2)
    { return reshape(ExpressionSize<2>(i0,i1,i2)); }
    Array<4,Type,IsActive> reshape(Index i0, Index i1, Index i2, Index i3)
    { return reshape(ExpressionSize<2>(i0,i1,i2,i3)); }
    Array<5,Type,IsActive> reshape(Index i0, Index i1, Index i2, Index i3, Index i4)
    { return reshape(ExpressionSize<2>(i0,i1,i2,i3,i4)); }
    Array<6,Type,IsActive> reshape(Index i0, Index i1, Index i2, Index i3,
				   Index i4, Index i5)
    { return reshape(ExpressionSize<2>(i0,i1,i2,i3,i4,i5)); }
    Array<7,Type,IsActive> reshape(Index i0, Index i1, Index i2, Index i3,
				   Index i4, Index i5, Index i6)
    { return reshape(ExpressionSize<2>(i0,i1,i2,i3,i4,i5,i6)); }


    // Return an Array that is a "soft" link to the data in the
    // present array; that is, it does not copy the Storage object and
    // increase the reference counter therein. This is useful in a
    // multi-threaded environment when multiple threads may wish to
    // subset the same array.
    Array soft_link() {
      return Array(data_,0,dimensions_,offset_,gradient_index());
    }
    const Array soft_link() const {
      return Array(data_,0,dimensions_,offset_,gradient_index());
    }


    // Place gradients associated with the present active array into
    // the equivalent passive array provided as an argument
    template <typename MyType>
    void get_gradient(Array<Rank,MyType,false>& gradient) const {
      ADEPT_STATIC_ASSERT(IsActive,CANNOT_USE_GET_GRADIENT_ON_INACTIVE_ARRAY);
      if (gradient.empty()) {
	gradient.resize(dimensions_);
      }
      else if (gradient.dimensions() != dimensions_) {
	throw size_mismatch("Attempt to get_gradient with array of different dimensions"
			    ADEPT_EXCEPTION_LOCATION);
      }
      static const int last = Rank-1;
      ExpressionSize<Rank> target_offset = gradient.offset();
      ExpressionSize<Rank> i(0);
      Index index = 0;
      int my_rank;
      Index index_target = 0;
      Index last_dim_stretch = dimensions_[last]*offset_[last];
      MyType* target = gradient.data();
      do {
	i[last] = 0;
	index_target = 0;
	for (int r = 0; r < Rank-1; r++) {
	  index_target += i[r]*target_offset[r];
	}
	ADEPT_ACTIVE_STACK->get_gradients(gradient_index()+index,
				  gradient_index()+index+last_dim_stretch,
				  target+index_target, offset_[last], target_offset[last]);
	index += last_dim_stretch;
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }

    // Return an inactive array of the same type and rank as the
    // present active array containing the gradients associated with
    // it
    Array<Rank,Type,false> get_gradient() const {
      Array<Rank,Type,false> gradient;
      get_gradient(gradient);
      return gradient;
    }


    // Set gradients associated with the present active array to 
    // the equivalent passive array provided as an argument
    template <typename MyType>
    void set_gradient(const Array<Rank,MyType,false>& gradient) const {
      ADEPT_STATIC_ASSERT(IsActive,CANNOT_USE_SET_GRADIENT_ON_INACTIVE_ARRAY);
      if (gradient.dimensions() != dimensions_) {
	throw size_mismatch("Attempt to set_gradient to an array of different dimensions"
			    ADEPT_EXCEPTION_LOCATION);
      }
      static const int last = Rank-1;
      ExpressionSize<Rank> src_offset = gradient.offset();
      ExpressionSize<Rank> i(0);
      Index index = 0;
      int my_rank;
      Index index_src = 0;
      Index last_dim_stretch = dimensions_[last]*offset_[last];
      const MyType* src = gradient.data();
      do {
	i[last] = 0;
	index_src = 0;
	for (int r = 0; r < Rank-1; r++) {
	  index_src += i[r]*src_offset[r];
	}
	ADEPT_ACTIVE_STACK->set_gradients(gradient_index()+index,
					  gradient_index()+index+last_dim_stretch,
					  src+index_src, src_offset[last], offset_[last]);
	index += last_dim_stretch;
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }

    
    // std::vector<typename internal::active_scalar<Type,IsActive>::type>
    // std_vector() const {
    //   ADEPT_STATIC_ASSERT(Rank == 1, STD_VECTOR_ONLY_AVAILABLE_FOR_RANK_1_ARRAYS);
    //   std::vector<typename internal::active_scalar<Type,IsActive>::type> data(dimensions_[0]);
    //   for (Index i = 0; i < dimensions_[0]; ++i) {
    // 	data[i] = (*this)(i);
    //   }
    //   return data;
    // }

    void
    put(std::vector<typename internal::active_scalar<Type,IsActive>::type>& data) const {
      ADEPT_STATIC_ASSERT(Rank == 1, PUT_ONLY_AVAILABLE_FOR_RANK_1_ARRAYS);
      if (data.size() != dimensions_[0]) {
	data.resize(dimensions_[0]);
      }
      for (Index i = 0; i < dimensions_[0]; ++i) {
	data[i] = (*this)(i);
      }  
    }

    void
    get(const std::vector<typename internal::active_scalar<Type,IsActive>::type>& data) {
      ADEPT_STATIC_ASSERT(Rank == 1, GET_ONLY_AVAILABLE_FOR_RANK_1_ARRAYS);
      if (data.size() != dimensions_[0]) {
	resize(data.size());
      }
      for (Index i = 0; i < dimensions_[0]; ++i) {
	(*this)(i) = data[i];
      }  
    }


    // -------------------------------------------------------------------
    // Array: 6. Member functions accessed by the Expression class
    // -------------------------------------------------------------------

    template <int MyArrayNum, int NArrays>
    void set_location_(const ExpressionSize<Rank>& i, 
		       ExpressionSize<NArrays>& index) const {
      index[MyArrayNum] = index_(i);
    }
    
    template <int MyArrayNum, int NArrays>
    Type value_at_location_(const ExpressionSize<NArrays>& loc) const {
      return data_[loc[MyArrayNum]];
    }
    template <int MyArrayNum, int NArrays>
    Packet<Type> packet_at_location_(const ExpressionSize<NArrays>& loc) const {
      return Packet<Type>(data_+loc[MyArrayNum]);
    }

    Type& lvalue_at_location(const Index& loc) {
      return data_[loc];
    }

    // Return a scalar
    template <bool IsAligned, int MyArrayNum, typename PacketType,
	      int NArrays>
    typename internal::enable_if<internal::is_same<Type,PacketType>::value, Type>::type
    values_at_location_(const ExpressionSize<NArrays>& loc) const {
      return data_[loc[MyArrayNum]];
    }

    // Return a Paket from an aligned memory address
    template <bool IsAligned, int MyArrayNum, typename PacketType,
	      int NArrays>
    typename internal::enable_if<IsAligned && internal::is_same<Packet<Type>,PacketType>::value, PacketType>::type
    values_at_location_(const ExpressionSize<NArrays>& loc) const {
      return Packet<Type>(data_+loc[MyArrayNum]);
    }    

    // Return a Paket from an unaligned memory address
    template <bool IsAligned, int MyArrayNum, typename PacketType,
	      int NArrays>
    typename internal::enable_if<!IsAligned && internal::is_same<Packet<Type>,PacketType>::value, PacketType>::type
    values_at_location_(const ExpressionSize<NArrays>& loc) const {
      // integer dummy second argument indicates unaligned load
      return Packet<Type>(data_+loc[MyArrayNum], 0); 
    }    

    // Return a scalar
    template <bool UseStored, bool IsAligned, int MyArrayNum, int MyScratchNum,
	      typename PacketType, int NArrays, int NScratch>
    typename internal::enable_if<internal::is_same<Type,PacketType>::value, Type>::type
    values_at_location_store_(const ExpressionSize<NArrays>& loc,
			      internal::ScratchVector<NScratch,PacketType>& scratch) const {
      return data_[loc[MyArrayNum]];
    }

    // Return a Paket from an aligned memory address
    template <bool UseStored, bool IsAligned, int MyArrayNum, int MyScratchNum,
	      typename PacketType, int NArrays, int NScratch>
    typename internal::enable_if<IsAligned && internal::is_same<Packet<Type>,PacketType>::value, PacketType>::type
    values_at_location_store_(const ExpressionSize<NArrays>& loc,
			      internal::ScratchVector<NScratch,PacketType>& scratch) const {
      return Packet<Type>(data_+loc[MyArrayNum]);
    }
    // Return a Paket from an unaligned memory address
    template <bool UseStored, bool IsAligned, int MyArrayNum, int MyScratchNum,
	      typename PacketType, int NArrays, int NScratch>
    typename internal::enable_if<!IsAligned && internal::is_same<Packet<Type>,PacketType>::value, PacketType>::type
    values_at_location_store_(const ExpressionSize<NArrays>& loc,
			      internal::ScratchVector<NScratch,PacketType>& scratch) const {
      return Packet<Type>(data_+loc[MyArrayNum], 0);
    }
   
    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
    Type value_at_location_store_(const ExpressionSize<NArrays>& loc,
				  internal::ScratchVector<NScratch>& scratch) const {
      return data_[loc[MyArrayNum]];

    }

    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
    Type value_stored_(const ExpressionSize<NArrays>& loc,
		       const internal::ScratchVector<NScratch>& scratch) const {
      return data_[loc[MyArrayNum]];
    }

    template <int MyArrayNum, int NArrays>
    void advance_location_(ExpressionSize<NArrays>& loc) const {
      loc[MyArrayNum] += offset_[Rank-1];
    }

    // If an expression leads to calc_gradient being called on an
    // active object, we push the multiplier and the gradient index on
    // to the operation stack (or 1.0 if no multiplier is specified
    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
    void calc_gradient_(Stack& stack, const ExpressionSize<NArrays>& loc,
			const internal::ScratchVector<NScratch>& scratch) const {
      stack.push_rhs(1.0, gradient_index() + loc[MyArrayNum]);
    }
    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, typename MyType>
    void calc_gradient_(Stack& stack, const ExpressionSize<NArrays>& loc,
			const internal::ScratchVector<NScratch>& scratch,
			const MyType& multiplier) const {
      stack.push_rhs(multiplier, gradient_index() + loc[MyArrayNum]);
    }
  
    template <int MyArrayNum, int MyScratchNum, int MyActiveNum,
	      int NArrays, int NScratch, int NActive>
    void calc_gradient_packet_(Stack& stack, 
			       const ExpressionSize<NArrays>& loc,
			       const internal::ScratchVector<NScratch,Packet<Real> >& scratch,
			       internal::ScratchVector<NActive,Packet<Real> >& gradients) const {
      stack.push_rhs_indices<Packet<Real>::size,NActive>(gradient_index() + loc[MyArrayNum]);
      gradients[MyActiveNum] = Packet<Real>(static_cast<Real>(1.0));
    }

    template <int MyArrayNum, int MyScratchNum, int MyActiveNum,
	      int NArrays, int NScratch, int NActive, typename MyType>
    void calc_gradient_packet_(Stack& stack, 
			       const ExpressionSize<NArrays>& loc,
			       const internal::ScratchVector<NScratch,Packet<Real> >& scratch,
			       internal::ScratchVector<NActive,Packet<Real> >& gradients,
			       const MyType& multiplier) const {
      stack.push_rhs_indices<Packet<Real>::size,NActive>(gradient_index() + loc[MyArrayNum]);
      gradients[MyActiveNum] = multiplier;
    }


    // -------------------------------------------------------------------
    // Array: 7. Protected member functions
    // -------------------------------------------------------------------
  protected:

    // Set the memory offsets from the array dimensions either
    // assuming C++-style row-major order, or Fortran-style
    // column-major order. The pack_() function spaces the data so
    // that all arrays are aligned to packet boundaries, to facilitate
    // vectorization.
    void pack_row_major_() {
      offset_[Rank-1] = 1;
      if (Rank > 1) {
	// Round up to nearest packet size so that all rows are aligned
	if (dimensions_[Rank-1] >= Packet<Type>::size*2) {
	  offset_[Rank-2] = ((dimensions_[Rank-1] + Packet<Type>::size - 1) / Packet<Type>::size) * Packet<Type>::size;
	}
	else {
	  offset_[Rank-2] = dimensions_[Rank-1];
	}
	for (int i = Rank-3; i >= 0; --i) {
	  offset_[i] = dimensions_[i+1]*offset_[i+1];
	}
      }
    }
    void pack_column_major_() {
      offset_[0] = 1;
      for (int i = 1; i < Rank; ++i) {
	offset_[i] = dimensions_[i-1]*offset_[i-1];
      }
    }
    void pack_() {
      if (internal::array_row_major_order) {
	pack_row_major_();
      }
      else {
	pack_column_major_();
      }
    }

    // ...while the pack_contiguous_() function makes sure all data
    // are contiguous in memory
    void pack_row_major_contiguous_() {
      offset_[Rank-1] = 1;
      for (int i = Rank-2; i >= 0; --i) {
	offset_[i] = dimensions_[i+1]*offset_[i+1];
      }
    }

    void pack_contiguous_() {
      if (internal::array_row_major_order) {
	pack_row_major_contiguous_();
      }
      else {
	pack_column_major_();
      }
    }

    // Return the memory index (relative to data_) for array element
    // indicated by j
    Index index_(Index j[Rank]) const {
      Index o = 0;
      for (int i = 0; i < Rank; i++) {
	o += j[i]*offset_[i];
      }
      return o;
    }
    Index index_(const ExpressionSize<Rank>& j) const {
      Index o = 0;
      for (int i = 0; i < Rank; i++) {
	o += j[i]*offset_[i];
      }
      return o;
    }

    // Used in traversing through an array
    void advance_index(Index& index, int& rank, ExpressionSize<Rank>& i) const {
      index -= offset_[Rank-1]*dimensions_[Rank-1];
      rank = Rank-1;
      while (--rank >= 0) {
	if (++i[rank] >= dimensions_[rank]) {
	  i[rank] = 0;
	  index -= offset_[rank]*(dimensions_[rank]-1);
	}
	else {
	  index += offset_[rank];
	  break;
	}
      }
    }

    // When assigning a scalar to a whole array, there may be
    // advantage in specialist behaviour depending on the rank of the
    // array. This is a generic one that copies the number but treats
    // the present array as passive.
    template <int LocalRank, bool LocalIsActive, typename X>
    typename internal::enable_if<!LocalIsActive,void>::type
    assign_inactive_scalar_(X x) {
      ExpressionSize<LocalRank> i(0);
      Index index = 0;
      int my_rank;
      do {
	// Innermost loop - note that the counter is index, not max_index
	for (Index max_index = index + dimensions_[LocalRank-1]*offset_[LocalRank-1];
	     index < max_index;
	     index += offset_[LocalRank-1]) {
	  data_[index] = x;
	}
	// Increment counters appropriately depending on which
	// dimensions have been finished
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }

    // An active array being assigned the value of an inactive scalar
    template <int LocalRank, bool LocalIsActive, typename X>
    typename internal::enable_if<LocalIsActive,void>::type
    assign_inactive_scalar_(X x) {
      // If not recording we call the inactive version instead
#ifdef ADEPT_RECORDING_PAUSABLE
      if (! ADEPT_ACTIVE_STACK->is_recording()) {
	assign_inactive_scalar_<LocalRank, false, X>(x);
	return;
      }
#endif

      ExpressionSize<LocalRank> i(0);
      Index gradient_ind = gradient_index();
      Index index = 0;
      int my_rank;
      do {
	// Innermost loop
	ADEPT_ACTIVE_STACK->push_lhs_range(gradient_ind+index, dimensions_[LocalRank-1],
					   offset_[LocalRank-1]);
	for (Index max_index = index + dimensions_[LocalRank-1]*offset_[LocalRank-1];
	     index < max_index; index += offset_[LocalRank-1]) {
	  data_[index] = x;
	}

	// Increment counters appropriately depending on which
	// dimensions have been finished
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }


    // When copying an expression to a whole array, there may be
    // advantage in specialist behaviour depending on the rank of the
    // array
    template<int LocalRank, bool LocalIsActive, bool EIsActive, class E>
    inline
    typename internal::enable_if<!LocalIsActive && (!internal::expr_cast<E>::is_vectorizable
					  || !internal::is_same<typename E::type,Type>::value),void>::type
    assign_expression_(const E& rhs) {
      ADEPT_STATIC_ASSERT(!EIsActive, CANNOT_ASSIGN_ACTIVE_EXPRESSION_TO_INACTIVE_ARRAY);
      ExpressionSize<LocalRank> i(0);
      ExpressionSize<internal::expr_cast<E>::n_arrays> ind(0);
      Index index = 0;
      int my_rank;
      static const int last = LocalRank-1;
      // FIX!!!
      if (false) { //rhs.all_arrays_contiguous()) {
	do {
	  i[last] = 0;
	  rhs.set_location(i, ind);
	  // Innermost loop
	  for ( ; i[last] < dimensions_[last]; ++i[last],
		  index += offset_[last]) {
	    // Note that this is faster as we know that all indices
	    // need to be incremented by 1
	    data_[index] = rhs.next_value_contiguous(ind);
	  }
	  advance_index(index, my_rank, i);
	} while (my_rank >= 0);
      }
      else {
	do {
	  i[last] = 0;
	  rhs.set_location(i, ind);
	  // Innermost loop
	  for ( ; i[last] < dimensions_[last]; ++i[last],
		  index += offset_[last]) {
	    data_[index] = rhs.next_value(ind);
	  }
	  advance_index(index, my_rank, i);
	} while (my_rank >= 0);
      }
    }

    // Vectorized version for Rank-1 arrays
    template<int LocalRank, bool LocalIsActive, bool EIsActive, class E>
    inline //__attribute__((always_inline))
    typename internal::enable_if<!LocalIsActive && internal::expr_cast<E>::is_vectorizable && LocalRank == 1
		       && internal::is_same<typename E::type,Type>::value,void>::type
      // Removing the reference speeds things up because otherwise E
      // is dereferenced each loop
      //  assign_expression_(const E& __restrict rhs) {
      assign_expression_(const E rhs) {
      ADEPT_STATIC_ASSERT(!EIsActive, CANNOT_ASSIGN_ACTIVE_EXPRESSION_TO_INACTIVE_ARRAY);
      ExpressionSize<1> i(0);
      ExpressionSize<internal::expr_cast<E>::n_arrays> ind(0);

      if (dimensions_[0] >= Packet<Type>::size*2
	  && offset_[0] == 1
	  && rhs.all_arrays_contiguous()
	  ) {
	// Contiguous source and destination data
	Index istartvec = 0;
	Index iendvec = 0;

	istartvec = rhs.alignment_offset();
	if (istartvec < 0 || istartvec != alignment_offset_<Packet<Type>::size>()) {
	  istartvec = iendvec = 0;
	}
	else  {
	  // Adjust iendvec such that iendvec-istartvec is a multiple
	  // of the packet size
	  iendvec = (dimensions_[0]-istartvec);
	  iendvec -= (iendvec % Packet<Type>::size);
	  iendvec += istartvec;
	}
	i[0] = 0;
	rhs.set_location(i, ind);
	Type* const __restrict t = data_; // Avoids an unnecessary load for some reason
	// Innermost loop
	for (int index = 0; index < istartvec; ++index) {
	  // Scalar version
	  t[index] = rhs.next_value_contiguous(ind);
	}
	for (int index = istartvec ; index < iendvec;
	     index += Packet<Type>::size) {
	  // Vectorized version
	  //	    rhs.next_packet(ind).put(data_+index)
	  // FIX may need unaligned store
	  rhs.next_packet(ind).put(t+index);
	}
	for (int index = iendvec ; index < dimensions_[0]; ++index) {
	  // Scalar version
	  t[index] = rhs.next_value_contiguous(ind);
	}
      }
      else {
	// Non-contiguous source or destination data
	i[0] = 0;
	rhs.set_location(i, ind);
	Type* const __restrict t = data_; // Avoids an unnecessary load for some reason
	for (int index = 0; i[0] < dimensions_[0]; ++i[0],
	       index += offset_[0]) {
	  t[index] = rhs.next_value(ind);
	}
      }
    }

    // Vectorized version
    template<int LocalRank, bool LocalIsActive, bool EIsActive, class E>
    inline
    typename internal::enable_if<!LocalIsActive && internal::expr_cast<E>::is_vectorizable && (LocalRank > 1)
                       && internal::is_same<typename E::type,Type>::value,void>::type
    // Removing the reference speeds things up because otherwise E
    // is dereferenced each loop
    //  assign_expression_(const E& rhs) 
      assign_expression_(const E rhs) {
      ADEPT_STATIC_ASSERT(!EIsActive, CANNOT_ASSIGN_ACTIVE_EXPRESSION_TO_INACTIVE_ARRAY);
      ExpressionSize<LocalRank> i(0);
      ExpressionSize<internal::expr_cast<E>::n_arrays> ind(0);
      Index index = 0;
      int my_rank;
      static const int last = LocalRank-1;
      
      if (dimensions_[last] >= Packet<Type>::size*2
	  && all_arrays_contiguous_()
	  && rhs.all_arrays_contiguous()) {
	// Contiguous source and destination data
	int iendvec;
	int istartvec = rhs.alignment_offset();
	if (istartvec < 0 || istartvec != alignment_offset_<Packet<Type>::size>()) {
	  istartvec = iendvec = 0;
	}
	else {
	  iendvec = (dimensions_[last]-istartvec);
	  iendvec -= (iendvec % Packet<Type>::size);
	  iendvec += istartvec;
	}


	do {
	  i[last] = 0;
	  rhs.set_location(i, ind);
	  // Innermost loop
	  for ( ; i[last] < istartvec; ++i[last], ++index) {
	    // Scalar version
	    data_[index] = rhs.next_value_contiguous(ind);
	  }
	  Type* const __restrict t = data_; // Avoids an unnecessary load for some reason
	  for ( ; i[last] < iendvec; i[last] += Packet<Type>::size,
		  index += Packet<Type>::size) {
	    // Vectorized version
	    //	    rhs.next_packet(ind).put(data_+index);
	    // FIX may need unaligned store
	    rhs.next_packet(ind).put(t+index);
	  }
	  for ( ; i[last] < dimensions_[last]; ++i[last], ++index) {
	    // Scalar version
	    data_[index] = rhs.next_value_contiguous(ind);
	  }
	  advance_index(index, my_rank, i);
	} while (my_rank >= 0);
      }
      else {
	// Non-contiguous source or destination data
	do {
	  i[last] = 0;
	  rhs.set_location(i, ind);
	  // Innermost loop
	  for ( ; i[last] < dimensions_[last]; ++i[last],
		  index += offset_[last]) {
	    data_[index] = rhs.next_value(ind);
	  }
	  advance_index(index, my_rank, i);
	} while (my_rank >= 0);
      }
    }

    template<int LocalRank, bool LocalIsActive, bool EIsActive, class E>
    inline
    typename internal::enable_if<LocalIsActive && EIsActive,void>::type
  //    assign_expression_(const E& rhs) {
    assign_expression_(const E rhs) {
      // If recording has been paused then call the inactive version
#ifdef ADEPT_RECORDING_PAUSABLE
      if (!ADEPT_ACTIVE_STACK->is_recording()) {
	assign_expression_<LocalRank,false,false>(rhs);
	return;
      }
#endif
      ExpressionSize<LocalRank> i(0);
      ExpressionSize<internal::expr_cast<E>::n_arrays> ind(0);
      Index index = 0;
      int my_rank;
      static const int last = LocalRank-1;

      ADEPT_ACTIVE_STACK->check_space(internal::expr_cast<E>::n_active * size());

      if (internal::expr_cast<E>::is_vectorizable && rhs.all_arrays_contiguous()) {
	// Contiguous source and destination data
	Type* const __restrict t = data_; // Avoids an unnecessary load for some reason
	do {
	  i[last] = 0;
	  rhs.set_location(i, ind);
	  // Innermost loop
	  for ( ; i[last] < dimensions_[last]; ++i[last],
		  index += offset_[last]) {
	    t[index] = rhs.next_value_and_gradient_contiguous(*ADEPT_ACTIVE_STACK, ind);
	    ADEPT_ACTIVE_STACK->push_lhs(gradient_index()+index); // What if RHS not active?
	  }
	  advance_index(index, my_rank, i);
	} while (my_rank >= 0);
      }
      else {
	// Non-contiguous source or destination data
	Type* const __restrict t = data_; // Avoids an unnecessary load for some reason
	do {
	  i[last] = 0;
	  rhs.set_location(i, ind);
	  // Innermost loop
	  for ( ; i[last] < dimensions_[last]; ++i[last],
		  index += offset_[last]) {
	    t[index] = rhs.next_value_and_gradient(*ADEPT_ACTIVE_STACK, ind);
	    ADEPT_ACTIVE_STACK->push_lhs(gradient_index()+index); // What if RHS not active?
	  }
	  advance_index(index, my_rank, i);
	} while (my_rank >= 0);
      }
    }

    template<int LocalRank, bool LocalIsActive, bool EIsActive, class E>
    inline
    typename internal::enable_if<LocalIsActive && !EIsActive,void>::type
    assign_expression_(const E& rhs) {
      // If recording has been paused then call the inactive version
#ifdef ADEPT_RECORDING_PAUSABLE
      if (!ADEPT_ACTIVE_STACK->is_recording()) {
	assign_expression_<LocalRank,false,false>(rhs);
	return;
      }
#endif
      ExpressionSize<LocalRank> i(0);
      ExpressionSize<internal::expr_cast<E>::n_arrays> ind(0);
      Index index = 0;
      int my_rank;
      Index gradient_ind = gradient_index();
      static const int last = LocalRank-1;
      do {
	i[last] = 0;
	rhs.set_location(i, ind);
	// Innermost loop
	ADEPT_ACTIVE_STACK->push_lhs_range(gradient_ind+index, dimensions_[LocalRank-1],
					   offset_[LocalRank-1]);
	for ( ; i[last] < dimensions_[last]; ++i[last],
	       index += offset_[last]) {
	  data_[index] = rhs.next_value(ind);
	}
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }


    template<bool LocalIsActive, class B, typename C>
    typename internal::enable_if<!LocalIsActive,void>::type
    assign_conditional_inactive_scalar_(const B& bool_expr, C rhs) {
      ExpressionSize<Rank> i(0);
      ExpressionSize<internal::expr_cast<B>::n_arrays> bool_ind(0);
      Index index = 0;
      int my_rank;
      static const int last = Rank-1;

      do {
	i[last] = 0;
	bool_expr.set_location(i, bool_ind);
	// Innermost loop
	for ( ; i[last] < dimensions_[last]; ++i[last],
	       index += offset_[last]) {
	  if (bool_expr.next_value(bool_ind)) {
	    data_[index] = rhs;
	  }
	}
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }

    template<bool LocalIsActive, class B, typename C>
    typename internal::enable_if<LocalIsActive,void>::type
    assign_conditional_inactive_scalar_(const B& bool_expr, C rhs) {
#ifdef ADEPT_RECORDING_PAUSABLE
      if (! ADEPT_ACTIVE_STACK->is_recording()) {
	assign_conditional_inactive_scalar_<false, B, C>(bool_expr, rhs);
	return;
      }
#endif

      ExpressionSize<Rank> i(0);
      ExpressionSize<internal::expr_cast<B>::n_arrays> bool_ind(0);
      Index index = 0;
      int my_rank;
      static const int last = Rank-1;

      do {
	i[last] = 0;
	bool_expr.set_location(i, bool_ind);
	// Innermost loop
	for ( ; i[last] < dimensions_[last]; ++i[last],
	       index += offset_[last]) {
	  if (bool_expr.next_value(bool_ind)) {
	    data_[index] = rhs;
	    ADEPT_ACTIVE_STACK->push_lhs(gradient_index()+index);
	  }
	}
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }

    template<bool LocalIsActive, class B, class C>
    typename internal::enable_if<!LocalIsActive,void>::type
    assign_conditional_(const B& bool_expr, const C& rhs) {
      ExpressionSize<Rank> i(0);
      ExpressionSize<internal::expr_cast<B>::n_arrays> bool_ind(0);
      ExpressionSize<internal::expr_cast<C>::n_arrays> rhs_ind(0);
      Index index = 0;
      int my_rank;
      static const int last = Rank-1;
      bool is_gap = false;

      do {
	i[last] = 0;
	rhs.set_location(i, rhs_ind);
	bool_expr.set_location(i, bool_ind);
	// Innermost loop
	for ( ; i[last] < dimensions_[last]; ++i[last],
	       index += offset_[last]) {
	  if (bool_expr.next_value(bool_ind)) {
	    if (is_gap) {
	      rhs.set_location(i, rhs_ind);
	      is_gap = false;
	    }
	    data_[index] = rhs.next_value(rhs_ind);
	  }
	  else {
	    is_gap = true;
	  }
	}
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }


    template<bool LocalIsActive, class B, class C>
    typename internal::enable_if<LocalIsActive,void>::type
    assign_conditional_(const B& bool_expr, const C& rhs) {
      // If recording has been paused then call the inactive version
#ifdef ADEPT_RECORDING_PAUSABLE
      if (!ADEPT_ACTIVE_STACK->is_recording()) {
	assign_conditional_<false>(bool_expr, rhs);
	return;
      }
#endif
      ExpressionSize<Rank> i(0);
      ExpressionSize<internal::expr_cast<B>::n_arrays> bool_ind(0);
      ExpressionSize<internal::expr_cast<C>::n_arrays> rhs_ind(0);
      Index index = 0;
      int my_rank;
      static const int last = Rank-1;
      bool is_gap = false;

      ADEPT_ACTIVE_STACK->check_space(internal::expr_cast<C>::n_active * size());
      do {
	i[last] = 0;
	rhs.set_location(i, rhs_ind);
	bool_expr.set_location(i, bool_ind);
	// Innermost loop
	for ( ; i[last] < dimensions_[last]; ++i[last],
	       index += offset_[last]) {
	  if (bool_expr.next_value(bool_ind)) {
	    if (is_gap) {
	      rhs.set_location(i, rhs_ind);
	      is_gap = false;
	    }
	    data_[index] = rhs.next_value_and_gradient(*ADEPT_ACTIVE_STACK, rhs_ind);
	    ADEPT_ACTIVE_STACK->push_lhs(gradient_index()+index); // What if RHS not active?
	  }
	  else {
	    is_gap = true;
	  }
	}
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }


    // -------------------------------------------------------------------
    // Array: 8. Static variables
    // -------------------------------------------------------------------
  public:


    void print_style(ArrayPrintStyle ps);


    // -------------------------------------------------------------------
    // Array: 9. Data
    // -------------------------------------------------------------------
  protected:
    Type* __restrict data_;           // Pointer to values
    Storage<Type>* storage_;          // Pointer to Storage object
    ExpressionSize<Rank> dimensions_; // Size of each dimension
    ExpressionSize<Rank> offset_;     // Memory offset for each dimension

  }; // End of Array class


  // -------------------------------------------------------------------
  // Helper functions
  // -------------------------------------------------------------------

  // Set the default ordering of arrays: if "true" use C-style
  // row-major ordering, otherwise use Fortran-style column-major
  // ordering
  inline
  void set_array_row_major_order(bool o = true) {
    internal::array_row_major_order = o;
  }

  // Set the print style
  void set_array_print_style(ArrayPrintStyle ps);

  inline ArrayPrintStyle get_array_print_style() {
    return internal::array_print_style;
  }

  // Change whether or not curly brackets are printed when arrays are
  // sent to a stream with the << operator
  inline
  void set_array_print_curly_brackets(bool o = true) {
    if (o) {
      set_array_print_style(PRINT_STYLE_CURLY);
    }
    else {
      set_array_print_style(PRINT_STYLE_PLAIN);
    }
  }

  // Print array on a stream
  template <int Rank, typename Type, bool IsActive>
  inline
  std::ostream&
  operator<<(std::ostream& os, const Array<Rank,Type,IsActive>& A) {
    return A.print(os);
  }


  // Extract inactive part of array, working correctly depending on
  // whether argument is active or inactive
  template <int Rank, typename Type>
  inline
  Array<Rank, Type, false>&
  value(Array<Rank, Type, false>& expr) {
    return expr;
  }
  template <int Rank, typename Type>
  inline
  Array<Rank, Type, false>
  value(Array<Rank, Type, true>& expr) {
    return expr.inactive_link();
  }

  // Print an array expression on a stream
  template <typename Type, class E>
  inline
  typename internal::enable_if<(E::rank > 0), std::ostream&>::type
  operator<<(std::ostream& os, const Expression<Type,E>& expr) {
    Array<E::rank,Type,false> A;
    A.assign_inactive(expr);
    return A.print(os);
  }

  // -------------------------------------------------------------------
  // Transpose function
  // -------------------------------------------------------------------

  // Transpose 2D array
  template<typename Type, bool IsActive>
  inline
  Array<2,Type,IsActive>
  transpose(Array<2,Type,IsActive>& in) {
    // Create output array initially as link to input array 
    Array<2,Type,IsActive> out(in);
    // Swap dimensions
    return out.in_place_transpose();
  }

  // Transpose 1D array, treating it as a length N column vector, so
  // returning a 1xN 2D array
  template<typename Type, bool IsActive>
  inline
  Array<2,Type,IsActive>
  transpose(Array<1,Type,IsActive>& in) {
    return Array<2,Type,IsActive>(in.data(), in.storage(),
				  ExpressionSize<2>(1,in.dimension(0)),
				  ExpressionSize<2>(in.dimension(0)*in.offset(0),in.offset(0)));
  }

  // Transpose a 2D expression
  template<typename Type, class E>
  inline
  typename internal::enable_if<E::rank == 2, Array<2,Type,E::is_active> >::type
  transpose(const Expression<Type,E>& in) {
    // Create output array by evaluating input expression
    Array<2,Type,E::is_active> out(in);
    // Swap dimensions
    return out.in_place_transpose();
  }

  // Transpose a 1D expression
  template<typename Type, class E>
  inline
  typename internal::enable_if<E::rank == 1, Array<2,Type,E::is_active> >::type
  transpose(const Expression<Type,E>& in) {
    Array<1,Type,E::is_active> out_1D(in);
    return Array<2,Type,E::is_active>(out_1D.data(), out_1D.storage(),
				      ExpressionSize<2>(1,out_1D.dimension(0)),
				      ExpressionSize<2>(out_1D.dimension(0)*out_1D.offset(0),out_1D.offset(0)));
  }

  // Extract the gradients from an active Array after the
  // Stack::forward or Stack::reverse functions have been called
  template<int Rank, typename Type, typename dType>
  inline
  void get_gradients(const Array<Rank,Type,true>& a, Array<Rank,dType,false>& data)
  {
    data = a.get_gradient();
  }

} // End namespace adept

#endif


================================================
FILE: include/adept/ArrayWrapper.h
================================================
/* ArrayWrapper.h -- Make Arrays work faster in expressions

    Copyright (C) 2016-2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#ifndef AdeptArrayWrapper_H
#define AdeptArrayWrapper_H 1

//#include <adept/Array.h>

namespace adept {

  // Forward declaration of Array class
  template <int Rank, typename Type, bool IsActive> class Array;
  
  namespace internal {

    template<int Rank, typename Type, bool IsActive>
    struct ArrayWrapper : public Expression<Type,ArrayWrapper<Rank,Type,IsActive> > {

      typedef Array<Rank,Type,IsActive> MyArray;

      // Static definitions to enable the properties of this type of
      // expression to be discerned at compile time
      static const bool is_active  = IsActive;
      static const bool is_lvalue  = true;
      static const int  rank       = Rank;
      static const int  n_active   = IsActive * (1 + is_complex<Type>::value);
      static const int  n_scratch  = 0;
      static const int  n_arrays   = 1;
      static const bool is_vectorizable = MyArray::is_vectorizable;
      
      ArrayWrapper(const MyArray& a) : data(a.const_data()), array(a) { }
      
      bool get_dimensions_(ExpressionSize<Rank>& dim) const {
	return array.get_dimensions_(dim);
      }
      
      std::string expression_string_() const {
	return std::string("wrapped") + array.expression_string_();
      }
      
      bool is_aliased_(const Type* mem1, const Type* mem2) const {
	return array.is_aliased(mem1, mem2);
      }
      
      bool all_arrays_contiguous_() const { 
	return array.all_arrays_contiguous_();
      }
      
      bool is_aligned_() const {
	return array.is_aligned_();
      }
      
      template <int n>
      int alignment_offset_() const {
	return array.template alignment_offset_<n>();
      }
      
      Type value_with_len_(const Index& j, const Index& len) const {
	return array.value_with_len_(j,len);
      }
      
      // Optimize by storing the offset of the fastest-varying dimension?
      template <int MyArrayNum, int NArrays>
      void advance_location_(ExpressionSize<NArrays>& loc) const {
	array.template advance_location_<MyArrayNum>(loc);
      }
      
      template <int MyArrayNum, int NArrays>
      Type value_at_location_(const ExpressionSize<NArrays>& loc) const {
	return data[loc[MyArrayNum]];
      }
      
      template <int MyArrayNum, int NArrays>
      Packet<Type> packet_at_location_(const ExpressionSize<NArrays>& loc) const {
	return Packet<Type>(data+loc[MyArrayNum]);
      }
      
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_at_location_store_(const ExpressionSize<NArrays>& loc,
				       ScratchVector<NScratch>& scratch) const {
	return data[loc[MyArrayNum]];
      }
      
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_stored_(const ExpressionSize<NArrays>& loc,
		       const ScratchVector<NScratch>& scratch) const {
	return data[loc[MyArrayNum]];
      }
      
      template <int MyArrayNum, int NArrays>
      void set_location_(const ExpressionSize<Rank>& i, 
			 ExpressionSize<NArrays>& index) const {
	array.template set_location_<MyArrayNum>(i, index);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      void calc_gradient_(Stack& stack, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const {
	array.template calc_gradient_<MyArrayNum,MyScratchNum>(stack, loc, scratch);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, typename MyType>
      void calc_gradient_(Stack& stack, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch,
			  MyType multiplier) const {
	array.template calc_gradient_<MyArrayNum,MyScratchNum>(stack, loc, scratch, multiplier);
      }
         
      
    protected:
      //      typedef Type __attribute__((aligned(32))) aligned_type;
      Type const * const __restrict data;
      //aligned_type const * const __restrict data;
      const MyArray& __restrict array;
    };
    
    // Unary and binary operations normally contain constant
    // references to their arguments, but if that reference is an
    // Array then the compiler represents this reference as a pointer
    // that must be dereferenced every time a value is extracted from
    // the Array. To speed this up, nested_expression<ExprType>::type
    // is used to obtain the constant reference to ExprType, but for
    // passive Arrays an ArrayWrapper object is returned instead that
    // is faster.
    template <class T>
    struct nested_expression {
      typedef const T& __restrict type;
    };

    template <int Rank, typename Type, bool IsActive>
    struct nested_expression<Array<Rank,Type,IsActive> > {
      typedef const ArrayWrapper<Rank,Type,IsActive> type;
    };

    template <class Type, template<class> class Op, class R>
    struct UnaryOperation;
    template <class Type, class L, class Op, class R>
    struct BinaryOperation;

    // Should we check that rank is > 1?
    template <class Type, template<class> class Op, class R>
    struct nested_expression<UnaryOperation<Type,Op,R> > {
      typedef UnaryOperation<Type,Op,R> type;
    };
    template <class Type, class L, class Op, class R>
    struct nested_expression<BinaryOperation<Type,L,Op,R> > {
      typedef BinaryOperation<Type,L,Op,R> type;
    };
    
  }
}


#endif


================================================
FILE: include/adept/BinaryOperation.h
================================================
/* BinaryOperation.h -- Binary operations on Adept expressions

    Copyright (C) 2014-2018 European Centre for Medium-Range Weather Forecasts

    Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.
*/

#ifndef AdeptBinaryOperation_H
#define AdeptBinaryOperation_H

#include <adept/Expression.h>

#include <adept/ArrayWrapper.h>

namespace adept {
  namespace internal {

    // ---------------------------------------------------------------------
    // SECTION 4.1: Binary operations: define BinaryOperation type
    // ---------------------------------------------------------------------

    // Binary operations derive from this class, where Op is a policy
    // class defining how to implement the operation and L and R are
    // the arguments to the operation
    template <class Type, class L, class Op, class R>
    struct BinaryOperation
      : public Expression<Type, BinaryOperation<Type, L, Op, R> >,
	protected Op {

      // Static data
      static const int  rank  = (L::rank > R::rank ? L::rank : R::rank);
      static const bool is_active = (L::is_active || R::is_active) 
	&& !is_same<Type, bool>::value;
      static const int  store_result = is_active * Op::store_result;
      static const int  n_active = expr_cast<L>::n_active + expr_cast<R>::n_active;
      // Assume the only local scratch variable is the result of the
      // binary expression
      static const int  n_local_scratch = store_result; 
      //	+ Op::n_scratch<L::is_active,R::is_active>::value
      static const int  n_scratch 
        = n_local_scratch + L::n_scratch + R::n_scratch;
      static const int  n_arrays  = L::n_arrays + R::n_arrays;
      static const bool is_vectorizable
	= L::is_vectorizable && R::is_vectorizable && Op::is_vectorized
	&& is_same<typename L::type,typename R::type>::value;

      using Op::is_operator;
      using Op::operation;
      using Op::operation_string;
      
      // DATA
      //const L& left;
      //const R& right;
      const typename nested_expression<L>::type left;
      const typename nested_expression<R>::type right;

      BinaryOperation(const Expression<typename L::type, L>& left_,
		      const Expression<typename R::type, R>& right_)
	: left(left_.cast()), right(right_.cast()) { 
      }
      
      template <int Rank>
      bool get_dimensions_(ExpressionSize<Rank>& dim) const {
	return my_get_dimensions<L::rank != 0, R::rank != 0>(dim);
      }

    protected:

      template <bool LIsArray, bool RIsArray, int Rank>
      typename enable_if<LIsArray && RIsArray, bool>::type
      my_get_dimensions(ExpressionSize<Rank>& dim) const {
	ExpressionSize<Rank> right_dim;
	return left.get_dimensions(dim)
	  && right.get_dimensions(right_dim)
	  && compatible(dim, right_dim);
      }

      template <bool LIsArray, bool RIsArray, int Rank>
      typename enable_if<LIsArray && !RIsArray, bool>::type
      my_get_dimensions(ExpressionSize<Rank>& dim) const {
	return left.get_dimensions(dim);
      }

      template <bool LIsArray, bool RIsArray, int Rank>
      typename enable_if<!LIsArray && RIsArray, bool>::type
      my_get_dimensions(ExpressionSize<Rank>& dim) const {
	return right.get_dimensions(dim);
      }

      template <bool LIsArray, bool RIsArray, int Rank>
      typename enable_if<!LIsArray && !RIsArray, bool>::type
      my_get_dimensions(ExpressionSize<Rank>& dim) const {
	return true;
      }

    public:

      std::string expression_string_() const {
	std::string str;
	if (is_operator) {
	  str = "(" + left.expression_string()
	    + operation_string()
	    + right.expression_string() + ")";
	}
	else {
	  str = operation_string();
	  str += "(" + left.expression_string()
	    + "," + right.expression_string() + ")";
	}
	return str;
      }

      bool is_aliased_(const Type* mem1, const Type* mem2) const {
	return left.is_aliased(mem1, mem2) || right.is_aliased(mem1, mem2);
      }
      bool all_arrays_contiguous_() const { 
	return left.all_arrays_contiguous_()
	  &&  right.all_arrays_contiguous_();
      }

      bool is_aligned_() const {
	return left.is_aligned_() && right.is_aligned_();
      }
      
      template <int n>
      int alignment_offset_() const {
	int l = left.template alignment_offset_<n>();
	int r = right.template alignment_offset_<n>();
	if (l == r) {
	  return l;
	}
	else if (l == n) {
	  return r;
	} else if (r == n) {
	  return l;
	}
	else {
	  return -1;
	}
      }

      Type value_with_len_(const Index& j, const Index& len) const {
	return operation(left.value_with_len(j,len), 
			right.value_with_len(j,len));
      }

      template <int MyArrayNum, int NArrays>
      void advance_location_(ExpressionSize<NArrays>& loc) const {
	left.template advance_location_<MyArrayNum>(loc);
	right.template advance_location_<MyArrayNum+L::n_arrays>(loc);
      }

      template <int MyArrayNum, int NArrays>
      Type value_at_location_(const ExpressionSize<NArrays>& loc) const {
	return operation(left.template value_at_location_<MyArrayNum>(loc),
			 right.template value_at_location_<MyArrayNum+L::n_arrays>(loc));
      }
      template <int MyArrayNum, int NArrays>
      Packet<Type> packet_at_location_(const ExpressionSize<NArrays>& loc) const {
	return operation(left.template packet_at_location_<MyArrayNum>(loc),
			 right.template packet_at_location_<MyArrayNum+L::n_arrays>(loc));
      }

      template <bool IsAligned,	int MyArrayNum, typename PacketType,
	int NArrays>
      PacketType values_at_location_(const ExpressionSize<NArrays>& loc) const {
	return operation(left.template  values_at_location_<IsAligned,MyArrayNum,PacketType>(loc),
			 right.template values_at_location_<IsAligned,MyArrayNum+L::n_arrays,PacketType>(loc));
      }

      template <bool UseStored, bool IsAligned,	int MyArrayNum, int MyScratchNum,
		typename PacketType, int NArrays, int NScratch>
      PacketType values_at_location_store_(const ExpressionSize<NArrays>& loc,
		   ScratchVector<NScratch,PacketType>& scratch) const {
	return my_values_at_location_store_<store_result,UseStored,IsAligned,
					    MyArrayNum,MyScratchNum>(loc, scratch);
      }

      // Adept-1.x did not store for addition and subtraction!
      // Moreover, we should ideally not ask inactive arguments to
      // store their result.
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_at_location_store_(const ExpressionSize<NArrays>& loc,
				    ScratchVector<NScratch>& scratch) const {
	return my_value_at_location_store_<store_result,MyArrayNum,MyScratchNum>(loc, scratch);
      }

      // Adept-1.x did not store for addition and subtraction!
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_stored_(const ExpressionSize<NArrays>& loc,
			 const ScratchVector<NScratch>& scratch) const {
	return my_value_stored_<store_result,MyArrayNum,MyScratchNum>(loc, scratch);
      }

    protected:
      template <int StoreResult, int MyArrayNum, int MyScratchNum, 
		int NArrays, int NScratch>
      typename enable_if<StoreResult==1, Type>::type
      my_value_at_location_store_(const ExpressionSize<NArrays>& loc,
				       ScratchVector<NScratch>& scratch) const {
	return scratch[MyScratchNum] 
	  = operation(left.template value_at_location_store_<MyArrayNum,MyScratchNum+n_local_scratch>(loc, scratch),
		      right.template value_at_location_store_<MyArrayNum+L::n_arrays,
						     MyScratchNum+L::n_scratch+n_local_scratch>(loc, scratch));
      }

      // In differentiating "a/b", it helps to store "1/b";
      // "operation_store" is only provided by Divide and Atan2
      template <int StoreResult, int MyArrayNum, int MyScratchNum, 
		int NArrays, int NScratch>
      typename enable_if<StoreResult==2, Type>::type
      my_value_at_location_store_(const ExpressionSize<NArrays>& loc,
				       ScratchVector<NScratch>& scratch) const {
	return scratch[MyScratchNum] 
	  = Op::operation_store(left.template value_at_location_store_<MyArrayNum,MyScratchNum+n_local_scratch>(loc, scratch),
			    right.template value_at_location_store_<MyArrayNum+L::n_arrays,
			    MyScratchNum+L::n_scratch+n_local_scratch>(loc, scratch),
			    scratch[MyScratchNum+1]);
      }

      // Adept-1.x did not store for addition and subtraction!
      template <int StoreResult, int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      typename enable_if<(StoreResult > 0), Type>::type
      my_value_stored_(const ExpressionSize<NArrays>& loc,
		       const ScratchVector<NScratch>& scratch) const {
	return scratch[MyScratchNum];
      }

      template <int StoreResult, int MyArrayNum, int MyScratchNum, 
		int NArrays, int NScratch>
      typename enable_if<StoreResult==0, Type>::type
      my_value_at_location_store_(const ExpressionSize<NArrays>& loc,
				       ScratchVector<NScratch>& scratch) const {
	return operation(left.template value_at_location_store_<MyArrayNum,MyScratchNum+n_local_scratch>(loc, scratch),
			 right.template value_at_location_store_<MyArrayNum+L::n_arrays,
			 MyScratchNum+L::n_scratch+n_local_scratch>(loc, scratch));
      }

      template <int StoreResult, int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      typename enable_if<StoreResult==0, Type>::type
      my_value_stored_(const ExpressionSize<NArrays>& loc,
		       const ScratchVector<NScratch>& scratch) const {
	return operation(left.template value_at_location_<MyArrayNum>(loc),
			 right.template value_at_location_<MyArrayNum+L::n_arrays>(loc));
      }
    
      template <int StoreResult, bool UseStored, bool IsAligned, int MyArrayNum, int MyScratchNum,
		typename PacketType, int NArrays, int NScratch>
      typename enable_if<StoreResult==1 && !UseStored, PacketType>::type
      my_values_at_location_store_(const ExpressionSize<NArrays>& loc,
				   ScratchVector<NScratch,PacketType>& scratch) const {
	return scratch[MyScratchNum]
	  = operation(left.template values_at_location_store_<UseStored,IsAligned,MyArrayNum,
		                                     MyScratchNum+n_local_scratch>(loc, scratch),
		      right.template values_at_location_store_<UseStored,IsAligned,MyArrayNum+L::n_arrays,
		                                     MyScratchNum+L::n_scratch+n_local_scratch>(loc, scratch));
      }

      template <int StoreResult, bool UseStored, bool IsAligned, int MyArrayNum, int MyScratchNum,
		typename PacketType, int NArrays, int NScratch>
      typename enable_if<StoreResult==2 && !UseStored, PacketType>::type
      my_values_at_location_store_(const ExpressionSize<NArrays>& loc,
				   ScratchVector<NScratch,PacketType>& scratch) const {
	return scratch[MyScratchNum]
	  = Op::operation_store(left.template values_at_location_store_<UseStored,IsAligned,MyArrayNum,
		                                     MyScratchNum+n_local_scratch>(loc, scratch),
				right.template values_at_location_store_<UseStored,IsAligned,MyArrayNum+L::n_arrays,
				                     MyScratchNum+L::n_scratch+n_local_scratch>(loc, scratch),
				scratch[MyScratchNum+1]);
      }

      template <int StoreResult, bool UseStored, bool IsAligned, int MyArrayNum, int MyScratchNum,
		typename PacketType, int NArrays, int NScratch>
      typename enable_if<(StoreResult>0) && UseStored, PacketType>::type
      my_values_at_location_store_(const ExpressionSize<NArrays>& loc,
				   ScratchVector<NScratch,PacketType>& scratch) const {
	return scratch[MyScratchNum];
      }

      template <int StoreResult, bool UseStored, bool IsAligned, int MyArrayNum, int MyScratchNum,
		typename PacketType, int NArrays, int NScratch>
      typename enable_if<StoreResult==0 && !UseStored, PacketType>::type
      my_values_at_location_store_(const ExpressionSize<NArrays>& loc,
				   ScratchVector<NScratch,PacketType>& scratch) const {
	return operation(left.template values_at_location_store_<UseStored,IsAligned,MyArrayNum,
		                                     MyScratchNum+n_local_scratch>(loc, scratch),
			 right.template values_at_location_store_<UseStored,IsAligned,MyArrayNum+L::n_arrays,
		                                     MyScratchNum+L::n_scratch+n_local_scratch>(loc, scratch));
      }

      template <int StoreResult, bool UseStored, bool IsAligned, int MyArrayNum, int MyScratchNum,
		typename PacketType, int NArrays, int NScratch>
      typename enable_if<StoreResult==0 && UseStored, PacketType>::type
      my_values_at_location_store_(const ExpressionSize<NArrays>& loc,
				   ScratchVector<NScratch,PacketType>& scratch) const {
	return operation(left.template values_at_location_<IsAligned,MyArrayNum,PacketType>(loc),
			 right.template values_at_location_<IsAligned,MyArrayNum+L::n_arrays,PacketType>(loc));
      }

    public:

      template <int MyArrayNum, int Rank, int NArrays>
      void set_location_(const ExpressionSize<Rank>& i, 
			 ExpressionSize<NArrays>& index) const {
	left.template set_location_<MyArrayNum>(i, index);
	right.template set_location_<MyArrayNum+L::n_arrays>(i, index);
      }


      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      void calc_gradient_(Stack& stack, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const {
        calc_left_ <MyArrayNum, MyScratchNum>(stack, left,  loc, scratch);
        calc_right_<MyArrayNum, MyScratchNum>(stack, right, loc, scratch);
      }
      // As the previous but multiplying the gradient by "multiplier"
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, typename MyType>
      void calc_gradient_(Stack& stack, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch,
			  MyType multiplier) const {
        calc_left_ <MyArrayNum, MyScratchNum>(stack, left,  loc, scratch, multiplier);
        calc_right_<MyArrayNum, MyScratchNum>(stack, right, loc, scratch, multiplier);
      }
    
    protected:
      // Only calculate gradients for left and right arguments if they
      // are active; otherwise do nothing
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class LType>
      typename enable_if<LType::is_active,void>::type
      calc_left_(Stack& stack, const LType& left, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const {
	Op::template calc_left<MyArrayNum, MyScratchNum>(stack, left, right, loc, scratch);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class LType>
      typename enable_if<!LType::is_active,void>::type
      calc_left_(Stack& stack, const LType& left, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const { }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class RType>
      typename enable_if<RType::is_active,void>::type
      calc_right_(Stack& stack, const RType& right, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const {
	Op::template calc_right<MyArrayNum, MyScratchNum>(stack, left, right, loc, scratch);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class RType>
      typename enable_if<!RType::is_active,void>::type
      calc_right_(Stack& stack, const RType& right, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const { }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class LType, typename MyType>
      typename enable_if<LType::is_active,void>::type
      calc_left_(Stack& stack, const LType& left, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch, MyType multiplier) const {
	Op::template calc_left<MyArrayNum, MyScratchNum>(stack, left, right, loc, scratch, multiplier);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class LType, typename MyType>
      typename enable_if<!LType::is_active,void>::type
      calc_left_(Stack& stack, const LType& left, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch, MyType multiplier) const { }


      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class RType, typename MyType>
      typename enable_if<RType::is_active,void>::type
      calc_right_(Stack& stack, const RType& right, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch, MyType multiplier) const {
	Op::template calc_right<MyArrayNum, MyScratchNum>(stack, left, right, loc, scratch, multiplier);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class RType, typename MyType>
      typename enable_if<!RType::is_active,void>::type
      calc_right_(Stack& stack, const RType& right, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch, MyType multiplier) const { }
    };
  

    // ---------------------------------------------------------------------
    // SECTION 4.2: policy classes for BinaryOperation: with scalars
    // ---------------------------------------------------------------------

    // Binary operations with a non-Expression on the left-hand-side
    template <class Type, typename L, class Op, class R>
    struct BinaryOpScalarLeft
      : public Expression<Type, BinaryOpScalarLeft<Type, L, Op, R> >,
	protected Op {

      // Static data
      static const int rank  = R::rank;
      static const bool is_active = R::is_active && !is_same<Type, bool>::value;
      static const int  store_result = is_active * Op::store_result;
      static const int n_active = expr_cast<R>::n_active;
      // Assume the only local scratch variable is the result of the
      // binary expression
      static const int  n_local_scratch = store_result; 
      //	+ Op::n_scratch<L::is_active,R::is_active>::value
      static const int  n_scratch
        = n_local_scratch + R::n_scratch;
      static const int  n_arrays  = R::n_arrays;
      static const bool is_vectorizable = R::is_vectorizable && Op::is_vectorized
	&& is_same<L,typename R::type>::value;

      using Op::is_operator;
      using Op::operation;
      using Op::operation_string;
      
      // DATA
      Packet<L> left;
      const R& right;

      BinaryOpScalarLeft(L left_,  const Expression<typename R::type, R>& right_)
	: left(left_), right(right_.cast()) { 
      }
      
      template <int Rank>
      bool get_dimensions_(ExpressionSize<Rank>& dim) const {
	return right.get_dimensions(dim);
      }

      std::string expression_string_() const {
	std::stringstream s;
	if (is_operator) {
	  s << "(" << left.value() << operation_string()
	    << right.expression_string() << ")";
	}
	else {
	  s << operation_string() << "(" << left.value() << ","
	    << static_cast<const R*>(&right)->expression_string() << ")";
	}
	return s.str();
      }

      bool is_aliased_(const Type* mem1, const Type* mem2) const {
	return right.is_aliased(mem1, mem2);
      }
      bool all_arrays_contiguous_() const {
	return right.all_arrays_contiguous_(); 
      }

       bool is_aligned_() const {
	return right.is_aligned_();
      }    

      template <int n>
      int alignment_offset_() const { return right.template alignment_offset_<n>(); }

      Type value_with_len_(const Index& j, const Index& len) const {
	return operation(left.value(), right.value_with_len(j,len));
      }

      template <int MyArrayNum, int NArrays>
      void advance_location_(ExpressionSize<NArrays>& loc) const {
	right.template advance_location_<MyArrayNum>(loc);
      }

      template <int MyArrayNum, int NArrays>
      Type value_at_location_(const ExpressionSize<NArrays>& loc) const {
	return operation(left.value(), right.template value_at_location_<MyArrayNum>(loc));
      }
      template <int MyArrayNum, int NArrays>
      Packet<Type> packet_at_location_(const ExpressionSize<NArrays>& loc) const {
	return operation(left, 
			 right.template packet_at_location_<MyArrayNum>(loc));
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_at_location_store_(const ExpressionSize<NArrays>& loc,
				    ScratchVector<NScratch>& scratch) const {
	return my_value_at_location_store_<store_result,MyArrayNum,MyScratchNum>(loc, scratch);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_stored_(const ExpressionSize<NArrays>& loc,
			 const ScratchVector<NScratch>& scratch) const {
	return my_value_stored_<store_result,MyArrayNum,MyScratchNum>(loc, scratch);
      }

    protected:
      template <int StoreResult, int MyArrayNum, int MyScratchNum, 
		int NArrays, int NScratch>
      typename enable_if<StoreResult == 1, Type>::type
      my_value_at_location_store_(const ExpressionSize<NArrays>& loc,
				       ScratchVector<NScratch>& scratch) const {
	return scratch[MyScratchNum] = operation(left.value(),
		      right.template value_at_location_store_<MyArrayNum, MyScratchNum+n_local_scratch>(loc, scratch));
      }
      template <int StoreResult, int MyArrayNum, int MyScratchNum, 
		int NArrays, int NScratch>
      typename enable_if<StoreResult == 2, Type>::type
      my_value_at_location_store_(const ExpressionSize<NArrays>& loc,
				       ScratchVector<NScratch>& scratch) const {
	return scratch[MyScratchNum] = Op::operation_store(left.value(),
	       right.template value_at_location_store_<MyArrayNum, MyScratchNum+n_local_scratch>(loc, scratch),
	       scratch[MyScratchNum+1]);
      }

      template <int StoreResult, int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      typename enable_if<(StoreResult > 0), Type>::type
      my_value_stored_(const ExpressionSize<NArrays>& loc,
		       const ScratchVector<NScratch>& scratch) const {
	return scratch[MyScratchNum];
      }

      template <int StoreResult, int MyArrayNum, int MyScratchNum, 
		int NArrays, int NScratch>
      typename enable_if<StoreResult == 0, Type>::type
      my_value_at_location_store_(const ExpressionSize<NArrays>& loc,
				       ScratchVector<NScratch>& scratch) const {
	return operation(left.value(),
	     right.template value_at_location_store_<MyArrayNum,MyScratchNum+n_local_scratch>(loc, scratch));
      }

      template <int StoreResult, int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      typename enable_if<StoreResult == 0, Type>::type
      my_value_stored_(const ExpressionSize<NArrays>& loc,
		       const ScratchVector<NScratch>& scratch) const {
	return operation(left.value(),right.template value_at_location_<MyArrayNum>(loc));
      }
    

    public:

      template <int MyArrayNum, int Rank, int NArrays>
      void set_location_(const ExpressionSize<Rank>& i, 
			 ExpressionSize<NArrays>& index) const {
	right.template set_location_<MyArrayNum>(i, index);
      }


      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      void calc_gradient_(Stack& stack, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const {
        calc_right_<MyArrayNum, MyScratchNum>(stack, right, loc, scratch);
      }
      // As the previous but multiplying the gradient by "multiplier"
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, typename MyType>
      void calc_gradient_(Stack& stack, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch,
			  MyType multiplier) const {
        calc_right_<MyArrayNum, MyScratchNum>(stack, right, loc, scratch, multiplier);
      }
    
    protected:
      // Only calculate gradients arguments if they are active;
      // otherwise do nothing
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class RType>
      typename enable_if<RType::is_active,void>::type
      calc_right_(Stack& stack, const RType& right, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const {
	Op::template calc_right<MyArrayNum, MyScratchNum>(stack, Scalar<L>(left.value()), right, loc, scratch);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class RType>
      typename enable_if<!RType::is_active,void>::type
      calc_right_(Stack& stack, const RType& right, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const { }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class RType, typename MyType>
      typename enable_if<RType::is_active,void>::type
      calc_right_(Stack& stack, const RType& right, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch, MyType multiplier) const {
	Op::template calc_right<MyArrayNum, MyScratchNum>(stack, Scalar<L>(left.value()), right, loc, scratch, multiplier);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class RType, typename MyType>
      typename enable_if<!RType::is_active,void>::type
      calc_right_(Stack& stack, const RType& right, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch, MyType multiplier) const { }
    };


    // Binary operations with a non-Expression on the right-hand-side
    template <class Type, typename L, class Op, class R>
    struct BinaryOpScalarRight
      : public Expression<Type, BinaryOpScalarRight<Type, L, Op, R> >,
	protected Op {

      // Static data
      static const int rank  = L::rank;
      static const bool is_active = L::is_active && !is_same<Type,bool>::value;
      static const int  store_result = is_active * Op::store_result;
      static const int n_active  = expr_cast<L>::n_active;
      // Assume the only local scratch variable is the result of the
      // binary expression
      static const int  n_local_scratch = store_result; 
      //	+ Op::n_scratch<L::is_active,R::is_active>::value
      static const int  n_scratch
        = n_local_scratch + L::n_scratch;
      static const int  n_arrays  = L::n_arrays;
      static const bool is_vectorizable = L::is_vectorizable && Op::is_vectorized
	&& is_same<typename L::type,R>::value;

      using Op::is_operator;
      using Op::operation;
      using Op::operation_string;
      
      // DATA
      const L& left;
      Packet<R> right;

      BinaryOpScalarRight(const Expression<typename L::type, L>& left_, R right_)
	: left(left_.cast()), right(right_) {
	// Some operations (divide and atan2) store one extra piece of
	// information during differentiation, so have
	// store_result==2.  This should not be needed when the RHS is
	// scalar, so has not been implemented.
	ADEPT_STATIC_ASSERT((!is_active || store_result<2), ERROR_IN_BINARY_OP_SCALAR_RIGHT);
      }
      
      template <int Rank>
      bool get_dimensions_(ExpressionSize<Rank>& dim) const {
	return left.get_dimensions(dim);
      }

      std::string expression_string_() const {
	std::stringstream s;
	if (is_operator) {
	  s << "(" << left.expression_string() << operation_string()
	    << right.value() << ")";
	}
	else {
	  s << operation_string() << "("
	    << static_cast<const L*>(&left)->expression_string() << ","
	    << right.value() << ")";
	}
	return s.str();
      }

      bool is_aliased_(const Type* mem1, const Type* mem2) const {
	return left.is_aliased(mem1, mem2);
      }
      bool all_arrays_contiguous_() const {
	return left.all_arrays_contiguous_(); 
      }

      bool is_aligned_() const {
	return left.is_aligned_();
      }

      template <int n>
      int alignment_offset_() const { return left.template alignment_offset_<n>(); }

      Type value_with_len_(const Index& j, const Index& len) const {
	return operation(left.value_with_len(j,len), right.value());
      }

      template <int MyArrayNum, int NArrays>
      void advance_location_(ExpressionSize<NArrays>& loc) const {
	left.template advance_location_<MyArrayNum>(loc);
      }

      template <int MyArrayNum, int NArrays>
      Type value_at_location_(const ExpressionSize<NArrays>& loc) const {
	return operation(left.template value_at_location_<MyArrayNum>(loc), right.value());
      }
      template <int MyArrayNum, int NArrays>
      Packet<Type> packet_at_location_(const ExpressionSize<NArrays>& loc) const {
	return operation(left.template packet_at_location_<MyArrayNum>(loc),
			 right);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_at_location_store_(const ExpressionSize<NArrays>& loc,
				    ScratchVector<NScratch>& scratch) const {
	return my_value_at_location_store_<store_result,MyArrayNum,MyScratchNum>(loc, scratch);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_stored_(const ExpressionSize<NArrays>& loc,
			 const ScratchVector<NScratch>& scratch) const {
	return my_value_stored_<store_result,MyArrayNum,MyScratchNum>(loc, scratch);
      }

    protected:
      template <int StoreResult, int MyArrayNum, int MyScratchNum, 
		int NArrays, int NScratch>
      typename enable_if<(StoreResult > 0), Type>::type
      my_value_at_location_store_(const ExpressionSize<NArrays>& loc,
				       ScratchVector<NScratch>& scratch) const {
	return scratch[MyScratchNum] = operation(
	 left.template value_at_location_store_<MyArrayNum, MyScratchNum+n_local_scratch>(loc, scratch), right.value());
      }

      template <int StoreResult, int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      typename enable_if<(StoreResult > 0), Type>::type
      my_value_stored_(const ExpressionSize<NArrays>& loc,
		       const ScratchVector<NScratch>& scratch) const {
	return scratch[MyScratchNum];
      }

      template <int StoreResult, int MyArrayNum, int MyScratchNum, 
		int NArrays, int NScratch>
      typename enable_if<StoreResult == 0, Type>::type
      my_value_at_location_store_(const ExpressionSize<NArrays>& loc,
				       ScratchVector<NScratch>& scratch) const {
	return operation(left.template value_at_location_store_<MyArrayNum,MyScratchNum+n_local_scratch>(loc, scratch), 
			 right.value());
      }

      template <int StoreResult, int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      typename enable_if<StoreResult == 0, Type>::type
      my_value_stored_(const ExpressionSize<NArrays>& loc,
		       const ScratchVector<NScratch>& scratch) const {
	return operation(left.template value_at_location_<MyArrayNum>(loc), right.value());
      }
    

    public:

      template <int MyArrayNum, int Rank, int NArrays>
      void set_location_(const ExpressionSize<Rank>& i, 
			 ExpressionSize<NArrays>& index) const {
	left.template set_location_<MyArrayNum>(i, index);
      }


      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      void calc_gradient_(Stack& stack, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const {
        calc_left_<MyArrayNum, MyScratchNum>(stack, left, loc, scratch);
      }
      // As the previous but multiplying the gradient by "multiplier"
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, typename MyType>
      void calc_gradient_(Stack& stack, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch,
			  MyType multiplier) const {
        calc_left_<MyArrayNum, MyScratchNum>(stack, left, loc, scratch, multiplier);
      }
    
    protected:
      // Only calculate gradients arguments if they are active;
      // otherwise do nothing
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class LType>
      typename enable_if<LType::is_active,void>::type
      calc_left_(Stack& stack, const LType& left, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const {
	Op::template calc_left<MyArrayNum, MyScratchNum>(stack, left, Scalar<R>(right.value()), loc, scratch);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class LType>
      typename enable_if<!LType::is_active,void>::type
      calc_left_(Stack& stack, const LType& left, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const { }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class LType, typename MyType>
      typename enable_if<LType::is_active,void>::type
      calc_left_(Stack& stack, const LType& left, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch, MyType multiplier) const {
	Op::template calc_left<MyArrayNum, MyScratchNum>(stack, left, Scalar<R>(right.value()), loc, scratch, multiplier);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class LType, typename MyType>
      typename enable_if<!LType::is_active,void>::type
      calc_left_(Stack& stack, const LType& left, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch, MyType multiplier) const { }
	};
 
  } // End namespace internal


  namespace internal {

    // ---------------------------------------------------------------------
    // SECTION 4.3: policy classes for BinaryOperation: standard operators
    // ---------------------------------------------------------------------

    // Policy class implementing operator+
    struct Add {
      static const bool is_operator  = true;  // Operator or function for expression_string()
      static const int  store_result = 0;     // Do we need any scratch space?
      static const bool is_vectorized = true;

      const char* operation_string() const { return "+"; } // For expression_string()
      
      // Implement the basic operation
      template <class LType, class RType>
      typename promote<LType, RType>::type
      operation(const LType& left, const RType& right) const { return left + right; }
      
      // Calculate the gradient of the left-hand argument
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      void calc_left(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch) const {
        left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch);
      }

      // Calculate the gradient of the right-hand argument
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      void calc_right(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch) const {
        right.template calc_gradient_<MyArrayNum+L::n_arrays, MyScratchNum+L::n_scratch+store_result>(stack, loc, scratch);
      }

      // Calculate the gradient of the left-hand argument with a multiplier
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R, typename MyType>
      void calc_left(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch, MyType multiplier) const {
        left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch, multiplier);
      }

      // Calculate the gradient of the right-hand argument with a multiplier
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R, typename MyType>
      void calc_right(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch, MyType multiplier) const {
        right.template calc_gradient_<MyArrayNum+L::n_arrays, MyScratchNum+L::n_scratch+store_result>(stack, loc, scratch, multiplier);
      }
    };

    // Policy class implementing operator-
    struct Subtract {
      static const bool is_operator  = true;  // Operator or function for expression_string()
      static const int  store_result = 1;     // Do we need any scratch space?
      static const bool is_vectorized = true;

      const char* operation_string() const { return "-"; } // For expression_string()
      
      // Implement the basic operation
      template <class LType, class RType>
      typename promote<LType, RType>::type
      operation(const LType& left, const RType& right) const { return left - right; }
      
      // Calculate the gradient of the left-hand argument
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      void calc_left(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch) const {
        left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch);
      }

      // Calculate the gradient of the right-hand argument
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      void calc_right(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch) const {
        right.template calc_gradient_<MyArrayNum+L::n_arrays, MyScratchNum+L::n_scratch+store_result>(stack, loc, scratch, -1.0);
      }

      // Calculate the gradient of the left-hand argument with a multiplier
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R, typename MyType>
      void calc_left(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch, MyType multiplier) const {
        left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch, multiplier);
      }

      // Calculate the gradient of the right-hand argument with a multiplier
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R, typename MyType>
      void calc_right(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch, MyType multiplier) const {
        right.template calc_gradient_<MyArrayNum+L::n_arrays, MyScratchNum+L::n_scratch+store_result>(stack, loc, scratch, -multiplier);
      }
    };


    // Policy class implementing operator*
    struct Multiply {
      static const bool is_operator  = true; // Operator or function for expression_string()
      static const int  store_result = 1;    // Do we need any scratch space? (this can be 0 or 1)
      static const bool is_vectorized = true;

      const char* operation_string() const { return "*"; } // For expression_string()
      
      // Implement the basic operation
      template <class LType, class RType>
      typename promote<LType, RType>::type
      operation(const LType& left, const RType& right) const { return left * right; }
      
      // Calculate the gradient of the left-hand argument
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      static void calc_left(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch) {
        left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch, 
	    right.template value_stored_<MyArrayNum+L::n_arrays,MyScratchNum+L::n_scratch+store_result>(loc, scratch));
      }

      // Calculate the gradient of the right-hand argument
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      static void calc_right(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch) {
        right.template calc_gradient_<MyArrayNum+L::n_arrays, MyScratchNum+L::n_scratch+store_result>(stack, loc, scratch, 
				   left.template value_stored_<MyArrayNum,MyScratchNum+store_result>(loc, scratch));
      }

      // Calculate the gradient of the left-hand argument with a multiplier
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R, typename MyType>
      static void calc_left(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch, MyType multiplier) {
        left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch, multiplier
	    *right.template value_stored_<MyArrayNum+L::n_arrays,MyScratchNum+L::n_scratch+store_result>(loc, scratch));
      }

      // Calculate the gradient of the right-hand argument with a multiplier
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R, typename MyType>
      static void calc_right(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch, MyType multiplier) {
        right.template calc_gradient_<MyArrayNum+L::n_arrays, MyScratchNum+L::n_scratch+store_result>(stack, loc, scratch, 
		   multiplier*left.template value_stored_<MyArrayNum,MyScratchNum+store_result>(loc, scratch));
      }
    };

    // Policy class implementing operator/
    struct Divide {
      static const bool is_operator  = true; // Operator or function for expression_string()
      static const int  store_result = 2;    // Do we need any scratch space? (this can be 1 or 2)
      static const bool is_vectorized = true;

      const char* operation_string() const { return "/"; } // For expression_string()
      
      // Implement the basic operation
      template <class LType, class RType>
      typename promote<LType, RType>::type
      operation(const LType& left, const RType& right) const { return left / right; }

      template <class LType, class RType>
      typename promote<LType, RType>::type
      operation_store(const LType& left, const RType& right, Real& one_over_right) const { 
	one_over_right = 1.0 / right;
	return left * one_over_right; 
      }
      
      // Calculate the gradient of the left-hand argument
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      void calc_left(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch) const {
	// If f(a,b) = a/b then df/da = 1/b
	// If store_result==1 then do this:
        //left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch, 
	//    1.0 / right.template value_stored_<MyArrayNum+L::n_arrays,MyScratchNum+L::n_scratch+store_result>(loc, scratch));
	// If store_result==2 then do this:
        left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch, 
									    scratch[MyScratchNum+1]);
      }

      // Calculate the gradient of the right-hand argument
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      void calc_right(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch) const {
	// If f(a,b) = a/b then df/db = -a/(b*b) = -f/b
	// If store_result==1 then do this:
        //right.template calc_gradient_<MyArrayNum+L::n_arrays, MyScratchNum+L::n_scratch+store_result>(stack, loc, scratch, 
	//      -scratch[MyScratchNum] / right.template value_stored_<MyArrayNum+L::n_arrays,MyScratchNum+L::n_scratch+store_result>(loc, scratch));
	// If store_result==2 then do this:
	right.template calc_gradient_<MyArrayNum+L::n_arrays, MyScratchNum+L::n_scratch+store_result>(stack, loc, scratch, 
								      -scratch[MyScratchNum] * scratch[MyScratchNum+1]);
      }

      // Calculate the gradient of the left-hand argument with a multiplier
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R, typename MyType>
      void calc_left(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch, MyType multiplier) const {
	// If f(a,b) = a/b then w*df/da = w/b
	// If store_result==1 then do this:
        //left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch, multiplier
	//    / right.template value_stored_<MyArrayNum+L::n_arrays,MyScratchNum+L::n_scratch+store_result>(loc, scratch));
	// If store_result==2 then do this:
        left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch, 
									    multiplier*scratch[MyScratchNum+1]);
      }

      // Calculate the gradient of the right-hand argument with a multiplier
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R, typename MyType>
      void calc_right(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch, MyType multiplier) const {
	// If f(a,b) = a/b then w*df/db = -w*a/(b*b) = -w*f/b
	// If store_result==1 then do this:
        //right.template calc_gradient_<MyArrayNum+L::n_arrays, MyScratchNum+L::n_scratch+store_result>(stack, loc, scratch, 
	//		  -multiplier * scratch[MyScratchNum] 
	//	      / right.template value_stored_<MyArrayNum+L::n_arrays,MyScratchNum+L::n_scratch+store_result>(loc, scratch));
	// If store_result==2 then do this:
	right.template calc_gradient_<MyArrayNum+L::n_arrays, MyScratchNum+L::n_scratch+store_result>(stack, loc, scratch, 
						      -multiplier * scratch[MyScratchNum] * scratch[MyScratchNum+1]);
      }
    };

    // Policy class implementing function pow
    struct Pow {
      static const bool is_operator  = false; // Operator or function for expression_string()
      static const int  store_result = 1;     // Do we need any scratch space? (this CANNOT be changed)
      static const bool is_vectorized = false;

      const char* operation_string() const { return "pow"; } // For expression_string()
      
      // Implement the basic operation
      template <class LType, class RType>
      typename promote<LType, RType>::type
      operation(const LType& left, const RType& right) const {
	using std::pow;
	return pow(left, right);
      }
      
      // Calculate the gradient of the left-hand argument
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      void calc_left(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch) const {
	using std::pow;
        left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch, 
	   right.template value_stored_<MyArrayNum+L::n_arrays,MyScratchNum+L::n_scratch+store_result>(loc, scratch)
	    *pow(left.template value_stored_<MyArrayNum, MyScratchNum+store_result>(loc, scratch),
		 right.template value_stored_<MyArrayNum+L::n_arrays,MyScratchNum+L::n_scratch+store_result>(loc, scratch) - 1.0));
      }

      // Calculate the gradient of the right-hand argument
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      void calc_right(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch) const {
	using std::log;
        right.template calc_gradient_<MyArrayNum+L::n_arrays, MyScratchNum+L::n_scratch+store_result>(stack, loc, scratch, 
	  scratch[MyScratchNum] * log(left.template value_stored_<MyArrayNum,MyScratchNum+store_result>(loc, scratch)));
      }

      // Calculate the gradient of the left-hand argument with a multiplier
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R, typename MyType>
      void calc_left(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch, MyType multiplier) const {
	using std::pow;
        left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch, multiplier
	    *right.template value_stored_<MyArrayNum+L::n_arrays,MyScratchNum+L::n_scratch+store_result>(loc, scratch)
	    *pow(left.template value_stored_<MyArrayNum, MyScratchNum+store_result>(loc, scratch),
		 right.template value_stored_<MyArrayNum+L::n_arrays,MyScratchNum+L::n_scratch+store_result>(loc, scratch) - 1.0));
      }

      // Calculate the gradient of the right-hand argument with a multiplier
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R, typename MyType>
      void calc_right(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch, MyType multiplier) const {
	using std::log;
        right.template calc_gradient_<MyArrayNum+L::n_arrays, MyScratchNum+L::n_scratch+store_result>(stack, loc, scratch, 
		   multiplier * scratch[MyScratchNum] 
		  * log(left.template value_stored_<MyArrayNum,MyScratchNum+store_result>(loc, scratch)));
      }
    };


    // Policy class implementing function atan2
    struct Atan2 {
      static const bool is_operator  = false; // Operator or function for expression_string()
      static const int  store_result = 2;     // Do we need any scratch space? Yes: for left^2+right^2
      static const bool is_vectorized = false;

      const char* operation_string() const { return "atan2"; } // For expression_string()
      
      // Implement the basic operation
      template <class LType, class RType>
      typename promote<LType, RType>::type
      operation(const LType& left, const RType& right) const {
	using std::atan2;
	return atan2(left, right);
      }
      // Implement the basic operation
      template <class LType, class RType>
      typename promote<LType, RType>::type
      operation_store(const LType& left, const RType& right, Real& saved_term) const {
	using std::atan2;
	saved_term = 1.0 / (left*left + right*right);
	return atan2(left, right);
      }
            
      // Calculate the gradient of the left-hand argument
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      void calc_left(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch) const {
        left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch, 
	   right.template value_stored_<MyArrayNum+L::n_arrays,MyScratchNum+L::n_scratch+store_result>(loc, scratch)
	    *scratch[MyScratchNum+1]);
      }

      // Calculate the gradient of the right-hand argument
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      void calc_right(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch) const {
        right.template calc_gradient_<MyArrayNum+L::n_arrays, MyScratchNum+L::n_scratch+store_result>(stack, loc, scratch, 
	  -left.template value_stored_<MyArrayNum,MyScratchNum+store_result>(loc, scratch)*scratch[MyScratchNum+1]);
      }

      // Calculate the gradient of the left-hand argument with a multiplier
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R, typename MyType>
      void calc_left(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch, MyType multiplier) const {
        left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch, 
	   right.template value_stored_<MyArrayNum+L::n_arrays,MyScratchNum+L::n_scratch+store_result>(loc, scratch)
	    *scratch[MyScratchNum+1]*multiplier);
      }

      // Calculate the gradient of the right-hand argument with a multiplier
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R, typename MyType>
      void calc_right(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch, MyType multiplier) const {
        right.template calc_gradient_<MyArrayNum+L::n_arrays, MyScratchNum+L::n_scratch+store_result>(stack, loc, scratch, 
	  -left.template value_stored_<MyArrayNum,MyScratchNum+store_result>(loc, scratch)*scratch[MyScratchNum+1]*multiplier);
      }
    };


    // Policy class implementing function max
    struct Max {
      static const bool is_operator  = false; // Operator or function for expression_string()
      static const int  store_result = 0;    // Do we need any scratch space? (this can be 0 or 1)
      static const bool is_vectorized = true;

      const char* operation_string() const { return "max"; } // For expression_string()
      
      // Implement the basic operation - first the version for packets
      template <class LType, class RType>
      typename enable_if<is_packet<LType>::value,LType>::type
      operation(const LType& left, const RType& right) const
      { return adept::internal::fmax(left,right); }

#ifndef ADEPT_CXX11_FEATURES
      // For C++98, use simple ternary operation
      template <class LType, class RType>
      typename enable_if<!is_packet<LType>::value,typename promote<LType, RType>::type>::type
      operation(const LType& left, const RType& right) const { return left < right ? right : left; }
#else
      // For C++11 use the (hopefully faster) fmax function for floating-point functions
      template <class LType, class RType>
      typename enable_if<!is_packet<LType>::value &&
                         (!is_floating_point<LType>::value
			  || !is_floating_point<RType>::value),
			 typename promote<LType, RType>::type>::type
      operation(const LType& left, const RType& right) const { return left < right ? right : left; }

      template <class LType, class RType>
      typename enable_if<!is_packet<LType>::value &&
                         (is_floating_point<LType>::value
			  && is_floating_point<RType>::value),
			 typename promote<LType, RType>::type>::type
      operation(const LType& left, const RType& right) const { return std::fmax(left,right); }
#endif
      // Calculate the gradient of the left-hand argument
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      void calc_left(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch) const {
	if (is_left<MyArrayNum,MyScratchNum>(left,right,loc,scratch)) {
	  left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch);
	}
      }

      // Calculate the gradient of the right-hand argument
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      void calc_right(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch) const {
	if (!is_left<MyArrayNum,MyScratchNum>(left,right,loc,scratch)) {
	  right.template calc_gradient_<MyArrayNum+L::n_arrays, MyScratchNum+L::n_scratch+store_result>(stack, loc, scratch);
	}
      }

      // Calculate the gradient of the left-hand argument with a multiplier
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R, typename MyType>
      void calc_left(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch, MyType multiplier) const {
	if (is_left<MyArrayNum,MyScratchNum>(left,right,loc,scratch)) {
	  left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch, multiplier);
	}
      }

      // Calculate the gradient of the right-hand argument with a multiplier
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R, typename MyType>
      void calc_right(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch, MyType multiplier) const {
	if (!is_left<MyArrayNum,MyScratchNum>(left,right,loc,scratch)) {
	  right.template calc_gradient_<MyArrayNum+L::n_arrays, MyScratchNum+L::n_scratch+store_result>(stack, loc, scratch, multiplier);
	}
      }

    private:
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      bool is_left(const L& left, const R& right, const ExpressionSize<NArrays>& loc,
		   const ScratchVector<NScratch>& scratch) const {
	return left.template value_stored_<MyArrayNum,MyScratchNum+store_result>(loc, scratch)
	  > right.template value_stored_<MyArrayNum+L::n_arrays,MyScratchNum+L::n_scratch+store_result>(loc, scratch);
      }
    };


    // Policy class implementing function min
    struct Min {
      static const bool is_operator  = false; // Operator or function for expression_string()
      static const int  store_result = 0;    // Do we need any scratch space? (this can be 0 or 1)
      static const bool is_vectorized = true;

      const char* operation_string() const { return "min"; } // For expression_string()
      
      // Implement the basic operation
      template <class LType, class RType>
      typename enable_if<is_packet<LType>::value,LType>::type
      operation(const LType& left, const RType& right) const
      { return adept::internal::fmin(left,right); }
#ifndef ADEPT_CXX11_FEATURES
      // For C++98, use simple ternary operation
      template <class LType, class RType>
      typename enable_if<!is_packet<LType>::value,typename promote<LType, RType>::type>::type
      operation(const LType& left, const RType& right) const { return left < right ? left : right; }
#else
      // For C++11 use the (hopefully faster) fmin function for floating-point functions
      template <class LType, class RType>
      typename enable_if<!is_packet<LType>::value &&
                         (!is_floating_point<LType>::value
			  || !is_floating_point<RType>::value),
			 typename promote<LType, RType>::type>::type
      operation(const LType& left, const RType& right) const { return left < right ? left : right; }

      template <class LType, class RType>
      typename enable_if<!is_packet<LType>::value &&
                         (is_floating_point<LType>::value
			  && is_floating_point<RType>::value),
			 typename promote<LType, RType>::type>::type
      operation(const LType& left, const RType& right) const { return std::fmin(left,right); }
#endif
  
      // Calculate the gradient of the left-hand argument
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      void calc_left(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch) const {
	if (is_left<MyArrayNum,MyScratchNum>(left,right,loc,scratch)) {
	  left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch);
	}
      }

      // Calculate the gradient of the right-hand argument
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      void calc_right(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch) const {
	if (!is_left<MyArrayNum,MyScratchNum>(left,right,loc,scratch)) {
	  right.template calc_gradient_<MyArrayNum+L::n_arrays, MyScratchNum+L::n_scratch+store_result>(stack, loc, scratch);
	}
      }

      // Calculate the gradient of the left-hand argument with a multiplier
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R, typename MyType>
      void calc_left(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch, MyType multiplier) const {
	if (is_left<MyArrayNum,MyScratchNum>(left,right,loc,scratch)) {
	  left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch, multiplier);
	}
      }

      // Calculate the gradient of the right-hand argument with a multiplier
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R, typename MyType>
      void calc_right(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch, MyType multiplier) const {
	if (!is_left<MyArrayNum,MyScratchNum>(left,right,loc,scratch)) {
	  right.template calc_gradient_<MyArrayNum+L::n_arrays, MyScratchNum+L::n_scratch+store_result>(stack, loc, scratch, multiplier);
	}
      }

    private:
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      bool is_left(const L& left, const R& right, const ExpressionSize<NArrays>& loc,
		   const ScratchVector<NScratch>& scratch) const {
	return left.template value_stored_<MyArrayNum,MyScratchNum+store_result>(loc, scratch)
	  <= right.template value_stored_<MyArrayNum+L::n_arrays,MyScratchNum+L::n_scratch+store_result>(loc, scratch);
      }
    };

    // Policy class implementing copysign
    struct CopySign {
      static const bool is_operator  = false;  // Operator or function for expression_string()
      static const int  store_result = 0;     // Do we need any scratch space?
      static const bool is_vectorized = false;

      const char* operation_string() const { return "copysign"; } // For expression_string()
      
      // Implement the basic operation
      template <class LType, class RType>
      typename promote<LType, RType>::type
      operation(const LType& left, const RType& right) const {
	// Not very efficient but no guarantee that copysign function
	// is available, and also would need to check for
	// compatibility of left and right types.
	if (right >= 0) {
	  return left;
	}
	else {
	  return -left;
	}
      }
      
      // Calculate the gradient of the left-hand argument
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      void calc_left(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch) const {
	if (is_right_positive<MyArrayNum,MyScratchNum>(left,right,loc,scratch)) {
	  left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch);
	}
	else {
	  left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch, -1.0);
	}
      }

      // Calculate the gradient of the right-hand argument
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      void calc_right(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch) const {
	// Do nothing: gradient of RHS is zero
      }

      // Calculate the gradient of the left-hand argument with a multiplier
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R, typename MyType>
      void calc_left(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch, MyType multiplier) const {
	if (is_right_positive<MyArrayNum,MyScratchNum>(left,right,loc,scratch)) {
	  left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch, multiplier);
	}
	else {
	  left.template calc_gradient_<MyArrayNum, MyScratchNum+store_result>(stack, loc, scratch, -multiplier);
	}
      }

      // Calculate the gradient of the right-hand argument with a multiplier
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R, typename MyType>
      void calc_right(Stack& stack, const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			       const ScratchVector<NScratch>& scratch, MyType multiplier) const {
	// Do nothing: gradient of RHS is zero
      }
    private:
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class L, class R>
      bool is_right_positive(const L& left, const R& right, const ExpressionSize<NArrays>& loc,
			     const ScratchVector<NScratch>& scratch) const {
	return right.template value_stored_<MyArrayNum+L::n_arrays,MyScratchNum+L::n_scratch+store_result>(loc, scratch)
	  >= 0.0;
      }

    };

    
  } // End namespace internal


#define ADEPT_DEFINE_OPERATION(NAME, OPERATOR)				\
  template<class L, class R>						\
  inline								\
  typename internal::enable_if<internal::rank_compatible<L::rank, R::rank>::value, \
			       internal::BinaryOperation<typename internal::promote<typename L::type, \
										    typename R::type>::type, \
							 L, internal:: NAME, R> >::type	\
  OPERATOR(const Expression<typename L::type, L>& l,			\
	   const Expression<typename R::type, R>& r)	{		\
    using namespace adept::internal;					\
    return BinaryOperation<typename promote<typename L::type,		\
					    typename R::type>::type,	\
			   L, NAME, R>(l.cast(), r.cast());		\
  }									\
									\
  template<typename LType, class R>					\
  inline								\
  typename internal::enable_if<internal::is_not_expression<LType>::value, \
			       internal::BinaryOpScalarLeft<typename internal::promote<LType, \
										       typename R::type>::type, \
							    LType, internal:: NAME, R> >::type \
  OPERATOR(const LType& l, const Expression<typename R::type, R>& r)	{ \
    using namespace adept::internal;					\
    return BinaryOpScalarLeft<typename promote<LType, typename R::type>::type, \
      LType, NAME, R>(l, r.cast());					\
  }

#define ADEPT_DEFINE_SCALAR_RHS_OPERATION(NAME, OPERATOR)		\
  template<class L, typename RType>					\
  inline								\
  typename internal::enable_if<internal::is_not_expression<RType>::value, \
			       internal::BinaryOpScalarRight<typename internal::promote<typename L::type, \
											RType>::type, \
							     L, internal:: NAME, RType> >::type \
  OPERATOR(const Expression<typename L::type, L>& l, const RType& r) {	\
    using namespace adept::internal;					\
    return BinaryOpScalarRight<typename promote<typename L::type, RType>::type, \
      L, NAME, RType>(l.cast(), r);		\
  }

  // The following define Expr*Expr and Scalar*Expr
  ADEPT_DEFINE_OPERATION(Add, operator+)
  ADEPT_DEFINE_OPERATION(Subtract, operator-)
  ADEPT_DEFINE_OPERATION(Multiply, operator*)
  ADEPT_DEFINE_OPERATION(Divide, operator/)
  ADEPT_DEFINE_OPERATION(Pow, pow)
  ADEPT_DEFINE_OPERATION(Atan2, atan2)
  ADEPT_DEFINE_OPERATION(Max, max)
  ADEPT_DEFINE_OPERATION(Min, min)
  // If std::max has been brought into scope via a "using" directive
  // then calling "max" with two arguments of the same type will call
  // the std::max rather than adept::max function, even if these
  // arguments are from the adept namespace. This will cause a compile
  // failure. Likewise with std::min. To avoid this, either don't use
  // "using std::max", or alternatively use Adept's "fmax" and "fmin"
  // functions, which do the same thing but match the C++11 functions
  // std::fmax and std::fmin for floating-point types.  Note that you
  // can use these Adept functions even if you are not using C++11.
  ADEPT_DEFINE_OPERATION(Max, fmax)
  ADEPT_DEFINE_OPERATION(Min, fmin)
  ADEPT_DEFINE_OPERATION(CopySign, copysign)

  // The following define Expr*Scalar; those in the list above but not
  // below (e.g. Divide) use a custom implementation of Expr*Scalar
  ADEPT_DEFINE_SCALAR_RHS_OPERATION(Add, operator+)
  ADEPT_DEFINE_SCALAR_RHS_OPERATION(Subtract, operator-)
  ADEPT_DEFINE_SCALAR_RHS_OPERATION(Multiply, operator*)
  ADEPT_DEFINE_SCALAR_RHS_OPERATION(Pow, pow)
  ADEPT_DEFINE_SCALAR_RHS_OPERATION(Max, max)
  ADEPT_DEFINE_SCALAR_RHS_OPERATION(Min, min)
  ADEPT_DEFINE_SCALAR_RHS_OPERATION(Max, fmax)
  ADEPT_DEFINE_SCALAR_RHS_OPERATION(Min, fmin)
  ADEPT_DEFINE_SCALAR_RHS_OPERATION(CopySign, copysign)

#undef ADEPT_DEFINE_OPERATION
#undef ADEPT_DEFINE_SCALAR_RHS_OPERATION

  // Treat expression divided by floating-point scalar differently
  // since this can be changed to a more efficient multiplication
  template<class L, typename RType>
  inline
  typename internal::enable_if<internal::is_not_expression<RType>::value 
                               && (internal::is_floating_point<RType>::value || L::is_active),
			       internal::BinaryOpScalarRight<typename internal::promote<typename L::type,
											RType>::type,
							     L, internal::Multiply, 
							     typename internal::promote<typename L::type,
											RType>::type> >::type
  operator/(const Expression<typename L::type, L>& l, const RType& r) {
    using namespace adept::internal;
    typedef typename promote<typename L::type, RType>::type PType;
    return BinaryOpScalarRight<PType, L, Multiply, PType>(l.cast(), 1.0/static_cast<PType>(r));
  }

  // Treat expression divided by any other type of scalar as division,
  // but differentiation is not properly implemented for dividing by a
  // scalar, so if the left hand side is active then the version above
  // (converting to a multiplication) will be used
  template<class L, typename RType>
  inline
  typename internal::enable_if<internal::is_not_expression<RType>::value
                               && (!internal::is_floating_point<RType>::value && !L::is_active),
			       internal::BinaryOpScalarRight<typename internal::promote<typename L::type,
											RType>::type,
							     L, internal::Divide, 
							     typename internal::promote<typename L::type,
											RType>::type> >::type
  operator/(const Expression<typename L::type, L>& l, const RType& r) {
    using namespace adept::internal;
    typedef typename promote<typename L::type, RType>::type PType;
    return BinaryOpScalarRight<PType, L, Divide, PType>(l.cast(), static_cast<PType>(r));
  }

// Now the operators returning boolean results

#define ADEPT_DEFINE_OPERATOR(NAME, OPERATOR, OPSYMBOL, OPSTRING)	\
  namespace internal {							\
    struct NAME {							\
      static const bool is_operator  = true;				\
      static const int  store_result = 0;	                        \
      static const bool is_vectorized = false;				\
      const char* operation_string() const { return OPSTRING; }		\
      									\
      template <class LType, class RType>				\
      bool operation(const LType& left, const RType& right) const	\
      { return left OPSYMBOL right; }					\
    };									\
  }									\
									\
  template<class L, class R>						\
  inline								\
  typename internal::enable_if<internal::rank_compatible<L::rank, R::rank>::value \
			       && (L::rank > 0 || R::rank > 0) ,	\
	    internal::BinaryOperation<bool,L,internal:: NAME, R> >::type \
  OPERATOR(const Expression<typename L::type, L>& l,			\
	   const Expression<typename R::type, R>& r)	{		\
    using namespace adept::internal;					\
    return BinaryOperation<bool, L, NAME, R>(l.cast(), r.cast());	\
  }									\
  									\
  template<typename LType, class R>					\
  inline								\
  typename internal::enable_if<internal::is_not_expression<LType>::value \
			       && (R::rank > 0) ,			\
			       internal::BinaryOpScalarLeft<bool,LType,internal:: NAME, R> >::type \
  OPERATOR(const LType& l, const Expression<typename R::type, R>& r) {	\
    using namespace adept::internal;					\
    return BinaryOpScalarLeft<bool, LType, NAME, R>(l, r.cast());	\
  }									\
  									\
  template<class L, typename RType>					\
  inline								\
  typename internal::enable_if<internal::is_not_expression<RType>::value \
		       && (L::rank > 0),			\
       internal::BinaryOpScalarRight<bool, L, internal:: NAME, RType> >::type \
  OPERATOR(const Expression<typename L::type, L>& l, const RType& r) {	\
    using namespace adept::internal;					\
    return BinaryOpScalarRight<bool, L, NAME, RType>(l.cast(), r);	\
  }									\
									\
  template<class L, class R>						\
  inline								\
  typename internal::enable_if<L::rank == 0 && R::rank == 0,		\
			       bool>::type				\
  OPERATOR(const Expression<typename L::type, L>& l,			\
	   const Expression<typename R::type, R>& r) {			\
    return l.scalar_value() OPSYMBOL r.scalar_value();			\
  }									\
  									\
  template<typename LType, class R>					\
  inline								\
  typename internal::enable_if<internal::is_not_expression<LType>::value \
			       && R::rank == 0, bool>::type		\
  OPERATOR(const LType& l, const Expression<typename R::type, R>& r) {	\
    return l OPSYMBOL r.scalar_value();					\
  }									\
  									\
  template<class L, typename RType>					\
  inline								\
  typename internal::enable_if<internal::is_not_expression<RType>::value \
			       && L::rank == 0, bool>::type		\
  OPERATOR(const Expression<typename L::type, L>& l, const RType& r) {	\
    return l.scalar_value() OPSYMBOL r;					\
  }


// These return bool expressions when applied to expressions of rank
// greater than zero
ADEPT_DEFINE_OPERATOR(GreaterThan, operator>, >, " > ")
ADEPT_DEFINE_OPERATOR(LessThan, operator<, <, " < ")
ADEPT_DEFINE_OPERATOR(GreaterThanEqualTo, operator>=, >=, " >= ")
ADEPT_DEFINE_OPERATOR(LessThanEqualTo, operator<=, <=, " <= ")
ADEPT_DEFINE_OPERATOR(EqualTo, operator==, ==, " == ")
ADEPT_DEFINE_OPERATOR(NotEqualTo, operator!=, !=, " != ")

// These should only work on bool expressions
ADEPT_DEFINE_OPERATOR(Or, operator||, ||, " || ")
ADEPT_DEFINE_OPERATOR(And, operator&&, &&, " && ")

#undef ADEPT_DEFINE_OPERATOR

  template <typename Type, class R>
  inline
  typename internal::enable_if<R::rank == 0,Type>::type
  value(const Expression<Type, R>& r) {
    return r.scalar_value();
  }

} // End namespace adept


#endif


================================================
FILE: include/adept/Expression.h
================================================
/* Expression.h -- Base class for arrays and active objects

    Copyright (C) 2014-2017 European Centre for Medium-Range Weather Forecasts

    Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.
*/

#ifndef AdeptExpression_H
#define AdeptExpression_H

#include <sstream>
#include <cmath>

#include <adept/ExpressionSize.h>
#include <adept/traits.h>
#include <adept/exception.h>
#include <adept/ScratchVector.h>
#include <adept/Packet.h>

namespace adept {

  using internal::Packet;

  // ---------------------------------------------------------------------
  // SECTION 0: Forward declarations 
  // ---------------------------------------------------------------------
  
  class Stack;

  // ---------------------------------------------------------------------
  // SECTION 1: Definition of Expression type
  // ---------------------------------------------------------------------

  // All types of expression derive from Expression.  "A" is the
  // actual type of the expression (a use of the Curiously Recurring
  // Template Pattern).
  template <typename Type, class A>
  struct Expression {

    // Static information about the expression
  public:
    typedef Type type;
    typedef Type value_type; // STL-style

    // There are several "static const" members in the derived
    // classes, some of which require fall-back values, defined here:

    // By default an expression is not vectorizable.
    static const bool is_vectorizable = false;

    // Classes derived from this one that do not define how many
    // scratch variables, active variables or arrays they contain are
    // assumed to need zero
    static const int  n_scratch = 0;

    // Number of active variables in the expression (where each array
    // counts as 1), used to work out how much space must be reserved
    // on the operation stack
    static const int  n_active = 0;

    // Is this an active expression?
    static const bool is_active = false;

    // Expressions cannot be lvalues by default
    static const bool is_lvalue = false;

    // The presence of _adept_expression_flag is used to define the
    // adept::is_not_expression trait
    typedef bool _adept_expression_flag;

    // Cast the expression to its true type, given by the template
    // argument
    const A& cast() const { return static_cast<const A&>(*this); }
    
    // Return the dimensions of the expression
    template <int Rank>
    bool get_dimensions(ExpressionSize<Rank>& dim) const {
      return cast().get_dimensions_(dim);
    }

    // Return a string representation of the expression
    std::string expression_string() const {
      return cast().expression_string_();
    }
    
    Type value_with_len(Index j, Index len) const {
      ADEPT_STATIC_ASSERT(A::rank<=1,
		  VALUE_WITH_LEN_ONLY_APPLICABLE_TO_ARRAYS_OF_RANK_0_OR_1);
      return cast().value_with_len_(j, len);
    }

    // These functions are for rank-0 expressions where there is no
    // indexing required
    Type scalar_value() const { 
      ExpressionSize<0> dummy_index;
      return cast().template value_at_location_<0>(dummy_index);
    }

    // Return true if any memory in the expression lies between mem1
    // and mem2: used to test for aliasing when doing assignment.
    bool is_aliased(const Type* mem1, const Type* mem2) const {
      return cast().is_aliased_(mem1, mem2);
    }

    // Return true if the fastest varying dimension of all the arrays
    // in the expression are contiguous and increasing.  If so, we can
    // more simply increment their indices.
    bool all_arrays_contiguous() const {
      return cast().all_arrays_contiguous_();
    }

    // By default, arrays are contiguous (this fall-back used for
    // objects that aren't arrays)
    bool all_arrays_contiguous_() const { return true; }

    // Are all the arrays in the expression aligned to a Packet<Type>
    // boundary?
    bool is_aligned() const {
      return cast().is_aligned();
    }

    // In order to perform optimal vectorization, the first memory
    // addresses of each inner dimension must be aligned
    // appropriately, or they should all have the same offset so that
    // this number of scalar operations can be performed at the start
    // before begining on vector instructions.  This function returns
    // the offset of the data in any arrays in the expression, or -1 if
    // there is a clash in offsets.
    int alignment_offset() const {
      int val = cast().template alignment_offset_<Packet<Type>::size>();
      if (val < Packet<Type>::size) {
	return val;
      }
      else {
	// Note that if an object returns val==Packet<Type>::size then
	// it indicates that alignment does not matter for this object
	return 0;
      }
    }
    
    // Fall-back position is that alignment doesn't matter for this
    // object, which is encoded by returning n
    template <int n>
    int alignment_offset_() const { return n; }

    // If the sub-expression is of a different type from that
    // requested then we assume there must be no aliasing.
    template <typename MyType>
    typename internal::enable_if<!internal::is_same<MyType,Type>::value, bool>::type
    is_aliased(const MyType* mem1, const MyType* mem2) const {
      return false;
    }
  
    Type 
    scalar_value_and_gradient(Stack& stack) const {
      internal::ScratchVector<A::n_scratch> scratch;
      ExpressionSize<0> dummy_index;
      Type val = cast().template value_at_location_store_<0,0>(dummy_index, scratch);
      cast().template calc_gradient_<0,0>(stack, dummy_index, scratch);
      return val;
    }
 
    // For each array in the expression use location "i" to return the
    // memory index
    template <int Rank, int NArrays>
    void
    set_location(const ExpressionSize<Rank>& i, 
		 ExpressionSize<NArrays>& index) const {
      cast().template set_location_<0>(i, index);
    }

    // Get the value at the specified location and move to the next
    // location
    template <int NArrays>
    Type next_value(ExpressionSize<NArrays>& index) const {
      Type val = cast().template value_at_location_<0>(index);
      cast().template advance_location_<0>(index);
      return val;
    }
    // If all arrays are have an inner dimension that is contiguous
    // and increasing then their indices may be incremented all
    // together, which is more efficient
    template <int NArrays>
    Type next_value_contiguous(ExpressionSize<NArrays>& index) const {
      Type val = cast().template value_at_location_<0>(index);
      ++index;
      return val;
    }

    template <int NArrays>
    Packet<Type> next_packet(ExpressionSize<NArrays>& index) const {
      Packet<Type> val
      	= cast().template packet_at_location_<0>(index);
      index += Packet<Type>::size;
      return val;
    }

    template <int NArrays>
    Type value_at_location(ExpressionSize<NArrays>& index) const {
      return cast().template value_at_location_<0>(index);
    }
    template <int NArrays>
    void advance_location(ExpressionSize<NArrays>& index) const {
      cast().template advance_location_<0>(index);
    }

    // Get the value at the specified location, calculate the gradient
    // and move to the next location
    template <int NArrays>
    Type next_value_and_gradient(Stack& stack,
				 ExpressionSize<NArrays>& index) const {
      internal::ScratchVector<A::n_scratch> scratch;
      Type val = cast().template value_at_location_store_<0,0>(index, scratch);
      cast().template calc_gradient_<0,0>(stack, index, scratch);
      cast().template advance_location_<0>(index);
      //++index;
      return val;
    }
    template <int NArrays>
    Type next_value_and_gradient_contiguous(Stack& stack,
				 ExpressionSize<NArrays>& index) const {
      internal::ScratchVector<A::n_scratch> scratch;
      Type val = cast().template value_at_location_store_<0,0>(index, scratch);
      cast().template calc_gradient_<0,0>(stack, index, scratch);
      //cast().template advance_location_<0>(index);
      ++index;
      return val;
    }

    // This is used in product()
    template <int NArrays, typename MyType>
    Type next_value_and_gradient_special(Stack& stack,
				 ExpressionSize<NArrays>& index,
				 const MyType& multiplier) const {
      internal::ScratchVector<A::n_scratch> scratch;
      Type val = cast().template value_at_location_store_<0,0>(index, scratch);
      cast().template calc_gradient_<0,0>(stack, index, scratch, multiplier);
      cast().template advance_location_<0>(index);
      return val;
    }

    // This is used in norm2()
    template <int NArrays, typename MyType>
    Type next_value_and_gradient_special2(Stack& stack,
				 ExpressionSize<NArrays>& index,
				 const MyType& multiplier) const {
      internal::ScratchVector<A::n_scratch> scratch;
      Type val = cast().template value_at_location_store_<0,0>(index, scratch);
      cast().template calc_gradient_<0,0>(stack, index, scratch, multiplier*val);
      cast().template advance_location_<0>(index);
      return val;
    }

    // Inaccessible methods
    //  private:
    //    Expression(const Expression&) { }

  }; // End struct Expression


  // ---------------------------------------------------------------------
  // SECTION 2: Definition of Scalar type
  // ---------------------------------------------------------------------

  // Specific types of operation are in the adept::internal namespace
  namespace internal {

    // SCALAR

    template <typename Type>
    struct Scalar : public Expression<Type, Scalar<Type> > {
      static const int  rank       = 0;
      static const int  n_scratch  = 0;
      static const int  n_active   = 0;
      static const int  n_arrays   = 0;
      static const bool is_active  = false;
      static const bool is_vectorizable = true;

      Scalar(const Type& value) : val_(value) { }

      bool get_dimensions_(ExpressionSize<0>& dim) const { return true; }

      std::string expression_string_() const {
	std::stringstream s;
	s << val_;
	return s.str();
      }

      bool is_aliased_(const Type* mem1, const Type* mem2) const { return false; }

      Type value_with_len_(const Index& j, const Index& len) const
      { return val_; }

      template <int MyArrayNum, int NArrays>
      void advance_location_(ExpressionSize<NArrays>& loc) const { } 

      template <int MyArrayNum, int NArrays>
      Type value_at_location_(const ExpressionSize<NArrays>& loc) const
      { return val_; }

      template <int MyArrayNum, int NArrays>
      Packet<Type>
      packet_at_location_(const ExpressionSize<NArrays>& loc) const
      { return Packet<Type>(val_); }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_at_location_store_(const ExpressionSize<NArrays>& loc,
				    ScratchVector<NScratch>& scratch) const
      { return val_; }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_stored_(const ExpressionSize<NArrays>& loc,
			 const ScratchVector<NScratch>& scratch) const
      { return val_; }

      template <bool IsAligned,	int MyArrayNum, typename PacketType,
	int NArrays>
      PacketType values_at_location_(const ExpressionSize<NArrays>& loc) const {
	return PacketType(val_);
      }

      template <bool UseStored, bool IsAligned,	int MyArrayNum, int MyScratchNum,
		typename PacketType, int NArrays, int NScratch>
      PacketType values_at_location_store_(const ExpressionSize<NArrays>& loc,
		   ScratchVector<NScratch,PacketType>& scratch) const {
	return PacketType(val_);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      void calc_gradient_(Stack& stack, 
			  const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const {}

      template <int MyArrayNum, int MyScratchNum, 
		int NArrays, int NScratch, typename MyType>
      void calc_gradient_(Stack& stack, 
			  const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch,
			  const MyType& multiplier) const {}

      template <bool IsAligned, int MyArrayNum, int MyScratchNum, int MyActiveNum,
		int NArrays, int NScratch, int NActive>
      void calc_gradient_packet_(Stack& stack, 
				 const ExpressionSize<NArrays>& loc,
				 const ScratchVector<NScratch,Packet<Real> >& scratch,
				 ScratchVector<NActive,Packet<Real> >& gradients) const {}

      template <bool IsAligned, int MyArrayNum, int MyScratchNum, int MyActiveNum,
		int NArrays, int NScratch, int NActive, typename MyType>
      void calc_gradient_packet_(Stack& stack, 
				 const ExpressionSize<NArrays>& loc,
				 const ScratchVector<NScratch,Packet<Real> >& scratch,
				 ScratchVector<NActive,Packet<Real> >& gradients,
				 const MyType& multiplier) const {}

      template <int MyArrayNum, int Rank, int NArrays>
      void set_location_(const ExpressionSize<Rank>& i, 
			 ExpressionSize<NArrays>& index) const {}

    protected:
      Type val_;
      
    };


    // ---------------------------------------------------------------------
    // SECTION 3. "expr_cast" helper 
    // ---------------------------------------------------------------------

    // The following enables one of the static consts only in a
    // derived class of Expression to be extracted, and is useful when
    // you don't know whether a template argument to a function is an
    // Expression or a class derived from it.  Thus
    // expr_cast<Expression<double,Array> >::is_vectorizable and
    // expr_cast<Array>::is_vectorizable would both return
    // Array::is_vectorizable.

    template <class E>
    struct expr_cast {
      // Rank of the array
      static const int  rank = E::rank;
      // Number of scratch floating-point variables needed in the
      // expression, for example to store the result of a calculation
      // when it is needed again to compult the equivalent differential
      // statement
      static const int  n_scratch = E::n_scratch;
      // Number of arrays within the expression; more specifically,
      // the number of indices required to store the location of an
      // element of the array
      static const int  n_arrays = E::n_arrays;
      // Number of active terms in the expression
      static const int  n_active = E::n_active;
      // Is this an array expression?
      static const bool is_array = (E::rank > 0);
      // Is this an array expression with dimension of 2 or more?
      static const bool is_multidimensional = (E::rank > 1);
      // Is this an active expression?
      static const bool is_active = E::is_active;
      // Is this expression actually an lvalue such as Array or
      // FixedArray?
      static const bool is_lvalue = E::is_lvalue;
      // Is this expression vectorizable (conditional on a few extra
      // run-time checks)?
      static const bool is_vectorizable = E::is_vectorizable;  
    };

    template <typename T, class E>
    struct expr_cast<Expression<T,E> > {
      static const int  rank = E::rank;
      static const int  n_scratch = E::n_scratch;
      static const int  n_arrays = E::n_arrays;
      static const int  n_active = E::n_active;
      static const bool is_array = (E::rank > 0);
      static const bool is_multidimensional = (E::rank > 1);
      static const bool is_active = E::is_active;
      static const bool is_lvalue = E::is_lvalue;
      static const bool is_vectorizable = E::is_vectorizable;
    };

  }
}

#endif // AdeptExpression_H


================================================
FILE: include/adept/ExpressionSize.h
================================================
/* ExpressionSize.h -- Class for describing array sizes

    Copyright (C) 2014-2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.


   The ExpressionSize class is used to pass information between
   components of an expression on the dimensions (e.g. rows/columns,
   but works in any number of dimensions) of that part of an
   expression, and to check that the dimensions match.  Since
   ExpressionSize objects can be used to index arrays, they may be
   useful to users and so are not placed in the "adept::internal"
   namespace.

*/

#ifndef AdeptExpressionSize_H
#define AdeptExpressionSize_H

#include <string>
#include <sstream>

#include <adept/base.h>
#include <adept/traits.h>

namespace adept {

  // Definition of ExpressionSize class
  template <int Rank>
  class ExpressionSize {
  public:
    // Constructors
    ExpressionSize() { } // By default no initialization is done

    ExpressionSize(Index j) {
      if (j >= 0) {
	// Set all dimensions to the same value - usually 0 (empty
	// array) or 1 (scalar)
	set_all(j);
      }
      else {
	// Set just the first dimension to j; usually this would be
	// less than 0 to indicate an invalid expression
	dim[0] = j;
      }
    }

    ExpressionSize(Index j0, Index j1)
    { dim[0]=j0; dim[1]=j1; }
    ExpressionSize(Index j0, Index j1, Index j2)
    { dim[0]=j0; dim[1]=j1; dim[2]=j2; }
    ExpressionSize(Index j0, Index j1, Index j2, Index j3)
    { dim[0]=j0; dim[1]=j1; dim[2]=j2; dim[3]=j3; }
    ExpressionSize(Index j0, Index j1, Index j2, Index j3, Index j4)
    { dim[0]=j0; dim[1]=j1; dim[2]=j2; dim[3]=j3; dim[4]=j4; }
    ExpressionSize(Index j0, Index j1, Index j2, Index j3, Index j4, Index j5)
    { dim[0]=j0; dim[1]=j1; dim[2]=j2; dim[3]=j3; dim[4]=j4; dim[5]=j5; }
    ExpressionSize(Index j0, Index j1, Index j2, Index j3, Index j4, Index j5, Index j6)
    { dim[0]=j0; dim[1]=j1; dim[2]=j2; dim[3]=j3; dim[4]=j4; dim[5]=j5; dim[6]=j6; }

    // Assume copy constructor will copy elements of dim
    
    // An "invalid" expression is one involving a mismatch of array
    // sizes, and is conveyed by a negative first element
    bool invalid_expression() const { return (dim[0] < 0); }

    // Set all to specified value
    void set_all(Index j) {
      for (int i = 0; i < Rank; ++i) {
	dim[i] = j;
      }
    }

    // Copy from an ExpressionSize object of the same rank
    void copy(const ExpressionSize& d) {
      for (int i = 0; i < Rank; ++i) {
	dim[i] = d[i];
      }
    }
    // ...or pointer to raw data
    void copy(const Index* d) {
      for (int i = 0; i < Rank; ++i) {
	dim[i] = d[i];
      }
    }

    // Copy dissimilar ExpressionSize object, filling the remaining
    // dimensions with 1
    template <int MyRank>
    void copy_dissimilar(const ExpressionSize<MyRank>& d) {
      int rank = MyRank > Rank ? Rank : MyRank;
      for (int i = 0; i < rank; ++i) {
	dim[i] = d[i];
      }
      for (int i = rank; i < Rank; ++i) {
	dim[i] = 1;
      }
    }

    // String representation
    std::string str() const {
      std::stringstream s;
      s << "[" << dim[0];
      for (int i = 1; i < Rank; ++i) {
	s << "," << dim[i];
      }
      s << "]";
      return s.str();
    }

    // Get the total number of elements
    Index size() const {
      Index prod;
      if (Rank == 0) {
	prod = 1;
      }
      else {
	prod = dim[0];
	for (int i = 1; i < Rank; ++i) {
	  prod *= dim[i];
	}
      }
      return prod;
    }

    ExpressionSize& operator++() {
      for (int i = 0; i < Rank; ++i) {
	++dim[i];
      }
      return *this;
    }
    ExpressionSize& operator+=(Index inc) {
      for (int i = 0; i < Rank; ++i) {
	dim[i] += inc;
      }
      return *this;
    }


    bool operator==(const ExpressionSize<Rank>& rhs) const {
      for (int i = 0; i < Rank; i++) {
	if (dim[i] != rhs[i]) {
	  return false;
	}
      }
      return true;
    }
    bool operator!=(const ExpressionSize<Rank>& rhs) const {
      return !(*this == rhs);
    }

#ifdef ADEPT_MOVE_SEMANTICS
    friend void swap(ExpressionSize<Rank>& l, 
		     ExpressionSize<Rank>& r) noexcept {
      for (int i = 0; i < Rank; ++i) {
	Index tmp = l.dim[i];
	l.dim[i] = r.dim[i];
	r.dim[i] = tmp;
      }
    }
#endif

    // Const and non-const access to elements
    Index& operator[](int i) { return dim[i]; }
    const Index& operator[](int i) const { return dim[i]; }
  private:
    Index dim[Rank];
  };

  // Specialization for scalars (zero-rank arrays) known at compile
  // time
  template <>
  class ExpressionSize<0> {
  public:
    ExpressionSize() { }
    ExpressionSize(Index j) { }
    bool invalid_expression() const { return false; }
    std::string str() const { return ""; }
    void set_all(Index) const { }
    bool operator==(const ExpressionSize<0>&) const { return true; }
    bool operator!=(const ExpressionSize<0>&) const { return false; }
    bool operator[](int) const { return 0; }
    template <int MyRank>
    void copy_dissimilar(const ExpressionSize<MyRank>&) { }
  };

  // Send the size of an expression to a stream
  template <int Rank>
  inline
  std::ostream& operator<<(std::ostream& os, const ExpressionSize<Rank>& s) {
    if (Rank > 0) {
      os << "(" << s[0];
      for (int i = 1; i < Rank; i++) {
	os << "," << s[i];
      }
      return os << ")";
    }
  }
 

  namespace internal {
    // The following are only used within the Adept library

    // Check whether the size of one expression is compatible with
    // that of another for arithmetic operations: this is "true" if
    // the rank is the same and the dimensions match, or if one of the
    // expressions is a scalar (zero rank).  If the ranks don't match
    // and neither is zero then the program won't compile.
    template <int LRank, int RRank>
    inline
    typename enable_if<LRank==RRank && (LRank>1), bool>::type
    compatible(const ExpressionSize<LRank>& l, const ExpressionSize<RRank>& r) {
      bool result = (l[0] == r[0]);
      for (int i = 1; i < RRank; ++i) {
	result = result && (l[i] == r[i]);
      }
      return result;
    }

    template <int LRank, int RRank>
    inline
    typename enable_if<LRank==1 && RRank==1, bool>::type
    compatible(const ExpressionSize<LRank>& l, const ExpressionSize<RRank>& r) {
      return l[0] == r[0];
    }

    template <int LRank, int RRank>
    inline
    typename enable_if<LRank==0 || RRank==0, bool>::type
    compatible(const ExpressionSize<LRank>& l, const ExpressionSize<RRank>& r) {
      return true;
    }

    // Return an ExpressionSize object of specified rank that expresses
    // an invalid expression
    template <int Rank>
    inline
    ExpressionSize<Rank> invalid_expression_size() {
      return ExpressionSize<Rank>(-1);
    }

  } // End namespace internal

  // Deprecated
  inline ExpressionSize<1> expression_size(Index j0)
  { return ExpressionSize<1>(j0); }
  inline ExpressionSize<2> expression_size(Index j0, Index j1)
  { return ExpressionSize<2>(j0, j1); }

  // Use this instead
  inline ExpressionSize<1> dimensions(Index j0)
  { return ExpressionSize<1>(j0); }
  inline ExpressionSize<2> dimensions(Index j0, Index j1)
  { return ExpressionSize<2>(j0, j1); }
  inline ExpressionSize<3> dimensions(Index j0, Index j1, Index j2)
  { return ExpressionSize<3>(j0, j1, j2); }
  inline ExpressionSize<4> dimensions(Index j0, Index j1, Index j2,
				      Index j3)
  { return ExpressionSize<4>(j0, j1, j2, j3); }
  inline ExpressionSize<5> dimensions(Index j0, Index j1, Index j2,
				      Index j3, Index j4)
  { return ExpressionSize<5>(j0, j1, j2, j3, j4); }
  inline ExpressionSize<6> dimensions(Index j0, Index j1, Index j2,
				      Index j3, Index j4, Index j5)
  { return ExpressionSize<6>(j0, j1, j2, j3, j4, j5); }
  inline ExpressionSize<7> dimensions(Index j0, Index j1, Index j2,
				      Index j3, Index j4, Index j5, Index j6)
  { return ExpressionSize<7>(j0, j1, j2, j3, j4, j5, j6); }


} // End namespace adept

#endif // AdeptExpressionSize_H


================================================
FILE: include/adept/FixedArray.h
================================================
/* FixedArray.h -- active or inactive FixedArray of arbitrary rank

    Copyright (C) 2014-2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.


   The FixedArray class has functionality modelled on Fortran-90 arrays -
   they can have a rank up to 7 (above will work, but some forms of
   indexing these arrays will not work).

*/

#ifndef AdeptFixedArray_H
#define AdeptFixedArray_H 1

#include <iostream>
#include <sstream>
#include <limits>
#include <complex>

#include <adept/Array.h>
#include <adept/Allocator.h>

namespace adept {

  namespace internal {

    // -------------------------------------------------------------------
    // Helper classes
    // -------------------------------------------------------------------

    // The following are used by expression_string()
    template <int Rank, bool IsActive>
    struct fixed_array_helper            { const char* name() { return "FixedArray";  } };
    template <int Rank>
    struct fixed_array_helper<Rank,true> { const char* name() { return "aFixedArray";  } };

    template <>
    struct fixed_array_helper<1,false>   { const char* name() { return "FixedVector"; } };
    template <>
    struct fixed_array_helper<1,true>    { const char* name() { return "aFixedVector"; } };

    template <>
    struct fixed_array_helper<2,false>   { const char* name() { return "FixedMatrix"; } };
    template <>
    struct fixed_array_helper<2,true>    { const char* name() { return "aFixedMatrix"; } };

    template<Index J0, Index J1, Index J2, Index J3,
	     Index J4, Index J5, Index J6>
    struct fixed_array {
      static const int rank = (J0>0)
	* (1 + (J1>0) * (1 + (J2>0) * (1 + (J3>0) * (1 + (J4>0) * (1 + (J5>0) * (1 + (J6>0)))))));
      static const Index length = (J0 + (J0<1)) * (J1 + (J1<1)) * (J2 + (J2<1))
	* (J3 + (J3<1)) * (J4 + (J4<1)) * (J5 + (J5<1)) * (J6 + (J6<1));
    };

  } // End namespace internal


  // -------------------------------------------------------------------
  // Definition of FixedArray class
  // -------------------------------------------------------------------
  template<typename Type, bool IsActive, Index J0, Index J1 = 0, 
	   Index J2 = 0, Index J3 = 0, Index J4 = 0, Index J5 = 0, Index J6 = 0>
  class FixedArray
    : public Expression<Type,FixedArray<Type,IsActive,J0,J1,J2,J3,J4,J5,J6> >,
      protected internal::GradientIndex<IsActive> {

  public:
    // -------------------------------------------------------------------
    // FixedArray: 1. Static Definitions
    // -------------------------------------------------------------------

    // The Expression base class needs access to some protected member
    // functions in section 5
    friend struct Expression<Type,FixedArray<Type,IsActive,J0,J1,J2,J3,J4,J5,J6> >;

    // Static definitions to enable the properties of this type of
    // expression to be discerned at compile time
    static const bool is_active  = IsActive;
    static const bool is_lvalue  = true;
    static const int  rank       = internal::fixed_array<J0,J1,J2,J3,J4,J5,J6>::rank;
    static const int  length_    = internal::fixed_array<J0,J1,J2,J3,J4,J5,J6>::length;
    static const int  n_active   = IsActive * (1 + internal::is_complex<Type>::value);
    static const int  n_scratch  = 0;
    static const int  n_arrays   = 1;
    static const bool is_vectorizable = Packet<Type>::is_vectorized;

  protected:
    template <int Dim, Index X0, Index X1, Index X2,
	      Index X3, Index X4, Index X5, Index X6>
    struct dimension_alias { };
    template <Index X0, Index X1, Index X2,
	      Index X3, Index X4, Index X5, Index X6>
    struct dimension_alias<0,X0,X1,X2,X3,X4,X5,X6>
    { static const Index value = X0; };
    template <Index X0, Index X1, Index X2,
	      Index X3, Index X4, Index X5, Index X6>
    struct dimension_alias<1,X0,X1,X2,X3,X4,X5,X6>
    { static const Index value = X1; };
    template <Index X0, Index X1, Index X2,
	      Index X3, Index X4, Index X5, Index X6>
    struct dimension_alias<2,X0,X1,X2,X3,X4,X5,X6>
    { static const Index value = X2; };
    template <Index X0, Index X1, Index X2,
	      Index X3, Index X4, Index X5, Index X6>
    struct dimension_alias<3,X0,X1,X2,X3,X4,X5,X6>
    { static const Index value = X3; };
    template <Index X0, Index X1, Index X2,
	      Index X3, Index X4, Index X5, Index X6>
    struct dimension_alias<4,X0,X1,X2,X3,X4,X5,X6>
    { static const Index value = X4; };
    template <Index X0, Index X1, Index X2,
	      Index X3, Index X4, Index X5, Index X6>
    struct dimension_alias<5,X0,X1,X2,X3,X4,X5,X6>
    { static const Index value = X5; };
    template <Index X0, Index X1, Index X2,
	      Index X3, Index X4, Index X5, Index X6>
    struct dimension_alias<6,X0,X1,X2,X3,X4,X5,X6>
    { static const Index value = X6; };

  public:
    template <int Dim> struct dimension_ { static const int value 
      = dimension_alias<Dim,J0,J1,J2,J3,J4,J5,J6>::value; };

    template <int RankMinusDim, int Dim>
    struct offset_helper { 
      static const Index value = // Dim == Rank-1 ? 1 :
	dimension_<Dim+1>::value*offset_helper<RankMinusDim-1, Dim+1>::value; 
    };
    template <int Dim>
    struct offset_helper<1,Dim> { static const Index value = 1; };
    template <int Dim>
    struct offset_helper<0,Dim> { static const Index value = 1; };
    template <int Dim>
    struct offset_helper<-1,Dim> { static const Index value = 1; };
    template <int Dim>
    struct offset_helper<-2,Dim> { static const Index value = 1; };
    template <int Dim>
    struct offset_helper<-3,Dim> { static const Index value = 1; };
    template <int Dim>
    struct offset_helper<-4,Dim> { static const Index value = 1; };
    template <int Dim>
    struct offset_helper<-5,Dim> { static const Index value = 1; };

    template <int Dim> struct offset_ { static const Index value
      = offset_helper<rank-Dim, Dim>::value; };


    // -------------------------------------------------------------------
    // FixedArray: 2. Constructors
    // -------------------------------------------------------------------
    
    // Initialize an empty array
    FixedArray() : internal::GradientIndex<IsActive>(length_, false) {
      ADEPT_STATIC_ASSERT(!(std::numeric_limits<Type>::is_integer
			    && IsActive), CANNOT_CREATE_ACTIVE_FIXED_ARRAY_OF_INTEGERS);
#ifdef ADEPT_REAL_INIT
      initialize<Type>();
#endif 
    }

#ifdef ADEPT_REAL_INIT
  private:

    // Initialize to zero, NaN or whatever for debugging
    template <typename T>
    typename internal::enable_if<internal::is_floating_point<T>::value, void>::type
    initialize() {
      for (int i = 0; i < length_; ++i) {
	data_[i] = ADEPT_INIT_REAL;
      }
    }
    template <typename T>
    typename internal::enable_if<internal::is_complex<T>::value, void>::type
    initialize() {
      for (int i = 0; i < length_; ++i) {
#ifdef ADEPT_INIT_REAL_SNAN
        data_[i] = std::complex<typename Type::value_type>(
          std::numeric_limits<typename Type::value_type>::signaling_NaN(),
	  std::numeric_limits<typename Type::value_type>::signaling_NaN());
#else
	data_[i] = std::complex<typename Type::value_type>(ADEPT_INIT_REAL, ADEPT_INIT_REAL);
#endif
      }
    }

    // Dummy initialize for non-floats
    template <typename T>
    typename internal::enable_if<!internal::is_floating_point<T>::value
				 && !internal::is_complex<T>::value, void>::type
    initialize() { }

  public:
#endif

    // Copy constructor copies the data, unlike in the Array class
    FixedArray(const FixedArray& rhs) 
      : internal::GradientIndex<IsActive>(length_, false)
    { *this = rhs; }

  public:
    // Initialize with an expression on the right hand side by
    // evaluating the expression, requiring the ranks to be equal.
    // Note that this constructor enables expressions to be used as
    // arguments to functions that expect an array - to prevent this
    // implicit conversion, use the "explicit" keyword.
    template<typename EType, class E>
    FixedArray(const Expression<EType, E>& rhs,
	  typename internal::enable_if<E::rank == rank,int>::type = 0)
      : internal::GradientIndex<IsActive>(length_, false)
    { *this = rhs; }

#ifdef ADEPT_CXX11_FEATURES
    // Initialize from initializer list
    template <typename T>
    FixedArray(std::initializer_list<T> list) 
      : internal::GradientIndex<IsActive>(length_,false) { *this = list; }

    // The unfortunate restrictions on initializer_list constructors
    // mean that each possible Array rank needs explicit treatment
    template <typename T>
    FixedArray(std::initializer_list<
	  std::initializer_list<T> > list)
      : internal::GradientIndex<IsActive>(length_,false) { *this = list; }

    template <typename T>
    FixedArray(std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<T> > > list)
      : internal::GradientIndex<IsActive>(length_,false) { *this = list; }

    template <typename T>
    FixedArray(std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<T> > > > list)
      : internal::GradientIndex<IsActive>(length_,false) { *this = list; }

    template <typename T>
    FixedArray(std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<T> > > > > list)
      : internal::GradientIndex<IsActive>(length_,false) { *this = list; }

    template <typename T>
    FixedArray(std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<T> > > > > > list)
      : internal::GradientIndex<IsActive>(length_,false) { *this = list; }

    template <typename T>
    FixedArray(std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<
	  std::initializer_list<T> > > > > > > list)
      : internal::GradientIndex<IsActive>(length_,false) { *this = list; }
    
#endif

    // Destructor: if the data are stored in a Storage object then we
    // tell it that one fewer object is linking to it; if the number
    // of links to it drops to zero, it will destruct itself and
    // deallocate the memory.
    ~FixedArray()
    { internal::GradientIndex<IsActive>::unregister(length_); }

    // -------------------------------------------------------------------
    // FixedArray: 3. Assignment operators
    // -------------------------------------------------------------------

    // Assignment to another matrix: copy the data...
    // Ideally we would like this to fall back to the operator=(const
    // Expression&) function, but if we don't define a copy assignment
    // operator then C++ will generate a default one :-(
    FixedArray& operator=(const FixedArray& rhs) {
      *this = static_cast<const Expression<Type,FixedArray>&> (rhs);
      return *this;
    }

    // Assignment to an array expression of the same rank
    template <typename EType, class E>
    typename internal::enable_if<E::rank == rank, FixedArray&>::type
    inline
    operator=(const Expression<EType,E>& rhs) {
#ifndef ADEPT_NO_DIMENSION_CHECKING
      ExpressionSize<rank> dims;
      if (!rhs.get_dimensions(dims)) {
	std::string str = "FixedArray size mismatch in "
	  + rhs.expression_string() + ".";
	throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
      }
      else if (!internal::compatible(dims, dimensions())) {
	std::string str = "Expr";
	str += dims.str() + " object assigned to " + expression_string_();
	throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
      }
#endif
      // Select active/passive version by delegating to a protected
      // function
      assign_expression_<rank, IsActive, E::is_active>(rhs);

      return *this;
    }

    // Assignment to a single value copies to every element
    template <typename RType>
    typename internal::enable_if<internal::is_not_expression<RType>::value, FixedArray&>::type
    operator=(RType rhs) {
      assign_inactive_scalar_<rank,IsActive>(rhs);
      return *this;
    }

    // Assign active scalar expression to an active array by first
    // converting the RHS to an active scalar
    template <typename EType, class E>
    typename internal::enable_if<E::rank == 0 && (rank > 0) && IsActive && !E::is_lvalue,
      FixedArray&>::type
    operator=(const Expression<EType,E>& rhs) {
      Active<EType> x = rhs;
      *this = x;
      return *this;
    }

    // Assign an active scalar to an active array
    template <typename PType>
    FixedArray& 
    operator=(const Active<PType>& rhs) {
      ADEPT_STATIC_ASSERT(IsActive, ATTEMPT_TO_ASSIGN_ACTIVE_SCALAR_TO_INACTIVE_FIXED_ARRAY);
#ifdef ADEPT_RECORDING_PAUSABLE
      if (!ADEPT_ACTIVE_STACK->is_recording()) {
	assign_inactive_scalar_<rank,IsActive>(rhs.scalar_value());
	return *this;
      }
#endif
      // In case PType != Type we make a local copy to minimize type
      // conversions
      Type val = rhs.scalar_value();
	
      ADEPT_ACTIVE_STACK->check_space(length_);
      for (Index i = 0; i < length_; ++i) {
	data_[i] = val;
	ADEPT_ACTIVE_STACK->push_rhs(1.0, rhs.gradient_index());
	ADEPT_ACTIVE_STACK->push_lhs(gradient_index()+i);
      }

      return *this;
    }
    
#define ADEPT_DEFINE_OPERATOR(OPERATOR, OPSYMBOL)		\
    template <class RType>					\
    FixedArray& OPERATOR(const RType& rhs) {			\
      return *this = noalias(*this OPSYMBOL rhs);		\
    }
    ADEPT_DEFINE_OPERATOR(operator+=, +)
    ADEPT_DEFINE_OPERATOR(operator-=, -)
    ADEPT_DEFINE_OPERATOR(operator*=, *)
    ADEPT_DEFINE_OPERATOR(operator/=, /)
  //    ADEPT_DEFINE_OPERATOR(operator&=, &);
  //    ADEPT_DEFINE_OPERATOR(operator|=, |);
#undef ADEPT_DEFINE_OPERATOR

    // Enable the A.where(B) = C construct.
  
    // Firstly implement the A.where(B) to return a "Where<A,B>" object
    template <class B>
    typename internal::enable_if<B::rank == rank, internal::Where<FixedArray,B> >::type
    where(const Expression<bool,B>& bool_expr) {
#ifndef ADEPT_NO_DIMENSION_CHECKING
      ExpressionSize<rank> dims;
      if (!bool_expr.get_dimensions(dims)) {
	std::string str = "FixedArray size mismatch in "
	  + bool_expr.expression_string() + ".";
	throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
      }
      else if (dims != dimensions()) {
	throw size_mismatch("Boolean expression of different size"
			    ADEPT_EXCEPTION_LOCATION);
      }
#endif
      return internal::Where<FixedArray,B>(*this, bool_expr.cast());
    }
    
    // When Where<A,B> = C is invoked, it calls
    // A.assign_conditional(B,C). This is implemented separately for
    // the case when C is an inactive scalar and when it is an array
    // expression.
    template <class B, typename C>
    typename internal::enable_if<internal::is_not_expression<C>::value, void>::type
    assign_conditional(const Expression<bool,B>& bool_expr,
			    C rhs) {
      if (!empty()) {
	assign_conditional_inactive_scalar_<IsActive>(bool_expr, rhs);
      }
    }
    
    template <class B, typename T, class C>
    void assign_conditional(const Expression<bool,B>& bool_expr,
			    const Expression<T,C>& rhs) {
#ifndef ADEPT_NO_DIMENSION_CHECKING
      // Assume size of bool_expr already checked
      ExpressionSize<rank> dims;
      if (!rhs.get_dimensions(dims)) {
	std::string str = "FixedArray size mismatch in "
	  + rhs.expression_string() + ".";
	throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
      }
      else if (!internal::compatible(dims,dimensions())) {
	throw size_mismatch("Right-hand-side of \"where\" construct of incompatible size"
			    ADEPT_EXCEPTION_LOCATION);
      }
#endif
      // Select active/passive version by delegating to a
      // protected function
      assign_conditional_<IsActive>(bool_expr.cast(), rhs.cast());
      //      return *this;
    }

#ifdef ADEPT_CXX11_FEATURES
    // Assignment of a FixedArray to an initializer list; the first ought
    // to only work for vectors
    template <typename T>
    typename internal::enable_if<std::is_convertible<T,Type>::value, FixedArray&>::type
    operator=(std::initializer_list<T> list) {
      ADEPT_STATIC_ASSERT(rank==1, RANK_MISMATCH_IN_INITIALIZER_LIST);

      if (list.size() > J0) {
	throw size_mismatch("Initializer list is larger than Vector in assignment"
			    ADEPT_EXCEPTION_LOCATION);
      }
      // Zero the whole array first in order that automatic
      // differentiation works
      *this = 0;
      Index index = 0;
      for (auto i = std::begin(list); i < std::end(list); ++i,
	   ++index) {
	data_[index*offset_<0>::value] = *i;	
      }
      return *this;
    }

    // Assignment of a higher rank Array to a list of lists...
    template <class IType>
    FixedArray& operator=(std::initializer_list<std::initializer_list<IType> > list) {
      ADEPT_STATIC_ASSERT(rank==internal::initializer_list_rank<IType>::value+2,
      			  RANK_MISMATCH_IN_INITIALIZER_LIST);
      if (list.size() > J0) {
	throw size_mismatch("Multi-dimensional initializer list larger than slowest-varying dimension of Array"
			    ADEPT_EXCEPTION_LOCATION);
      }
      // Zero the whole array first in order that automatic
      // differentiation works
      *this = 0;

      // Enact the assignment using the Array version
      inactive_link() = list;
      return *this;
    }
#endif
  
    // -------------------------------------------------------------------
    // FixedArray: 4. Access functions, particularly operator()
    // -------------------------------------------------------------------
  
    // Get l-value of the element at the specified coordinates
    typename internal::active_reference<Type,IsActive>::type
    get_lvalue(const ExpressionSize<rank>& i) {
      return get_lvalue_<IsActive>(index_(i));
    }
    
    typename internal::active_scalar<Type,IsActive>::type
    get_rvalue(const ExpressionSize<rank>& i) const {
      return get_rvalue_<IsActive>(index_(i));
    }

  protected:
    template <bool MyIsActive>
    typename internal::enable_if<MyIsActive, ActiveReference<Type> >::type
    get_lvalue_(const Index& loc) {
      return ActiveReference<Type>(data_[loc], gradient_index()+loc);
    }
    template <bool MyIsActive>
    typename internal::enable_if<!MyIsActive, Type&>::type
    get_lvalue_(const Index& loc) {
      return data_[loc];
    }

    template <bool MyIsActive>
    typename internal::enable_if<MyIsActive, Active<Type> >::type
    get_rvalue_(const Index& loc) const {
      return Active<Type>(data_[loc], gradient_index()+loc);
    }
    template <bool MyIsActive>
    typename internal::enable_if<!MyIsActive, const Type&>::type
    get_rvalue_(const Index& loc) const {
      return data_[loc];
    }

  public:
    // Get a constant reference to the element at the specified
    // location, ignoring whether it is active or not
    //    const Type& get(const ExpressionSize<rank>& i) const {
    //      return data_[index_(i)];
    //    }

    // The following provide a way to access individual elements of
    // the array.  There must be the same number of arguments to
    // operator() as the rank of the array.  Each argument must be of
    // integer type, or a rank-0 expression of integer type (such as
    // "end" or "end-3"). Inactive arrays return a reference to the
    // element, while active arrays return an ActiveReference<Type>
    // object.  Up to 7 dimensions are supported.

    // l-value access to inactive array with function-call operator
    template <typename I0>
    typename internal::enable_if<rank==1 && internal::all_scalar_ints<1,I0>::value && !IsActive, Type&>::type
    operator()(I0 i0) 
    { return data_[internal::get_index_with_len(i0,J0)]; }
    
    // r-value access to inactive array with function-call operator
    template <typename I0>
    typename internal::enable_if<rank==1 && internal::all_scalar_ints<1,I0>::value && !IsActive, const Type&>::type
    operator()(I0 i0) const
    { return data_[internal::get_index_with_len(i0,J0)]; }

    // l-value access to inactive array with element-access operator
    template <typename I0>
    typename internal::enable_if<rank==1 && internal::all_scalar_ints<1,I0>::value && !IsActive, Type&>::type
    operator[](I0 i0) 
    { return data_[internal::get_index_with_len(i0,J0)]; }

    // r-value access to inactive array with element-access operator
    template <typename I0>
    typename internal::enable_if<rank==1 && internal::all_scalar_ints<1,I0>::value && !IsActive, const Type&>::type
    operator[](I0 i0) const
    { return data_[internal::get_index_with_len(i0,J0)]; }
 
  protected:
    template <bool MyIsActive>
    typename internal::enable_if<!MyIsActive,Type&>::type
    get_scalar_reference(const Index& offset)
    { return data_[offset]; }

    template <bool MyIsActive>
    typename internal::enable_if<!MyIsActive,const Type&>::type
    get_scalar_reference(const Index& offset) const
    { return data_[offset]; }

    template <bool MyIsActive>
    typename internal::enable_if<MyIsActive,ActiveReference<Type> >::type
    get_scalar_reference(const Index& offset) 
    { return ActiveReference<Type>(data_[offset], gradient_index()+offset); }
    template <bool MyIsActive>
    typename internal::enable_if<MyIsActive,ActiveConstReference<Type> >::type
    get_scalar_reference(const Index& offset) const
    { return ActiveConstReference<Type>(data_[offset], gradient_index()+offset); }

  public:

    // l-value access to active array with function-call operator
    template <typename I0>
    typename internal::enable_if<rank==1 && internal::all_scalar_ints<1,I0>::value && IsActive,
		       ActiveReference<Type> >::type
    operator()(I0 i0) {
      Index offset = internal::get_index_with_len(i0,J0);
      return ActiveReference<Type>(data_[offset], gradient_index()+offset);
    }
    
    // r-value access to active array with function-call operator
    template <typename I0>
    typename internal::enable_if<rank==1 && internal::all_scalar_ints<1,I0>::value && IsActive,
		       ActiveConstReference<Type> >::type
    operator()(I0 i0) const {
      Index offset = internal::get_index_with_len(i0,J0);
      return ActiveConstReference<Type>(data_[offset], gradient_index()+offset);
    }
  
    // l-value access to active array with element-access operator
    template <typename I0>
    typename internal::enable_if<rank==1 && internal::all_scalar_ints<1,I0>::value && IsActive,
		       ActiveReference<Type> >::type
    operator[](I0 i0) {
      Index offset = internal::get_index_with_len(i0,J0);
      return ActiveReference<Type>(data_[offset], gradient_index()+offset);
    }
    
    // r-value access to active array with element-access operator
    template <typename I0>
    typename internal::enable_if<rank==1 && internal::all_scalar_ints<1,I0>::value && IsActive,
		       ActiveConstReference<Type> >::type
    operator[](I0 i0) const {
      Index offset = internal::get_index_with_len(i0,J0);
      return ActiveConstReference<Type>(data_[offset], gradient_index()+offset);
    }
      
    // 2D array l-value and r-value access
    template <typename I0, typename I1>
    typename internal::enable_if<rank==2 && internal::all_scalar_ints<2,I0,I1>::value,
		       typename internal::active_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1) {
      return get_scalar_reference<IsActive>(
		    internal::get_index_with_len(i0,J0)*J1
		  + internal::get_index_with_len(i1,J1));
    }
    template <typename I0, typename I1>
    typename internal::enable_if<rank==2 && internal::all_scalar_ints<2,I0,I1>::value,
		       typename internal::active_const_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1) const {
      return get_scalar_reference<IsActive>(
		    internal::get_index_with_len(i0,J0)*J1
		  + internal::get_index_with_len(i1,J1));
    }
  
    // 3D array l-value and r-value access
    template <typename I0, typename I1, typename I2>
    typename internal::enable_if<rank==3 && internal::all_scalar_ints<3,I0,I1,I2>::value,
		       typename internal::active_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2) {
      return get_scalar_reference<IsActive>(J2*(J1*internal::get_index_with_len(i0,J0)
						+ internal::get_index_with_len(i1,J1))
					    + internal::get_index_with_len(i2,J2));
    }
    template <typename I0, typename I1, typename I2>
    typename internal::enable_if<rank==3 && internal::all_scalar_ints<3,I0,I1,I2>::value,
		       typename internal::active_const_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2) const {
      return get_scalar_reference<IsActive>(J2*(J1*internal::get_index_with_len(i0,J0)
						+ internal::get_index_with_len(i1,J1))
					    + internal::get_index_with_len(i2,J2));
    }

    // 4D array l-value and r-value access
    template <typename I0, typename I1, typename I2, typename I3>
    typename internal::enable_if<rank==4 && internal::all_scalar_ints<4,I0,I1,I2,I3>::value,
		       typename internal::active_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3) {
      return get_scalar_reference<IsActive>(J3*(J2*(J1*internal::get_index_with_len(i0,J0)
						    + internal::get_index_with_len(i1,J1))
						+ internal::get_index_with_len(i2,J2))
					    + internal::get_index_with_len(i3,J3));
    }
    template <typename I0, typename I1, typename I2, typename I3>
    typename internal::enable_if<rank==4 && internal::all_scalar_ints<4,I0,I1,I2,I3>::value,
		       typename internal::active_const_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3) const {
      return get_scalar_reference<IsActive>(J3*(J2*(J1*internal::get_index_with_len(i0,J0)
						    + internal::get_index_with_len(i1,J1))
						+ internal::get_index_with_len(i2,J2))
					    + internal::get_index_with_len(i3,J3));
    }

    // 5D array l-value and r-value access
    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4>
    typename internal::enable_if<rank==5 && internal::all_scalar_ints<5,I0,I1,I2,I3,I4>::value,
		       typename internal::active_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4) {
      return get_scalar_reference<IsActive>(J4*(J3*(J2*(J1*internal::get_index_with_len(i0,J0)
							+ internal::get_index_with_len(i1,J1))
						    + internal::get_index_with_len(i2,J2))
						+ internal::get_index_with_len(i3,J3))
					    + internal::get_index_with_len(i4,J4));
    }
    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4>
    typename internal::enable_if<rank==5 && internal::all_scalar_ints<5,I0,I1,I2,I3,I4>::value,
		       typename internal::active_const_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4) const {
      return get_scalar_reference<IsActive>(J4*(J3*(J2*(J1*internal::get_index_with_len(i0,J0)
							+ internal::get_index_with_len(i1,J1))
						    + internal::get_index_with_len(i2,J2))
						+ internal::get_index_with_len(i3,J3))
					    + internal::get_index_with_len(i4,J4));
    }

    // 6D array l-value and r-value access
    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4, typename I5>
    typename internal::enable_if<rank==6 && internal::all_scalar_ints<6,I0,I1,I2,I3,I4,I5>::value,
		       typename internal::active_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5) {
      return get_scalar_reference<IsActive>(J5*(J4*(J3*(J2*(J1*internal::get_index_with_len(i0,J0)
							    + internal::get_index_with_len(i1,J1))
							+ internal::get_index_with_len(i2,J2))
						    + internal::get_index_with_len(i3,J3))
						+ internal::get_index_with_len(i4,J4))
					    + internal::get_index_with_len(i5,J5));
    }
    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4, typename I5>
    typename internal::enable_if<rank==6 && internal::all_scalar_ints<6,I0,I1,I2,I3,I4,I5>::value,
		       typename internal::active_const_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5) const {
      return get_scalar_reference<IsActive>(J5*(J4*(J3*(J2*(J1*internal::get_index_with_len(i0,J0)
							    + internal::get_index_with_len(i1,J1))
							+ internal::get_index_with_len(i2,J2))
						    + internal::get_index_with_len(i3,J3))
						+ internal::get_index_with_len(i4,J4))
					    + internal::get_index_with_len(i5,J5));
    }

    // 7D array l-value and r-value access
    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4, typename I5, typename I6>
    typename internal::enable_if<rank==7 && internal::all_scalar_ints<7,I0,I1,I2,I3,I4,I5,I6>::value,
		       typename internal::active_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6) {
      return get_scalar_reference<IsActive>(J6*(J5*(J4*(J3*(J2*(J1*internal::get_index_with_len(i0,J0)
								+ internal::get_index_with_len(i1,J1))
							    + internal::get_index_with_len(i2,J2))
							+ internal::get_index_with_len(i3,J3))
						    + internal::get_index_with_len(i4,J4))
						+ internal::get_index_with_len(i5,J5))
					    + internal::get_index_with_len(i6,J6));
    }
    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4, typename I5, typename I6>
    typename internal::enable_if<rank==7 && internal::all_scalar_ints<7,I0,I1,I2,I3,I4,I5,I6>::value,
		       typename internal::active_const_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6) const {
      return get_scalar_reference<IsActive>(J6*(J5*(J4*(J3*(J2*(J1*internal::get_index_with_len(i0,J0)
								+ internal::get_index_with_len(i1,J1))
							    + internal::get_index_with_len(i2,J2))
							+ internal::get_index_with_len(i3,J3))
						    + internal::get_index_with_len(i4,J4))
						+ internal::get_index_with_len(i5,J5))
					    + internal::get_index_with_len(i6,J6));
    }
   

    // The following define the case when operator() is called and one
    // of the arguments is a "range" object (an object that describes
    // a range of indices that are either contiguous or separated by a
    // fixed stride), while all others are of integer type (or a
    // rank-0 expression of integer type). An Array object is returned
    // with a rank that may be reduced from that of the original
    // array, by one for each dimension that was indexed by an
    // integer. The new array points to a subset of the original data,
    // so modifying it will modify the original array.

    // First the case of a vector where we know the argument must be a
    // "range" object
    template <typename I0>
    typename internal::enable_if<internal::is_ranged<rank,I0>::value,
		       Array<1,Type,IsActive> >::type
    operator()(I0 i0) {
      ExpressionSize<1> new_dim((i0.end(J0) + i0.stride(J0) - i0.begin(J0))
				/i0.stride(J0));
      ExpressionSize<1> new_offset(i0.stride(J0));
      return Array<1,Type,IsActive>(data_, i0.begin(J0), new_dim, new_offset,
				    internal::GradientIndex<IsActive>::get());
    }
    template <typename I0>
    typename internal::enable_if<internal::is_ranged<rank,I0>::value,
		       const Array<1,Type,IsActive> >::type
    operator()(I0 i0) const {
      ExpressionSize<1> new_dim((i0.end(J0) + i0.stride(J0) - i0.begin(J0))
				/i0.stride(J0));
      ExpressionSize<1> new_offset(i0.stride(J0));
      return Array<1,Type,IsActive>(data_, i0.begin(J0), new_dim, new_offset,
				    internal::GradientIndex<IsActive>::get());
    }

  private:
    // For multi-dimensional arrays, we need a helper function

    // Treat the indexing of dimension "irank" in the case that the
    // index is of integer type
    template <int Rank, typename T, int NewRank>
    typename internal::enable_if<internal::is_scalar_int<T>::value, void>::type
    update_index(const T& i, Index& inew_rank, Index& ibegin,
		 ExpressionSize<NewRank>& new_dim, 
		 ExpressionSize<NewRank>& new_offset) const {
      ibegin += internal::get_index_with_len(i,dimension_<Rank>::value)*offset_<Rank>::value;
    }

    // Treat the indexing of dimension "irank" in the case that the
    // index is a "range" object
    template <int Rank, typename T, int NewRank>
    typename internal::enable_if<internal::is_range<T>::value, void>::type
    update_index(const T& i, Index& inew_rank, Index& ibegin,
		 ExpressionSize<NewRank>& new_dim, 
		 ExpressionSize<NewRank>& new_offset) const {
      ibegin += i.begin(dimension_<Rank>::value)*offset_<Rank>::value;
      new_dim[inew_rank]
      = (i.end(dimension_<Rank>::value)
	 + i.stride(dimension_<Rank>::value)-i.begin(dimension_<Rank>::value))
      / i.stride(dimension_<Rank>::value);
      new_offset[inew_rank] = i.stride(dimension_<Rank>::value)*offset_<Rank>::value;
      ++inew_rank;
    }
  
  public:

    // Now the individual overloads for each number of arguments, up
    // to 7, with separate r-value (const) and l-value (non-const)
    // versions
    template <typename I0, typename I1>
    typename internal::enable_if<internal::is_ranged<rank,I0,I1>::value,
		       Array<internal::is_ranged<rank,I0,I1>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1) {
      static const int new_rank = internal::is_ranged<rank,I0,I1>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index<0>(i0, inew_rank, ibegin, new_dim, new_offset);
      update_index<1>(i1, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_, ibegin, new_dim, new_offset,
					   internal::GradientIndex<IsActive>::get());
    }

    template <typename I0, typename I1>
    typename internal::enable_if<internal::is_ranged<rank,I0,I1>::value,
		       const Array<internal::is_ranged<rank,I0,I1>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1) const {
      static const int new_rank = internal::is_ranged<rank,I0,I1>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index<0>(i0, inew_rank, ibegin, new_dim, new_offset);
      update_index<1>(i1, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_, ibegin, new_dim, new_offset,
					   internal::GradientIndex<IsActive>::get());
    }

    template <typename I0, typename I1, typename I2>
    typename internal::enable_if<internal::is_ranged<rank,I0,I1,I2>::value,
	       Array<internal::is_ranged<rank,I0,I1,I2>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1, I2 i2) {
      static const int new_rank = internal::is_ranged<rank,I0,I1,I2>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index<0>(i0, inew_rank, ibegin, new_dim, new_offset);
      update_index<1>(i1, inew_rank, ibegin, new_dim, new_offset);
      update_index<2>(i2, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_, ibegin, new_dim, new_offset,
					   internal::GradientIndex<IsActive>::get());
    }

    template <typename I0, typename I1, typename I2>
    typename internal::enable_if<internal::is_ranged<rank,I0,I1,I2>::value,
	       const Array<internal::is_ranged<rank,I0,I1,I2>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1, I2 i2) const {
      static const int new_rank = internal::is_ranged<rank,I0,I1,I2>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index<0>(i0, inew_rank, ibegin, new_dim, new_offset);
      update_index<1>(i1, inew_rank, ibegin, new_dim, new_offset);
      update_index<2>(i2, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_, ibegin, new_dim, new_offset,
					   internal::GradientIndex<IsActive>::get());
    }

    template <typename I0, typename I1, typename I2, typename I3>
    typename internal::enable_if<internal::is_ranged<rank,I0,I1,I2,I3>::value,
       Array<internal::is_ranged<rank,I0,I1,I2,I3>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3) {
      static const int new_rank = internal::is_ranged<rank,I0,I1,I2,I3>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index<0>(i0, inew_rank, ibegin, new_dim, new_offset);
      update_index<1>(i1, inew_rank, ibegin, new_dim, new_offset);
      update_index<2>(i2, inew_rank, ibegin, new_dim, new_offset);
      update_index<3>(i3, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_, ibegin, new_dim, new_offset,
					   internal::GradientIndex<IsActive>::get());
    }

    template <typename I0, typename I1, typename I2, typename I3>
    typename internal::enable_if<internal::is_ranged<rank,I0,I1,I2,I3>::value,
       const Array<internal::is_ranged<rank,I0,I1,I2,I3>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3) const {
      static const int new_rank = internal::is_ranged<rank,I0,I1,I2,I3>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index<0>(i0, inew_rank, ibegin, new_dim, new_offset);
      update_index<1>(i1, inew_rank, ibegin, new_dim, new_offset);
      update_index<2>(i2, inew_rank, ibegin, new_dim, new_offset);
      update_index<3>(i3, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_, ibegin, new_dim, new_offset,
					   internal::GradientIndex<IsActive>::get());
    }

    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4>
    typename internal::enable_if<internal::is_ranged<rank,I0,I1,I2,I3,I4>::value,
       Array<internal::is_ranged<rank,I0,I1,I2,I3,I4>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4) {
      static const int new_rank = internal::is_ranged<rank,I0,I1,I2,I3,I4>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index<0>(i0, inew_rank, ibegin, new_dim, new_offset);
      update_index<1>(i1, inew_rank, ibegin, new_dim, new_offset);
      update_index<2>(i2, inew_rank, ibegin, new_dim, new_offset);
      update_index<3>(i3, inew_rank, ibegin, new_dim, new_offset);
      update_index<4>(i4, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_, ibegin, new_dim, new_offset,
					   internal::GradientIndex<IsActive>::get());
    }
  
    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4>
    typename internal::enable_if<internal::is_ranged<rank,I0,I1,I2,I3,I4>::value,
       const Array<internal::is_ranged<rank,I0,I1,I2,I3,I4>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4) const {
      static const int new_rank = internal::is_ranged<rank,I0,I1,I2,I3,I4>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index<0>(i0, inew_rank, ibegin, new_dim, new_offset);
      update_index<1>(i1, inew_rank, ibegin, new_dim, new_offset);
      update_index<2>(i2, inew_rank, ibegin, new_dim, new_offset);
      update_index<3>(i3, inew_rank, ibegin, new_dim, new_offset);
      update_index<4>(i4, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_, ibegin, new_dim, new_offset,
					   internal::GradientIndex<IsActive>::get());
    }
  
    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4, typename I5>
    typename internal::enable_if<internal::is_ranged<rank,I0,I1,I2,I3,I4,I5>::value,
       Array<internal::is_ranged<rank,I0,I1,I2,I3,I4,I5>::count,Type,IsActive> >::type
     operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5) {
      static const int new_rank = internal::is_ranged<rank,I0,I1,I2,I3,I4,I5>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index<0>(i0, inew_rank, ibegin, new_dim, new_offset);
      update_index<1>(i1, inew_rank, ibegin, new_dim, new_offset);
      update_index<2>(i2, inew_rank, ibegin, new_dim, new_offset);
      update_index<3>(i3, inew_rank, ibegin, new_dim, new_offset);
      update_index<4>(i4, inew_rank, ibegin, new_dim, new_offset);
      update_index<5>(i5, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_, ibegin, new_dim, new_offset,
					   internal::GradientIndex<IsActive>::get());
    }


    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4, typename I5>
    typename internal::enable_if<internal::is_ranged<rank,I0,I1,I2,I3,I4,I5>::value,
       const Array<internal::is_ranged<rank,I0,I1,I2,I3,I4,I5>::count,Type,IsActive> >::type
     operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5) const {
      static const int new_rank = internal::is_ranged<rank,I0,I1,I2,I3,I4,I5>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index<0>(i0, inew_rank, ibegin, new_dim, new_offset);
      update_index<1>(i1, inew_rank, ibegin, new_dim, new_offset);
      update_index<2>(i2, inew_rank, ibegin, new_dim, new_offset);
      update_index<3>(i3, inew_rank, ibegin, new_dim, new_offset);
      update_index<4>(i4, inew_rank, ibegin, new_dim, new_offset);
      update_index<5>(i5, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_, ibegin, new_dim, new_offset,
					   internal::GradientIndex<IsActive>::get());
    }

    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4, typename I5, typename I6>
    typename internal::enable_if<internal::is_ranged<rank,I0,I1,I2,I3,I4,I5,I6>::value,
       Array<internal::is_ranged<rank,I0,I1,I2,I3,I4,I5,I6>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6) {
      static const int new_rank = internal::is_ranged<rank,I0,I1,I2,I3,I4,I5,I6>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index<0>(i0, inew_rank, ibegin, new_dim, new_offset);
      update_index<1>(i1, inew_rank, ibegin, new_dim, new_offset);
      update_index<2>(i2, inew_rank, ibegin, new_dim, new_offset);
      update_index<3>(i3, inew_rank, ibegin, new_dim, new_offset);
      update_index<4>(i4, inew_rank, ibegin, new_dim, new_offset);
      update_index<5>(i5, inew_rank, ibegin, new_dim, new_offset);
      update_index<6>(i6, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_, ibegin, new_dim, new_offset,
					   internal::GradientIndex<IsActive>::get());
    }

    template <typename I0, typename I1, typename I2, typename I3,
	      typename I4, typename I5, typename I6>
    typename internal::enable_if<internal::is_ranged<rank,I0,I1,I2,I3,I4,I5,I6>::value,
       const Array<internal::is_ranged<rank,I0,I1,I2,I3,I4,I5,I6>::count,Type,IsActive> >::type
    operator()(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6) const {
      static const int new_rank = internal::is_ranged<rank,I0,I1,I2,I3,I4,I5,I6>::count;
      ExpressionSize<new_rank> new_dim;
      ExpressionSize<new_rank> new_offset;
      Index inew_rank = 0;
      Index ibegin = 0;
      update_index<0>(i0, inew_rank, ibegin, new_dim, new_offset);
      update_index<1>(i1, inew_rank, ibegin, new_dim, new_offset);
      update_index<2>(i2, inew_rank, ibegin, new_dim, new_offset);
      update_index<3>(i3, inew_rank, ibegin, new_dim, new_offset);
      update_index<4>(i4, inew_rank, ibegin, new_dim, new_offset);
      update_index<5>(i5, inew_rank, ibegin, new_dim, new_offset);
      update_index<6>(i6, inew_rank, ibegin, new_dim, new_offset);
      return Array<new_rank,Type,IsActive>(data_, ibegin, new_dim, new_offset,
					   internal::GradientIndex<IsActive>::get());
    }
  
    // If one or more of the indices is not guaranteed to be monotonic
    // at compile time then we must return an IndexedArray, now done
    // for all possible numbers of arguments

    // Indexing a 1D array
    template <typename I0>
    typename internal::enable_if<rank == 1 && internal::is_int_vector<I0>::value
		       && !internal::is_ranged<rank,I0>::value,
		       internal::IndexedArray<rank,Type,IsActive,FixedArray,I0> >::type
    operator()(const I0& i0) {
      return internal::IndexedArray<rank,Type,IsActive,FixedArray,I0>(*this, i0);
    }
    template <typename I0>
    typename internal::enable_if<rank == 1 && internal::is_int_vector<I0>::value
		       && !internal::is_ranged<rank,I0>::value,
		       const internal::IndexedArray<rank,Type,IsActive,
					  FixedArray,I0> >::type
    operator()(const I0& i0) const {
      return internal::IndexedArray<rank,Type,IsActive,
			  FixedArray,I0>(*const_cast<FixedArray*>(this), i0);
    }
  
    // Indexing a 2D array
    template <typename I0, typename I1>
    typename internal::enable_if<rank == 2 && internal::is_irreg_indexed<rank,I0,I1>::value,
		       internal::IndexedArray<internal::is_irreg_indexed<rank,I0,I1>::count,
				    Type,IsActive,FixedArray,I0,I1> >::type
    operator()(const I0& i0, const I1& i1) {
      static const int new_rank = internal::is_irreg_indexed<rank,I0,I1>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,FixedArray,I0,I1>(*this, i0, i1);
    }
    template <typename I0, typename I1>
    typename internal::enable_if<rank == 2 && internal::is_irreg_indexed<rank,I0,I1>::value,
		       const internal::IndexedArray<internal::is_irreg_indexed<rank,I0,I1>::count,
				    Type,IsActive,FixedArray,I0,I1> >::type
    operator()(const I0& i0, const I1& i1) const {
      static const int new_rank = internal::is_irreg_indexed<rank,I0,I1>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,
			  FixedArray,I0,I1>(*const_cast<FixedArray*>(this), i0, i1);
    }

    // Indexing a 3D array
    template <typename I0, typename I1, typename I2>
    typename internal::enable_if<rank == 3 && internal::is_irreg_indexed<rank,I0,I1,I2>::value,
		       internal::IndexedArray<internal::is_irreg_indexed<rank,I0,I1,I2>::count,
				    Type,IsActive,FixedArray,I0,I1,I2> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2) {
      static const int new_rank = internal::is_irreg_indexed<rank,I0,I1,I2>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,FixedArray,
			  I0,I1,I2>(*this, i0, i1, i2);
    }
    template <typename I0, typename I1, typename I2>
    typename internal::enable_if<rank == 3 && internal::is_irreg_indexed<rank,I0,I1,I2>::value,
		       const internal::IndexedArray<internal::is_irreg_indexed<rank,
							   I0,I1,I2>::count,
				    Type,IsActive,FixedArray,I0,I1,I2> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2) const {
      static const int new_rank = internal::is_irreg_indexed<rank,I0,I1,I2>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,FixedArray,
			  I0,I1,I2>(*const_cast<FixedArray*>(this), i0, i1, i2);
    }

    // Indexing a 4D array
    template <typename I0, typename I1, typename I2, typename I3>
    typename internal::enable_if<rank == 4 && internal::is_irreg_indexed<rank,I0,I1,I2,I3>::value,
		       internal::IndexedArray<internal::is_irreg_indexed<rank,I0,I1,I2,I3>::count,
				    Type,IsActive,FixedArray,I0,I1,I2,I3> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) {
      static const int new_rank = internal::is_irreg_indexed<rank,I0,I1,I2,I3>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,FixedArray,
			  I0,I1,I2,I3>(*this, i0, i1, i2, i3);
    }
    template <typename I0, typename I1, typename I2, typename I3>
    typename internal::enable_if<rank == 4 && internal::is_irreg_indexed<rank,I0,I1,I2,I3>::value,
		       const internal::IndexedArray<internal::is_irreg_indexed<rank,I0,I1,
							   I2,I3>::count,
				    Type,IsActive,FixedArray,I0,I1,I2,I3> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const {
      static const int new_rank = internal::is_irreg_indexed<rank,I0,I1,I2,I3>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,FixedArray,I0,I1,I2,
			  I3>(*const_cast<FixedArray*>(this), i0, i1, i2, i3);
    }

    // Indexing a 5D array
    template <typename I0, typename I1, typename I2, typename I3, typename I4>
    typename internal::enable_if<rank == 5
		       && internal::is_irreg_indexed<rank,I0,I1,I2,I3,I4>::value,
		       internal::IndexedArray<internal::is_irreg_indexed<rank,I0,I1,I2,
						     I3,I4>::count,
			    Type,IsActive,FixedArray,I0,I1,I2,I3,I4> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2, 
	       const I3& i3, const I4& i4) {
      static const int new_rank = internal::is_irreg_indexed<rank,I0,I1,I2,I3,
						   I4>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,FixedArray,I0,I1,I2,I3,
			  I4>(*this, i0, i1, i2, i3, i4);
    }
    template <typename I0, typename I1, typename I2, typename I3, typename I4>
    typename internal::enable_if<rank == 5
		       && internal::is_irreg_indexed<rank,I0,I1,I2,I3,I4>::value,
		       const internal::IndexedArray<internal::is_irreg_indexed<rank,I0,I1,I2,
							   I3,I4>::count,
				  Type,IsActive,FixedArray,I0,I1,I2,I3,I4> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2, 
	       const I3& i3, const I4& i4) const {
      static const int new_rank = internal::is_irreg_indexed<rank,I0,I1,I2,I3,
						   I4>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,FixedArray,I0,I1,I2,I3,
			  I4>(*const_cast<FixedArray*>(this), i0, i1, i2, i3, i4);
    }

    // Indexing a 6D array
    template <typename I0, typename I1, typename I2,
	      typename I3, typename I4, typename I5>
    typename internal::enable_if<rank == 6
		       && internal::is_irreg_indexed<rank,I0,I1,I2,I3,I4,I5>::value,
		       internal::IndexedArray<internal::is_irreg_indexed<rank,I0,I1,I2,I3,
							   I4,I5>::count,
			  Type,IsActive,FixedArray,I0,I1,I2,I3,I4,I5> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2, 
	       const I3& i3, const I4& i4, const I5& i5) {
      static const int new_rank = internal::is_irreg_indexed<rank,I0,I1,I2,I3,
						   I4,I5>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,FixedArray,I0,I1,I2,I3,I4,
			  I5>(*this,i0,i1,i2,i3,i4,i5);
    }
    template <typename I0, typename I1, typename I2,
	      typename I3, typename I4, typename I5>
    typename internal::enable_if<rank == 6
		       && internal::is_irreg_indexed<rank,I0,I1,I2,I3,I4,I5>::value,
		       const internal::IndexedArray<internal::is_irreg_indexed<rank,I0,I1,I2,I3,
							   I4,I5>::count,
			  Type,IsActive,FixedArray,I0,I1,I2,I3,I4,I5> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2, 
	       const I3& i3, const I4& i4, const I5& i5) const {
      static const int new_rank = internal::is_irreg_indexed<rank,I0,I1,I2,I3,
						   I4,I5>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,FixedArray,I0,I1,I2,I3,I4,
			  I5>(*const_cast<FixedArray*>(this),i0,i1,i2,i3,i4,i5);
    }

    // Indexing a 7D array
    template <typename I0, typename I1, typename I2,
	      typename I3, typename I4, typename I5, typename I6>
    typename internal::enable_if<rank == 7
		       && internal::is_irreg_indexed<rank,I0,I1,I2,I3,I4,I5>::value,
		       internal::IndexedArray<internal::is_irreg_indexed<rank,I0,I1,I2,I3,
						     I4,I5,I6>::count,
			  Type,IsActive,FixedArray,I0,I1,I2,I3,I4,I5,I6> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
	       const I4& i4, const I5& i5, const I6& i6) {
      static const int new_rank = internal::is_irreg_indexed<rank,I0,I1,I2,I3,
						   I4,I5,I6>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,FixedArray,I0,I1,I2,I3,I4,I5,
			  I6>(*this,i0,i1,i2,i3,i4,i5,i6);
    }
    template <typename I0, typename I1, typename I2,
	      typename I3, typename I4, typename I5, typename I6>
    typename internal::enable_if<rank == 7
		       && internal::is_irreg_indexed<rank,I0,I1,I2,I3,I4,I5>::value,
		       const internal::IndexedArray<internal::is_irreg_indexed<rank,I0,I1,I2,I3,
							   I4,I5,I6>::count,
			  Type,IsActive,FixedArray,I0,I1,I2,I3,I4,I5,I6> >::type
    operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
	       const I4& i4, const I5& i5, const I6& i6) const {
      static const int new_rank = internal::is_irreg_indexed<rank,I0,I1,I2,I3,
						   I4,I5,I6>::count;
      return internal::IndexedArray<new_rank,Type,IsActive,FixedArray,I0,I1,I2,I3,I4,I5,
			  I6>(*const_cast<FixedArray*>(this),i0,i1,i2,i3,i4,i5,i6);
    }


    // Provide a C-array-like array access: for a multidimensional
    // array, operator[](i), where i is of integer type, returns an
    // array of rank one less than the original array, where the new
    // array is "sliced" at index i of dimension 0.  For a vector,
    // operator[](i) returns an l-value to the element at i.  Thus for
    // a 3D array A, A[1][2][3] returns a single element. Note that
    // this will be slower than A(1,2,3) because each operator[]
    // creates a new array (although does not copy the data).
    template <typename T>
    typename internal::enable_if<internal::is_scalar_int<T>::value && (rank > 1),
      Array<rank-1,Type,IsActive> >::type
    operator[](T i) {
      int index = internal::get_index_with_len(i,J0)*offset_<0>::value;
      ExpressionSize<rank-1> new_dim;
      ExpressionSize<rank-1> new_offset;
      ExpressionSize<rank> dims = dimensions();
      ExpressionSize<rank> offs = offset();
      for (int j = 1; j < rank; ++j) {
	new_dim[j-1] = dims[j];
	new_offset[j-1] = offs[j];
      }
      return Array<rank-1,Type,IsActive>(data_, index, new_dim, new_offset,
					  internal::GradientIndex<IsActive>::get());
    }
    
    // diag_matrix(), where *this is a 1D array, returns a DiagMatrix
    // containing the data as the diagonal pointing to the original
    // data, Can be used as an lvalue.  Defined in SpecialMatrix.h
    SpecialMatrix<Type, internal::BandEngine<ROW_MAJOR,0,0>, IsActive>
    diag_matrix();
    
    Array<1,Type,IsActive>
    diag_vector(Index offdiag = 0) {
      ADEPT_STATIC_ASSERT(rank == 2, DIAG_VECTOR_ONLY_WORKS_ON_SQUARE_MATRICES);
      if (empty()) {
	// Return an empty vector
	return Array<1,Type,IsActive>();
      }
      else if (J0 != J1) {
	throw invalid_operation("diag_vector member function only applicable to square matrices"
				ADEPT_EXCEPTION_LOCATION);
      }
      else if (offdiag >= 0) {
	Index new_dim = std::min(J0, J1-offdiag);
	return Array<1,Type,IsActive>(data_, offset_<1>::value*offdiag,  
				      ExpressionSize<1>(new_dim),
				      ExpressionSize<1>(offset_<0>::value+offset_<1>::value),
				      internal::GradientIndex<IsActive>::get());
      }
      else {
	Index new_dim = std::min(J0+offdiag, J1);
	return Array<1,Type,IsActive>(data_,-offset_<0>::value*offdiag,  
				      ExpressionSize<1>(new_dim),
				      ExpressionSize<1>(offset_<0>::value+offset_<1>::value),
				      internal::GradientIndex<IsActive>::get());
      }
    }
  

    Array<2,Type,IsActive>
    submatrix_on_diagonal(Index ibegin, Index iend) {
      ADEPT_STATIC_ASSERT(rank == 2,
		SUBMATRIX_ON_DIAGONAL_ONLY_WORKS_ON_SQUARE_MATRICES);
      if (J0 != J1) {
	throw invalid_operation("submatrix_on_diagonal member function only applicable to square matrices"
				ADEPT_EXCEPTION_LOCATION);
      }
      else if (ibegin < 0 || ibegin > iend || iend >= J0) {
	throw index_out_of_bounds("Dimensions out of range in submatrix_on_diagonal"
				  ADEPT_EXCEPTION_LOCATION);
      }
      else {
	Index len = iend-ibegin+1;
	ExpressionSize<2> dim(len,len);
	return Array<2,Type,IsActive>(data_, ibegin*(offset_<0>::value + offset_<1>::value),
				      dim, offset(), internal::GradientIndex<IsActive>::get());
      }
    }

    // For extracting contiguous sections out of an array use the
    // following. Currently this just indexes each dimension with the
    // contiguous range(a,b) index, but in future it may be optimized.

    // 1D array subset
    template <typename B0, typename E0>
    Array<1,Type,IsActive>
    subset(const B0& ibegin0, const E0& iend0) {
      ADEPT_STATIC_ASSERT(rank == 1,
			  SUBSET_WITH_2_ARGS_ONLY_ON_RANK_1_ARRAY);
      return (*this)(range(ibegin0,iend0));
    }
    template <typename B0, typename E0>
    const Array<1,Type,IsActive>
    subset(const B0& ibegin0, const E0& iend0) const {
      ADEPT_STATIC_ASSERT(rank == 1,
			  SUBSET_WITH_2_ARGS_ONLY_ON_RANK_1_ARRAY);
      return (*this)(range(ibegin0,iend0));
    }

    // 2D array subset
    template <typename B0, typename E0, typename B1, typename E1>
    Array<2,Type,IsActive>
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1) {
      ADEPT_STATIC_ASSERT(rank == 2,
			  SUBSET_WITH_4_ARGS_ONLY_ON_RANK_2_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1));
    }
    template <typename B0, typename E0, typename B1, typename E1>
    const Array<2,Type,IsActive>
    subset(const B0& ibegin0, const E0& iend0, 
	  const B1& ibegin1, const E1& iend1) const {
      ADEPT_STATIC_ASSERT(rank == 2,
			  SUBSET_WITH_4_ARGS_ONLY_ON_RANK_2_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1));
    }

    // 3D array subset
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2>
    Array<3,Type,IsActive>
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2) {
      ADEPT_STATIC_ASSERT(rank == 3,
			  SUBSET_WITH_6_ARGS_ONLY_ON_RANK_3_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2));
    }     
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2>
    const Array<3,Type,IsActive>
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2) const {
      ADEPT_STATIC_ASSERT(rank == 3,
			  SUBSET_WITH_6_ARGS_ONLY_ON_RANK_3_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2));
    }

    // 4D array subset
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2, typename B3, typename E3>
    Array<4,Type,IsActive>
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2,
	   const B3& ibegin3, const E3& iend3) {
      ADEPT_STATIC_ASSERT(rank == 4,
			  SUBSET_WITH_8_ARGS_ONLY_ON_RANK_4_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2),range(ibegin3,iend3));
    }
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2, typename B3, typename E3>
    const Array<4,Type,IsActive>
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2,
	   const B3& ibegin3, const E3& iend3) const {
      ADEPT_STATIC_ASSERT(rank == 4,
			  SUBSET_WITH_8_ARGS_ONLY_ON_RANK_4_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2),range(ibegin3,iend3));
    } 

    // 5D array subset
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2, typename B3, typename E3,
	      typename B4, typename E4>
    Array<5,Type,IsActive>
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2,
	   const B3& ibegin3, const E3& iend3,
	   const B4& ibegin4, const E4& iend4) {
      ADEPT_STATIC_ASSERT(rank == 5,
			  SUBSET_WITH_10_ARGS_ONLY_ON_RANK_5_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2),range(ibegin3,iend3),
		     range(ibegin4,iend4));
    }
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2, typename B3, typename E3,
	      typename B4, typename E4>
    const Array<5,Type,IsActive>
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2,
	   const B3& ibegin3, const E3& iend3,
	   const B4& ibegin4, const E4& iend4) const {
      ADEPT_STATIC_ASSERT(rank == 5,
			  SUBSET_WITH_10_ARGS_ONLY_ON_RANK_5_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2),range(ibegin3,iend3),
		     range(ibegin4,iend4));
    }

    // 6D array subset
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2, typename B3, typename E3,
	      typename B4, typename E4, typename B5, typename E5>
    Array<6,Type,IsActive>
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2,
	   const B3& ibegin3, const E3& iend3,
	   const B4& ibegin4, const E4& iend4,
	   const B5& ibegin5, const E5& iend5) {
      ADEPT_STATIC_ASSERT(rank == 6,
			  SUBSET_WITH_12_ARGS_ONLY_ON_RANK_6_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2),range(ibegin3,iend3),
		     range(ibegin4,iend4),range(ibegin5,iend5));
    }
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2, typename B3, typename E3,
	      typename B4, typename E4, typename B5, typename E5>
    const Array<6,Type,IsActive>
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2,
	   const B3& ibegin3, const E3& iend3,
	   const B4& ibegin4, const E4& iend4,
	   const B5& ibegin5, const E5& iend5) const {
      ADEPT_STATIC_ASSERT(rank == 6,
			  SUBSET_WITH_12_ARGS_ONLY_ON_RANK_6_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2),range(ibegin3,iend3),
		     range(ibegin4,iend4),range(ibegin5,iend5));
    }

    // 7D array subset
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2, typename B3, typename E3,
	      typename B4, typename E4, typename B5, typename E5,
	      typename B6, typename E6>
    Array<7,Type,IsActive>
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2,
	   const B3& ibegin3, const E3& iend3,
	   const B4& ibegin4, const E4& iend4,
	   const B5& ibegin5, const E5& iend5,
	   const B6& ibegin6, const E6& iend6) {
      ADEPT_STATIC_ASSERT(rank == 7,
			  SUBSET_WITH_14_ARGS_ONLY_ON_RANK_7_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2),range(ibegin3,iend3),
		     range(ibegin4,iend4),range(ibegin5,iend5),
		     range(ibegin6,iend6));
    }
    template <typename B0, typename E0, typename B1, typename E1,
	      typename B2, typename E2, typename B3, typename E3,
	      typename B4, typename E4, typename B5, typename E5,
	      typename B6, typename E6>
    const Array<7,Type,IsActive>
    subset(const B0& ibegin0, const E0& iend0, 
	   const B1& ibegin1, const E1& iend1,
	   const B2& ibegin2, const E2& iend2,
	   const B3& ibegin3, const E3& iend3,
	   const B4& ibegin4, const E4& iend4,
	   const B5& ibegin5, const E5& iend5,
	   const B6& ibegin6, const E6& iend6) const {
      ADEPT_STATIC_ASSERT(rank == 7,
			  SUBSET_WITH_14_ARGS_ONLY_ON_RANK_7_ARRAY);
      return (*this)(range(ibegin0,iend0),range(ibegin1,iend1),
		     range(ibegin2,iend2),range(ibegin3,iend3),
		     range(ibegin4,iend4),range(ibegin5,iend5),
		     range(ibegin6,iend6));
    }

    // -------------------------------------------------------------------
    // FixedArray: 5. Public member functions
    // -------------------------------------------------------------------
  
    // STL-like size() returns total length of array
    Index size() const { return length_; }

    bool get_dimensions_(ExpressionSize<rank>& dims) const {
      dims[0] = J0;
      if (J1 > 0) {
	dims[1] = J1;
	if (J2 > 0) {
	  dims[2] = J2;
	  if (J3 > 0) {
	    dims[3] = J3;
	    if (J4 > 0) {
	      dims[4] = J4;
	      if (J5 > 0) {
		dims[5] = J5;
		if (J6 > 0) {
		  dims[6] = J6;
		}
	      }
	    }
	  }
	}
      }
      return true;
    }

    // Return constant reference to dimensions
    ExpressionSize<rank> dimensions() const {
      ExpressionSize<rank> dims;
      get_dimensions_(dims);
      return dims;
    }

    // Return individual dimension
    Index size(int j) const {
      if (j >= rank)  { return  0; }
      else if (j == 0) { return J0; }
      else if (j == 1) { return J1; }
      else if (j == 2) { return J2; }
      else if (j == 3) { return J3; }
      else if (j == 4) { return J4; }
      else if (j == 5) { return J5; }
      else { return J6; }
    }
    Index dimension(int j) const {
      return size(j);
    }

    // Return individual offset
    Index offset(int j) const {
      if (j >= rank)  { return  0; }
      else if (j == 0) { return offset_<0>::value; }
      else if (j == 1) { return offset_<1>::value; }
      else if (j == 2) { return offset_<2>::value; }
      else if (j == 3) { return offset_<3>::value; }
      else if (j == 4) { return offset_<4>::value; }
      else if (j == 5) { return offset_<5>::value; }
      else if (j == 6) { return offset_<6>::value; }
      else { throw invalid_dimension(); }
    }

    // Return constant reference to offsets
    ExpressionSize<rank> offset() const {
      ExpressionSize<rank> offs;
      offs[0] = offset_<0>::value;
      if (J1 > 0) {
	offs[1] = offset_<1>::value;
	if (J2 > 0) {
	  offs[2] = offset_<2>::value;
	  if (J3 > 0) {
	    offs[3] = offset_<3>::value;
	    if (J4 > 0) {
	      offs[4] = offset_<4>::value;
	      if (J5 > 0) {
		offs[5] = offset_<5>::value;
		if (J6 > 0) {
		  offs[6] = offset_<6>::value;
		}
	      }
	    }
	  }
	}
      }
      return offs;
    }

    const Index& last_offset() const { return offset_<rank-1>::value; }

    // Return true if the array is empty
    bool empty() const { return (J0 == 0); }

    // Return a string describing the array
    std::string info_string() const {
      std::stringstream str;
      str << "FixedArray<" << rank << ">, dim=" << dimensions() << ", data_location=" << data_;
      if (IsActive) {
	str << ", gradient_index=" << gradient_index();
      }
      return str.str();
    }

    // Return a pointer to the start of the data
    Type* data() { return data_; }
    const Type* data() const { return data_; }
    const Type* const_data() const { return data_; }

    // Older style
    Type* data_pointer() { return data_; }
    const Type* data_pointer() const { return data_; }
    const Type* const_data_pointer() const { return data_; }

    // For vectors only, we allow a pointer to be returned to a
    // specified element
    Type* data_pointer(Index i) { 
      ADEPT_STATIC_ASSERT(rank == 1, CAN_ONLY_USE_DATA_POINTER_WITH_INDEX_ON_VECTORS);
      if (data_) {
	return data_ + i;
      }
      else {
	return 0;
      }
    }
    const Type* const_data_pointer(Index i) const { 
      ADEPT_STATIC_ASSERT(rank == 1, CAN_ONLY_USE_CONST_DATA_POINTER_WITH_INDEX_ON_VECTORS);
      if (data_) {
	return data_ + i;
      }
      else {
	return 0;
      }
    }
   
    bool is_aliased_(const Type* mem1, const Type* mem2) const {
      Type const * ptr_begin;
      Type const * ptr_end;
      data_range(ptr_begin, ptr_end);
      if (ptr_begin <= mem2 && ptr_end >= mem1) {
	return true;
      }
      else {
	return false;
      }
    }

    // By design, FixedArrays are row-major and row-wise access is
    // contiguous
    bool all_arrays_contiguous_() const { return true; }
 
    bool is_aligned_() const {
      return !(reinterpret_cast<std::size_t>(data_) & Packet<Type>::align_mask);
    }

    template <int n>
    int alignment_offset_() const {
      return (reinterpret_cast<std::size_t>(data_)/sizeof(Type)) % n; 
    }

    Type value_with_len_(const Index& j, const Index& len) const {
      ADEPT_STATIC_ASSERT(rank == 1, CANNOT_USE_VALUE_WITH_LEN_ON_ARRAY_OF_RANK_OTHER_THAN_1);
      return data_[j];
    }

    std::string expression_string_() const {
      if (true) {
	std::string a = internal::fixed_array_helper<rank,IsActive>().name();
	a += dimensions().str();
	return a;
      }
      else {
	std::stringstream s;
	print(s);
	return s.str();
      }
    }

    // The same as operator=(inactive scalar) but does not put
    // anything on the stack
    template <typename RType>
    typename internal::enable_if<internal::is_not_expression<RType>::value, FixedArray&>::type
    set_value(RType x) {
      if (!empty()) {
	assign_inactive_scalar_<rank,false>(x);
      }
      return *this;
    }
  
    
    // Return the gradient index for the first element in the array,
    // or -1 if not active
    Index gradient_index() const {
      return internal::GradientIndex<IsActive>::get();
    }

    std::ostream& print(std::ostream& os) const {
      const Array<rank,Type,IsActive> x(*this);
      x.print(os);
      return os;
    }

    // Get pointers to the first and last data members in memory.  
    void data_range(Type const * &data_begin, Type const * &data_end) const {
      data_begin = data_;
      data_end = data_ + length_-1;
    }

  
    // The Stack::independent(x) and Stack::dependent(y) functions add
    // the gradient_index of objects x and y to std::vector<uIndex>
    // objects in Stack. Since x and y may be scalars or arrays, this
    // is best done by delegating to the Active or FixedArray classes.
    template <typename IndexType>
    void push_gradient_indices(std::vector<IndexType>& vec) const {
      ADEPT_STATIC_ASSERT(IsActive,
		  CANNOT_PUSH_GRADIENT_INDICES_FOR_INACTIVE_ARRAY); 
      ExpressionSize<rank> i(0);
      Index gradient_ind = gradient_index();
      Index index = 0;
      int my_rank;
      vec.reserve(vec.size() + size());
      do {
	// Innermost loop - note that the counter is index, not max_index
	for (Index max_index = index + dimension_<rank-1>::value*offset_<rank-1>::value;
	     index < max_index;
	     index += offset_<rank-1>::value) {
	  vec.push_back(gradient_ind + index);
	}
	// Increment counters appropriately depending on which
	// dimensions have been finished
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }

    // Return inactive array linked to original data
    Array<rank, Type, false> inactive_link() {
      return Array<rank, Type, false>(data_, 0, dimensions(), offset(),
				       internal::GradientIndex<IsActive>::get());
    }

    // Transpose helper functions
  protected:
    template<int MyRank>
    typename internal::enable_if<MyRank == 2, Array<2,Type,IsActive> >::type
    my_T() {
      // Transpose 2D array: create output array initially as link
      // to input array
      Array<2,Type,IsActive> out(*this);
      // Swap dimensions
      return out.in_place_transpose();
    }
    template<int MyRank>
    typename internal::enable_if<MyRank == 2, const Array<2,Type,IsActive> >::type
    my_T() const {
      // Transpose 2D array: create output array initially as link
      // to input array
      Array<2,Type,IsActive> out(const_cast<FixedArray&>(*this));
      // Swap dimensions
      return out.in_place_transpose();
    }

  public:
    // Out-of-place transpose
    Array<2,Type,IsActive>
    T() {
      ADEPT_STATIC_ASSERT(rank == 1 || rank == 2, 
			  TRANSPOSE_ONLY_POSSIBLE_WITH_1D_OR_2D_ARRAYS);
      return my_T<rank>();
    }
    const Array<2,Type,IsActive>
    T() const {
      ADEPT_STATIC_ASSERT(rank == 1 || rank == 2, 
			  TRANSPOSE_ONLY_POSSIBLE_WITH_1D_OR_2D_ARRAYS);
      return my_T<rank>();
    }

    // "permute" is a generalized transpose, returning an FixedArray linked
    // to the current one but with the dimensions rearranged according
    // to idim: idim[0] is the 0-based number of the dimension of the
    // current array that will be dimension 0 of the new array,
    // idim[1] is the number of the dimension of the current array
    // that will be dimension 1 of the new array and so on.
    Array<rank,Type,IsActive> permute(const Index* idim) {
      if (empty()) {
	throw empty_array("Attempt to permute an empty array"
			  ADEPT_EXCEPTION_LOCATION);
      }
      ExpressionSize<rank> new_dims(0);
      ExpressionSize<rank> new_offset;
      ExpressionSize<rank> dims, offs;
      dims = dimensions();
      offs = offset();
      for (int i = 0; i < rank; ++i) {
	if (idim[i] >= 0 && idim[i] < rank) {
	  new_dims[i] = dims[idim[i]];
	  new_offset[i] = offs[idim[i]];
	}
	else {
	  throw invalid_dimension("Dimensions must be in range 0 to rank-1 in permute"
				  ADEPT_EXCEPTION_LOCATION);
	}
      }
      for (int i = 0; i < rank; ++i) {
	if (new_dims[i] == 0) {
	  throw invalid_dimension("Missing dimension in permute"
				  ADEPT_EXCEPTION_LOCATION);
	}
      }
      return Array<rank,Type,IsActive>(data_, 0, new_dims, new_offset,
					internal::GradientIndex<IsActive>::get());
    }

    Array<rank,Type,IsActive> permute(const ExpressionSize<rank>& idim) {
      return permute(&idim[0]);
    }

    // Up to 7 dimensions we can specify the dimensions as separate
    // arguments
    typename internal::enable_if<(rank < 7), Array<rank,Type,IsActive> >::type
    permute(Index i0, Index i1, Index i2 = -1, Index i3 = -1, Index i4 = -1,
	    Index i5 = -1, Index i6 = -1) {
      Index idim[7] = {i0, i1, i2, i3, i4, i5, i6};
      for (int i = 0; i < rank; ++i) {
	if (idim[i] == -1) {
	  throw invalid_dimension("Incorrect number of dimensions provided to permute"
				  ADEPT_EXCEPTION_LOCATION);
	}
      }
      return permute(idim);
    }

    // Return an inactive array of the same type and rank as the
    // present active fixed array, containing the gradients associated
    // with it
    template <typename MyType>
    void get_gradient(Array<rank,MyType,false>& gradient) const {
      ADEPT_STATIC_ASSERT(IsActive,CANNOT_USE_GET_GRADIENT_ON_INACTIVE_ARRAY);
      if (gradient.empty()) {
	gradient.resize(dimensions());
      }
      else if (gradient.dimensions() != dimensions()) {
	throw size_mismatch("Attempt to get_gradient with array of different dimensions"
			    ADEPT_EXCEPTION_LOCATION);
      }
      static const int last = rank-1;
      ExpressionSize<rank> target_offset = gradient.offset();
      ExpressionSize<rank> i(0);
      Index index = 0;
      int my_rank;
      Index index_target = 0;
      Index last_dim_stretch = dimension_<rank-1>::value*offset_<rank-1>::value;
      MyType* target = gradient.data();
      do {
	i[last] = 0;
	index_target = 0;
	for (int r = 0; r < rank-1; r++) {
	  index_target += i[r]*target_offset[r];
	}
	ADEPT_ACTIVE_STACK->get_gradients(gradient_index()+index,
				  gradient_index()+index+last_dim_stretch,
					  target+index_target, offset_<rank-1>::value, target_offset[last]);
	index += last_dim_stretch;
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }


    // Return an inactive array of the same type and rank as the
    // present active array containing the gradients associated with
    // it
    Array<rank,Type,false> get_gradient() const {
      Array<rank,Type,false> gradient;
      get_gradient(gradient);
      return gradient;
    }

    void
    put(std::vector<typename internal::active_scalar<Type,IsActive>::type>& data) const {
      ADEPT_STATIC_ASSERT(rank == 1, PUT_ONLY_AVAILABLE_FOR_RANK_1_ARRAYS);
      if (data.size() != J0) {
	data.resize(J0);
      }
      for (Index i = 0; i < J0; ++i) {
	data[i] = (*this)(i);
      }  
    }

    void
    get(const std::vector<typename internal::active_scalar<Type,IsActive>::type>& data) {
      ADEPT_STATIC_ASSERT(rank == 1, GET_ONLY_AVAILABLE_FOR_RANK_1_ARRAYS);
      if (data.size() != J0) {
	resize(data.size());
      }
      for (Index i = 0; i < J0; ++i) {
	(*this)(i) = data[i];
      }  
    }


    // -------------------------------------------------------------------
    // FixedArray: 6. Member functions accessed by the Expression class
    // -------------------------------------------------------------------

    template <int MyArrayNum, int NArrays>
    void set_location_(const ExpressionSize<rank>& i, 
		       ExpressionSize<NArrays>& index) const {
      index[MyArrayNum] = index_(i);
    }
    
    template <int MyArrayNum, int NArrays>
    Type value_at_location_(const ExpressionSize<NArrays>& loc) const {
      return data_[loc[MyArrayNum]];
    }
    template <int MyArrayNum, int NArrays>
    Packet<Type> packet_at_location_(const ExpressionSize<NArrays>& loc) const {
      return Packet<Type>(data_+loc[MyArrayNum]);
    }

    Type& lvalue_at_location(const Index& loc) {
      return data_[loc];
    }

    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
    Type value_at_location_store_(const ExpressionSize<NArrays>& loc,
				  internal::ScratchVector<NScratch>& scratch) const {
      return data_[loc[MyArrayNum]];

    }

    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
    Type value_stored_(const ExpressionSize<NArrays>& loc,
		       const internal::ScratchVector<NScratch>& scratch) const {
      return data_[loc[MyArrayNum]];
    }

    template <int MyArrayNum, int NArrays>
    void advance_location_(ExpressionSize<NArrays>& loc) const {
      loc[MyArrayNum] += offset_<rank-1>::value;
    }

    // If an expression leads to calc_gradient being called on an
    // active object, we push the multiplier and the gradient index on
    // to the operation stack (or 1.0 if no multiplier is specified
    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
    void calc_gradient_(Stack& stack, const ExpressionSize<NArrays>& loc,
			const internal::ScratchVector<NScratch>& scratch) const {
      stack.push_rhs(1.0, gradient_index() + loc[MyArrayNum]);
    }
    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, typename MyType>
    void calc_gradient_(Stack& stack, const ExpressionSize<NArrays>& loc,
			const internal::ScratchVector<NScratch>& scratch,
			const MyType& multiplier) const {
      stack.push_rhs(multiplier, gradient_index() + loc[MyArrayNum]);
    }
  

    // -------------------------------------------------------------------
    // FixedArray: 7. Protected member functions
    // -------------------------------------------------------------------
  protected:

    // Return the memory index (relative to data_) for array element
    // indicated by j
    Index index_(Index j[rank]) const {
      Index o = 0;
      ExpressionSize<rank> offs = offset();
      for (int i = 0; i < rank; i++) {
	o += j[i]*offs[i];
      }
      return o;
    }
    Index index_(const ExpressionSize<rank>& j) const {
      Index o = 0;
      for (int i = 0; i < rank; i++) {
	o += j[i]*offset(i);
      }
      return o;
    }

    // Used in traversing through an array
    void advance_index(Index& index, int& my_rank, ExpressionSize<rank>& i) const {
      index -= offset_<rank-1>::value*dimension_<rank-1>::value;
      my_rank = rank-1;
      while (--my_rank >= 0) {
	if (++i[my_rank] >= dimension(my_rank)) {
	  i[my_rank] = 0;
	  index -= offset(my_rank)*(dimension(my_rank)-1);
	}
	else {
	  index += offset(my_rank);
	  break;
	}
      }
    }

    // When assigning a scalar to a whole array, there may be
    // advantage in specialist behaviour depending on the rank of the
    // array. This is a generic one that copies the number but treats
    // the present array as passive.
    template <int LocalRank, bool LocalIsActive, typename X>
    typename internal::enable_if<!LocalIsActive,void>::type
    assign_inactive_scalar_(X x) {
      ExpressionSize<LocalRank> i(0);
      Index index = 0;
      int my_rank;
      do {
	// Innermost loop - note that the counter is index, not max_index
	for (Index max_index = index + dimension_<LocalRank-1>::value*offset_<LocalRank-1>::value;
	     index < max_index;
	     index += offset_<LocalRank-1>::value) {
	  data_[index] = x;
	}
	// Increment counters appropriately depending on which
	// dimensions have been finished
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }

    // An active array being assigned the value of an inactive scalar
    template <int LocalRank, bool LocalIsActive, typename X>
    typename internal::enable_if<LocalIsActive,void>::type
    assign_inactive_scalar_(X x) {
      // If not recording we call the inactive version instead
#ifdef ADEPT_RECORDING_PAUSABLE
      if (! ADEPT_ACTIVE_STACK->is_recording()) {
	assign_inactive_scalar_<LocalRank, false, X>(x);
	return;
      }
#endif

      ExpressionSize<LocalRank> i(0);
      Index gradient_ind = gradient_index();
      Index index = 0;
      int my_rank;
      do {
	// Innermost loop
	ADEPT_ACTIVE_STACK->push_lhs_range(gradient_ind+index, dimension_<LocalRank-1>::value,
					   offset_<LocalRank-1>::value);
	for (Index max_index = index + dimension_<LocalRank-1>::value*offset_<LocalRank-1>::value;
	     index < max_index; index += offset_<LocalRank-1>::value) {
	  data_[index] = x;
	}

	// Increment counters appropriately depending on which
	// dimensions have been finished
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }

    // When copying an expression to a whole array, there may be
    // advantage in specialist behaviour depending on the rank of the
    // array
    template<int LocalRank, bool LocalIsActive, bool EIsActive, class E>
    typename internal::enable_if<!LocalIsActive,void>::type
    assign_expression_(const E& rhs) {
      ADEPT_STATIC_ASSERT(!EIsActive, CANNOT_ASSIGN_ACTIVE_EXPRESSION_TO_INACTIVE_ARRAY);
      ExpressionSize<LocalRank> i(0);
      ExpressionSize<internal::expr_cast<E>::n_arrays> ind(0);
      Index index = 0;
      int my_rank;
      static const int last = LocalRank-1;
      do {
	i[last] = 0;
	rhs.set_location(i, ind);
	// Innermost loop
	for ( ; i[last] < dimension_<LocalRank-1>::value; ++i[last],
		index += offset_<LocalRank-1>::value) {
	  data_[index] = rhs.next_value(ind);
	}
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }

    template<int LocalRank, bool LocalIsActive, bool EIsActive, class E>
    typename internal::enable_if<LocalIsActive && EIsActive,void>::type
    assign_expression_(const E& rhs) {
      // If recording has been paused then call the inactive version
#ifdef ADEPT_RECORDING_PAUSABLE
      if (!ADEPT_ACTIVE_STACK->is_recording()) {
	assign_expression_<LocalRank,false,false>(rhs);
	return;
      }
#endif
      ExpressionSize<LocalRank> i(0);
      ExpressionSize<internal::expr_cast<E>::n_arrays> ind(0);
      Index index = 0;
      int my_rank;
      static const int last = LocalRank-1;

      ADEPT_ACTIVE_STACK->check_space(internal::expr_cast<E>::n_active * size());
      do {
	i[last] = 0;
	rhs.set_location(i, ind);
	// Innermost loop
	for ( ; i[last] < dimension_<LocalRank-1>::value; ++i[last],
		index += offset_<LocalRank-1>::value) {
	  data_[index] = rhs.next_value_and_gradient(*ADEPT_ACTIVE_STACK, ind);
	  ADEPT_ACTIVE_STACK->push_lhs(gradient_index()+index); // What if RHS not active?
	}
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }

    template<int LocalRank, bool LocalIsActive, bool EIsActive, class E>
    typename internal::enable_if<LocalIsActive && !EIsActive,void>::type
    assign_expression_(const E& rhs) {
      // If recording has been paused then call the inactive version
#ifdef ADEPT_RECORDING_PAUSABLE
      if (!ADEPT_ACTIVE_STACK->is_recording()) {
	assign_expression_<LocalRank,false,false>(rhs);
	return;
      }
#endif
      ExpressionSize<LocalRank> i(0);
      ExpressionSize<internal::expr_cast<E>::n_arrays> ind(0);
      Index index = 0;
      int my_rank;
      Index gradient_ind = gradient_index();
      static const int last = LocalRank-1;
      do {
	i[last] = 0;
	rhs.set_location(i, ind);
	// Innermost loop
	ADEPT_ACTIVE_STACK->push_lhs_range(gradient_ind+index, dimension_<LocalRank-1>::value,
					   offset_<LocalRank-1>::value);
	for ( ; i[last] < dimension_<LocalRank-1>::value; ++i[last],
		index += offset_<LocalRank-1>::value) {
	  data_[index] = rhs.next_value(ind);
	}
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }


    template<bool LocalIsActive, class B, typename C>
    typename internal::enable_if<!LocalIsActive,void>::type
    assign_conditional_inactive_scalar_(const B& bool_expr, C rhs) {
      ExpressionSize<rank> i(0);
      ExpressionSize<internal::expr_cast<B>::n_arrays> bool_ind(0);
      Index index = 0;
      int my_rank;
      static const int last = rank-1;

      do {
	i[last] = 0;
	bool_expr.set_location(i, bool_ind);
	// Innermost loop
	for ( ; i[last] < dimension_<rank-1>::value; ++i[last],
		index += offset_<rank-1>::value) {
	  if (bool_expr.next_value(bool_ind)) {
	    data_[index] = rhs;
	  }
	}
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }

    template<bool LocalIsActive, class B, typename C>
    typename internal::enable_if<LocalIsActive,void>::type
    assign_conditional_inactive_scalar_(const B& bool_expr, C rhs) {
#ifdef ADEPT_RECORDING_PAUSABLE
      if (! ADEPT_ACTIVE_STACK->is_recording()) {
	assign_conditional_inactive_scalar_<false, B, C>(bool_expr, rhs);
	return;
      }
#endif

      ExpressionSize<rank> i(0);
      ExpressionSize<internal::expr_cast<B>::n_arrays> bool_ind(0);
      Index index = 0;
      int my_rank;
      static const int last = rank-1;

      do {
	i[last] = 0;
	bool_expr.set_location(i, bool_ind);
	// Innermost loop
	for ( ; i[last] < dimension_<rank-1>::value; ++i[last],
		index += offset_<rank-1>::value) {
	  if (bool_expr.next_value(bool_ind)) {
	    data_[index] = rhs;
	    ADEPT_ACTIVE_STACK->push_lhs(gradient_index()+index);
	  }
	}
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }

    template<bool LocalIsActive, class B, class C>
    typename internal::enable_if<!LocalIsActive,void>::type
    assign_conditional_(const B& bool_expr, const C& rhs) {
      ExpressionSize<rank> i(0);
      ExpressionSize<internal::expr_cast<B>::n_arrays> bool_ind(0);
      ExpressionSize<internal::expr_cast<C>::n_arrays> rhs_ind(0);
      Index index = 0;
      int my_rank;
      static const int last = rank-1;
      bool is_gap = false;

      do {
	i[last] = 0;
	rhs.set_location(i, rhs_ind);
	bool_expr.set_location(i, bool_ind);
	// Innermost loop
	for ( ; i[last] < dimension_<rank-1>::value; ++i[last],
		index += offset_<rank-1>::value) {
	  if (bool_expr.next_value(bool_ind)) {
	    if (is_gap) {
	      rhs.set_location(i, rhs_ind);
	      is_gap = false;
	    }
	    data_[index] = rhs.next_value(rhs_ind);
	  }
	  else {
	    is_gap = true;
	  }
	}
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }


    template<bool LocalIsActive, class B, class C>
    typename internal::enable_if<LocalIsActive,void>::type
    assign_conditional_(const B& bool_expr, const C& rhs) {
      // If recording has been paused then call the inactive version
#ifdef ADEPT_RECORDING_PAUSABLE
      if (!ADEPT_ACTIVE_STACK->is_recording()) {
	assign_conditional_<false>(bool_expr, rhs);
	return;
      }
#endif
      ExpressionSize<rank> i(0);
      ExpressionSize<internal::expr_cast<B>::n_arrays> bool_ind(0);
      ExpressionSize<internal::expr_cast<C>::n_arrays> rhs_ind(0);
      Index index = 0;
      int my_rank;
      static const int last = rank-1;
      bool is_gap = false;

      ADEPT_ACTIVE_STACK->check_space(internal::expr_cast<C>::n_active * size());
      do {
	i[last] = 0;
	rhs.set_location(i, rhs_ind);
	bool_expr.set_location(i, bool_ind);
	// Innermost loop
	for ( ; i[last] < dimension_<rank-1>::value; ++i[last],
		index += offset_<rank-1>::value) {
	  if (bool_expr.next_value(bool_ind)) {
	    if (is_gap) {
	      rhs.set_location(i, rhs_ind);
	      is_gap = false;
	    }
	    data_[index] = rhs.next_value_and_gradient(*ADEPT_ACTIVE_STACK, rhs_ind);
	    ADEPT_ACTIVE_STACK->push_lhs(gradient_index()+index); // What if RHS not active?
	  }
	  else {
	    is_gap = true;
	  }
	}
	advance_index(index, my_rank, i);
      } while (my_rank >= 0);
    }


    // -------------------------------------------------------------------
    // FixedArray: 8. Data
    // -------------------------------------------------------------------
  protected:
    Type data_[length_]; // Stored on the stack

  }; // End of FixedArray class


  // -------------------------------------------------------------------
  // Helper functions
  // -------------------------------------------------------------------

  // Print array on a stream
  template <typename Type, bool IsActive, Index J0,Index J1,
	    Index J2,Index J3,Index J4,Index J5,Index J6>
  inline
  std::ostream&
  operator<<(std::ostream& os, const FixedArray<Type,IsActive,J0,J1,J2,J3,J4,J5,J6>& A) {
    const Array<internal::fixed_array<J0,J1,J2,J3,J4,J5,J6>::rank,Type,IsActive> B = A; // link to original data
    return B.print(os);
  }


  // Extract inactive part of array, working correctly depending on
  // whether argument is active or inactive
  template <typename Type, Index J0,Index J1,Index J2,Index J3,
	   Index J4,Index J5,Index J6>
  inline
  FixedArray<Type, false,J0,J1,J2,J3,J4,J5,J6>&
  value(FixedArray<Type, false,J0,J1,J2,J3,J4,J5,J6>& expr) {
    return expr;
  }
  template <typename Type, Index J0,Index J1,Index J2, Index J3,
	   Index J4,Index J5,Index J6>
  inline
  FixedArray<Type, false,J0,J1,J2,J3,J4,J5,J6>
  value(FixedArray<Type, true,J0,J1,J2,J3,J4,J5,J6>& expr) {
    return expr.inactive_link();
  }

  // -------------------------------------------------------------------
  // Transpose function
  // -------------------------------------------------------------------

  // Transpose 2D array
  template<typename Type, bool IsActive, Index J0, Index J1>
  inline
  Array<2,Type,IsActive>
  transpose(FixedArray<Type,IsActive,J0,J1>& in) {
    // Create output array initially as link to input array 
    Array<2,Type,IsActive> out(in);
    // Swap dimensions
    return out.in_place_transpose();
  }

  // Extract the gradients from an active FixedArray after the
  // Stack::forward or Stack::reverse functions have been called
  template<typename Type, typename dType, Index J0, Index J1,
	   Index J2, Index J3, Index J4, Index J5, Index J6>
  inline
  void get_gradients(const FixedArray<Type,true,J0,J1,J2,J3,J4,J5,J6>& a,
		     FixedArray<dType,false,J0,J1,J2,J3,J4,J5,J6>& data)
  {
    data = a.get_gradient();
  }

  template <typename T, bool IsActive, typename E, Index J0, 
	    Index J1, Index J2, Index J3, Index J4, Index J5, Index J6>
  internal::Allocator<internal::fixed_array<J0,J1,J2,J3,J4,J5,J6>::rank,
		      FixedArray<T,IsActive,J0,J1,J2,J3,J4,J5,J6> > 
  operator<<(FixedArray<T,IsActive,J0,J1,J2,J3,J4,J5,J6>& array, const E& x) {
    return internal::Allocator<internal::fixed_array<J0,J1,J2,J3,J4,J5,J6>::rank,
      FixedArray<T,IsActive,J0,J1,J2,J3,J4,J5,J6> >(array, x);
  }


} // End namespace adept

#endif


================================================
FILE: include/adept/GradientIndex.h
================================================


#ifndef AdeptGradientIndex_H
#define AdeptGradientIndex_H 1

#include <adept/Stack.h>

namespace adept {
  namespace internal {

    // Arrays inherit from this class to provide optional storage of
    // the gradient index of the first value of the array depending on
    // whether the array is active or not
    template <bool IsActive>
    struct GradientIndex {
      // Constructor used when linking to existing data where gradient
      // index is known
      GradientIndex(Index val = -9999) : value_(val) { }
      // Constructor used for fixed array objects where length is
      // known
      GradientIndex(Index n, bool) : value_(ADEPT_ACTIVE_STACK->register_gradients(n)) { }
      GradientIndex(Index val, Index offset) : value_(val+offset) { }
      Index get() const { return value_; }
      void set(Index val) { value_ = val; }
      void clear() { value_ = -9999; }
      template <typename Type>
      void set(const Type* data, const Storage<Type>* storage) {
	value_ = (storage->gradient_index() + (data - storage->data()));
      }
      void assert_inactive() {
	throw invalid_operation("Operation applied that is invalid with active arrays"
				ADEPT_EXCEPTION_LOCATION);
      }
      void unregister(Index n) { ADEPT_ACTIVE_STACK->unregister_gradients(value_, n); }
#ifdef ADEPT_MOVE_SEMANTICS
      void swap_value(GradientIndex& rhs) noexcept {
	Index tmp_value = rhs.get();
	rhs.set(value_);
	value_ = tmp_value;
      }
#endif
    private:
      Index value_;
    };

    template <>
    struct GradientIndex<false> {
      GradientIndex(Index val = -9999) { }
      GradientIndex(Index, bool) { }
      GradientIndex(Index val, Index offset) { }
      Index get() const { return -9999; }
      void set(Index val) { }
      void clear() { }
      template <typename Type>
      void set(const Type* data, const Storage<Type>* storage) { }
      void assert_inactive() { }
      void unregister(Index) { }
#ifdef ADEPT_MOVE_SEMANTICS
      void swap_value(GradientIndex& rhs) noexcept { }
#endif
    };

  };
};

#endif


================================================
FILE: include/adept/IndexedArray.h
================================================
/* IndexedArray.h -- Support for indexed arrays

    Copyright (C) 2015-2018 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

  
   If an Array is indexed via A(i,j,...) then there are three possible
   return values: (1) a scalar, if all indices are scalar integers
   (including 0-rank expressions such as "end"); (2) an Array that
   links to a subset of the data in the original Array, if one or more
   of the indices is a RangeIndex object and all the rest are scalar
   integers; and (3) an IndexedArray object, if one or more of the
   indices is a vector of integers.  All of these return values can be
   used on the left-hand-side of an expression.

   This file treats the last case.  The code is quite complex because
   the rank of the IndexedArray may be reduced compared to the
   original Array, since dimensions indexed by scalar integers are
   removed in IndexedArray.

*/


#ifndef AdeptIndexedArray_H
#define AdeptIndexedArray_H 1

#include <vector>

#include <adept/Expression.h>

namespace adept {

  // ---------------------------------------------------------------------
  // Section 0: Forward declarations 
  // ---------------------------------------------------------------------
  
  template <int Rank, typename Type, bool IsActive> class Array;

  
  namespace internal {
    
    // ---------------------------------------------------------------------
    // Section 1. get_size_with_len
    // ---------------------------------------------------------------------
    // Return the size of an index to an individual dimension, with
    // specializations for the different types of index. The second
    // argument passes in the length of the dimension being indexed;
    // that way if any of the indices are expressions containing
    // "end", this will be replaced by that dimension length minus 1.

    // A scalar integer and rank-0 expression have a size of unity
    inline
    Index get_size_with_len(const Index& j, const Index&) { return 1; }

    template <typename T, class E>
    inline
    typename enable_if<std::numeric_limits<T>::is_integer
		       && E::rank == 0, Index>::type
    get_size_with_len(const Expression<T,E>&, const Index& len) { return 1; }

    // Extract the length of an IntVector
    template <typename T, class E>
    inline
    typename enable_if<std::numeric_limits<T>::is_integer
		       && E::rank == 1 && !is_range<E>::value, Index>::type
    get_size_with_len(const Expression<T,E>& e, const Index& len) { 
      ExpressionSize<1> s;
      e.get_dimensions(s);
      return s[0];
    }

    // Extract the length of a RangeIndex object, which might be
    // dependent on len if "end" is present
    template <typename T, class E>
    inline
    typename enable_if<std::numeric_limits<T>::is_integer
		       && is_range<E>::value, Index>::type
    get_size_with_len(const Expression<T,E>& e, const Index& len) { 
      return e.cast().size_with_len_(len);
    }

    // Allow std::vector to be used to index Arrays
    template <typename T>
    inline
    typename enable_if<std::numeric_limits<T>::is_integer, Index>::type
    get_size_with_len(const std::vector<T>& v, const Index&) { 
      return v.size();
    }


    // ---------------------------------------------------------------------
    // Section 2. get_value_with_len
    // ---------------------------------------------------------------------
    // Return the j'th value of index ind.

#ifndef ADEPT_BOUNDS_CHECKING
    // For scalar indices there is only one value to return - j ought
    // to be zero but we don't check this
    inline
    Index get_value_with_len(const Index& ind, const Index& j, const Index&)
    { return ind; }

    template <typename T, class E>
    inline
    typename enable_if<std::numeric_limits<T>::is_integer
		       && (E::rank < 2), Index>::type
    get_value_with_len(const Expression<T,E>& ind, const Index& j, 
		       const Index& len) {
      return ind.value_with_len(j, len); 
    }

    template <typename T>
    inline
    Index get_value_with_len(const std::vector<T>& ind, const Index& j, 
			     const Index&) { 
      return ind[j];
    }
#else
    // For scalar indices there is only one value to return - j ought
    // to be zero but we don't check this
    inline
    Index get_value_with_len(const Index& ind, const Index& j, const Index& len)   { 
      if (j != 0) {
	throw index_out_of_bounds("Index to IndexedArray is out of bounds"
				  ADEPT_EXCEPTION_LOCATION);
      }
      else if (ind < 0 || ind >= len) {
	throw index_out_of_bounds("Scalar index out of bounds in IndexedArray"
				  ADEPT_EXCEPTION_LOCATION);
      }
      else {
	return ind; 
      }
    }

    template <typename T, class E>
    inline
    typename enable_if<std::numeric_limits<T>::is_integer
		       && (E::rank < 2), Index>::type
    get_value_with_len(const Expression<T,E>& ind, const Index& j, 
		       const Index& len) {
      Index i = ind.value_with_len(j, len);
      if (i < 0 || i >= len) {
	throw index_out_of_bounds("Index out of bounds in IndexedArray"
				  ADEPT_EXCEPTION_LOCATION);
      }
      else {
	return i;
      }
    }

    template <typename T>
    inline
    Index get_value_with_len(const std::vector<T>& ind, const Index& j, 
			     const Index& len) {
      Index i = ind[j];
      if (i < 0 || i >= len) {
	throw index_out_of_bounds("Index from std::vector out of bounds in IndexedArray"
				  ADEPT_EXCEPTION_LOCATION);
      }
      else {
	return i;
      }    
    }
#endif

    // ---------------------------------------------------------------------
    // Section 3. is_int_vector
    // ---------------------------------------------------------------------
    // is_int_vector<Type>::value is "true" if Type is a rank-1
    // integer expression (including RangeIndex objects), false
    // otherwise.

    template <typename T, class Enable = void>
    struct is_int_vector { };

    template <typename T>
    struct is_int_vector<T,
	 typename enable_if<is_not_expression<T>::value>::type>
    { static const bool value = false; };

    template <typename T>
    struct is_int_vector<T,
       typename enable_if<!is_not_expression<T>::value>::type>
    {
      static const bool value 
      = std::numeric_limits<typename T::type>::is_integer
	&& expr_cast<T>::rank == 1;
    };
    
    template <typename T>
    struct is_index {
      static const bool value = is_regular_index<T>::value 
	|| is_int_vector<T>::value;
      static const int count = value;
    };

    template <typename T>
    struct is_irregular_index {
      static const bool value = !is_range<T>::value 
	&& is_int_vector<T>::value;
      static const int count = value;
    };
    
    
    // ---------------------------------------------------------------------
    // Section 4. is_irregular_index
    // ---------------------------------------------------------------------

    // is_irregular_index<Rank,I0,I1,...>::value is "true" if indices
    // I0 to I[Rank-1] contains at least one integer vector that could
    // be irregularly spaced, and all the other are valid indices.
    // The ::count member gives the number of non-scalar indices,
    // which is the rank of the IndexedArray objects resulting from
    // indexing an Array of the specified Rank with indices I0 to
    // I[Rank-1].
    template <int Rank, typename I0, typename I1 = Index, 
	      typename I2 = Index, typename I3 = Index,
	      typename I4 = Index, typename I5 = Index,
	      typename I6 = Index>
    struct is_irreg_indexed {
      static const bool value
        = (   is_irregular_index<I0>::value || is_irregular_index<I1>::value
	   || is_irregular_index<I2>::value || is_irregular_index<I3>::value
	   || is_irregular_index<I4>::value || is_irregular_index<I5>::value
	   || is_irregular_index<I6>::value)
	&& (   is_index<I0>::value && is_index<I1>::value
	    && is_index<I2>::value && is_index<I3>::value
	    && is_index<I4>::value && is_index<I5>::value
	    && is_index<I6>::value);
      static const int count 
         = 7 - (  is_scalar_int<I0>::count + is_scalar_int<I1>::count
		+ is_scalar_int<I2>::count + is_scalar_int<I3>::count
		+ is_scalar_int<I4>::count + is_scalar_int<I5>::count
		+ is_scalar_int<I6>::count);
    };
    

    // ---------------------------------------------------------------------
    // Section 5. IndexedArray class
    // ---------------------------------------------------------------------
    // A class holding references to an Array to be indexed, plus
    // references to the objects corresponding to each of its
    // dimension being indexed.  IndexedArray objects are temporary,
    // generated by indexing an Array object "A" via A(i,j,...) within
    // an expression.  The indices themselves may be temporary results
    // of integer expressions, but by C++ rules they will not be
    // deleted until the full expression is complete.
    template <int Rank, typename Type, bool IsActive, 
	      class ArrayType, class I0, 
	      class I1 = Index, class I2 = Index, 
	      class I3 = Index, class I4 = Index, 
	      class I5 = Index, class I6 = Index>
    class IndexedArray : public Expression<Type, 
		   IndexedArray<Rank, Type, IsActive, ArrayType, 
				I0, I1, I2, I3, I4, I5, I6> > {
    public:
      // ---------------------------------------------------------------------
      // Section 5.1. IndexedArray: Static definitions
      // ---------------------------------------------------------------------
      static const int  rank       = Rank;
      static const int  n_scratch  = 1;
      static const int  n_active   = IsActive;

      // We require three indices to be stored to optimize the
      // calculation of the location: first the location of the start
      // of the row, second the index to i[Rank-1] (0, 1, 2...), and 
      // third the location passed to the Array
      static const int  n_arrays   = 3;
      static const bool is_active  = IsActive;

      // The rank of the array being indexed may be higher than the
      // result of the index due to singleton indices
      // (e.g. Matrix(IntVector,int) has rank 1 even though Matrix has
      // rank 2).
      static const int  a_rank      = ArrayType::rank;


      // ---------------------------------------------------------------------
      // Section 5.2. IndexedArray: Constructors
      // ---------------------------------------------------------------------
      // Make default constructor that the compiler might generate
      // itself unreachable
    private:
      IndexedArray() { }

    public:
      // The constructor sets all unused indices to an integer of zero
      IndexedArray(ArrayType& a, const I0& i0,
		   const I1& i1 = 0, const I2& i2 = 0,
		   const I3& i3 = 0, const I4& i4 = 0,
		   const I5& i5 = 0, const I6& i6 = 0)
	: a_(a), i0_(i0), i1_(i1), i2_(i2), i3_(i3), 
	  i4_(i4), i5_(i5), i6_(i6), a_dims_(a.dimensions())
      {
	// Compute the dimensions of the IndexedArray objects from the
	// lengths of the non-singleton indices to Array
	set_dimensions_<0,0>(); 

	// For stepping through memory efficiently in the inner loop,
	// we store the distance between elements in the fastest
	// varying dimension in Array
	last_offset_ = a.offset()[a_fastest_varying_dim];
      }

      // ---------------------------------------------------------------------
      // Section 5.3. IndexedArray: Functions facilitating Expression functionality
      // ---------------------------------------------------------------------
      bool get_dimensions_(ExpressionSize<Rank>& dim) const {
	dim = dimensions_;
	return true;
      }
      
      std::string info_string() const {
	std::stringstream s;
	s << expression_string_() 
	  << ", array-dim=" << a_dims_ << ", dim=" << dimensions_
	  << ", last-offset_=" << last_offset_;
	return s.str();	
      }

      std::string expression_string_() const {
	std::string str;
	str = a_.expression_string() + "(";
	str += expr_string(i0_);
	if (a_rank > 1) {
	  str += std::string(",") + expr_string(i1_);
	  if (a_rank > 2) {
	    str += std::string(",") + expr_string(i2_);
	    if (a_rank > 3) {
	      str += std::string(",") + expr_string(i3_);
	      if (a_rank > 4) {
		str += std::string(",") + expr_string(i4_);
		if (a_rank > 5) {
		  str += std::string(",") + expr_string(i5_);
		  if (a_rank > 6) {
		    str += std::string(",") + expr_string(i6_);
		  }
		}
	      }
	    }
	  }
	}
	str += ")";
	return str;
      }
     
    protected:
      // Helper functions for expression_string()
      template <typename T, typename E>
      std::string expr_string(const Expression<T,E>& e) const {
	return e.expression_string();
      }
      template <typename T>
      typename enable_if<is_not_expression<T>::value, std::string>::type
      expr_string(const T& e) const {
	std::stringstream s;
	s << e;
	return s.str();
      }

    public:
      bool is_aliased_(const Type* mem1, const Type* mem2) const {
	return a_.is_aliased(mem1, mem2);
      }

      Type value_with_len_(const Index& i, const Index& len) const {
	// Treat as one dimensional
	return a_(get_value_with_len_<Rank-1>(i));
      }
      
      template <int MyArrayNum, int NArrays>
      void set_location_(const ExpressionSize<Rank>& coords,
			 ExpressionSize<NArrays>& loc) const {
	ExpressionSize<a_rank> a_coords;
	translate_coords_<0,0>(coords, a_coords);
	// Location of start of most rapidly varying dimension in
	// Array
	a_.template set_location_<MyArrayNum>(a_coords, loc);
	// Index to most rapidly varying dimension in IndexedArray
	loc[MyArrayNum+1] = coords[Rank-1];
	loc[MyArrayNum+2] = loc[MyArrayNum] + last_offset_
	  * get_value_with_len_<a_fastest_varying_dim>(loc[MyArrayNum+1]);
      }

      // Advance the location of each array in the expression
      template <int MyArrayNum, int NArrays>
      void advance_location_(ExpressionSize<NArrays>& loc) const {
	++loc[MyArrayNum+1];
	// Note that next_value calls advance_location even when it
	// has reached the end of a row, in which case finding the
	// location of an indexed array is an invalid operation since
	// it would require accessing the indexing array out of
	// bounds. Hence the "if" test here.
	if (loc[MyArrayNum+1] < dimensions_[Rank-1]) {
	  loc[MyArrayNum+2] = loc[MyArrayNum] + last_offset_
	    * get_value_with_len_<a_fastest_varying_dim>(loc[MyArrayNum+1]);
	}
      }

      template <int MyArrayNum, int NArrays>
      Type value_at_location_(const ExpressionSize<NArrays>& loc) const {
	return a_.template value_at_location_<MyArrayNum+2>(loc);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_at_location_store_(const ExpressionSize<NArrays>& loc,
				    ScratchVector<NScratch>& scratch) const {
	ADEPT_STATIC_ASSERT(ArrayType::n_scratch == 0,
			    ASSUMING_ARRAY_N_SCRATCH_IS_ZERO);
	return (scratch[MyScratchNum] 
		= a_.template value_at_location_<MyArrayNum+2>(loc));
      }
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_stored_(const ExpressionSize<NArrays>& loc,
			 const ScratchVector<NScratch>& scratch) const {
	return scratch[MyScratchNum];
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      void calc_gradient_(Stack& stack, 
			  const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const {
	a_.template calc_gradient_<MyArrayNum+2,MyScratchNum+1>(stack, loc, scratch);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch,
		typename MyType>
      void calc_gradient_(Stack& stack, 
			  const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch,
			  MyType multiplier) const {
	a_.template calc_gradient_<MyArrayNum+2, MyScratchNum+1>(stack, loc, 
								 scratch, multiplier);
      }


      // ---------------------------------------------------------------------
      // Section 5.4. IndexedArray: Operators
      // ---------------------------------------------------------------------
      // Operators so that IndexedArray can appear on the
      // left-hand-side of a statement
      IndexedArray& operator=(const IndexedArray& src) {
	*this = static_cast<const Expression<Type,IndexedArray>&>(src);
	return *this;
      }

      // Assignment to a single value copies to every element
      template <typename RType>
      typename enable_if<is_not_expression<RType>::value, IndexedArray&>::type
      operator=(RType rhs) {
	if (!empty()) {
#ifdef ADEPT_RECORDING_PAUSABLE
	  if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
	    assign_inactive_scalar_<IsActive>(rhs);
#ifdef ADEPT_RECORDING_PAUSABLE
	  }
	  else {
	    assign_inactive_scalar_<false>(rhs);
	  }
#endif
	}
	return *this;
      }

    public:
      // Assignment to an array expression of the same rank
      template <typename EType, class E>
      typename enable_if<E::rank == Rank, IndexedArray&>::type
      operator=(const Expression<EType,E>& rhs) {
      // Definition moved to Array.h due to its dependence on the
      // Array class
	ExpressionSize<Rank> dims;
	if (!rhs.get_dimensions(dims)) {
	  std::string str = "Array size mismatch in "
	    + rhs.expression_string() + ".";
	  throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
	}
	else if (!compatible(dims, dimensions_)) {
	  std::string str = "Expr";
	  str += dims.str() + " object assigned to " + expression_string_();
	  throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
	}

	if (!empty()) {
#ifndef ADEPT_NO_ALIAS_CHECKING
	  // Check for aliasing first
	  Type const * ptr_begin;
	  Type const * ptr_end;
	  a_.data_range(ptr_begin, ptr_end);
	  if (rhs.is_aliased(ptr_begin, ptr_end)) {
	    Array<Rank,Type,IsActive> copy;
	    copy = noalias(rhs);
	    assign_expression_<IsActive, E::is_active>(copy);
	  }
	  else {
#endif
	    assign_expression_<IsActive, E::is_active>(rhs);
#ifndef ADEPT_NO_ALIAS_CHECKING
	  }
#endif
	}
	return *this;
      }


      // Assign active scalar expression to an active array by first
      // converting the RHS to an active scalar
      template <typename EType, class E>
      typename enable_if<E::rank == 0 && (Rank > 0)
	                 && IsActive && !E::is_lvalue,
	IndexedArray&>::type
      operator=(const Expression<EType,E>& rhs) {
	Active<EType> x = rhs;
	*this = x;
	return *this;
      }

      // Assign an active scalar to an active array
      template <typename PType>
      typename enable_if<!internal::is_active<PType>::value && IsActive, IndexedArray&>::type
      operator=(const Active<PType>& rhs) {
	ADEPT_STATIC_ASSERT(IsActive, ATTEMPT_TO_ASSIGN_ACTIVE_SCALAR_TO_INACTIVE_INDEXED_ARRAY);
	if (!empty()) {
#ifdef ADEPT_RECORDING_PAUSABLE
	  if (!ADEPT_ACTIVE_STACK->is_recording()) {
	    assign_inactive_scalar_<false>(rhs.scalar_value());
	    return *this;
	  }
#endif
	  
	  ExpressionSize<Rank> coords(0);
	  ExpressionSize<a_rank> a_coords(0);
	  ExpressionSize<1> a_loc(0);
	  Type val = rhs.scalar_value();
	  int dim;
	  static const int last = Rank-1;
	  do {
 	    coords[last] = 0;
	    // Convert between the coordinates of the IndexedArray
	    // object to the coordinates of the Array object
	    translate_coords_<0,0>(coords, a_coords);
	    a_.set_location(a_coords, a_loc);
	    // Innermost loop
	    for ( ; coords[last] < dimensions_[last]; ++coords[last]) {
	      Index index = a_loc[0]
		+ last_offset_
		* get_value_with_len_<a_fastest_varying_dim>(coords[last]);
	      a_.data()[index] = val;
	      ADEPT_ACTIVE_STACK->push_rhs(1.0, rhs.gradient_index());
	      ADEPT_ACTIVE_STACK->push_lhs(a_.gradient_index()+index);
	    }
	    advance_index(dim, coords);
	  } while (dim >= 0);
        }
        return *this;
      } 


#define ADEPT_DEFINE_OPERATOR(OPERATOR, OPSYMBOL)	\
    template <class RType>			\
    IndexedArray& OPERATOR(const RType& rhs) {	\
    return *this = noalias(*this) OPSYMBOL rhs;	\
    }
    ADEPT_DEFINE_OPERATOR(operator+=, +)
    ADEPT_DEFINE_OPERATOR(operator-=, -)
    ADEPT_DEFINE_OPERATOR(operator*=, *)
    ADEPT_DEFINE_OPERATOR(operator/=, /)
    //    ADEPT_DEFINE_OPERATOR(operator&=, &);
    //    ADEPT_DEFINE_OPERATOR(operator|=, |);
#undef ADEPT_DEFINE_OPERATOR

#ifdef ADEPT_CXX11_FEATURES

    // To enable assignment to an initializer list we take a simple
    // but inefficient strategy of creating a temporary Array and
    // assigning to that
    template <class IType>
    IndexedArray& operator=(std::initializer_list<IType> list) {
      ADEPT_STATIC_ASSERT(Rank==1,RANK_MISMATCH_IN_INITIALIZER_LIST);
      Array<Rank,Type,false> array = list;
      return (*this = array);
    }
    template <class IType>
    IndexedArray& operator=(std::initializer_list<
			    std::initializer_list<IType> > list) {
      ADEPT_STATIC_ASSERT(Rank==2,RANK_MISMATCH_IN_INITIALIZER_LIST);
      Array<Rank,Type,false> array = list;
      return (*this = array);
    }
    template <class IType>
    IndexedArray& operator=(std::initializer_list<
			    std::initializer_list<
			    std::initializer_list<IType> > > list) {
      ADEPT_STATIC_ASSERT(Rank==3,RANK_MISMATCH_IN_INITIALIZER_LIST);
      Array<Rank,Type,false> array = list;
      return (*this = array);
    }
    template <class IType>
    IndexedArray& operator=(std::initializer_list<
			    std::initializer_list<
			    std::initializer_list<
			    std::initializer_list<IType> > > > list) {
      ADEPT_STATIC_ASSERT(Rank==4,RANK_MISMATCH_IN_INITIALIZER_LIST);
      Array<Rank,Type,false> array = list;
      return (*this = array);
    }
    template <class IType>
    IndexedArray& operator=(std::initializer_list<
			    std::initializer_list<
			    std::initializer_list<
			    std::initializer_list<
			    std::initializer_list<IType> > > > > list) {
      ADEPT_STATIC_ASSERT(Rank==5,RANK_MISMATCH_IN_INITIALIZER_LIST);
      Array<Rank,Type,false> array = list;
      return (*this = array);
    }
    template <class IType>
    IndexedArray& operator=(std::initializer_list<
			    std::initializer_list<
			    std::initializer_list<
			    std::initializer_list<
			    std::initializer_list<
			    std::initializer_list<IType> > > > > > list) {
      ADEPT_STATIC_ASSERT(Rank==6,RANK_MISMATCH_IN_INITIALIZER_LIST);
      Array<Rank,Type,false> array = list;
      return (*this = array);
    }

#endif


    protected:
      // ---------------------------------------------------------------------
      // Section 5.5. IndexedArray: Internal functions facilitating operator=
      // ---------------------------------------------------------------------

      // Two versions of assigning an inactive scalar to an indexed
      // array depending on whether the indexed array is active -
      // first the case when it is not
      template <bool LocalIsActive, typename X>
      typename enable_if<!LocalIsActive,void>::type
      assign_inactive_scalar_(X x) {
	ExpressionSize<Rank> coords(0);
	ExpressionSize<a_rank> a_coords(0);
	ExpressionSize<1> a_loc(0);
	int dim;
	static const int last = Rank-1;
	do {
	  coords[last] = 0;
	  // Convert between the coordinates of the IndexedArray
	  // object to the coordinates of the Array object
	  translate_coords_<0,0>(coords, a_coords);
	  a_.set_location(a_coords, a_loc);
	  // Innermost loop
	  for ( ; coords[last] < dimensions_[last]; ++coords[last]) {
	    a_.data()[a_loc[0]
		      + last_offset_
		      * get_value_with_len_<a_fastest_varying_dim>(coords[last])]
	      = x;
	  }
	  advance_index(dim, coords);
	} while (dim >= 0);
      }

      // Active version of assigning an inactive scalar
      template <bool LocalIsActive, typename X>
      typename enable_if<LocalIsActive,void>::type
      assign_inactive_scalar_(X x) {
	// If not recording we call the inactive version instead
#ifdef ADEPT_RECORDING_PAUSABLE
	if (!ADEPT_ACTIVE_STACK->is_recording()) {
	  assign_inactive_scalar_<false, X>(x);
	  return;
	}
#endif
	ExpressionSize<Rank> coords(0);
	ExpressionSize<a_rank> a_coords(0);
	ExpressionSize<1> a_loc(0);
	int dim;
	static const int last = Rank-1;
	do {
	  coords[last] = 0;
	  // Convert between the coordinates of the IndexedArray
	  // object to the coordinates of the Array object
	  translate_coords_<0,0>(coords, a_coords);
	  a_.set_location(a_coords, a_loc);
	  // Innermost loop
	  for ( ; coords[last] < dimensions_[last]; ++coords[last]) {
	    Index index = a_loc[0]
	      + last_offset_
	      * get_value_with_len_<a_fastest_varying_dim>(coords[last]);
	    a_.data()[index] = x;
	    ADEPT_ACTIVE_STACK->push_lhs(a_.gradient_index()+index);
	  }
	  advance_index(dim, coords);
	} while (dim >= 0);
      }
      

      // Assign expression has two versions, passive and active
      template<bool LeftIsActive, bool RightIsActive, class E>
      typename enable_if<!LeftIsActive,void>::type
      assign_expression_(const E& rhs) {
	ADEPT_STATIC_ASSERT(!RightIsActive, 
		    CANNOT_ASSIGN_ACTIVE_EXPRESSION_TO_INACTIVE_INDEXED_ARRAY);
	ExpressionSize<Rank> coords(0);
	ExpressionSize<a_rank> a_coords(0);
	ExpressionSize<expr_cast<E>::n_arrays> loc(0);
	ExpressionSize<1> a_loc(0);
	int dim;
	static const int last = Rank-1;
	do {
	  coords[last] = 0;
	  rhs.set_location(coords, loc);
	  // Convert between the coordinates of the IndexedArray
	  // object to the coordinates of the Array object
	  translate_coords_<0,0>(coords, a_coords);
	  a_.set_location(a_coords, a_loc);
	  // Innermost loop
	  for ( ; coords[last] < dimensions_[last]; ++coords[last]) {
	    a_.data()[a_loc[0]
		      + last_offset_
		      * get_value_with_len_<a_fastest_varying_dim>(coords[last])]
	      = rhs.next_value(loc);
	  }
	  advance_index(dim, coords);
	} while (dim >= 0);
      }

      // Active LHS, passive RHS
      template<bool LeftIsActive, bool RightIsActive, class E>
      typename enable_if<LeftIsActive && !RightIsActive,void>::type
      assign_expression_(const E& rhs) {
#ifdef ADEPT_RECORDING_PAUSABLE
	if (!ADEPT_ACTIVE_STACK->is_recording()) {
	  assign_expression_<false,false>(rhs);
	  return;
	}
#endif
	ExpressionSize<Rank> coords(0);
	ExpressionSize<a_rank> a_coords(0);
	ExpressionSize<expr_cast<E>::n_arrays> loc(0);
	ExpressionSize<1> a_loc(0);
	int dim;
	static const int last = Rank-1;
	do {
	  coords[last] = 0;
	  rhs.set_location(coords, loc);
	  // Convert between the coordinates of the IndexedArray
	  // object to the coordinates of the Array object
	  translate_coords_<0,0>(coords, a_coords);
	  a_.set_location(a_coords, a_loc);
	  // Innermost loop
	  for ( ; coords[last] < dimensions_[last]; ++coords[last]) {
	    Index index = a_loc[0]
		      + last_offset_
	      * get_value_with_len_<a_fastest_varying_dim>(coords[last]);
	    a_.data()[index] = rhs.next_value(loc);
	    ADEPT_ACTIVE_STACK->push_lhs(a_.gradient_index()+index);
	  }
	  advance_index(dim, coords);
	} while (dim >= 0);
      }

      // Active LHS, active RHS
      template<bool LeftIsActive, bool RightIsActive, class E>
      typename enable_if<LeftIsActive && RightIsActive,void>::type
      assign_expression_(const E& rhs) {
#ifdef ADEPT_RECORDING_PAUSABLE
	if (!ADEPT_ACTIVE_STACK->is_recording()) {
	  assign_expression_<false,false>(rhs);
	  return;
	}
#endif
	ExpressionSize<Rank> coords(0);
	ExpressionSize<a_rank> a_coords(0);
	ExpressionSize<expr_cast<E>::n_arrays> loc(0);
	ExpressionSize<1> a_loc(0);
	int dim;
	static const int last = Rank-1;

	ADEPT_ACTIVE_STACK->check_space(expr_cast<E>::n_active * dimensions_[0]);
	do {
	  coords[last] = 0;
	  rhs.set_location(coords, loc);
	  // Convert between the coordinates of the IndexedArray
	  // object to the coordinates of the Array object
	  translate_coords_<0,0>(coords, a_coords);
	  a_.set_location(a_coords, a_loc);
	  // Innermost loop
	  for ( ; coords[last] < dimensions_[last]; ++coords[last]) {
	    Index index = a_loc[0]
		      + last_offset_
	      * get_value_with_len_<a_fastest_varying_dim>(coords[last]);
	    a_.data()[index] = rhs.next_value_and_gradient(*ADEPT_ACTIVE_STACK, loc);
	    ADEPT_ACTIVE_STACK->push_lhs(a_.gradient_index()+index);
	  }
	  advance_index(dim, coords);
	} while (dim >= 0);
      }

      // Move to the start of the next row
      void advance_index(int& dim, ExpressionSize<Rank>& coords) const {
	dim = Rank-1;
	while (--dim >= 0) {
	  if (++coords[dim] >= dimensions_[dim]) {
	    coords[dim] = 0;
	  }
	  else {
	    break;
	  }
	}
      }


      bool empty() { return dimensions_[0] == 0; }
      
      // Declare I as it is used before it is defined
      template<int Dim> struct Ix;

      // Translate coordinates in terms of the IndexedArray object in
      // to coordinates to the Array object it wraps, accounting for
      // singleton dimensions in Array that are not included in the
      // dimensions that IndexedArray presents to external objects
      template <int InDim, int OutDim>
      typename enable_if<!is_scalar_int<typename Ix<OutDim>::type>::value
                         && (InDim < Rank-1), void>::type
      translate_coords_(const ExpressionSize<Rank>& in,
		       ExpressionSize<a_rank>& out) const {
	// Compute the index of the OutDim dimension of Array
	out[OutDim] = get_value_with_len(index_object_<OutDim>(),
					 in[InDim],a_dims_[OutDim]);
	// Move on to the next dimension
	translate_coords_<InDim+1,OutDim+1>(in, out);
      }

      template <int InDim, int OutDim>
      typename enable_if<(OutDim < a_rank)
	                 && is_scalar_int<typename Ix<OutDim>::type>::value,
			 void>::type
      translate_coords_(const ExpressionSize<Rank>& in,
		        ExpressionSize<a_rank>& out) const {
	// This is a singleton dimension so the 0th element is the
	// only element
	out[OutDim] = get_value_with_len(index_object_<OutDim>(),
					  0,a_dims_[OutDim]);
	// Move on to the next OutDim dimension of Array
	translate_coords_<InDim,OutDim+1>(in, out);
      }

      template <int InDim, int OutDim>
      typename enable_if<!is_scalar_int<typename Ix<OutDim>::type>::value
                         && InDim == Rank-1, void>::type
      translate_coords_(const ExpressionSize<Rank>& in,
		       ExpressionSize<a_rank>& out) const {
	// The final non-singleton dimension is set to zero, since it
	// will be incremented later by advance_location
	out[OutDim] = 0;
	// Do any further dimensions, which must be singletons
	translate_coords_<InDim+1,OutDim+1>(in, out);
      }

      // Run out of dimensions: do nothing
      template <int InDim, int OutDim>
      typename enable_if<InDim == Rank && OutDim == a_rank, void>::type
      translate_coords_(const ExpressionSize<Rank>& in,
		       ExpressionSize<a_rank>& out) const { }

      template <int Dim>
      Index get_value_with_len_(const Index& j) const {
	return get_value_with_len(index_object_<Dim>(), j, a_dims_[Dim]);
 	//return get_value_with_len(index_object_<Dim>(), j, dimensions_[Dim]);
     }


      // ---------------------------------------------------------------------
      // Section 5.6. IndexedArray: Helper functions for the constructor
      // ---------------------------------------------------------------------
      // Helper function for translating between the dimensions of the
      // Array object and that of the IndexedArray, the latter of
      // which has removed the singleton dimensions of the former
      template <int InDim, int OutDim>
      typename enable_if<(OutDim < a_rank)
	&& !is_scalar_int<typename Ix<OutDim>::type>::value,void>::type
      set_dimensions_() {
	dimensions_[InDim] = get_size_with_len(index_object_<OutDim>(),
					      a_dims_[OutDim]);
	set_dimensions_<InDim+1, OutDim+1>();
      }
      template <int InDim, int OutDim>
      typename enable_if<(OutDim < a_rank)
	&& is_scalar_int<typename Ix<OutDim>::type>::value,void>::type
      set_dimensions_() {
	set_dimensions_<InDim, OutDim+1>();
      }
      template <int InDim, int OutDim>
      typename enable_if<OutDim == a_rank,void>::type
      set_dimensions_() { }


      // ---------------------------------------------------------------------
      // Section 5.7. IndexedArray: Low-level helper sub-classes and functions
      // ---------------------------------------------------------------------

      // The individual indices are stored in objects of type I0 to
      // I[Rank-1].  The following sub-class "index_alias" enables the
      // definition of the sub-class I that is used such that
      // Ix<Dim>::type returns the type of index "Dim" at compile time.
      template <int Dim,class X0,class X1,class X2,class X3,class X4,
		class X5,class X6> struct index_alias { };

      template<class X0,class X1,class X2,class X3,class X4,class X5,class X6> 
      struct index_alias<0,X0,X1,X2,X3,X4,X5,X6> { typedef X0 type; };

      template<class X0,class X1,class X2,class X3,class X4,class X5,class X6> 
      struct index_alias<1,X0,X1,X2,X3,X4,X5,X6> { typedef X1 type; };

      template<class X0,class X1,class X2,class X3,class X4,class X5,class X6> 
      struct index_alias<2,X0,X1,X2,X3,X4,X5,X6> { typedef X2 type; };

      template<class X0,class X1,class X2,class X3,class X4,class X5,class X6> 
      struct index_alias<3,X0,X1,X2,X3,X4,X5,X6> { typedef X3 type; };

      template<class X0,class X1,class X2,class X3,class X4,class X5,class X6> 
      struct index_alias<4,X0,X1,X2,X3,X4,X5,X6> { typedef X4 type; };

      template<class X0,class X1,class X2,class X3,class X4,class X5,class X6> 
      struct index_alias<5,X0,X1,X2,X3,X4,X5,X6> { typedef X5 type; };

      template<class X0,class X1,class X2,class X3,class X4,class X5,class X6> 
      struct index_alias<6,X0,X1,X2,X3,X4,X5,X6> { typedef X6 type; };

      template<int Dim> struct Ix { 
	typedef typename index_alias<Dim,I0,I1,I2,I3,I4,I5,I6>::type type; 
      };

      // Similarly, the following enables us to return not just the
      // type but also a reference to the actual index object via
      // index_object_<Dim>()
      template <int Dim> typename enable_if<Dim == 0, const I0&>::type
      index_object_() const { return i0_; }
      template <int Dim> typename enable_if<Dim == 1, const I1&>::type
      index_object_() const { return i1_; }
      template <int Dim> typename enable_if<Dim == 2, const I2&>::type
      index_object_() const { return i2_; }
      template <int Dim> typename enable_if<Dim == 3, const I3&>::type
      index_object_() const { return i3_; }
      template <int Dim> typename enable_if<Dim == 4, const I4&>::type
      index_object_() const { return i4_; }
      template <int Dim> typename enable_if<Dim == 5, const I5&>::type
      index_object_() const { return i5_; }
      template <int Dim> typename enable_if<Dim == 6, const I6&>::type
      index_object_() const { return i6_; }

      // The following sub-class "fastest_varying" enables the
      // definition of "a_fastest_varying_dim" static constant integer
      // that contains the dimension of Array that varies fastest when
      // progessing through memory and is not a singleton.  This
      // corresponds to the dimension "Rank-1" of IndexedArray.
      template<int Dim, class X0,class X1,class X2,
	       class X3,class X4,class X5,class X6> 
      struct fastest_varying {
	static const int value
	  = is_scalar_int<typename index_alias<Dim,X0,X1,X2,X3,X4,X5,X6>::type>::value 
	  ? fastest_varying<Dim-1,X0,X1,X2,X3,X4,X5,X6>::value
	  : Dim;
      };
      template<class X0,class X1,class X2,class X3,class X4,class X5,class X6> 
      struct fastest_varying<0,X0,X1,X2,X3,X4,X5,X6> {
	static const int value = 0;
      };

      static const int a_fastest_varying_dim 
        = fastest_varying<6,I0,I1,I2,I3,I4,I5,I6>::value;

      // ---------------------------------------------------------------------
      // Section 5.8. IndexedArray: Data
      // ---------------------------------------------------------------------
      // Reference to the array being indexed
      ArrayType& a_;
      // Individual indices to up to seven dimensions
      const I0& i0_;
      const I1& i1_;
      const I2& i2_;
      const I3& i3_;
      const I4& i4_;
      const I5& i5_;
      const I6& i6_;
      // Dimensions of the array being indexed (cannot be a reference
      // because FixedArrays do not store their dimensions explicitly)
      ExpressionSize<ArrayType::rank> a_dims_;
      // Dimensions of the IndexedArray
      ExpressionSize<Rank> dimensions_;
      // Separation of elements of the array objects in the dimension
      // that varies fastests
      Index last_offset_;

    }; // End class IndexedArray

  } // End namespace internal
  
} // End namespace adept

#endif 


================================================
FILE: include/adept/Minimizer.h
================================================
/* Minimizer.h -- class for minimizing the cost function of an optimizable object

    Copyright (C) 2020 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#ifndef AdeptMinimizer_H
#define AdeptMinimizer_H 1

#include <adept/Optimizable.h>

namespace adept {

  enum MinimizerAlgorithm {
    MINIMIZER_ALGORITHM_LIMITED_MEMORY_BFGS = 0,
    MINIMIZER_ALGORITHM_CONJUGATE_GRADIENT,    // Polak-Ribiere
    MINIMIZER_ALGORITHM_CONJUGATE_GRADIENT_FR, // Fletcher-Reeves
    MINIMIZER_ALGORITHM_LEVENBERG,
    MINIMIZER_ALGORITHM_LEVENBERG_MARQUARDT,
    MINIMIZER_ALGORITHM_NUMBER_AVAILABLE
  };

  enum MinimizerStatus {
    MINIMIZER_STATUS_SUCCESS = 0,
    MINIMIZER_STATUS_EMPTY_STATE,
    MINIMIZER_STATUS_MAX_ITERATIONS_REACHED,
    MINIMIZER_STATUS_FAILED_TO_CONVERGE,
    MINIMIZER_STATUS_DIRECTION_UPHILL,
    MINIMIZER_STATUS_BOUND_REACHED, // Only returned from line-search
    MINIMIZER_STATUS_INVALID_COST_FUNCTION,
    MINIMIZER_STATUS_INVALID_GRADIENT,
    MINIMIZER_STATUS_INVALID_BOUNDS,
    MINIMIZER_STATUS_NUMBER_AVAILABLE,
    MINIMIZER_STATUS_NOT_YET_CONVERGED
  };

  // Return a C string describing the minimizer status
  const char* minimizer_status_string(MinimizerStatus status);

  // Return the order of a minimization algorithm: 0 indicates only
  // the cost function is required, 1 indicates the first derivative
  // is required, 2 indicates the second derivative is required, while
  // -1 indicates that the algorithm is not recognized.
  inline int minimizer_algorithm_order(MinimizerAlgorithm algo) {
    switch (algo) {
    case MINIMIZER_ALGORITHM_LIMITED_MEMORY_BFGS:
    case MINIMIZER_ALGORITHM_CONJUGATE_GRADIENT:
    case MINIMIZER_ALGORITHM_CONJUGATE_GRADIENT_FR:
      return 1;
      break;
    case MINIMIZER_ALGORITHM_LEVENBERG:
    case MINIMIZER_ALGORITHM_LEVENBERG_MARQUARDT:
      return 2;
      break;
    default:
      return -1;
    }
  }

  // Convenience function for initializing vectors representing the
  // lower and upper bounds on state variables
  inline void minimizer_initialize_bounds(int nx, adept::Vector& x_lower,
					  adept::Vector& x_upper) {
    x_lower.resize(nx);
    x_upper.resize(nx);
    x_lower = -std::numeric_limits<Real>::max();
    x_upper =  std::numeric_limits<Real>::max();
  }

  // A class that can minimize a function using various algorithms
  class Minimizer {

  public:

    // Tedious C++98 initializations
    Minimizer(MinimizerAlgorithm algo) {
      initialize_default_settings();
      set_algorithm(algo);
    }

    Minimizer(const std::string& algo) {
      initialize_default_settings();
      set_algorithm(algo);
    }

    void initialize_default_settings() {
      max_iterations_ = 100; // <=0 means no limit
      max_step_size_ = -1.0;
      converged_gradient_norm_ = 0.1;
      ensure_updated_state_ = -1;
      levenberg_damping_min_ = 1.0/128.0;
      levenberg_damping_max_ = 100000.0;
      levenberg_damping_multiplier_ = 2.0;
      levenberg_damping_divider_ = 5.0;
      levenberg_damping_start_ = 0.0;
      levenberg_damping_restart_ = 1.0/4.0;
      max_line_search_iterations_ = 10;
      armijo_coeff_ = 1.0e-4;
      cg_curvature_coeff_ = 0.1;
      lbfgs_curvature_coeff_ = 0.9;
      lbfgs_n_states_ = 6;
    }

    // Unconstrained minimization
    MinimizerStatus minimize(Optimizable& optimizable, Vector x);
    // Constrained minimization
    MinimizerStatus minimize(Optimizable& optimizable, Vector x,
			     const Vector& x_lower, const Vector& x_upper);

    // Functions to set parameters defining the general behaviour of
    // minimization algorithms
    void set_algorithm(MinimizerAlgorithm algo) { algorithm_ = algo; }
    void set_algorithm(const std::string& algo);
    void set_max_iterations(int mi)             { max_iterations_ = mi; }
    void set_converged_gradient_norm(Real cgn)  { converged_gradient_norm_ = cgn; }
    void set_max_step_size(Real mss)            { max_step_size_ = mss; }

    // Ensure that the last call to compute the cost function uses the
    // "solution" state vector returned by minimize. This ensures that
    // any variables in user classes that inherit from Optimizable are
    // up to date with the returned state vector. The "order" argument
    // indicates which the order of derivatives required (provided
    // they are supported by the minimizing algorithm):
    // 0=cost_function, 1=cost_function_gradient,
    // 2=cost_function_gradient_hessian.
    void ensure_updated_state(int order = 2)    { ensure_updated_state_ = order; }
    
    // Return parameters defining behaviour of minimization algorithms
    MinimizerAlgorithm algorithm() { return algorithm_; }
    std::string algorithm_name();
    int max_iterations() { return max_iterations_; }
    Real converged_gradient_norm() { return converged_gradient_norm_; }      

    // Functions to set parameters defining the behaviour of the
    // Levenberg and Levenberg-Marquardt algorithm
    void set_levenberg_damping_limits(Real damp_min, Real damp_max);
    void set_levenberg_damping_start(Real damp_start);
    void set_levenberg_damping_restart(Real damp_restart);
    void set_levenberg_damping_multiplier(Real damp_multiply, Real damp_divide);

    // Functions to set parameters used by the L-BFGS and
    // Conjugate-Gradient algorithms
    void set_max_line_search_iterations(int mi) { max_line_search_iterations_ = mi; }
    void set_armijo_coeff(Real ac)              {
      if (ac <= 0.0 || ac >= 1.0) {
	throw optimization_exception("Armijo coefficient must be greater than 0 and less than 1");
      }
      else {
	armijo_coeff_ = ac;
      }
    }
    void set_lbfgs_curvature_coeff(Real lcc) {
      if (lcc <= 0.0 || lcc >= 1.0) {
	throw optimization_exception("L-BFGS curvature coefficient must be greater than 0 and less than 1");
      }
      else {
	lbfgs_curvature_coeff_ = lcc;
      }
    }
    void set_cg_curvature_coeff(Real cgcc) {
      if (cgcc <= 0.0 || cgcc >= 1.0) {
	throw optimization_exception("Conjugate-Gradient curvature coefficient must be greater than 0 and less than 1");
      }
      else {
	cg_curvature_coeff_ = cgcc;
      }
    }

    // Query aspects of the algorithm progress after it has completed
    int  n_iterations()        const { return n_iterations_; }
    int  n_samples()           const { return n_samples_; }
    Real cost_function()       const { return cost_function_; }
    Real gradient_norm()       const { return gradient_norm_; }
    Real start_cost_function() const { return start_cost_function_; }
    MinimizerStatus status()   const { return status_; }

  protected:

    // Specific minimization algorithms

    // The Limited-Memory Broyden-Fletcher-Goldfarb-Shanno algorithm
    MinimizerStatus 
    minimize_limited_memory_bfgs(Optimizable& optimizable, Vector x);
    MinimizerStatus
    minimize_limited_memory_bfgs_bounded(Optimizable& optimizable, Vector x,
					 const Vector& min_x,
					 const Vector& max_x);

    // The Conjugate-Gradient algorithm; Polak-Ribiere by default,
    // optionally Fletcher-Reeves
    MinimizerStatus
    minimize_conjugate_gradient(Optimizable& optimizable, Vector x,
				bool use_fletcher_reeves = false);
    MinimizerStatus
    minimize_conjugate_gradient_bounded(Optimizable& optimizable, Vector x,
					const Vector& min_x,
					const Vector& max_x,
					bool use_fletcher_reeves = false);

    // The Levenberg-Marquardt algorithm; if use_additive_damping is
    // true then the Levenberg algorithm is used instead
    MinimizerStatus
    minimize_levenberg_marquardt(Optimizable& optimizable, Vector x,
				 bool use_additive_damping = false);
    MinimizerStatus
    minimize_levenberg_marquardt_bounded(Optimizable& optimizable, Vector x,
					 const Vector& min_x,
					 const Vector& max_x,
					 bool use_additive_damping = false);

    // Perform line search starting at state vector "x" with gradient
    // vector "gradient", and initial step "step_size" in
    // un-normalized direction "direction". Successful minimization of
    // the function (according to Wolfe conditions) will lead to
    // MINIMIZER_STATUS_SUCCESS being returned, the new state stored
    // in "x", and if state_up_to_date >= 1 then the gradient stored
    // in "gradient". Other possible return values are
    // MINIMIZER_STATUS_FAILED_TO_CONVERGE and
    // MINIMIZER_STATUS_DIRECTION_UPHILL if the initial direction
    // points uphill, or MINIMIZER_STATUS_INVALID_COST_FUNCTION,
    // MINIMIZER_STATUS_INVALID_GRADIENT or
    // MINIMIZER_STATUS_BOUND_REACHED. First the minimum is bracketed,
    // then a cubic polynomial is fitted to the values and gradients
    // of the function at the two points in order to select the next
    // test point.
    MinimizerStatus
    line_search(Optimizable& optimizable, Vector x, const Vector& direction,
		Vector test_x, Real& abs_step_size,
		Vector gradient, int& state_up_to_date,
		Real curvature_coeff, Real bound_step_size = -1.0);

    // Compute the cost function "cf" and gradient vector "gradient",
    // along with the scalar gradient "grad" in the search direction
    // "direction" (normalized with "dir_scaling"), from the state
    // vector "x" plus a step "step_size" in the search direction. If
    // the resulting cost function and gradient satisfy the Wolfe
    // conditions for sufficient convergence, copy the new state
    // vector to "x" and the step size to "final_step_size", and
    // return MINIMIZER_STATUS_SUCCESS.  Otherwise, return
    // MINIMIZER_STATUS_NOT_YET_CONVERGED.  Error conditions
    // MINIMIZER_STATUS_INVALID_COST_FUNCTION and
    // MINIMIZER_STATUS_INVALID_GRADIENT are also possible.
    MinimizerStatus
    line_search_gradient_check(Optimizable& optimizable, Vector x, 
			       const Vector& direction,
			       Vector test_x, Real& final_step_size,
			       Vector gradient, int& state_up_to_date,
			       Real step_size, Real grad0, Real dir_scaling,
			       Real& cost_function, Real& grad,
			       Real curvature_coeff);

    // DATA

    // Minimizer type
    MinimizerAlgorithm algorithm_;

    // Variables controling the general behaviour of the minimizer,
    // used by all gradient-based algorithms
    int max_iterations_; // <=0 means no limit
    Real max_step_size_;
    Real converged_gradient_norm_;
    int ensure_updated_state_;

    // Variables controling the specific behaviour of the
    // Levenberg-Marquardt minimizer
    Real levenberg_damping_min_;
    Real levenberg_damping_max_;
    Real levenberg_damping_multiplier_;
    Real levenberg_damping_divider_;
    Real levenberg_damping_start_;
    Real levenberg_damping_restart_;

    // Variable used by the Conjugate-Gradient and L-BFGS minimizers
    int max_line_search_iterations_;
    // Armijo condition determined by this coefficient, the first of
    // the two Wolfe conditions
    Real armijo_coeff_;

    // Variables controlling the specific behaviour of the Conjugate
    // Gradient minimizer
    // Gradient in search direction must reduce by this amount
    Real cg_curvature_coeff_;

    // Variables controlling specific behaviour of L-BFGS minimizer
    // Gradient in search direction must reduce by this amount
    Real lbfgs_curvature_coeff_;
    // Number of prevous states to store
    int lbfgs_n_states_;

    // Variables set during the running of an algorithm and available
    // to the user afterwards

    // Number of iterations that successfully reduced the cost function
    int n_iterations_;

    // Number of calculations of the cost function
    int n_samples_;

    Real start_cost_function_;
    Real cost_function_;
    Real gradient_norm_;
    MinimizerStatus status_;
  };

  // Implement inline member functions

  // Functions to set parameters defining the behaviour of the
  // Levenberg and Levenberg-Marquardt algorithm
  inline void 
  Minimizer::set_levenberg_damping_limits(Real damp_min, Real damp_max) {
    if (damp_min <= 0.0) {
      throw optimization_exception("Minimum damping factor in Levenberg-Marquardt algorithm must be positive");
    }
    else if (damp_max <= damp_min) {
      throw optimization_exception("Maximum damping factor must be greater than minimum in Levenberg-Marquardt algorithm");
    }
    levenberg_damping_min_ = damp_min;
    levenberg_damping_max_ = damp_max;
  }
  inline void 
  Minimizer::set_levenberg_damping_start(Real damp_start) {
    if (damp_start < 0.0) {
      throw optimization_exception("Start damping factor in Levenberg-Marquardt algorithm must be positive or zero");
    }
    levenberg_damping_start_ = damp_start;
  }
  inline void 
  Minimizer::set_levenberg_damping_restart(Real damp_restart) {
    if (damp_restart <= 0.0) {
      throw optimization_exception("Restart damping factor in Levenberg-Marquardt algorithm must be positive");
    }
    levenberg_damping_restart_ = damp_restart;
  }
  inline void 
  Minimizer::set_levenberg_damping_multiplier(Real damp_multiply,
					      Real damp_divide) {
    if (damp_multiply <= 1.0 || damp_divide <= 1.0) {
      throw optimization_exception("Damping multipliers in Levenberg-Marquardt algorithm must be greater than one");
    }
    levenberg_damping_multiplier_ = damp_multiply;
    levenberg_damping_divider_    = damp_divide;
  }

};

#endif


================================================
FILE: include/adept/Optimizable.h
================================================
/* Optimizable.h -- abstract base classes representing an optimization problem

    Copyright (C) 2020 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#ifndef AdeptOptimizable_H
#define AdeptOptimizable_H 1

#include <adept_arrays.h>

namespace adept {

  // A class representing an optimization problem that can be solved
  // by Adept's Minimizer class. The user should define their own
  // class that publicly inherits from Optimizable and overrides the
  // member functions calc_cost_function and provides_derivative.
  // This is the minimum requirement to use in gradient-free
  // minimization algorithms (e.g. Nelder-Mead). To use in
  // quasi-Newton and conjugate-gradient minimization algorithms, the
  // user should also override the member function
  // calc_cost_function_gradient. To use in Newton-type minimization
  // algorithms such as Gauss-Newton and Levenberg-Marquardt, the user
  // should also override the member function
  // calc_cost_function_gradient_hessian.  The user may optionally
  // override report_progress.
  class Optimizable {
  public:
    virtual ~Optimizable() { }

    // Return the cost function corresponding to the state vector x.
    virtual Real calc_cost_function(const adept::Vector& x) = 0;

    // Return the cost function corresponding to the state vector x,
    // and also set the "gradient" argument to the gradient of the
    // cost function with respect to each element of x.
    virtual Real calc_cost_function_gradient(const adept::Vector& x,
					     adept::Vector gradient) {
      // If we get here then a gradient-based minimizer has been
      // applied to this class but the user has not implemented a
      // function to compute the gradient.
      throw optimization_exception("Gradient calculation has not been implemented");
    }
   
    // Return the cost function corresponding to the state vector x,
    // and set the "gradient" argument to the gradient of the cost
    // function with respect to each element of x, and "hessian" to
    // the second derivative of the cost function with respect to x.
    virtual Real calc_cost_function_gradient_hessian(const adept::Vector& x,
		     adept::Vector gradient, adept::SymmMatrix& hessian) {
      // If we get here then a Newton-type minimizer has been applied
      // to this class but the user has not implemented a function to
      // compute the Hessian matrix.
      throw optimization_exception("Hessian calculation has not been implemented");
    }

    // This function is called at every iteration, and can be
    // overridden by child classes to report or store the progress at
    // each iteration, if required. By default it does nothing.
    virtual void report_progress(int niter, const adept::Vector& x,
				 Real cost, Real gnorm) { }

    // Child classes should override this function to provide a
    // run-time mechanism to check which of the first and second
    // derivative (i.e. gradient and Hessian, respectively) are
    // available.  If only the gradient is available then it could be
    // implemented as: if (order == 0 || order == 1) { return true; }
    // else { return false; }
    virtual bool provides_derivative(int order) = 0;

  };

};

#endif


================================================
FILE: include/adept/Packet.h
================================================
/* Packet.h -- Vectorization support

    Copyright (C) 2016-2020 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

   A Packet contains a short vector of values, and when it is used in
   a limited set of arithmetic operations, the appropriate vector
   instructions will be used.  For example if your hardware and
   compiler support SSE2 then Packet<float> is a vector of 4x 4-byte
   floats while Packet<double> is a vector of 2x 8-byte floats. This
   header file also provides for allocating aligned data
*/

#ifndef AdeptPacket_H
#define AdeptPacket_H 1

#include <iostream>
#include <cstdlib>
#include <cmath>

// Headers needed for allocation of aligned memory
#include <new>

#ifdef __unix__
#include <unistd.h>  // Defines _POSIX_VERSION
#endif
#include <stdlib.h>
#ifdef _MSC_VER
#include <malloc.h> // Provides _aligned_malloc on Windows
#endif

#include <adept/quick_e.h>
#include <adept/base.h>

// -------------------------------------------------------------------
// Determine how many floating point values will be held in a packet
// -------------------------------------------------------------------

#ifndef ADEPT_FLOAT_PACKET_SIZE
#define ADEPT_FLOAT_PACKET_SIZE QE_LONGEST_FLOAT_PACKET
//static const int ADEPT_FLOAT_PACKET_SIZE = quick_e::longest_packet<float>::size;
#endif
#ifndef ADEPT_DOUBLE_PACKET_SIZE
#define ADEPT_DOUBLE_PACKET_SIZE QE_LONGEST_DOUBLE_PACKET
//static const int ADEPT_DOUBLE_PACKET_SIZE = quick_e::longest_packet<double>::size
#endif

// -------------------------------------------------------------------
// Determine how many floating point values will be held in packet of Real
// -------------------------------------------------------------------

#if ADEPT_REAL_TYPE_SIZE == 4
#define ADEPT_REAL_PACKET_SIZE ADEPT_FLOAT_PACKET_SIZE
#elif ADEPT_REAL_TYPE_SIZE == 8
#define ADEPT_REAL_PACKET_SIZE ADEPT_DOUBLE_PACKET_SIZE
#else
#define ADEPT_REAL_PACKET_SIZE 1
#endif

namespace adept {

  namespace internal {

    // Trait to define packet size
    template <typename T> struct packet_traits
    { static const int size = 1; };
    template <> struct packet_traits<float>
    { static const int size = ADEPT_FLOAT_PACKET_SIZE; };
    template <> struct packet_traits<double>
    { static const int size = ADEPT_DOUBLE_PACKET_SIZE; };
    

    // -------------------------------------------------------------------
    // Define packet type
    // -------------------------------------------------------------------

    // Unfortunately, with C++98, unions cannot contain std::complex
    // because ith as a constructor... therefore Packet inherits from
    // PacketData to contain the data in order that union is only used
    // for Packets of types that are actually vectorized (which are
    // floats and doubles).
    template <typename T, class Enable = void>
    struct PacketData {
      // Static definitions
      static const int size = packet_traits<T>::size;
      typedef typename quick_e::packet<T,size>::type intrinsic_type;
      PacketData(intrinsic_type d) : data(d) { }
      union {
	intrinsic_type data;
	T value_[size];
      };
      T value() const { return value_[0]; }
      T& operator[](int i) { return value_[i]; }
      const T& operator[](int i) const { return value_[i]; }
    };
    template <typename T>
    struct PacketData<T, typename enable_if<packet_traits<T>::size == 1>::type>
    {
      // Static definitions
      static const int size = 1;
      typedef T intrinsic_type;
      PacketData(intrinsic_type d) : data(d) { }
      T data;
      T value() const { return data; }
      T& operator[](int i) { return data; }
      const T& operator[](int i) const { return data; }
    };
    
    template <typename T>
    struct Packet : public PacketData<T> {
      using PacketData<T>::data;
      static const int size = packet_traits<T>::size;
      typedef typename quick_e::packet<T,size>::type intrinsic_type;
      //      static const int intrinsic_size = 1; // What is this for?
      static const std::size_t alignment_bytes = sizeof(intrinsic_type);
       // T=float/double -> all bits = 1
      static const std::size_t align_mask = (size == 1) ? -1 : alignment_bytes-1;
      static const bool        is_vectorized = (size > 1);
      // Constructors
      Packet() : PacketData<T>(quick_e::set0<intrinsic_type>()) { }
      Packet(const Packet& d) : PacketData<T>(d.data) { }
      template <typename TT>
      Packet(TT d, typename enable_if<is_same<TT,intrinsic_type>::value,int>::type = 0)
	: PacketData<T>(d) { }
      explicit Packet(const T* d) : PacketData<T>(quick_e::load<intrinsic_type>(d)) { }
      //      explicit Packet(T d) : PacketData<T>(quick_e::set1<intrinsic_type>(d)) { }
      template <typename TT>
      explicit Packet(TT d, typename enable_if<is_same<TT,T>::value&&is_vectorized,int>::type = 0)
	: PacketData<T>(quick_e::set1<intrinsic_type>(d)) { }
      // Member functions
      void put(T* __restrict d) const { quick_e::store(d, data); }
      void put_unaligned(T* __restrict d) const { quick_e::storeu(d, data); }
      //      void operator=(T d)              { data = quick_e::set1<intrinsic_type>(d); }
      template <typename TT> //, typename enable_if<is_same<T,TT>::value||is_same<T,intrinsic_type>::value,int>::type = 0>
      void operator=(TT d)              { data = quick_e::set1<intrinsic_type>(d); }
      //      void operator=(intrinsic_type d) { data = d;       }
      void operator=(const Packet& d)  { data = d.data;  }
      void operator+=(const Packet& d) { data = quick_e::add(data, d.data); }
      void operator-=(const Packet& d) { data = quick_e::sub(data, d.data); }
      void operator*=(const Packet& d) { data = quick_e::mul(data, d.data); }
      void operator/=(const Packet& d) { data = quick_e::div(data, d.data); }
      Packet operator-() const         { return quick_e::neg(data); }
      Packet operator+() const         { return *this; }
    };

    //#define QE_PACKET_ARG Packet<T>
    #define QE_PACKET_ARG const Packet<T>& __restrict
        
    // Default functions
    template <typename T> Packet<T> operator+(QE_PACKET_ARG x, QE_PACKET_ARG y)
    { return quick_e::add(x.data,y.data); }
    template <typename T> Packet<T> operator-(QE_PACKET_ARG x, QE_PACKET_ARG y)
    { return quick_e::sub(x.data,y.data); }
    template <typename T> Packet<T> operator*(QE_PACKET_ARG x, QE_PACKET_ARG y)
    { return quick_e::mul(x.data,y.data); }
    template <typename T> Packet<T> operator/(QE_PACKET_ARG x, QE_PACKET_ARG y)
    { return quick_e::div(x.data,y.data); }
    template <typename T> Packet<T> fmin(QE_PACKET_ARG x, QE_PACKET_ARG y)
    { return quick_e::fmin(x.data,y.data); }
    template <typename T> Packet<T> fmax(QE_PACKET_ARG x, QE_PACKET_ARG y)
    { return quick_e::fmax(x.data,y.data); }
    template <typename T> Packet<T> sqrt(QE_PACKET_ARG x) {
      using std::sqrt;
      using quick_e::sqrt;
      return sqrt(x.data);
    }
    template <typename T> Packet<T> fastexp(QE_PACKET_ARG x) {
      return quick_e::exp(x.data);
    }
#ifdef ADEPT_FAST_EXPONENTIAL
    template <typename T> Packet<T> exp(QE_PACKET_ARG x) {
      return quick_e::exp(x.data);
    }
#else
    template <typename T> Packet<T> exp(QE_PACKET_ARG x) {
      return std::exp(x.data);
    }
#endif

    template <typename T> T hsum(QE_PACKET_ARG x)  { return quick_e::hsum(x.data); }
    template <typename T> T hprod(QE_PACKET_ARG x) { return quick_e::hmul(x.data); }
    template <typename T> T hmin(QE_PACKET_ARG x)  { return quick_e::hmin(x.data); }
    template <typename T> T hmax(QE_PACKET_ARG x)  { return quick_e::hmax(x.data); }

    template <typename T>
    std::ostream& operator<<(std::ostream& os, QE_PACKET_ARG x) {
      os << "{";
      for (int i = 0; i < Packet<T>::size; ++i) {
	os << " " << x[i];
      }
      os << "}";
      return os;
    }

    // -------------------------------------------------------------------
    // Aligned allocation and freeing of memory
    // -------------------------------------------------------------------
    template <typename Type>
    inline
    Type* alloc_aligned(Index n) {
      std::size_t n_align = Packet<Type>::alignment_bytes;
      if (n_align < sizeof(void*)) {
	// Note that the requested byte alignment passed to
	// posix_memalign must be at least sizeof(void*)
	return new Type[n];
      }
      else {
	Type* result;
#ifdef _POSIX_VERSION
#if _POSIX_VERSION >= 200112L
	if (posix_memalign(reinterpret_cast<void**>(&result), 
			   n_align, n*sizeof(Type)) != 0) {
	  throw std::bad_alloc();
	}
#else
	result = new Type[n];
#endif
#elif defined(_MSC_VER)
	result = reinterpret_cast<Type*>(_aligned_malloc(n*sizeof(Type),
							 n_align));
	if (result == 0) {
	  throw std::bad_alloc();
	}
#else
	result = new Type[n];	
#endif
      return result;
      }
    }
    
    template <typename Type>
    inline
    void free_aligned(Type* data) {
      // Note that we need to use the same condition as used in
      // alloc_aligned() in order that new[] is followed by delete[]
      // and posix_memalign is followed by free
      if (Packet<Type>::alignment_bytes < sizeof(void*)) {
	delete[] data;
      }
      else { 
#ifdef _POSIX_VERSION
#if _POSIX_VERSION >= 200112L   
	free(data);
#else
	delete[] data;
#endif
#elif defined(_MSC_VER)
	_aligned_free(data);
#else
	delete[] data;
#endif
      }
    }


    // -------------------------------------------------------------------
    // Check if templated object is a packet: is_packet
    // -------------------------------------------------------------------
    template <typename T>
    struct is_packet {
      static const bool value = false;
    };
    template <typename T>
    struct is_packet<Packet<T> > {
      static const bool value = true;
    };

  } // End namespace internal


  // -------------------------------------------------------------------
  // Fast exponential function
  // -------------------------------------------------------------------

#ifdef ADEPT_FAST_SCALAR_EXPONENTIAL
  // Bring scalar exp from quick_e into this namespace
  inline float  exp(float x)  { return quick_e::exp(x); }
  inline double exp(double x) { return quick_e::exp(x); }
#endif
  inline float  fastexp(float x)  { return quick_e::exp(x); }
  inline double fastexp(double x) { return quick_e::exp(x); }

  // This namespace is only for use in array operations
  namespace functions {
#ifdef ADEPT_FAST_EXPONENTIAL
    // Bring scalar exp from quick_e into this namespace
    inline float  exp(float x)  { return quick_e::exp(x); }
    inline double exp(double x) { return quick_e::exp(x); }
#else
    inline float  exp(float x)  { return std::exp(x); }
    inline double exp(double x) { return std::exp(x); }
#endif
  }

} // End namespace adept

#endif


================================================
FILE: include/adept/RangeIndex.h
================================================
/* RangeIndex.h -- Helper classes to enable indexing of arrays

    Copyright (C) 2015-2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.


   If an Array is indexed via A(i,j,...) then there are three possible
   return values: (1) a scalar, if all indices are scalar integers
   (including 0-rank expressions such as "end"); (2) an Array that
   links to a subset of the data in the original Array, if one or more
   of the indices is a RangeIndex object and all the rest are scalar
   integers; and (3) an IndexedArray object, if one or more of the
   indices is a vector of integers.  All of these return values can be
   used on the left-hand-side of an expression.

   This file defines the RangeIndex class and associated helper types
   that facilitate the second case.  A RangeIndex object expresses a
   sequence of regularly spaced integers, which may have a separation
   greater than 1 or a negative separation.  Since an Array need not
   be contiguous in memory, when an Array is indexed by one or more
   RangeIndex objects the result is also a valid Array.  RangeIndex
   objects are created by the range(begin,end) and
   stride(begin,end,stride) functions.

   This file also includes the EndIndex class to enable the use of
   "end" to express the final element of an array dimension being
   indexed (as in Matlab), and the AllIndex class to enable the use of
   "__" to express all elements of a dimension (as ":" in Fortran 90
   and Matlab).

*/


#ifndef AdeptRangeIndex_H
#define AdeptRangeIndex_H 1

#include <adept/Expression.h>

namespace adept {

  namespace internal {
    // ---------------------------------------------------------------------
    // Section 1. EndIndex: enable Matlab-like "end" indexing
    // ---------------------------------------------------------------------

    // When an integer Expression is used as the index to another
    // expression, make "end" (or "adept::end") be interpretted as the
    // index of the final element of the array dimension being
    // referred to. If an whole multi-dimensional array is referred to
    // by a single integer Expression, then "end" is resolved to the
    // len-1 ("len" being the length of the dimension being indexed).
    // "end" is actually an instantiation of the "EndIndex" class, a
    // rank-0 expression.
    struct EndIndex : public Expression<Index, EndIndex>
    {
      // Static definitions
      static const int  rank       = 0;
      static const bool is_active  = false;
      static const int  n_scratch  = 0;
      static const int  n_arrays   = 0;
      static const int  n_active   = 0;
      
      // Functions to implement Expression behaviour

      bool get_dimensions_(ExpressionSize<0>& dim) const
      { return true; }
      
      std::string expression_string_() const
      { return std::string("end"); }

      bool is_aliased_(const Index* mem1, const Index* mem2) const
      { return false; }

      Index value_with_len_(const Index& j, const Index& len) const
      { return len-1; }

      // Note that "end" can only be used as an index to an array or
      // expression: when used in any other context it will fail.
      template <int Rank>
      Index value_at_location_(const ExpressionSize<Rank>&) const
      { throw array_exception("Cannot determine to which object the \"end\" index refers to"
			      ADEPT_EXCEPTION_LOCATION); }
    };
    
    // ---------------------------------------------------------------------
    // Section 2. get_index_with_len
    // ---------------------------------------------------------------------
    // We want range(x,y) and stride(x,y,z) to work for integer
    // arguments or for 0-rank expressions (including "end" and
    // constructs such as "end - 1"), so define the following helper
    // function. For an integer first argument, "get_index_with_len"
    // just returns the first argument, but for 0-rank expressions of
    // integer type, the second argument "len" is passed in and if the
    // expression contains an "end" then this resolves to len-1.

#ifndef ADEPT_BOUNDS_CHECKING
    inline Index get_index_with_len(Index j, Index) { return j; }

    template <typename T, class E>
    inline
    typename enable_if<std::numeric_limits<T>::is_integer
		       && E::rank == 0, Index>::type
    get_index_with_len(const Expression<T,E>& j, Index len) {
      return j.value_with_len(0, len);
    }
#else
    // Bounds-checking versions
    inline Index get_index_with_len(Index j, Index len) {
      if (j < 0 || j >= len) {
	throw index_out_of_bounds();
      }
      else {
	return j; 
      }
    }

    template <typename T, class E>
    inline
    typename enable_if<std::numeric_limits<T>::is_integer
		       && E::rank == 0, Index>::type
    get_index_with_len(const Expression<T,E>& j, Index len) {
      Index ind = j.value_with_len(0, len);
      if (ind < 0 || ind >= len) {
	throw index_out_of_bounds("Array index (probably generated from a scalar expression containing \"end\") is out of bounds"
				  ADEPT_EXCEPTION_LOCATION);
      }
      else {
	return ind;
      }
    }
#endif

    // get_stride_with_len is just like get_index_with_len except that
    // there is no need to do bounds checking
    inline Index get_stride_with_len(Index j, Index) { return j; }

    template <typename T, class E>
    inline
    typename enable_if<std::numeric_limits<T>::is_integer
		       && E::rank == 0, Index>::type
    get_stride_with_len(const Expression<T,E>& j, Index len) {
      return j.value_with_len(0, len);
    }

    // ---------------------------------------------------------------------
    // Section 3. get_value
    // ---------------------------------------------------------------------
    // If a RangeIndex object is not to be used as an index to an
    // array, we may wish to access its elements without consideration
    // of the length of a dimension.

    inline Index get_value(Index j) { return j; }

    template <typename T, class E>
    inline
    typename enable_if<std::numeric_limits<T>::is_integer
		       && E::rank == 0, Index>::type
    get_value(const Expression<T,E>& j) {
      return j.scalar_value();
    }

    // ---------------------------------------------------------------------
    // Section 3. RangeIndex class
    // ---------------------------------------------------------------------
    // A class to store a range of integers, optionally with a fixed
    // stride, for simple indexing of arrays. 
    template<class BeginType, class EndType, class StrideType>
    class RangeIndex
      : public Expression<Index, RangeIndex<BeginType, EndType, StrideType> >
    {
    public:
      static const int  rank       = 1;
      static const bool is_active  = false;
      static const int  n_scratch  = 0;
      static const int  n_arrays   = 1;
      static const int  n_active   = 0;
      
      // Construct with a specified stride
      RangeIndex(const BeginType& begin, const EndType& end, 
		 const StrideType& stride)
	: begin_(begin), end_(end), stride_(stride)
      { };

      // Construct without a specified stride: defaults to 1
      RangeIndex(const BeginType& begin, const EndType& end)
	: begin_(begin), end_(end), stride_(1)
      { };

      Index size() const 
      { return (end() - begin() + stride()) / stride(); }

      Index size_with_len_(const Index& len) const
      { return (end(len) - begin(len) + stride(len)) / stride(len); }

      bool get_dimensions_(ExpressionSize<1>& dim) const {
	dim[0] = size();
	return true;
      }
      std::string expression_string_() const {
	std::stringstream s;
	s << "(" << begin() << ":" << end();
	Index str = stride();
	if (str != 1) {
	  s << ":" << str;
	}
	s << ")";
	return s.str();
      }

      bool is_aliased_(const Index* mem1, const Index* mem2) const {
	return false;
      }

      bool all_arrays_contiguous_() const { return true; }

      // When this object is used as an index to another, the
      // following version of the function is called, in which the
      // "len" element is specified in order for the "end" index
      // specifier to work
      Index value_with_len_(const Index&j, const Index& len) const 
      { return begin(len) + stride(len)*j; }

      // Advance the location of each array in the expression
      template <int MyArrayNum, int NArrays>
      void advance_location_(ExpressionSize<NArrays>& loc) const {
	++loc[MyArrayNum];
      }

      template <int MyArrayNum, int NArrays>
      void set_location_(const ExpressionSize<1>& i, 
			 ExpressionSize<NArrays>& index) const { }

      // Give the value at a particular offset
      template <int MyArrayNum, int NArrays>
      Index value_at_location_(const ExpressionSize<NArrays>& j) const 
      { return begin() + stride()*j[MyArrayNum]; }

      // Access the beginning, end and stride, where the argument
      // gives the length of the dimension in case any of these is
      // expressed with respect to "end" (which resolves to length-1)
      Index begin()  const { return get_value(begin_);  }
      Index end()    const { return get_value(end_);    }
      Index stride() const { return get_value(stride_); }
      Index begin(Index len) const
      {	return get_index_with_len(begin_, len); }
      Index end(Index len) const 
      { return get_index_with_len(end_, len); }
      Index stride(Index len) const
      { return get_stride_with_len(stride_, len); }

    private:
      // Note that a copy rather than a reference to the Expression or
      // int is stored: this is because if range(i1, i2) is used as
      // the index to another object, then a temporary object will be
      // created that will be destroyed immediately after calling the
      // RangeIndex constructor (following ANSI C++ rules), so a
      // reference would then point to invalid data.
      // FIX!!!
      const BeginType begin_;
      const EndType end_;
      const StrideType stride_;
    };

    // ---------------------------------------------------------------------
    // Section 4. AllIndex class
    // ---------------------------------------------------------------------
    // A class to represent all elements along one dimension, for simple
    // indexing of arrays with "__" (equivalent to ":" in Fortran).
    class AllIndex : public Expression<Index, AllIndex>
    {
    public:
      static const int  rank      = 1;
      static const bool is_active = false;
      static const int  n_active  = 0;
      static const int  n_static_ = 0;
      static const int  n_arrays  = 0;

      // Unknown!
      //      bool get_dimensions_(ExpressionSize<1>& dim) const { return true; }      

      std::string expression_string_() const { return std::string("__"); }

      bool is_aliased_(const Index* mem1, const Index* mem2) const { return false; }

      Index size_with_len_(const Index& len) const
      { return len; }

      Index value_with_len_(const Index& j, const Index& len) const
      { return j; }

      Index value_at_location_(const ExpressionSize<1>& loc) const
      { return loc[0]; }
      
      Index begin(Index len = -1) const { return 0; }
      Index end(Index len) const { return len-1; }
      Index stride(Index len = -1) const { return 1; }
    };


    // is_range<T>::value is true if T is of type RangeIndex or
    // AllIndex
    template <typename T>
    struct is_range {
      static const bool value = false;
      static const int  count = 0;
    };
    template <>
    struct is_range<AllIndex> {
      static const bool value = true;
      static const int  count = 1;
    };
    template <class B, class E, class S>
    struct is_range<RangeIndex<B,E,S> > {
      static const bool value = true;
      static const int  count = 1;
    };
    
    // is_regular_index<T>::value is true if T is a valid index to a
    // dimension of an Array such that the indexed object is also an
    // Array
    template <typename T>
    struct is_regular_index {
      static const bool value = (is_scalar_int<T>::value
				 || is_null_type<T>::value
				 || is_range<T>::value);
    };

    // is_ranged<>::value is true if at least one of the template
    // arguments I0 to I[Rank-1] is of type RangeIndex, and all others
    // are of integer type
    template <int Rank, typename I0, typename I1 = null_type, 
	      typename I2 = null_type, typename I3 = null_type,
	      typename I4 = null_type, typename I5 = null_type,
	      typename I6 = null_type>
    struct is_ranged {
      static const bool value = (is_range<I0>::value || is_range<I1>::value
			      || is_range<I2>::value || is_range<I3>::value
			      || is_range<I4>::value || is_range<I5>::value
			      || is_range<I6>::value)
	&& Rank == 7 - (  is_null_type<I1>::count + is_null_type<I2>::count
			+ is_null_type<I3>::count + is_null_type<I4>::count
			+ is_null_type<I5>::count + is_null_type<I6>::count)
	&& (   is_regular_index<I0>::value && is_regular_index<I1>::value
	    && is_regular_index<I2>::value && is_regular_index<I3>::value
	    && is_regular_index<I4>::value && is_regular_index<I5>::value
	    && is_regular_index<I6>::value);
      static const int count = is_range<I0>::count + is_range<I1>::count
	+ is_range<I2>::count + is_range<I3>::count + is_range<I4>::count
	+ is_range<I5>::count + is_range<I6>::count;
    };


  } // End namespace internal

  // User-accessible functions and objects

  // The actual end object is held in a source file
  extern ::adept::internal::EndIndex end;

  // The actual "__" object is held in a source file
  extern ::adept::internal::AllIndex __;

  // Return a RangeIndex object representing all the integers between
  // "begin" and "end"; the inputs can either be Expressions or ints
  template<class BeginType, class EndType>
  inline
  adept::internal::RangeIndex<BeginType, EndType, int>
  range(const BeginType& begin, const EndType& end)
  {
    return adept::internal::RangeIndex<BeginType, EndType, int>(begin, end, 1);
  }

  // Return a RangeIndex object representing integers between "begin"
  // and "end" spaced "stride" apart
  template<class BeginType, class EndType, class StrideType>
  inline
  adept::internal::RangeIndex<BeginType, EndType, StrideType>
  stride(const BeginType& begin, const EndType& end,
	 const StrideType& stride)
  {
    return adept::internal::RangeIndex<BeginType, EndType, 
				       StrideType>(begin, end, stride);
  }

} // End namespace adept

#endif


================================================
FILE: include/adept/ScratchVector.h
================================================
/* ScratchVector.h -- Class for holding temporary real data

    Copyright (C) 2015-2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.


   The ScratchVector class is used to store a temporary vector of real
   numbers (by default the type "Real", but could also be
   Packet<Real>) for use in optimally evaluating an expression and
   computing its derivative.  Certain parts of the expression need to
   store their numerical value when first computed since it will be
   needed again in the derivative computation.  In Adept 1.x such data
   were stored in the expression objects themselves, e.g. in
   adept::Multiply, but now that it is not clear at the level of an
   individual operation whether vectorization will be possible
   (requiring Packet<Real>), the storage for such scratch data must be
   held externally.

*/

#ifndef AdeptScratchVector_H
#define AdeptScratchVector_H

#include <limits>

#include <adept/base.h>
#include <adept/traits.h>

namespace adept {

  namespace internal {

    // Definition of ScratchVector class
    template <int Size, typename Type = Real>
    class ScratchVector {
    public:
      // Constructors

      // By default no initialization is done
      ScratchVector() {
#ifdef ADEPT_INIT_REAL
	initialize<Type>();
#endif
      }

#ifdef ADEPT_INIT_REAL
      template <typename T>
      typename internal::enable_if<internal::is_floating_point<T>::value, void>::type
      initialize() {
	for (int is = 0; is < Size; ++is) {
	  val[is] = ADEPT_INIT_REAL;
	}
      }
      template <typename T>
      typename internal::enable_if<!internal::is_floating_point<T>::value, void>::type
      initialize() { }
#endif

      // Set all dimensions to the same value
      ScratchVector(Type x) {
	set_all(x);
      }

      // Specify the values of all elements
      ScratchVector(Type x[Size]) {
	for (int i = 0; i < Size; ++i) {
	  val[i] = x[i];
	}
      }

      // Assume copy constructor will copy elements of val
    
      // Set all to specified value
      void set_all(Type x) {
	for (int i = 0; i < Size; ++i) {
	  val[i] = x;
	}
      }

      // Copy from a ScratchVector object of the same rank
      void copy(const ScratchVector& d) {
	for (int i = 0; i < Size; ++i) {
	  val[i] = d[i];
	}
      }
      // ...or pointer to raw data
      void copy(const Type* d) {
	for (int i = 0; i < Size; ++i) {
	  val[i] = d[i];
	}
      }

      // Write out contents for debugging
      std::ostream& write(std::ostream& os) const {
	os << "{" << val[0];
	for (int i = 1; i < Size; i++) {
	  os << "," << val[i];
	}
	return os << "}\n";
      }

      // Const and non-const access to elements
      Type& operator[](int i) { return val[i]; }

      const Type& operator[](int i) const { return val[i]; }

      // Data
    private:
      Type val[Size];
    };
  
    // Specialization for scalars (zero-rank arrays) known at compile
    // time
    template <>
      class ScratchVector<0> {
    public:
      ScratchVector() { }
      template <typename T>
      ScratchVector(T x) { }
      std::ostream& write(std::ostream& os) const {
	return os << "{}\n";
      }
    };

    // Write out all elements for debugging
    template <int Size>
    inline
    std::ostream& operator<<(std::ostream& os, const ScratchVector<Size>& s) {
      return s.write(os);
    }
   
 
  } // End namespace internal

} // End namespace adept

#endif // AdeptScratchVector_H


================================================
FILE: include/adept/SpecialMatrix.h
================================================
/* SpecialMatrix.h -- Active or inactive symmetric and band-diagonal matrices

    Copyright (C) 2015-2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.


   The SpecialMatrix is the basis for a wide range of matrix types
   such as SquareMatrix, DiagonalMatrix, TridiagonalMatrix,
   SymmetricMatrix etc.

*/

#ifndef AdeptSpecialMatrix_H
#define AdeptSpecialMatrix_H 1

#include <iostream>
#include <sstream>
#include <limits>

#include <adept/base.h>
#include <adept/Storage.h>
#include <adept/Expression.h>
#include <adept/RangeIndex.h>
#include <adept/ActiveReference.h>
#include <adept/Array.h>
#include <adept/FixedArray.h>

namespace adept {

  // -------------------------------------------------------------------
  // SpecialMatrix Engine helper classes
  // -------------------------------------------------------------------
  enum SymmMatrixOrientation {
    ROW_LOWER_COL_UPPER=0, ROW_UPPER_COL_LOWER=1
  };

  namespace internal {

    // -------------------------------------------------------------------
    // Conventional matrix storage engine
    // -------------------------------------------------------------------

    // The SpecialMatrix class is assisted by data-free policy classes
    // that define the behaviour of different matrix types. The first
    // most basic one is for square matrices. Comments are provided
    // for the first one only to explain the meaning of each
    // function. The default here is ROW_MAJOR; the alternative
    // COL_MAJOR is provided as a specialization of this class.
    template <MatrixStorageOrder Order>
    struct SquareEngine {
      // The number of variables to store for a SpecialMatrix when it
      // is on the right-hand-side of an expression for its location
      static const int my_n_arrays = 1;
      // Used by SpecialMatrix::expression_string() to describe the
      // matrix type
      const char* name() const { return "SquareMatrix"; }
      // Used by SpecialMatrix::info_string() to describe the matrix
      // type
      std::string long_name() const { return "SquareMatrix<ROW_MAJOR>"; }
      // The offset to use (the spacing in memory of elements along
      // the slowest varying dimension) for "packed" data, i.e. when
      // this matrix is created by the SpecialMatrix::resize function
      // rather than being a submatrix to something larger.
      Index pack_offset(Index dim) const { return dim; }
      // Provide the memory index to the element at row i, column j
      Index index(Index i, Index j, Index offset) const {
	return i*offset + j;
      }
      // When traversing along a row, this is the separation in memory
      // of each element
      template <int MyArrayNum, int NArrays>
      Index row_offset(Index offset, const ExpressionSize<NArrays>& loc) const {
	return 1; 
      }
      // This function is used when a SpecialMatrix is used on the
      // left-hand-side of an expression. For row i, return the range
      // of columns containing unique elements in j_start and
      // j_end_plus_1, the memory location of the element
      // corresponding to j_start in index_start, and the separation
      // in memory of consecutive elements in this range
      void get_row_range(Index i, Index dim, Index offset,
			 Index& j_start, Index& j_end_plus_1,
			 Index& index_start, Index& index_stride) const {
	j_start = 0;
	j_end_plus_1 = dim;
	index_start = i*offset;
	index_stride = 1;
      }
      // Return value at row i, column j as an rvalue, first in the
      // case of an inactive array...
      template <bool IsActive, typename Type>
      typename internal::enable_if<!IsActive,Type>::type
      get_scalar(Index i, Index j, Index dim, Index offset, 
		 Index gradient_index, const Type* data) const {
	return data[index(i,j,offset)]; 
      }
      // ...now in the case of an active array.
      template <bool IsActive, typename Type>
      typename internal::enable_if<IsActive,Active<Type> >::type
      get_scalar(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, const Type* data) const {
	return Active<Type>(data[index(i,j,offset)]);
      }
      // Return value at row i, column j as an lvalue, first in the
      // case of an inactive array...
      template <bool IsActive, typename Type>
      typename internal::enable_if<!IsActive,Type&>::type
      get_reference(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, Type* data) {
	return data[index(i,j,offset)]; 
      }
      // ...now in the case of an active array.
      template <bool IsActive, typename Type>
      typename internal::enable_if<IsActive,ActiveReference<Type> >::type
      get_reference(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, Type* data) {
	Index ind = index(i,j,offset);
	return ActiveReference<Type>(data[ind], gradient_index+ind);
      }
      // Return the number of elements stored for a SpecialMatrix of
      // size dim x dim.  This is used both by SpecialMatrix::resize
      // to know how much memory to allocate, and by
      // SpecialMatrix::is_aliased to know the memory range spanned by
      // the object.
      Index data_size(Index dim, Index offset) const {
	return (dim-1)*offset+dim;
      }
      // Memory offset of start of a superdiagonal (offdiag > 0)
      Index upper_offset(Index dim, Index offset, Index offdiag) const {
	return offdiag;
      }
      // Memory offset of start of a subdiagonal (offdiag < 0)
      Index lower_offset(Index dim, Index offset, Index offdiag) const {
	return -offdiag*offset;
      }
      // Check super- and sub-diagonals are in range, otherwise throw
      // an exception (errors only thrown for band matrices)
      void check_upper_diag(Index offdiag) const { }
      void check_lower_diag(Index offdiag) const { }
      // The type returned by the transpose .T() member function
      typedef SquareEngine<COL_MAJOR> transpose_engine;
      // Extra info to store when traversing a SpecialMatrix on the
      // right-hand-side of an expression
      template <int MyArrayNum, int NArrays>
      void set_extras(Index i, Index offset,
		      ExpressionSize<NArrays>& index) const { }
      // Return the value at the specified location in memory
      template <int MyArrayNum, int NArrays, typename Type>
      Type value_at_location(const Type* data, 
			     const ExpressionSize<NArrays>& loc) const {
	return data[loc[MyArrayNum]];
      }
      // Push an element of an active SpecialMatrix onto the stack
      template <int MyArrayNum, int NArrays, typename Type>
      void push_rhs(Stack& stack, Type multiplier, Index gradient_index,
		    const ExpressionSize<NArrays>& loc) const {
	stack.push_rhs(multiplier, gradient_index + loc[MyArrayNum]);
      }
    };

    // The engine for the SquareMatrix type using column-major
    // storage; note that this inherits from the row-major version in
    // order that functions that don't need to be changed can be
    // imported using "using".
    template <>
    struct SquareEngine<COL_MAJOR> : public SquareEngine<ROW_MAJOR> {
      static const int my_n_arrays = 1;
      const char* name() const { return "SquareMatrix"; }
      std::string long_name() const { return "SquareMatrix<COL_MAJOR>"; }
      Index pack_offset(Index dim) const { return dim; }
      Index index(Index i, Index j, Index offset) const {
	return i + j*offset;
      }
      template <int MyArrayNum, int NArrays>
      Index row_offset(Index offset, const ExpressionSize<NArrays>& loc) const {
	return offset; 
      }
      void get_row_range(Index i, Index dim, Index offset,
			 Index& j_start, Index& j_end_plus_1,
			 Index& index_start, Index& index_stride) const {
	j_start = 0;
	j_end_plus_1 = dim;
	index_start = i;
	index_stride = offset;
      }

      template <bool IsActive, typename Type>
      typename internal::enable_if<!IsActive,Type>::type
      get_scalar(Index i, Index j, Index dim, Index offset, 
		 Index gradient_index, const Type* data) const {
	return data[index(i,j,offset)]; 
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<IsActive,Active<Type> >::type
      get_scalar(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, const Type* data) const {
	return Active<Type>(data[index(i,j,offset)]);
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<!IsActive,Type&>::type
      get_reference(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, Type* data) {
	return data[index(i,j,offset)]; 
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<IsActive,ActiveReference<Type> >::type
      get_reference(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, Type* data) {
	Index ind = index(i,j,offset);
	return ActiveReference<Type>(data[ind], gradient_index+ind);
      }
      Index upper_offset(Index dim, Index offset, Index offdiag) const {
	return offdiag*offset;
      }
      Index lower_offset(Index dim, Index offset, Index offdiag) const {
	return -offdiag;
      }
      typedef SquareEngine<ROW_MAJOR> transpose_engine;
      using SquareEngine<ROW_MAJOR>::data_size;
      using SquareEngine<ROW_MAJOR>::check_upper_diag;
      using SquareEngine<ROW_MAJOR>::check_lower_diag;
      using SquareEngine<ROW_MAJOR>::set_extras;
      using SquareEngine<ROW_MAJOR>::value_at_location;
      using SquareEngine<ROW_MAJOR>::push_rhs;
    };

    // -------------------------------------------------------------------
    // Band matrix storage engine
    // -------------------------------------------------------------------

    // A band matrix uses the BLAS packed storage to store LDiags
    // subdiagonals and UDiags superdiagonals; the default version
    // uses row-major storage
    template <Index LDiags, Index UDiags>
    struct BandEngineHelper {
      const char* name() const { return "BandMatrix"; }
    };
    template <>
    struct BandEngineHelper<0,0> {
      const char* name() const { return "DiagMatrix"; }
    };
    template <>
    struct BandEngineHelper<1,1> {
      const char* name() const { return "TridiagMatrix"; }
    };
    template <>
    struct BandEngineHelper<2,2> {
      const char* name() const { return "PentadiagMatrix"; }
    };

    template <MatrixStorageOrder Order, Index LDiags, Index UDiags>
    struct BandEngine {
      static const int my_n_arrays = 3;
      static const Index diagonals = 1+LDiags+UDiags;
      const char* name() const { return BandEngineHelper<LDiags,UDiags>().name(); }
      std::string long_name() const { 
	std::stringstream s;
	s << "BandMatrix<ROW_MAJOR,LDiags=" << LDiags
	  << ",UDiags=" << UDiags << ">";
	return s.str();
      }
      Index pack_offset(Index dim) const { return diagonals-1; }
      Index index(Index i, Index j, Index offset) const {
	//	return LDiags + i*offset + j;
	return i*offset + j;
      }
      template <int MyArrayNum, int NArrays>
      Index row_offset(Index offset, const ExpressionSize<NArrays>& loc) const {
	return 1; 
      }
      void get_row_range(Index i, Index dim, Index offset,
			 Index& j_start, Index& j_end_plus_1,
			 Index& index_start, Index& index_stride) const {
	j_start = i<LDiags ? 0 : i-LDiags;
	j_end_plus_1 = i+UDiags+1>dim ? dim : i+UDiags+1;
	index_start = i*offset + j_start;
	index_stride = 1;
      }
      typedef BandEngine<COL_MAJOR,UDiags,LDiags> transpose_engine;
      template <bool IsActive, typename Type>
      typename internal::enable_if<!IsActive,Type>::type
      get_scalar(Index i, Index j, Index dim, Index offset, 
		 Index gradient_index, const Type* data) const {
	Index off = j-i;
	Type val;
	if (off > UDiags || off < (-LDiags)) {
	  val = 0;
	}
	else {
	  val = data[index(i,j,offset)]; 
	}
	return val;
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<IsActive,Active<Type> >::type
      get_scalar(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, const Type* data) const {
	Index off = j-i;
	if (off > UDiags || off < (-LDiags)) {
	  return Active<Type>(0.0);
	}
	else {
	  return Active<Type>(data[index(i,j,offset)]);
	}
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<!IsActive,Type&>::type
      get_reference(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, Type* data) {
	Index off = j-i;
	if (off > UDiags || off < (-LDiags)) {
	  throw index_out_of_bounds("Attempt to get lvalue to off-diagonal in BandMatrix"
				    ADEPT_EXCEPTION_LOCATION);
	}
	else {
	  return data[index(i,j,offset)]; 
	}
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<IsActive,ActiveReference<Type> >::type
      get_reference(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, Type* data) {
	Index off = j-i;
	if (off > UDiags || off < (-LDiags)) {
	  throw index_out_of_bounds("Attempt to get lvalue to off-diagonal in BandMatrix"
				    ADEPT_EXCEPTION_LOCATION);
	}
	else {
	  Index ind = index(i,j,offset);
	  return ActiveReference<Type>(data[ind], gradient_index+ind);
	}
      }
      Index data_size(Index dim, Index offset) const {
	return (dim-1)*(offset+1) + 1;// + dim; // - UDiags;
      }

      Index upper_offset(Index dim, Index offset, Index offdiag) const {
	return offdiag;
      }
      Index lower_offset(Index dim, Index offset, Index offdiag) const {
	return -offdiag*offset;
      }
      void check_upper_diag(Index offdiag) const {
	if (offdiag > UDiags) {
	  throw index_out_of_bounds("Attempt to get lvalue diagonal to off-diagonal in BandMatrix"
				    ADEPT_EXCEPTION_LOCATION);	  
	}
      }
      void check_lower_diag(Index offdiag) const { 
	if (-offdiag > LDiags) {
	  throw index_out_of_bounds("Attempt to get lvalue diagonal to off-diagonal in BandMatrix"
				    ADEPT_EXCEPTION_LOCATION);
	}
      }
      template <int MyArrayNum, int NArrays>
      void set_extras(Index i, Index offset,
		      ExpressionSize<NArrays>& index) const {
	index[MyArrayNum+1] = i*(offset+1) - LDiags;
	index[MyArrayNum+2] = index[MyArrayNum+1] + diagonals;
      }
      template <int MyArrayNum, int NArrays, typename Type>
      Type value_at_location(const Type* data, 
			     const ExpressionSize<NArrays>& loc) const {
	if (loc[MyArrayNum] >= loc[MyArrayNum+1]
	    && loc[MyArrayNum] < loc[MyArrayNum+2]) {
	  return data[loc[MyArrayNum]];
	}
	else {
	  return 0;
	}
      }
      template <int MyArrayNum, int NArrays, typename Type>
      void push_rhs(Stack& stack, Type multiplier, Index gradient_index,
		    const ExpressionSize<NArrays>& loc) const {
	if (loc[MyArrayNum] >= loc[MyArrayNum+1]
	    && loc[MyArrayNum] < loc[MyArrayNum+2]) {
	  stack.push_rhs(multiplier, gradient_index + loc[MyArrayNum]);
	}
      }
    };

    // The column-major version inherits from the row-major version in
    // order that some functionality can be imported
    template <Index LDiags, Index UDiags>
    struct BandEngine<COL_MAJOR, LDiags, UDiags>
      : public BandEngine<ROW_MAJOR, LDiags, UDiags> {
      static const int my_n_arrays = 3;
      static const Index diagonals = 1+LDiags+UDiags;
      const char* name() const { return BandEngineHelper<LDiags,UDiags>().name(); }
      std::string long_name() const { 
	std::stringstream s;
	s << "BandMatrix<COL_MAJOR,LDiags=" << LDiags
	  << ",UDiags=" << UDiags << ">";
	return s.str();
      }
      using BandEngine<ROW_MAJOR,LDiags,UDiags>::pack_offset;
      Index index(Index i, Index j, Index offset) const {
	//	return UDiags + i + j*offset;
	return i + j*offset;
      }
      template <int MyArrayNum, int NArrays>
      Index row_offset(Index offset, const ExpressionSize<NArrays>& loc) const {
	return offset;
      }
      void get_row_range(Index i, Index dim, Index offset,
			 Index& j_start, Index& j_end_plus_1,
			 Index& index_start, Index& index_stride) const {
	j_start = i<LDiags ? 0 : i-LDiags;
	j_end_plus_1 = i+UDiags+1>dim ? dim : i+UDiags+1;
	index_start = i + j_start*offset;
	index_stride = offset;
      }
      typedef BandEngine<ROW_MAJOR,UDiags,LDiags> transpose_engine;
      template <bool IsActive, typename Type>
      typename internal::enable_if<!IsActive,Type>::type
      get_scalar(Index i, Index j, Index dim, Index offset, 
		 Index gradient_index, const Type* data) const {
	Index off = j-i;
	Type val;
	if (off > UDiags || off < (-LDiags)) {
	  val = 0;
	}
	else {
	  val = data[index(i,j,offset)]; 
	}
	return val;
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<IsActive,Active<Type> >::type
      get_scalar(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, const Type* data) const {
	Index off = j-i;
	if (off > UDiags || off < (-LDiags)) {
	  return Active<Type>(0.0);
	}
	else {
	  return Active<Type>(data[index(i,j,offset)]);
	}
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<!IsActive,ActiveReference<Type> >::type
      get_reference(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, Type* data) {
	Index off = j-i;
	if (off > UDiags || off < (-LDiags)) {
	  throw index_out_of_bounds("Attempt to get lvalue to off-diagonal in BandMatrix"
				    ADEPT_EXCEPTION_LOCATION);
	}
	else {
	  return data[index(i,j,offset)]; 
	}
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<IsActive,ActiveReference<Type> >::type
      get_reference(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, Type* data) {
	Index off = j-i;
	if (off > UDiags || off < (-LDiags)) {
	  throw index_out_of_bounds("Attempt to get lvalue to off-diagonal in BandMatrix"
				    ADEPT_EXCEPTION_LOCATION);
	}
	else {
	  Index ind = index(i,j,offset);
	  return ActiveReference<Type>(data[ind], gradient_index+ind);
	}
      }
      using BandEngine<ROW_MAJOR,LDiags,UDiags>::data_size;

      Index upper_offset(Index dim, Index offset, Index offdiag) const {
	//	return LDiags + offdiag*offset;
	return offdiag*offset;
      }
      Index lower_offset(Index dim, Index offset, Index offdiag) const {
	//	return LDiags - offdiag;
	return -offdiag;
      }
      template <int MyArrayNum, int NArrays>
      void set_extras(Index i, Index offset,
		      ExpressionSize<NArrays>& index) const {
	index[MyArrayNum+1] = (i-LDiags)*(offset+1) + LDiags;
	index[MyArrayNum+2] = index[MyArrayNum+1] + (diagonals-1)*offset+1;
      }
      using BandEngine<ROW_MAJOR,LDiags,UDiags>::check_upper_diag;
      using BandEngine<ROW_MAJOR,LDiags,UDiags>::check_lower_diag;
      using BandEngine<ROW_MAJOR,LDiags,UDiags>::value_at_location;
      using BandEngine<ROW_MAJOR,LDiags,UDiags>::push_rhs;
    };

    // -------------------------------------------------------------------
    // Symmetric matrix storage engine
    // -------------------------------------------------------------------

    // A symmetric matrix - the default version (template parameter
    // ROW_LOWER_COL_UPPER) should be considered to use row-major
    // storage with the data held on the lower triangle of the
    // matrix. This is equivalent to column-major upper-triangle
    // storage for most uses, except that when this kind of symmetric
    // matrix is used on the left-hand-side of a statement, it will
    // only read the lower triangle of the right-hand-side of the
    // statement (assuming the upper triangle to be a symmetric copy).
    template <SymmMatrixOrientation Orient>
    struct SymmEngine : public SquareEngine<ROW_MAJOR> {
      static const int my_n_arrays = 2;
      const char* name() const { return "SymmMatrix"; }
      std::string long_name() const {
	return "SymmMatrix<ROW_LOWER_COL_UPPER>";
      }
      Index index(Index i, Index j, Index offset) const {
	return i >= j ? i*offset + j : i + j*offset;
      }
      template <int MyArrayNum, int NArrays>
      Index row_offset(Index offset, const ExpressionSize<NArrays>& loc) const {
	return loc[MyArrayNum] < loc[MyArrayNum+1] ? 1 : offset; 
      }
      void get_row_range(Index i, Index dim, Index offset,
			 Index& j_start, Index& j_end_plus_1,
			 Index& index_start, Index& index_stride) const {
	j_start = 0;
	j_end_plus_1 = i+1;
	index_start = i*offset;
	index_stride = 1;
      }
      typedef SymmEngine<ROW_LOWER_COL_UPPER> transpose_engine;
      template <bool IsActive, typename Type>
      typename internal::enable_if<!IsActive,Type>::type
      get_scalar(Index i, Index j, Index dim, Index offset, 
		 Index gradient_index, const Type* data) const {
	return data[index(i,j,offset)]; 
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<IsActive,Active<Type> >::type
      get_scalar(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, const Type* data) const {
	return Active<Type>(data[index(i,j,offset)]);
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<!IsActive,Type&>::type
      get_reference(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, Type* data) {
	return data[index(i,j,offset)]; 
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<IsActive,ActiveReference<Type> >::type
      get_reference(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, Type* data) {
	Index ind = index(i,j,offset);
	return ActiveReference<Type>(data[ind], gradient_index+ind);
      }
      template <int MyArrayNum, int NArrays>
      void set_extras(Index i, Index offset,
		      ExpressionSize<NArrays>& index) const {
	index[MyArrayNum+1] = i*(offset+1);
      }
      Index upper_offset(Index dim, Index offset, Index offdiag) const {
	return offdiag*offset;
      }
      Index lower_offset(Index dim, Index offset, Index offdiag) const {
	return -offdiag*offset;
      }

      using SquareEngine<ROW_MAJOR>::pack_offset;
      using SquareEngine<ROW_MAJOR>::data_size;
      using SquareEngine<ROW_MAJOR>::check_upper_diag;
      using SquareEngine<ROW_MAJOR>::check_lower_diag;
      using SquareEngine<ROW_MAJOR>::value_at_location;
      using SquareEngine<ROW_MAJOR>::push_rhs;
    };

    // A symmetric matrix whose storage can be considered to be
    // row-major with the data stored on the upper triangle. This is
    // equivalent to column-major lower-triangular storage, except
    // that when this kind of symmetric matrix is on the LHS of a
    // statement, it will only read the upper triangle of the RHS of
    // the statement.
    template <>
    struct SymmEngine<ROW_UPPER_COL_LOWER> : public SquareEngine<ROW_MAJOR> {
      static const int my_n_arrays = 2;
      const char* name() const { return "SymmMatrix"; }
      std::string long_name() const { 
	return "SymmMatrix<ROW_UPPER_COL_LOWER>";
      }
      Index pack_offset(Index dim) const { return dim; }
      Index index(Index i, Index j, Index offset) const {
	return i <= j ? i*offset + j : i + j*offset;
      }
      template <int MyArrayNum, int NArrays>
      Index row_offset(Index offset, const ExpressionSize<NArrays>& loc) const {
	return loc[MyArrayNum] < loc[MyArrayNum+1] ? offset : 1; 
      }
      void get_row_range(Index i, Index dim, Index offset,
			 Index& j_start, Index& j_end_plus_1,
			 Index& index_start, Index& index_stride) const {
	j_start = i;
	j_end_plus_1 = dim;
	index_start = i*(1+offset);
	index_stride = 1;
      }
      typedef SymmEngine<ROW_UPPER_COL_LOWER> transpose_engine;
      Index upper_offset(Index dim, Index offset, Index offdiag) const {
	return offdiag;
      }
      Index lower_offset(Index dim, Index offset, Index offdiag) const {
	return -offdiag;
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<!IsActive,Type>::type
      get_scalar(Index i, Index j, Index dim, Index offset, 
		 Index gradient_index, const Type* data) const {
	return data[index(i,j,offset)]; 
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<IsActive,Active<Type> >::type
      get_scalar(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, const Type* data) const {
	return Active<Type>(data[index(i,j,offset)]);
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<!IsActive,Type&>::type
      get_reference(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, Type* data) {
	return data[index(i,j,offset)]; 
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<IsActive,ActiveReference<Type> >::type
      get_reference(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, Type* data) {
	Index ind = index(i,j,offset);
	return ActiveReference<Type>(data[ind], gradient_index+ind);
      }
      template <int MyArrayNum, int NArrays>
      void set_extras(Index i, Index offset,
		      ExpressionSize<NArrays>& index) const {
	index[MyArrayNum+1] = i*(offset+1);
      }

      using SquareEngine<ROW_MAJOR>::data_size;
      using SquareEngine<ROW_MAJOR>::check_upper_diag;
      using SquareEngine<ROW_MAJOR>::check_lower_diag;
      using SquareEngine<ROW_MAJOR>::value_at_location;
      using SquareEngine<ROW_MAJOR>::push_rhs;
    };

    /*
    // -------------------------------------------------------------------
    // Symmetric band matrix storage engine
    // -------------------------------------------------------------------
    */

    // -------------------------------------------------------------------
    // Triangular matrix storage engines
    // -------------------------------------------------------------------

    // Forward declaration
    template <MatrixStorageOrder Order> struct UpperEngine;

    // Base class for common functions for row-major and column-major
    // storage
    template <MatrixStorageOrder Order>
    struct LowerBase : public SquareEngine<Order> {
      static const int my_n_arrays = 2;

      using SquareEngine<Order>::pack_offset;
      using SquareEngine<Order>::data_size;
      using SquareEngine<Order>::index;
      using SquareEngine<Order>::row_offset;
      using SquareEngine<Order>::check_lower_diag;
      using SquareEngine<Order>::upper_offset;
      using SquareEngine<Order>::lower_offset;

      const char* name() const { return "LowerMatrix"; }
      template <int MyArrayNum, int NArrays>
      void set_extras(Index i, Index offset,
		      ExpressionSize<NArrays>& index) const {
	index[MyArrayNum+1] = i*(offset+1);
      }
      void check_upper_diag(Index offdiag) const {
	if (offdiag > 0) {
	  throw index_out_of_bounds("Attempt to get lvalue to an upper diagonal of a lower-triangular matrix"
				    ADEPT_EXCEPTION_LOCATION);	  
	}
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<!IsActive,Type>::type
      get_scalar(Index i, Index j, Index dim, Index offset, 
		 Index gradient_index, const Type* data) const {
	if (i >= j) {
	  return data[index(i,j,offset)]; 
	}
	else {
	  return 0;
	}
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<IsActive,Active<Type> >::type
      get_scalar(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, const Type* data) const {
	if (i >= j) {
	  return Active<Type>(data[index(i,j,offset)]);
	}
	else {
	  return Active<Type>(0.0);
	}
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<!IsActive,Type&>::type
      get_reference(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, Type* data) {
	if (i >= j) {
	  return data[index(i,j,offset)]; 
	}
	else {
	  throw index_out_of_bounds("Attempt to get lvalue to upper part of lower-triangular matrix"
				    ADEPT_EXCEPTION_LOCATION);
	}
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<IsActive,ActiveReference<Type> >::type
      get_reference(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, Type* data) {
	if (i >= j) {
	  Index ind = index(i,j,offset);
	  return ActiveReference<Type>(data[ind], gradient_index+ind);
	}
	else {
	  throw index_out_of_bounds("Attempt to get lvalue to upper part of lower-triangular matrix"
				    ADEPT_EXCEPTION_LOCATION);
	  
	}
      }
      template <int MyArrayNum, int NArrays, typename Type>
      Type value_at_location(const Type* data, 
			     const ExpressionSize<NArrays>& loc) const {
	if (loc[MyArrayNum] <= loc[MyArrayNum+1]) {
	  return data[loc[MyArrayNum]];
	}
	else {
	  return 0;
	}
      }
      template <int MyArrayNum, int NArrays, typename Type>
      void push_rhs(Stack& stack, Type multiplier, Index gradient_index,
		    const ExpressionSize<NArrays>& loc) const {
	if (loc[MyArrayNum] <= loc[MyArrayNum+1]) {
	  stack.push_rhs(multiplier, gradient_index + loc[MyArrayNum]);
	}
      }
    };

    // Lower-triangular matrix using row-major storage
    template <MatrixStorageOrder Order>
    struct LowerEngine : public LowerBase<ROW_MAJOR> {
      std::string long_name() const {
	return "LowerMatrix<ROW_MAJOR>";
      }
      typedef UpperEngine<COL_MAJOR> transpose_engine;
      void get_row_range(Index i, Index dim, Index offset,
			 Index& j_start, Index& j_end_plus_1,
			 Index& index_start, Index& index_stride) const {
	j_start = 0;
	j_end_plus_1 = i+1;
	index_start = i*offset;
	index_stride = 1;
      }
    };

    // Lower-triangular matrix using column-major storage
    template <>
    struct LowerEngine<COL_MAJOR> : public LowerBase<COL_MAJOR> {
      std::string long_name() const {
	return "LowerMatrix<COL_MAJOR>";
      }
      typedef UpperEngine<ROW_MAJOR> transpose_engine;
      void get_row_range(Index i, Index dim, Index offset,
			 Index& j_start, Index& j_end_plus_1,
			 Index& index_start, Index& index_stride) const {
	j_start = 0;
	j_end_plus_1 = i+1;
	index_start = i;
	index_stride = offset;
      }
    };

    // Base class for common functions for row-major and column-major
    // storage
    template <MatrixStorageOrder Order>
    struct UpperBase : public SquareEngine<Order> {
      static const int my_n_arrays = 2;

      using SquareEngine<Order>::pack_offset;
      using SquareEngine<Order>::data_size;
      using SquareEngine<Order>::index;
      using SquareEngine<Order>::row_offset;
      using SquareEngine<Order>::check_lower_diag;
      using SquareEngine<Order>::upper_offset;
      using SquareEngine<Order>::lower_offset;

      const char* name() const { return "UpperMatrix"; }
      template <int MyArrayNum, int NArrays>
      void set_extras(Index i, Index offset,
		      ExpressionSize<NArrays>& index) const {
	index[MyArrayNum+1] = i*(offset+1);
      }
      void check_lower_diag(Index offdiag) const {
	if (offdiag < 0) {
	  throw index_out_of_bounds("Attempt to get lvalue to a lower diagonal of an upper-triangular matrix"
				    ADEPT_EXCEPTION_LOCATION);	  
	}
      }

      template <bool IsActive, typename Type>
      typename internal::enable_if<!IsActive,Type>::type
      get_scalar(Index i, Index j, Index dim, Index offset, 
		 Index gradient_index, const Type* data) const {
	if (i <= j) {
	  return data[index(i,j,offset)]; 
	}
	else {
	  return 0;
	}
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<IsActive,Active<Type> >::type
      get_scalar(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, const Type* data) const {
	if (i <= j) {
	  return Active<Type>(data[index(i,j,offset)]);
	}
	else {
	  return Active<Type>(0.0);
	}
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<!IsActive,Type&>::type
      get_reference(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, Type* data) {
	if (i <= j) {
	  return data[index(i,j,offset)]; 
	}
	else {
	  throw index_out_of_bounds("Attempt to get lvalue to lower part of upper-triangular matrix"
				    ADEPT_EXCEPTION_LOCATION);
	}
      }
      template <bool IsActive, typename Type>
      typename internal::enable_if<IsActive,ActiveReference<Type> >::type
      get_reference(Index i, Index j, Index dim, Index offset, 
		    Index gradient_index, Type* data) {
	if (i <= j) {
	  Index ind = index(i,j,offset);
	  return ActiveReference<Type>(data[ind], gradient_index+ind);
	}
	else {
	  throw index_out_of_bounds("Attempt to get lvalue to lower part of upper-triangular matrix"
				    ADEPT_EXCEPTION_LOCATION);
	  
	}
      }
      template <int MyArrayNum, int NArrays, typename Type>
      Type value_at_location(const Type* data, 
			     const ExpressionSize<NArrays>& loc) const {
	if (loc[MyArrayNum] >= loc[MyArrayNum+1]) {
	  return data[loc[MyArrayNum]];
	}
	else {
	  return 0;
	}
      }
      template <int MyArrayNum, int NArrays, typename Type>
      void push_rhs(Stack& stack, Type multiplier, Index gradient_index,
		    const ExpressionSize<NArrays>& loc) const {
	if (loc[MyArrayNum] >= loc[MyArrayNum+1]) {
	  stack.push_rhs(multiplier, gradient_index + loc[MyArrayNum]);
	}
      }
    };

    // Upper-triangular matrix using row-major storage
    template <MatrixStorageOrder Order>
    struct UpperEngine : public UpperBase<ROW_MAJOR> {
      typedef LowerEngine<COL_MAJOR> transpose_engine;

      std::string long_name() const {
	return "UpperMatrix<ROW_MAJOR>";
      }
      void get_row_range(Index i, Index dim, Index offset,
			 Index& j_start, Index& j_end_plus_1,
			 Index& index_start, Index& index_stride) const {
	j_start = i;
	j_end_plus_1 = dim;
	index_start = i*(offset+1);
	index_stride = 1;
      }
    };

    // Upper-triangular matrix using column-major storage
    template <>
    struct UpperEngine<COL_MAJOR> : public UpperBase<COL_MAJOR> {
      typedef LowerEngine<ROW_MAJOR> transpose_engine;
      std::string long_name() const {
	return "UpperMatrix<COL_MAJOR>";
      }
      void get_row_range(Index i, Index dim, Index offset,
			 Index& j_start, Index& j_end_plus_1,
			 Index& index_start, Index& index_stride) const {
	j_start = i;
	j_end_plus_1 = dim;
	index_start = i*(offset+1);
	index_stride = offset;
      }
    };

  } // End namespace internal

  // -------------------------------------------------------------------
  // Definition of SpecialMatrix class
  // -------------------------------------------------------------------
  template <typename Type = Real, class Engine = internal::SquareEngine<ROW_MAJOR>,
    bool IsActive = false>
  class SpecialMatrix 
    : public Expression<Type,SpecialMatrix<Type,Engine,IsActive> >,
      protected Engine,
      protected internal::GradientIndex<IsActive> {
  public:
    // -------------------------------------------------------------------
    // SpecialMatrix: 1. Static Definitions
    // -------------------------------------------------------------------

    // Static definitions to enable the properties of this type of
    // expression to be discerned at compile time
    static const bool is_active  = IsActive;
    static const bool is_lvalue  = true;
    static const int  rank       = 2;
    static const int  n_active   = IsActive * (1 + internal::is_complex<Type>::value);
    static const int  n_scratch  = 0;
    static const int  n_arrays   = Engine::my_n_arrays;
    static const bool is_vectorizable = false;

    // -------------------------------------------------------------------
    // SpecialMatrix: 2. Constructors
    // -------------------------------------------------------------------
    
    // Initialize an empty array
    SpecialMatrix() : data_(0), storage_(0), dimension_(0)
    { ADEPT_STATIC_ASSERT(!(std::numeric_limits<Type>::is_integer
			    && IsActive), CANNOT_CREATE_ACTIVE_ARRAY_OF_INTEGERS); }

    // Initialize an array with specified size
    SpecialMatrix(const ExpressionSize<2>& dims) : storage_(0)
    { resize(dims[0], dims[1]); }
    SpecialMatrix(Index m0) : storage_(0) { resize(m0); }
    SpecialMatrix(Index m0, Index m1) : storage_(0) { resize(m0,m1); }

    // A way to directly create arrays, needed when subsetting
    // other arrays
    SpecialMatrix(Type* data, Storage<Type>* s, Index dim, Index offset)
      : data_(data), storage_(s), dimension_(dim), offset_(offset) {
      if (storage_) {
	storage_->add_link(); 
	internal::GradientIndex<IsActive>::set(data_, storage_);
      }
      else {
	// It is an error if an active object gets here since it will
	// not have a valid gradient index
	internal::GradientIndex<IsActive>::assert_inactive();
      }
    }
    // Similar to the above, but with the gradient index supplied explicitly,
    // needed when an active FixedArray is being sliced
    SpecialMatrix(const Type* data0, Index data_offset, Index dim, Index offset,
		  Index gradient_index0)
      : internal::GradientIndex<IsActive>(gradient_index0, data_offset),
	data_(const_cast<Type*>(data0)+data_offset), storage_(0), dimension_(dim), offset_(offset) { }


    // Initialize an array pointing at existing data: the fact that
    // storage_ is a null pointer is used to convey the information
    // that it is not necessary to deallocate the data when this array
    // is destructed
    SpecialMatrix(Type* data, Index dim)
      : data_(data), storage_(0), dimension_(dim), 
	offset_(Engine::pack_offset(dim)) {
      ADEPT_STATIC_ASSERT(!IsActive, CANNOT_CONSTRUCT_ACTIVE_SQUARE_ARRAY_WITHOUT_GRADIENT_INDEX);
    }

    // Copy constructor: links to the source data rather than copying
    // it.  This is needed because we want a function returning an
    // SpecialMatrix not to make a deep copy, but rather to perform a
    // (computationally cheaper) shallow copy; when the SpecialMatrix within
    // the function is destructed, it will remove its link to the
    // data, and the responsibility for deallocating the data will
    // then pass to the SpecialMatrix in the calling function.
    SpecialMatrix(SpecialMatrix& rhs) 
      : internal::GradientIndex<IsActive>(rhs.gradient_index()),
        data_(rhs.data()), storage_(rhs.storage()), 
	dimension_(rhs.dimension()), offset_(rhs.offset()) 
    { if (storage_) storage_->add_link(); }

    // Copy constructor with const argument does exactly the same
    // thing
    SpecialMatrix(const SpecialMatrix& rhs) 
      : internal::GradientIndex<IsActive>(rhs.gradient_index()),
        dimension_(rhs.dimension()), offset_(rhs.offset())
    { link_(const_cast<SpecialMatrix&>(rhs)); }
  private:
    void link_(SpecialMatrix& rhs) {
      data_ = const_cast<Type*>(rhs.data()); 
      storage_ = const_cast<Storage<Type>*>(rhs.storage());
      if (storage_) storage_->add_link();
    }

  public:
    // Initialize with an expression on the right hand side by
    // evaluating the expression, requiring the ranks to be equal.
    // Note that this constructor enables expressions to be used as
    // arguments to functions that expect an array - to prevent this
    // implicit conversion, use the "explicit" keyword.
    template<typename EType, class E>
    explicit
    SpecialMatrix(const Expression<EType, E>& rhs,
	  typename internal::enable_if<E::rank == 2,int>::type = 0)
      : data_(0), storage_(0), dimension_(0)
    { *this = rhs; }

    // Destructor: if the data are stored in a Storage object then we
    // tell it that one fewer object is linking to it; if the number
    // of links to it drops to zero, it will destruct itself and
    // deallocate the memory.
    ~SpecialMatrix()
    { if (storage_) storage_->remove_link(); }

    // -------------------------------------------------------------------
    // SpecialMatrix: 3. Assignment operators
    // -------------------------------------------------------------------

    // Assignment to another matrix: copy the data...
    // Ideally we would like this to fall back to the operator=(const
    // Expression&) function, but if we don't define a copy assignment
    // operator then C++ will generate a default one :-(
    SpecialMatrix& operator=(const SpecialMatrix& rhs) {
      *this = static_cast<const Expression<Type,SpecialMatrix>&> (rhs);
      return *this;
    }

    // Assignment to an array expression of the same rank
    template <typename EType, class E>
    typename internal::enable_if<E::rank == 2, SpecialMatrix&>::type
    operator=(const Expression<EType,E>& rhs) {
#ifndef ADEPT_NO_DIMENSION_CHECKING
      ExpressionSize<2> dims;
      if (!rhs.get_dimensions(dims)) {
	std::string str = "Array size mismatch in "
	  + rhs.expression_string() + ".";
	throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
      }
      else if (empty()) {
	resize(dims[0], dims[1]);
      }
      else if (!internal::compatible(dims, dimensions())) {
	std::string str = "Expr";
	str += dims.str() + " object assigned to " + expression_string_();
	throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
      }
#else
      if (empty()) {
	ExpressionSize<2> dims;
	if (!rhs.get_dimensions(dims)) {
	  std::string str = "Array size mismatch in "
	    + rhs.expression_string() + ".";
	  throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
	}
	resize(dims[0], dims[1]);
      }
#endif
      if (!empty()) {
#ifndef ADEPT_NO_ALIAS_CHECKING
	// Check for aliasing first
	Type const * ptr_begin;
	Type const * ptr_end;
	data_range(ptr_begin, ptr_end);
	if (rhs.is_aliased(ptr_begin, ptr_end)) {
	  SpecialMatrix copy;
	  // It would be nice to wrap noalias around rhs, but then
	  // this leads to infinite template recursion since the "="
	  // operator calls the current function but with a modified
	  // expression type. perhaps a better way would be to make
	  // copy.assign_no_alias(rhs) work.
	  copy = rhs;
	  assign_expression_<IsActive, E::is_active>(copy);
	}
	else {
#endif
	  // Select active/passive version by delegating to a
	  // protected function
	  assign_expression_<IsActive, E::is_active>(rhs);
#ifndef ADEPT_NO_ALIAS_CHECKING
	}
#endif
      }
      return *this;
    }
    
    // Assignment to an array expression of the same rank in which the
    // activeness of the right-hand-side is ignored
    template <typename EType, class E>
    typename internal::enable_if<E::rank == 2, SpecialMatrix&>::type
    assign_inactive(const Expression<EType,E>& rhs) {
      ExpressionSize<2> dims;
      if (!rhs.get_dimensions(dims)) {
	std::string str = "Array size mismatch in "
	  + rhs.expression_string() + ".";
	throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
      }
      else if (empty()) {
	resize(dims[0], dims[1]);
      }
      else if (!internal::compatible(dims, dimensions())) {
	std::string str = "Expr";
	str += dims.str() + " object assigned to " + expression_string_();
	throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
      }

      if (!empty()) {
	// Check for aliasing first
	Type const * ptr_begin;
	Type const * ptr_end;
	data_range(ptr_begin, ptr_end);
	if (rhs.is_aliased(ptr_begin, ptr_end)) {
	  std::cout << "ALIASED!\n";
	  SpecialMatrix copy;
	  copy.assign_inactive(rhs);
	  //	  *this = copy;
	  assign_expression_<IsActive, false>(copy);
	}
	else {
	  assign_expression_<IsActive, false>(rhs);
	}
      }
      return *this;
    }

    // Assignment to a single value copies to every element
    template <typename RType>
    typename internal::enable_if<internal::is_not_expression<RType>::value, SpecialMatrix&>::type
    operator=(RType rhs) {
      if (!empty()) {
	assign_inactive_scalar<IsActive>(rhs);
      }
      return *this;
    }

    // Assign active scalar expression to an active array by first
    // converting the RHS to an active scalar
    template <typename EType, class E>
    typename internal::enable_if<E::rank == 0 && IsActive && !E::is_lvalue,
      SpecialMatrix&>::type
      operator=(const Expression<EType,E>& rhs) {
      Active<EType> x = rhs;
      *this = x;
      return *this;
    }

  
    // An active array being assigned to an active scalar
    template <typename PType>
    typename internal::enable_if<!internal::is_active<PType>::value && IsActive, SpecialMatrix&>::type
    operator=(const Active<PType>& rhs) {
      // If not recording we call the inactive version instead
#ifdef ADEPT_RECORDING_PAUSABLE
      if (! ADEPT_ACTIVE_STACK->is_recording()) {
	assign_inactive_scalar<false>(rhs.scalar_value());
	return *this;
      }
#endif
      Type val = rhs.scalar_value();
      Index j_start, j_end_plus_1, index, index_stride;
      for (Index i = 0 ; i < dimension_; ++i) {
	Engine::get_row_range(i, dimension_, offset_, 
			      j_start, j_end_plus_1, index, index_stride);
	for (Index j = j_start; j < j_end_plus_1; ++j, index += index_stride) {
	  data_[index] = val;
	  ADEPT_ACTIVE_STACK->push_rhs(1.0, rhs.gradient_index());
	  ADEPT_ACTIVE_STACK->push_lhs(gradient_index()+index);	  
	}
      }
      return *this;
    }


    // All the compound assignment operators are unpacked, i.e. a+=b
    // becomes a=a+b; first for an Expression on the rhs.  We use
    // "noalias" sine there is no need for the entirety of the
    // right-hand-side of the expression to be copied before
    // evaluation.
    template<typename EType, class E>
    SpecialMatrix& operator+=(const Expression<EType,E>& rhs) {
      return *this = (noalias(*this) + rhs);
    }
    template<typename EType, class E>
    SpecialMatrix& operator-=(const Expression<EType,E>& rhs) {
      return *this = (noalias(*this) - rhs);
    }
    template<typename EType, class E>
    SpecialMatrix& operator*=(const Expression<EType,E>& rhs) {
      return *this = (noalias(*this) * rhs);
    }
    template<typename EType, class E>
    SpecialMatrix& operator/=(const Expression<EType,E>& rhs) {
      return *this = (noalias(*this) / rhs);
    }

    // And likewise for a passive scalar on the rhs
    template <typename PType>
    typename internal::enable_if<internal::is_not_expression<PType>::value, SpecialMatrix&>::type
    operator+=(const PType& rhs) {
      return *this = (noalias(*this) + rhs);
    }
    template <typename PType>
    typename internal::enable_if<internal::is_not_expression<PType>::value, SpecialMatrix&>::type
    operator-=(const PType& rhs) {
      return *this = (noalias(*this) - rhs);
    }
    template <typename PType>
    typename internal::enable_if<internal::is_not_expression<PType>::value, SpecialMatrix&>::type
    operator*=(const PType& rhs) {
      return *this = (noalias(*this) * rhs);
    }
    template <typename PType>
    typename internal::enable_if<internal::is_not_expression<PType>::value, SpecialMatrix&>::type
    operator/=(const PType& rhs) {
      return *this = (noalias(*this) / rhs);
    }

  
    // -------------------------------------------------------------------
    // SpecialMatrix: 4. Access functions, particularly operator()
    // -------------------------------------------------------------------
  
    // Get l-value of the element at the specified coordinates
    typename internal::active_reference<Type,IsActive>::type
    get_lvalue(const ExpressionSize<2>& i) {
      return get_lvalue_<IsActive>(Engine::index(i[0],i[1],offset_));
    }
    
  protected:
    template <bool MyIsActive>
    typename internal::enable_if<MyIsActive, ActiveReference<Type> >::type
    get_lvalue_(const Index& loc) {
      return ActiveReference<Type>(data_[loc], gradient_index()+loc);
    }
    template <bool MyIsActive>
    typename internal::enable_if<!MyIsActive, Type&>::type
    get_lvalue_(const Index& loc) {
      return data_[loc];
    }

  public:
    // Access individual elements of the array.  Each argument must be
    // of integer type, or a rank-0 expression of integer type (such
    // as "end" or "end-3"). Inactive arrays return a reference to the
    // element, while active arrays return an ActiveReference<Type>
    // object.
    template <typename I0, typename I1>
    typename internal::enable_if<internal::all_scalar_ints<2,I0,I1>::value,
		       typename internal::active_reference<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1) {
      return Engine::template 
	get_reference<IsActive>(internal::get_index_with_len(i0,dimension_),
				internal::get_index_with_len(i1,dimension_),
				dimension_, offset_, 
				gradient_index(), data_);
    }
    template <typename I0, typename I1>
    typename internal::enable_if<internal::all_scalar_ints<2,I0,I1>::value,
				 typename internal::active_scalar<Type,IsActive>::type>::type
    operator()(I0 i0, I1 i1) const {
      return Engine::template get_scalar<IsActive>(internal::get_index_with_len(i0,dimension_),
						   internal::get_index_with_len(i1,dimension_),
						   dimension_, offset_, 
						   gradient_index(), data_);
    }
    
    /*
    // If one or more of the indices is not guaranteed to be monotonic
    // at compile time then we must return an IndexedSpecialMatrix, now done
    // for all possible numbers of arguments
  
    template <typename I0, typename I1>
    typename internal::enable_if<internal::is_indexed<Rank,I0,I1>::value
                       && !internal::is_ranged<Rank,I0,I1>::value,
		       IndexedSpecialMatrix<internal::is_indexed<Rank,I0,I1>::count,
				    Type,IsActive,SpecialMatrix,I0,I1> >::type
    operator()(const I0& i0, const I1& i1) {
      static const int new_rank = internal::is_indexed<Rank,I0,I1>::count;
      return IndexedSpecialMatrix<new_rank,Type,IsActive,SpecialMatrix,I0,I1>(*this, i0, i1);
    }
    */

    // diag_vector(offdiag), where A is a 2D square band matrix (including
    // DiagMatrix, TridiagMatrix etc), returns a 1D array pointing to
    // the "offdiag"-th diagonal of the original data, Can be used as an
    // lvalue.
    Array<1,Type,IsActive>
    diag_vector(Index offdiag = 0) {
      if (offdiag >= 0) {
	Engine::check_upper_diag(offdiag);
	ExpressionSize<1> dim(dimension_ - offdiag);
	ExpressionSize<1> offset(offset_+1);
	return Array<1,Type,IsActive>(data_
	      +Engine::upper_offset(dimension_,offset_,offdiag),
				    storage_, dim, offset);
      }
      else {
	Engine::check_lower_diag(offdiag);
	ExpressionSize<1> dim(dimension_ + offdiag);
	ExpressionSize<1> offset(offset_+1);
	return Array<1,Type,IsActive>(data_
	      +Engine::lower_offset(dimension_,offset_,offdiag),
				      storage_, dim, offset);
      }
    }

    // Extract a square sub-matrix on the diagonal
    SpecialMatrix
    submatrix_on_diagonal(Index istart, Index iend) {
      if (istart < 0 || istart > iend || iend >= dimension_) {
	throw index_out_of_bounds("Dimensions out of range in submatrix_on_diagonal"
				  ADEPT_EXCEPTION_LOCATION);
      }
      return SpecialMatrix(data_+(offset_+1)*istart, 
			  storage_, iend-istart+1, offset_);
    }

    // FIX - add an rvalue version returning const Array (?)

    // Transpose as an lvalue
    SpecialMatrix<Type, typename Engine::transpose_engine, IsActive>
    T() {
      return SpecialMatrix<Type, typename Engine::transpose_engine, 
	IsActive>(data_, storage_, dimension_, offset_);
    }

    // Return a SpecialMatrix that is a "soft" link to the data in the
    // present array; that is, it does not copy the Storage object and
    // increase the reference counter therein. This is useful in a
    // multi-threaded environment when multiple threads may wish to
    // subset the same array.
    SpecialMatrix soft_link() {
      return SpecialMatrix(data_,0,dimension_,offset_,gradient_index());
    }
    const SpecialMatrix soft_link() const {
      return SpecialMatrix(data_,0,dimension_,offset_,gradient_index());
    }
    

    // -------------------------------------------------------------------
    // SpecialMatrix: 5. Public member functions
    // -------------------------------------------------------------------
  
    // Link to an existing array of the same rank, type and activeness
    SpecialMatrix& link(SpecialMatrix& rhs) {
      if (!rhs.data()) {
	throw empty_array("Attempt to link to empty array"
			  ADEPT_EXCEPTION_LOCATION);
      }
      else {
	clear();
	data_ = rhs.data();
	storage_ = rhs.storage();
	dimension_ = rhs.dimension();
	offset_ = rhs.offset();
	if (storage_) {
	  storage_->add_link();
	}
      }
      return *this;
    }
   

#ifndef ADEPT_MOVE_SEMANTICS
    // A common pattern is to link to a subset of another
    // SpecialMatrix, e.g. vec1.link(vec2(range(2,4))), but the
    // problem is that the argument to link is a temporary so will not
    // bind to SpecialMatrix&. In C++98 we therefore need a function
    // taking const SpecialMatrix& and then cast away the const-ness. This has
    // the unfortunate side effect that a non-const SpecialMatrix can be
    // linked to a const SpecialMatrix.
    SpecialMatrix& link(const SpecialMatrix& rhs) { 
      return link(const_cast<SpecialMatrix&>(rhs)); 
    }
#else
    // But in C++11 we can solve this problem and only bind to
    // temporary non-const SpecialMatrix
    SpecialMatrix& link(SpecialMatrix&& rhs) {
      return link(const_cast<SpecialMatrix&>(rhs));
    }
#endif

    // Fortran-like link syntax A >>= B
    SpecialMatrix& operator>>=(SpecialMatrix& rhs)
    { return link(rhs); }
#ifndef ADEPT_MOVE_SEMANTICS
    SpecialMatrix& operator>>=(const SpecialMatrix& rhs)
    { return link(const_cast<SpecialMatrix&>(rhs)); }
#else
    SpecialMatrix& operator>>=(SpecialMatrix&& rhs)
    { return link(const_cast<SpecialMatrix&>(rhs)); }
#endif

    // STL-like size() returns total length of array
    Index size() const {
      return dimension_*dimension_;
    }

    // Return dimensions
    ExpressionSize<2> dimensions() const {
      return ExpressionSize<2>(dimension_,dimension_);
    }

    bool get_dimensions_(ExpressionSize<2>& dim) const {
      dim[0] = dim[1] = dimension_;
      return true;
    }

    // Return individual dimension
    Index dimension(int j = 0) const {
      return dimension_;
    }

    
    // Return individual offset
    Index offset() const {
      return offset_;
    }
    

  /*
    // Get dimensions for matrix operations, treating 1D arrays as
    // column vectors
    void get_matrix_dimensions(ExpressionSize<2>& dim) const {
      dim[0] = dim[1] = dimension_;
    }
  */

    /*
    // Return constant reference to offsets
    const ExpressionSize<Rank>& offset() const {
      return offset_;
    }
    const Index& last_offset() const { return offset_[Rank-1]; }
    */

    // Return true if the array is empty
    bool empty() const { return (dimension_ == 0); }

    // Return a string describing the array
    std::string info_string() const {
      std::stringstream str;
      str << Engine::long_name() << ", dim=" << dimension_ 
	  << ", offset=" << offset_ << ", data_location=" << data_;
      return str.str();
    }

    // Return a pointer to the start of the data
    Type* data() { return data_; }
    const Type* data() const { return data_; }
    const Type* const_data() const { return data_; }

    // Older style
    Type* data_pointer() { return data_; }
    const Type* data_pointer() const { return data_; }
    const Type* const_data_pointer() const { return data_; }

    // Return a pointer to the storage object
    Storage<Type>* storage() { return storage_; }

    // Reset the array to its original empty state, removing the link
    // to the data (which may deallocate the data if it was the only
    // link) and set the dimensions to zero
    void clear() {
      if (storage_) {
	storage_->remove_link();
	storage_ = 0;
      }
      data_ = 0;
      dimension_ = 0;
      offset_ = 0;
      internal::GradientIndex<IsActive>::clear();
    }

    // Resize an array
    void resize(Index dim) {

      ADEPT_STATIC_ASSERT(!(std::numeric_limits<Type>::is_integer
	    && IsActive), CANNOT_CREATE_ACTIVE_ARRAY_OF_INTEGERS);

      if (storage_) {
	storage_->remove_link();
	storage_ = 0;
      }
      // Check requested dimensions
      if (dim < 0) {
	throw invalid_dimension("Negative array dimension requested"
				ADEPT_EXCEPTION_LOCATION);
      }
      else if (dim == 0) {
	clear();
      }
      else {
	dimension_ = dim;
	offset_ = Engine::pack_offset(dim);
	storage_ = new Storage<Type>(Engine::data_size(dimension_,offset_), IsActive);
	data_ = storage_->data();
	internal::GradientIndex<IsActive>::set(data_, storage_);
      }
    }

    // Resize with an ExpressionSize object
    void resize(Index dim0, Index dim1) {
      if (dim0 != dim1) {
	throw invalid_dimension("Square matrix must have the same x and y dimensions"
				ADEPT_EXCEPTION_LOCATION);
      }
      resize(dim0);
    }

    bool is_aliased_(const Type* mem1, const Type* mem2) const {
      Type const * ptr_begin;
      Type const * ptr_end;
      data_range(ptr_begin, ptr_end);
      if (ptr_begin <= mem2 && ptr_end >= mem1) {
	return true;
      }
      else {
	return false;
      }
    }
  
    // Cannot traverse a full row just by incrementing an index by 1
    bool all_arrays_contiguous_() const { return false; }

    Type value_with_len_(const Index& j, const Index& len) const {
      ADEPT_STATIC_ASSERT(false, CANNOT_USE_VALUE_WITH_LEN_ON_ARRAY_OF_RANK_OTHER_THAN_1);
      return 0;
    }

    std::string expression_string_() const {
      std::stringstream a;
      a << Engine::name()
	<< "[" << dimension_ << "," << dimension_ << "]";
      return a.str();
    }

    // The same as operator=(inactive scalar) but does not put
    // anything on the stack
    template <typename RType>
    typename internal::enable_if<internal::is_not_expression<RType>::value, SpecialMatrix&>::type
    set_value(RType x) {
      if (!empty()) {
	assign_inactive_scalar<false>(x);
      }
      return *this;
    }
  
    // Is the array contiguous in memory?
    bool is_contiguous() const {
      return (offset_ == Engine::pack_offset(dimension_));
    }
  
    // Return the gradient index for the first element in the array,
    // or -1 if not active
    Index gradient_index() const {
      return internal::GradientIndex<IsActive>::get();
    }

    /*
    std::ostream& print(std::ostream& os) const {
      if (empty()) {
	os << "(empty " << Engine::name() << ")";
      }
      else if (adept::internal::array_print_curly_brackets) {
	os << "\n";
	for (int i = 0; i < dimension_; ++i) {
	  if (i == 0) {
	    os << "{{";
	  }
	  else {
	    os << " {";
	  }
	  for (int j = 0; j < dimension_; ++j) {
	    os << (*this)(i,j);
	    if (j < dimension_-1) { os << ", "; }
	  }
	  os << "}";
	  if (i < dimension_-1) { 
	    os << ",\n"; 
	  }
	  else {
	    //	    os << "}\n"; 
	    os << "}"; 
	  }
	}
      }
      else {
	for (int i = 0; i < dimension_; ++i) {
	  for (int j = 0; j < dimension_; ++j) {
	    os << (*this)(i,j);
	    if (j < dimension_-1) { os << " "; }
	  }
	  os << "\n"; 
	}
      }
      return os;
    }
    */

    std::ostream& print(std::ostream& os) const {
      const Array<rank,Type,IsActive> x(*this);
      x.print(os);
      return os;
    }    

    std::ostream& print_raw(std::ostream& os) const {
      if (empty()) {
	os << "(empty " << Engine::name() << ")\n";
      }
      else {
	for (Index i = 0; i < Engine::data_size(dimension_,offset_); ++i) {
	  os << " " << data_[i];
	}
	os << "\n";
      }
      return os;
    }

    // Get pointers to the first and last data members in memory.  
    void data_range(Type const * &data_begin, Type const * &data_end) const {
      data_begin = data_;
      data_end = data_ + Engine::data_size(dimension_, offset_) - 1;
    }

    // The Stack::independent(x) and Stack::dependent(y) functions add
    // the gradient_index of objects x and y to std::vector<uIndex>
    // objects in Stack. Since x and y may be scalars or arrays, this
    // is best done by delegating to the Active or Array classes.
    template <typename IndexType>
    void push_gradient_indices(std::vector<IndexType>& vec) {
      ADEPT_STATIC_ASSERT(IsActive,
	  CANNOT_PUSH_GRADIENT_INDICES_FOR_INACTIVE_SPECIAL_MATRIX); 
      Index j_start, j_end_plus_1, index, index_stride;
      Index gradient_ind = gradient_index();
      vec.reserve(vec.size() + Engine::data_size(dimension_, offset_));
      for (Index i; i < dimension_; ++i) {
	Engine::get_row_range(i, dimension_, offset_, 
			      j_start, j_end_plus_1, index, index_stride);
	for (Index j = j_start; j < j_end_plus_1; ++j, index += index_stride) {
	  vec.push_back(gradient_ind + index);
	}
      }
    }

    // Return inactive array linked to original data
    SpecialMatrix<Type, Engine, false> inactive_link() {
      SpecialMatrix<Type, Engine, false> A;
      A.data_ = data_;
      A.storage_ = storage_;
      A.dimension_ = dimension_;
      A.offset_ = offset_;
      if (storage_) storage_->add_link();
      return A;
    }


    // -------------------------------------------------------------------
    // SpecialMatrix: 6. Member functions accessed by the Expression class
    // -------------------------------------------------------------------

    template <int MyArrayNum, int NArrays>
    void set_location_(const ExpressionSize<2>& i, 
		       ExpressionSize<NArrays>& index) const {
      index[MyArrayNum] = Engine::index(i[0],i[1],offset_);
      Engine::template set_extras<MyArrayNum>(i[0],offset_,index);
    }
    
    template <int MyArrayNum, int NArrays>
    Type value_at_location_(const ExpressionSize<NArrays>& loc) const {
      return Engine::template value_at_location<MyArrayNum>(data_, loc);
    }

    Type& lvalue_at_location(const Index& loc) {
      return data_[loc];
    }

    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
    Type value_at_location_store_(const ExpressionSize<NArrays>& loc,
				  internal::ScratchVector<NScratch>& scratch) const {
      return Engine::template value_at_location<MyArrayNum>(data_, loc);

    }

    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
    Type value_stored_(const ExpressionSize<NArrays>& loc,
		       const internal::ScratchVector<NScratch>& scratch) const {
      return Engine::template value_at_location<MyArrayNum>(data_, loc);
    }

    template <int MyArrayNum, int NArrays>
    void advance_location_(ExpressionSize<NArrays>& loc) const {
      loc[MyArrayNum] += Engine::template row_offset<MyArrayNum>(offset_, loc);
    }

    // If an expression leads to calc_gradient being called on an
    // active object, we push the multiplier and the gradient index on
    // to the operation stack (or 1.0 if no multiplier is specified
    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
    void calc_gradient_(Stack& stack, const ExpressionSize<NArrays>& loc,
			const internal::ScratchVector<NScratch>& scratch) const {
      Engine::template push_rhs<MyArrayNum>(stack, static_cast<Type>(1.0), 
					    gradient_index(), loc);
    }
    template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, typename MyType>
    void calc_gradient_(Stack& stack, const ExpressionSize<NArrays>& loc,
			const internal::ScratchVector<NScratch>& scratch,
			const MyType& multiplier) const {
      Engine::template push_rhs<MyArrayNum>(stack, multiplier, gradient_index(), loc);
    }
  

    // -------------------------------------------------------------------
    // SpecialMatrix: 7. Protected member functions
    // -------------------------------------------------------------------
  protected:

    // When assigning a scalar to a whole array, there may be
    // advantage in specialist behaviour depending on the rank of the
    // array. This is a generic one that copies the number but treats
    // the present array as passive.
    template <bool LocalIsActive, typename X>
    typename internal::enable_if<!LocalIsActive,void>::type
    assign_inactive_scalar(X x) {
      Index j_start, j_end_plus_1, index, index_stride;
      for (Index i = 0 ; i < dimension_; ++i) {
	Engine::get_row_range(i, dimension_, offset_, 
			      j_start, j_end_plus_1, index, index_stride);
	for (Index j = j_start; j < j_end_plus_1; ++j, index += index_stride) {
	  data_[index] = x;
	}
      }
    }

    // An active array being assigned the value of an inactive scalar
    template <bool LocalIsActive, typename X>
    typename internal::enable_if<LocalIsActive,void>::type
    assign_inactive_scalar(X x) {
      // If not recording we call the inactive version instead
#ifdef ADEPT_RECORDING_PAUSABLE
      if (! ADEPT_ACTIVE_STACK->is_recording()) {
	assign_inactive_scalar<false, X>(x);
	return;
      }
#endif
      Index j_start, j_end_plus_1, index, index_stride;
      for (Index i = 0 ; i < dimension_; ++i) {
	Engine::get_row_range(i, dimension_, offset_, 
			      j_start, j_end_plus_1, index, index_stride);
	ADEPT_ACTIVE_STACK->push_lhs_range(gradient_index()+index, j_end_plus_1-j_start,
					   index_stride);
	for (Index j = j_start; j < j_end_plus_1; ++j, index += index_stride) {
	  data_[index] = x;
	}
      }
    }


    // When copying an expression to a whole array, there may be
    // advantage in specialist behaviour depending on the rank of the
    // array
    template<bool LocalIsActive, bool EIsActive, class E>
    typename internal::enable_if<!LocalIsActive,void>::type
    assign_expression_(const E& rhs) {
      ADEPT_STATIC_ASSERT(!EIsActive, CANNOT_ASSIGN_ACTIVE_EXPRESSION_TO_INACTIVE_ARRAY);
      ExpressionSize<2> i(0);
      ExpressionSize<internal::expr_cast<E>::n_arrays> ind(0);
      Index j_start, j_end_plus_1, index, index_stride;
      for ( ; i[0] < dimension_; ++i[0]) {
	Engine::get_row_range(i[0], dimension_, offset_, 
			      j_start, j_end_plus_1, index, index_stride);
	i[1] = j_start;
	rhs.set_location(i, ind);	
	for (i[1] = j_start; i[1] < j_end_plus_1;
	     ++i[1], index += index_stride) {
	  data_[index] = rhs.next_value(ind);
	}
      }
    }

    template<bool LocalIsActive, bool EIsActive, class E>
    typename internal::enable_if<LocalIsActive,void>::type
    assign_expression_(const E& rhs) {
      // If recording has been paused then call the inactive version
#ifdef ADEPT_RECORDING_PAUSABLE
      if (!ADEPT_ACTIVE_STACK->is_recording()) {
	assign_expression_<false,false>(rhs);
	return;
      }
#endif
      ExpressionSize<2> i(0);
      ExpressionSize<internal::expr_cast<E>::n_arrays> ind(0);
      ADEPT_ACTIVE_STACK->check_space(internal::expr_cast<E>::n_active * size());
      Index j_start, j_end_plus_1, index, index_stride;
      for ( ; i[0] < dimension_; ++i[0]) {
	Engine::get_row_range(i[0], dimension_, offset_, 
			      j_start, j_end_plus_1, index, index_stride);
	i[1] = j_start;
	rhs.set_location(i, ind);	
	for (i[1] = j_start; i[1] < j_end_plus_1; ++i[1], index += index_stride) {
	  data_[index] = rhs.next_value_and_gradient(*ADEPT_ACTIVE_STACK, ind);
	  ADEPT_ACTIVE_STACK->push_lhs(gradient_index()+index);
	}
      }
    }


    // -------------------------------------------------------------------
    // SpecialMatrix: 8. Data
    // -------------------------------------------------------------------
  protected:
    Type* data_;                      // Pointer to values
    Storage<Type>* storage_;          // Pointer to Storage object
    Index dimension_;                 // Size of each dimension
    Index offset_;                    // Memory offset for
				      // slowest-varying dimension

  }; // End of SpecialMatrix class


  // -------------------------------------------------------------------
  // Helper functions
  // -------------------------------------------------------------------

  // Print array on a stream
  template <typename Type, class Engine, bool IsActive>
  inline
  std::ostream&
  operator<<(std::ostream& os, const SpecialMatrix<Type,Engine,IsActive>& A) {
    return A.print(os);
  }

  // Extract inactive part of array, working correctly depending on
  // whether argument is active or inactive
  template <typename Type, class Engine>
  inline
  SpecialMatrix<Type, Engine, false>&
  value(SpecialMatrix<Type, Engine, false>& expr) {
    return expr;
  }
  template <typename Type, class Engine>
  inline
  SpecialMatrix<Type, Engine, false>
  value(SpecialMatrix<Type, Engine, true>& expr) {
    return expr.inactive_link();
  }

  // Array::diag_matrix(), where Array is a 1D array, returns a
  // DiagMatrix containing the data as the diagonal pointing to the
  // original data, Can be used as an lvalue. Needs to be defined
  // after DiagMatrix.
  template <int Rank, typename Type, bool IsActive>
  inline
  SpecialMatrix<Type, internal::BandEngine<ROW_MAJOR,0,0>, IsActive>
  Array<Rank,Type,IsActive>::diag_matrix() {
    return SpecialMatrix<Type, internal::BandEngine<ROW_MAJOR,0,0>,
      IsActive> (data_, storage_, dimensions_[0], offset_[0]-1);
  }

  template <typename Type, bool IsActive, Index J0, Index J1, Index J2,
	    Index J3, Index J4, Index J5, Index J6>
  inline
  SpecialMatrix<Type, internal::BandEngine<ROW_MAJOR,0,0>, IsActive>
  FixedArray<Type,IsActive,J0,J1,J2,J3,J4,J5,J6>::diag_matrix() {
    return SpecialMatrix<Type, internal::BandEngine<ROW_MAJOR,0,0>, 
      IsActive> (data_, 0, dimension_<0>::value, offset_<0>::value-1,
		 internal::GradientIndex<IsActive>::get());
  }

} // End namespace adept


#endif


================================================
FILE: include/adept/Stack.h
================================================
/* Stack.h -- Storage of automatic differentiation information

    Copyright (C) 2012-2014 University of Reading
    Copyright (C) 2015-2020 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.


   The Stack class is where all the derivative information of an
   algorithm, from which the Jacobian matrix can be constructed, as
   well as tangent-linear and adjoint operations being carried out for
   suitable input derivatives.  When a Stack object is created it puts
   a pointer to itself in a global but thread-local variable that is
   then accessed whenever an active expression is evaluated.

*/

#ifndef AdeptStack_H
#define AdeptStack_H 1

#include <cmath>
#include <iostream>
#include <typeinfo>
#include <utility>
#include <string>
#include <vector>
#include <list>
#include <cstddef>
#include <limits>

#ifdef ADEPT_STACK_STORAGE_STL
#include <valarray>
#endif

#include <adept/base.h>
#include <adept/exception.h>
#include <adept/StackStorageOrig.h>
#include <adept/StackStorageOrigStl.h>
#include <adept/traits.h>

namespace adept {

  // ---------------------------------------------------------------------
  // Access to Stack object via global pointer
  // ---------------------------------------------------------------------

  // Declare a thread-safe and a thread-unsafe global pointer to the
  // current stack
  class Stack;
  extern ADEPT_THREAD_LOCAL Stack* _stack_current_thread;
  extern Stack* _stack_current_thread_unsafe;

  // Define ADEPT_ACTIVE_STACK to be the currently active version
  // regardless of whether we are in thread safe or unsafe mode
#ifdef ADEPT_STACK_THREAD_UNSAFE
#define ADEPT_ACTIVE_STACK adept::_stack_current_thread_unsafe
#else
#define ADEPT_ACTIVE_STACK adept::_stack_current_thread
#endif

  // ---------------------------------------------------------------------
  // Helper classes
  // ---------------------------------------------------------------------

  // Structure holding a fixed-size array of objects (intended for
  // double or float)
  template<int Size, class Type>
  struct Block {
    Block() { zero(); }
    const Type& operator[](uIndex i) const { return data[i]; }
    Type& operator[](uIndex i) { return data[i]; }
    void zero() { for (uIndex i = 0; i < Size; i++) data[i] = 0.0; }
    Type data[Size] ADEPT_SSE2_ALIGNED;
  };

  // Structure for describing a gap in the current list of gradients
  struct Gap {
    Gap(uIndex value) : start(value), end(value) {}
    Gap(uIndex start_, uIndex end_) : start(start_), end(end_) {}
    uIndex start;
    uIndex end;
  };

  // Forward declaration of Array, to enable Jacobian functions
  template<int Rank, typename Type, bool IsActive>
  class Array;

  // ---------------------------------------------------------------------
  // Definition of Stack class
  // ---------------------------------------------------------------------

  // "Stack" inherits from a class defining the storage of the stack
  // information, which is controlled by preprocessor
  // variables. Member functions not defined here are in Stack.cpp.
  class Stack 
#ifdef ADEPT_STACK_STORAGE_STL
    : public internal::StackStorageOrigStl
#else
    : public internal::StackStorageOrig
#endif
  {
  public:
    // -------------------------------------------------------------------
    // Stack: 1. Static Definitions
    // -------------------------------------------------------------------
    typedef std::list<Gap> GapList;
    typedef std::list<Gap>::iterator GapListIterator;

    // -------------------------------------------------------------------
    // Stack: 2. Constructor and destructor
    // -------------------------------------------------------------------

    // Only one constructor, which is normally called with no
    // arguments, but if "false" is provided as the argument it will
    // construct as normal but not attempt to make itself the active stack
    Stack(bool activate_immediately = true) :
#ifndef ADEPT_STACK_STORAGE_STL
      gradient_(0),
#endif
      most_recent_gap_(gap_list_.end()),
      i_gradient_(0), n_allocated_gradients_(0), max_gradient_(0),
      n_gradients_registered_(0),
      gradients_initialized_(false), 
#ifdef ADEPT_STACK_THREAD_UNSAFE
      is_thread_unsafe_(true),
#else
      is_thread_unsafe_(false),
#endif
      is_recording_(true),
      // Since the library might be compiled with OpenMP support and
      // subsequent programs without, we need to tell the library via
      // the following variable
#ifdef _OPENMP
      have_openmp_(true),
#else
      have_openmp_(false),
#endif
      openmp_manually_disabled_(false)
    { 
      initialize(ADEPT_INITIAL_STACK_LENGTH);
      new_recording();
      if (activate_immediately) {
	activate();
      }
    }
  
    // Destructor
    ~Stack();

    // -------------------------------------------------------------------
    // Stack: 3. Public member functions
    // -------------------------------------------------------------------

    // This function is no longer available
    void start(uIndex n = ADEPT_INITIAL_STACK_LENGTH) {
      throw feature_not_available("The Stack::start() function has been removed since Adept version 1.0: see the documentation about how to use Stack::new_recording()"
				  ADEPT_EXCEPTION_LOCATION);
    }

    // After a sequence of operation pushes, we may append these to
    // the previous statement by calling this function.
    // gradient_index is the index of the gradient on the LHS of the
    // statement: if this does not match the LHS of the previous
    // statement then this is an error and "false" will be returned. A
    // "true" return value indicates success.
    bool update_lhs(const uIndex& gradient_index) {
      if (statement_[n_statements_-1].index != gradient_index) {
	return false;
      }
      else {
	statement_[n_statements_-1].end_plus_one = n_operations_;
	return true;
      }
    }

    // When an aReal object is created it is registered on the stack
    // and keeps a copy of its location, which is returned from this
    // function
    uIndex register_gradient() {
      uIndex return_val;
#ifdef ADEPT_RECORDING_PAUSABLE
      if (is_recording()) {
#endif
	n_gradients_registered_++;
	if (gap_list_.empty()) {
	  // Add to end of gradient vector
	  i_gradient_++;
	  if (i_gradient_ > max_gradient_) {
	    max_gradient_ = i_gradient_;
	  }
	  return_val = i_gradient_-1;
	}
	else {
	  // Insert in a gap
	  Gap& first_gap = gap_list_.front();
	  return_val = first_gap.start;
	  first_gap.start++;
	  if (first_gap.start > first_gap.end) {
	    // Gap has closed: remove it from the list, after checking
	    // if it had been stored as the gap that had most recently
	    // grown
	    if (most_recent_gap_ == gap_list_.begin()) {
	      most_recent_gap_ = gap_list_.end();
	    }
	    gap_list_.pop_front();
	  }
	}
#ifdef ADEPT_RECORDING_PAUSABLE
      }
      else {
	return_val = 0;
      }
#endif
      return return_val;
    }

    // Register n gradients and return the index of the first one
    uIndex register_gradients(const uIndex& n)  {
      uIndex return_val;
#ifdef ADEPT_RECORDING_PAUSABLE
      if (is_recording()) {
#endif
	return_val = do_register_gradients(n);
#ifdef ADEPT_RECORDING_PAUSABLE
      }
      else {
	return_val = 0;
      }
#endif
      return return_val;
    }


    // When an aReal object is destroyed it is unregistered from the
    // stack. If it is at the top of the stack then the stack pointer
    // can be decremented so that the space can be used by another
    // object. A gap can appear in the stack if an active object (or
    // array of active objects) is returned from a function, so we
    // need to keep track of a "gap" appearing in the stack. If the
    // user uses new and delete without any regard for this "last-in
    // first-out" preference then the number of gradients that are
    // allocated in the reverse pass may be larger than needed.
    void unregister_gradient(const uIndex& gradient_index) {
      n_gradients_registered_--;
      if (gradient_index+1 == i_gradient_) {
        // Gradient to be unregistered is at the top of the stack
        i_gradient_--;
	if (!gap_list_.empty()) {
	  Gap& last_gap = gap_list_.back();
	  if (i_gradient_ == last_gap.end+1) {
	    // We have unregistered the elements between the "gap" of
	    // unregistered element and the top of the stack, so can
	    // set the variables indicating the presence of the gap to
	    // zero
	    i_gradient_ = last_gap.start;
	    GapListIterator it = gap_list_.end();
	    it--;
	    if (most_recent_gap_ == it) {
	      most_recent_gap_ = gap_list_.end();
	    }
	    gap_list_.pop_back();
	  }
	}
      }
      else { // Gradient to be unregistered not at top of stack.
	// In the less common situation that the gradient is not at
	// the top of the stack, the task of unregistering is a bit
	// more involved, so we carry it out in a non-inline function
	// to avoid code bloat
	unregister_gradient_not_top(gradient_index);
      }
    }

    // Unregister n gradients starting at gradient_index
    void unregister_gradients(const uIndex& gradient_index,
			      const uIndex& n);


  protected:
    uIndex do_register_gradients(const uIndex& n);

    // Unregister a gradient that is not at the top of the stack
    void unregister_gradient_not_top(const uIndex& gradient_index);
  public:

    // Set the gradients in the list with indices between start and
    // end_plus_one-1 to the values pointed to by "gradient"
    template <typename MyReal>
    typename internal::enable_if<internal::is_floating_point<MyReal>::value,
		       void>::type
    set_gradients(uIndex start, uIndex end_plus_one,
		  const MyReal* gradient) {
      // Need to initialize the gradient list if not already done
      if (!gradients_are_initialized()) {
	initialize_gradients();
      }
      if (end_plus_one > max_gradient_) {
	throw gradient_out_of_range();
      }
      for (uIndex i = start, j = 0; i < end_plus_one; i++, j++) {
	gradient_[i] = gradient[j];
      }
    }
    template <typename MyReal>
    typename internal::enable_if<internal::is_floating_point<MyReal>::value,
		       void>::type
    set_gradients(uIndex start, uIndex end_plus_one,
		  const MyReal* gradient, Index src_stride, Index target_stride) {
      // Need to initialize the gradient list if not already done
      if (!gradients_are_initialized()) {
	initialize_gradients();
      }
      if (end_plus_one > max_gradient_) {
	throw gradient_out_of_range();
      }
      for (uIndex i = start, j = 0; i < end_plus_one; i+=target_stride, j+=src_stride) {
	gradient_[i] = gradient[j];
      }
    }

    // Get the gradients in the list with indices between start and
    // end_plus_one-1 and put them in the location pointed to by
    // "gradient"
    template <typename MyReal>
    typename internal::enable_if<internal::is_floating_point<MyReal>::value,
		       void>::type
    get_gradients(uIndex start, uIndex end_plus_one,
		  MyReal* gradient) const {
      if (!gradients_are_initialized()) {
	throw gradients_not_initialized();
      }
      if (end_plus_one > max_gradient_) {
	throw gradient_out_of_range();
      }
      for (uIndex i = start, j = 0; i < end_plus_one; i++, j++) {
	gradient[j] = gradient_[i];
      }
    }
    template <typename MyReal>
    typename internal::enable_if<internal::is_floating_point<MyReal>::value,
		       void>::type
    get_gradients(uIndex start, uIndex end_plus_one,
		  MyReal* gradient, Index src_stride, Index target_stride) const {
      if (!gradients_are_initialized()) {
	throw gradients_not_initialized();
      }
      if (end_plus_one > max_gradient_) {
	throw gradient_out_of_range();
      }
      for (uIndex i = start, j = 0; i < end_plus_one; i+=src_stride, j+=target_stride) {
	gradient[j] = gradient_[i];
      }
    }

    // Run the tangent-linear algorithm on the gradient list; normally
    // this call is preceded calls to set_gradient to load input
    // gradients and followed by calls to get_gradient to extract
    // gradients
    void compute_tangent_linear();
    void forward() { return compute_tangent_linear(); }

    // Run the adjoint algorithm on the gradient list; normally this
    // call is preceded calls to set_gradient to load input gradient
    // and followed by calls to get_gradient to extract gradient
    void compute_adjoint();
    void reverse() { return compute_adjoint(); }

    // Return the number of independent and dependent variables that
    // have been identified
    uIndex n_independent() const { return static_cast<uIndex>(independent_index_.size()); }
    uIndex n_dependent()   const { return static_cast<uIndex>(dependent_index_.size()); }

    // Compute the Jacobian matrix; note that jacobian_out must be
    // allocated to be of size m*n, where m is the number of dependent
    // variables and n is the number of independents. The independents
    // and dependents must have already been identified with the
    // functions "independent" and "dependent", otherwise this
    // function will throw a
    // "dependents_or_independents_not_identified" exception. The
    // optional dep_offset and indep_offset specify the offsets in
    // memory of the dependent and independent variables,
    // respectively, where 0 indicates to use the size of the other
    // dimension.  The default is dep_offset=1, i.e. the dependents
    // vary contiguously in memory which is equivalent to the Jacobian
    // being stored in column-major order.  Unfortunately this is not
    // the same as the convention for Adept arrays, but this part of
    // the interface was designed in Adept 1 before arrays were added.
    void jacobian(Real* jacobian_out,
		  Index dep_offset = 1,
		  Index indep_offset = 0) const {
      // Call one of jacobian_forward and jacobian_reverse, whichever
      // would be faster.
      if (n_independent() <= n_dependent()) {
	jacobian_forward(jacobian_out, dep_offset, indep_offset);
      }
      else {
	jacobian_reverse(jacobian_out, dep_offset, indep_offset);
      }
    };

    // Compute the Jacobian matrix, but explicitly specify whether
    // this is done with repeated forward or reverse passes.
    void jacobian_forward(Real* jacobian_out,
			  Index dep_offset = 1,
			  Index indep_offset = 0) const;
    void jacobian_reverse(Real* jacobian_out,
			  Index dep_offset = 1,
			  Index indep_offset = 0) const;

    // If the user included "adept_arrays.h" rather than "adept.h",
    // then allow the Jacobian to be returned in the form of an Adept
    // matrix.
    void jacobian(Array<2,Real,false> jac) const;
    void jacobian_forward(Array<2,Real,false> jac) const;
    void jacobian_reverse(Array<2,Real,false> jac) const;
    Array<2,Real,false> jacobian() const;
    Array<2,Real,false> jacobian_forward() const;
    Array<2,Real,false> jacobian_reverse() const;

    // Return maximum number of OpenMP threads to be used in Jacobian
    // calculation
    int max_jacobian_threads() const;

    // Set the maximum number of threads to be used in Jacobian
    // calculations, if possible. A value of 1 indicates that OpenMP
    // will not be used, while a value of 0 indicates that the number
    // will match the number of available processors. Returns the
    // maximum that will be used, which will be 1 if the Adept library
    // was compiled without OpenMP support. Note that a value of 1
    // will disable the use of OpenMP with Adept, so Adept will then
    // use no OpenMP directives or function calls. Note that if in
    // your program you use OpenMP with each thread performing
    // automatic differentiaion with its own independent Adept stack,
    // then typically only one OpenMP thread is available for each
    // Jacobian calculation, regardless of whether you call this
    // function.
    int set_max_jacobian_threads(int n);

    // In order to compute the jacobian we need to first declare which
    // active variables are independent (x) and which are dependent
    // (y). First, the following two functions declare an individual
    // active variable and an array of active variables to be
    // independent. Note that we use templates here because aReal has
    // not been defined.
    template <class A>
    void independent(const A& x) {
      //      independent_index_.push_back(x.gradient_index());
      x.push_gradient_indices(independent_index_);
    }
    template <class A>
    void independent(const A* x, uIndex n) {
      for (uIndex i = 0; i < n; i++) {
	//	independent_index_.push_back(x[i].gradient_index());
	x[i].push_gradient_indices(independent_index_);
      }
    }

    // Likewise, delcare the dependent variables
    template <class A>
    void dependent(const A& x) {
      //      dependent_index_.push_back(x.gradient_index());
      x.push_gradient_indices(dependent_index_);
    }
    template <class A>
    void dependent(const A* x, uIndex n) {
      for (uIndex i = 0; i < n; i++) {
	//	dependent_index_.push_back(x[i].gradient_index());
	x[i].push_gradient_indices(dependent_index_);
      }
    }

    // Print various bits of information about the Stack to the
    // specified stream (or standard output if not specified). The
    // same behaviour can be obtained by "<<"-ing the Stack to a
    // stream.
    void print_status(std::ostream& os = std::cout) const;

    // Print each derivative statement to the specified stream (or
    // standard output if not specified)
    void print_statements(std::ostream& os = std::cout) const;

    // Print the current gradient list to the specified stream (or
    // standard output if not specified); returns true on success or
    // false if no gradients have been initialized
    bool print_gradients(std::ostream& os = std::cout) const;

    // Print a list of the gaps in the gradient list
    void print_gaps(std::ostream& os = std::cout) const;

    // Clear the gradient list enabling a new adjoint or
    // tangent-linear computation to be performed with the same
    // recording
    void clear_gradients() {
      gradients_initialized_ = false;
    }

    // Clear the list of independent variables, in order that a
    // different Jacobian can be computed from the same recording
    void clear_independents() {
      independent_index_.clear();
    }

    // Clear the list of dependent variables, in order that a
    // different Jacobian can be computed from the same recording
    void clear_dependents() {
      dependent_index_.clear();
    }

    // Function now removed
    void clear() {
      throw feature_not_available("The Stack::clear() function has been removed since Adept version 1.0: see the documentation about how to use Stack::new_recording()"
				  ADEPT_EXCEPTION_LOCATION);
    }
    // Function now removed
    void clear_statements() {
      throw feature_not_available("The Stack::clear_statements() function has been removed since Adept version 1.0: see the documentation about how to use Stack::new_recording()"
				  ADEPT_EXCEPTION_LOCATION);
    }

    // Make this stack "active" by copying its "this" pointer to a
    // global variable; this makes it the stack that aReal objects
    // subsequently interact with when being created and participating
    // in mathematical expressions
    void activate();

    // This stack will stop being the one that aReal objects refer
    // to; this may be useful if the thread needs to use another stack
    // object for the next algorithm
    void deactivate() {
      if (is_active()) {
	ADEPT_ACTIVE_STACK = 0;
      }
    }

    // Return true if the Stack is "active", false otherwise
    bool is_active() const {
      return (ADEPT_ACTIVE_STACK == this);
    }

    // Clear the contents of the various lists ready for a new
    // recording
    void new_recording() {
      clear_stack(); // Defined in the storage class
      clear_independents();
      clear_dependents();
      clear_gradients();

      // i_gradient_ is the maximum index of all currently constructed
      // aReal objects and max_gradient_ is the maximum index of all
      // that were used in a recording.  Thus when deleting the
      // recording we need to set max_gradient_ to i_gradient_ or a
      // little more.
      max_gradient_ = i_gradient_+1;
      // Insert a null statement
      //    std::cerr << "Inserting a null statement; when is this needed?\n";
      push_lhs(-1);
    }

    // Are gradients to be computed?  The default is "true", but if
    // ADEPT_RECORDING_PAUSABLE is defined then this may
    // be false
    bool is_recording() const {
#ifdef ADEPT_RECORDING_PAUSABLE
      return is_recording_;
#else
      return true;
#endif
    }

    // Stop recording gradient information, enabling a piece of active
    // code to be run without the stack information being stored. This
    // only works if ADEPT_RECORDING_PAUSABLE has been defined.
    bool pause_recording() {
#ifdef ADEPT_RECORDING_PAUSABLE
      is_recording_ = false;
      return true;
#else
      return false;
#endif
    }
    // Continue recording gradient information after a previous
    // pause_recording() call. This only works if
    // ADEPT_RECORDING_PAUSABLE has been defined.
    bool continue_recording() {
#ifdef ADEPT_RECORDING_PAUSABLE
      is_recording_ = true;
      return true;
#else
      return false;
#endif
    }

    // For modular codes, some modules may have an existing Jacobian
    // code and possibly be unsuitable for automatic differentiation
    // using Adept (e.g. because they are written in Fortran).  In
    // this case, we can use the following two functions to "wrap" the
    // non-Adept code. These are actually normally called by functions
    // of the same name in the Active, ActiveReference and
    // ActiveConstReference classes.
    void add_derivative_dependence(uIndex lhs_index, uIndex rhs_index,
				   Real multiplier) {
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
	// Check there is space in the operation stack for 1 entry
	ADEPT_ACTIVE_STACK->check_space(1);
#endif
	if (multiplier != 0.0) {
	  push_rhs(multiplier, rhs_index);
	}
	push_lhs(lhs_index);
#ifdef ADEPT_RECORDING_PAUSABLE
      }
#endif
    }

    void append_derivative_dependence(uIndex lhs_index, uIndex rhs_index,
				      Real multiplier) {
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
	// Check there is space in the operation stack for 1 entry
	ADEPT_ACTIVE_STACK->check_space(1);
#endif
	if (multiplier != 0.0) {
	  push_rhs(multiplier, rhs_index);
	}
	if (!update_lhs(lhs_index)) {
	  throw wrong_gradient("Wrong gradient: append_derivative_dependence called on a different active number from the most recent add_derivative_dependence call"
			       ADEPT_EXCEPTION_LOCATION);
	}
#ifdef ADEPT_RECORDING_PAUSABLE
      }
#endif
    }

    // To enable the automatic differentiation of matrix
    // multiplication, this function performs a similar role to
    // aReal::add_derivative_dependence.  We add a derivative
    // expression of the form d[lhs_index] =
    // sum(multiplier[i*multiplier_stride]*d[rhs_index+i*index_stride]),
    // where the summation is from i = 0 to n-1. Multiple calls to
    // this function may be carried out but must be followed by
    // push_lhs(lhs_index) to specify the left-hand-side of the
    // statement.
    template <typename Type>
    void push_derivative_dependence(uIndex rhs_index,
				    const Type* multiplier,
				    int n = 1,
				    int index_stride = 1,
				    int multiplier_stride = 1) {
#ifdef ADEPT_RECORDING_PAUSABLE
      if (is_recording()) {
#endif
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
	// Check there is space in the operation stack for n entries
	check_space(n);
#endif
	for (int i = 0; i < n; i++, rhs_index += index_stride, 
	       multiplier += multiplier_stride) {
	  push_rhs(*multiplier, rhs_index);
	}
#ifdef ADEPT_RECORDING_PAUSABLE
      }
#endif
    }

    // Have the gradients been initialized?
    bool gradients_are_initialized() const { return gradients_initialized_; }

    // Return the number of statements, operations, and how much
    // memory has been allocated for each
    uIndex n_statements() const { return n_statements_; }
    uIndex n_allocated_statements() const { return n_allocated_statements_; }
    uIndex n_operations() const { return n_operations_; }
    uIndex n_allocated_operations() const { return n_allocated_operations_; }

    // Return the size of the two dimensions of a Jacobian matrix
    uIndex n_independents() const { return static_cast<uIndex>(independent_index_.size()); }
    uIndex n_dependents() const { return static_cast<uIndex>(dependent_index_.size()); }

    // Return the maximum number of gradients required to perform
    // adjoint calculation
    uIndex max_gradients() const { return max_gradient_; }

    // Return the highest gradient index on the left-hand-side of any
    // of the statements currently on the stack
    uIndex max_gradient_index() const {
      uIndex mg = 0;
      for (int is = 0; is < n_statements_; ++is) {
	if (statement_[is].index > mg) {
	  mg = statement_[is].index;
	}
      }
      return mg;
    }

    // Return the index to the current gradient
    uIndex i_gradient() const { return i_gradient_; }

    // Return the number of gradients memory has been allocated for
    uIndex n_allocated_gradients() const { return n_allocated_gradients_; }

    // Return the number of bytes used
    std::size_t memory() const {
      std::size_t mem = n_statements()*sizeof(uIndex)*2
	+ n_operations()*(sizeof(Real)+sizeof(uIndex));
      if (gradients_are_initialized()) {
	mem += max_gradients()*sizeof(Real);
      }
      return mem;
    }

    // Return the number of gradients currently registered
    uIndex n_gradients_registered() const { return n_gradients_registered_; }

    // Return the fraction of multipliers equal to the specified
    // number (usually -1, 0 or 1)
    Real fraction_multipliers_equal_to(Real val) {
      uIndex sum = 0;
      for (uIndex i = 0; i < n_operations_; i++) {
	if (multiplier_[i] == val) {
	  sum++;
	}
      }
      return static_cast<Real>(sum)/static_cast<Real>(n_operations_);
    }


    bool is_thread_unsafe() const { return is_thread_unsafe_; }

    const GapList& gap_list() const { return gap_list_; }

    // Memory to store statements and operations can be preallocated,
    // offering modest performance advantage if you define
    // ADEPT_MANUAL_MEMORY_ALLOCATION and know the maximum number of
    // statements and operations you will need
    void preallocate_statements(uIndex n) {
      if (n_statements_+n+1 >= n_allocated_statements_) {
	grow_statement_stack(n);
      }
    }
    void preallocate_operations(uIndex n) {
      if (n_allocated_operations_ < n_operations_+n+1) {
	grow_operation_stack(n);
      }      
    }

    // -------------------------------------------------------------------
    // Stack: 4. Protected member functions
    // -------------------------------------------------------------------
  protected:
    // Initialize the vector of gradients ready for the adjoint
    // calculation
    void initialize_gradients();

    // Set to zero the gradients required by a Jacobian calculation
    /*
    void zero_gradient_multipass() {
      for (std::size_t i = 0; i < gradient_multipass_.size(); i++) {
	gradient_multipass_[i].zero();
      }
    }
    */

    // OpenMP versions of the forward and reverse Jacobian functions,
    // which are called from the jacobian_forward and jacobian_reverse
    // if OpenMP is enabled
    void jacobian_forward_openmp(Real* jacobian_out,
		  Index dep_offset, Index indep_offset) const;
    void jacobian_reverse_openmp(Real* jacobian_out,
		  Index dep_offset, Index indep_offset) const;

    // The core code for computing Jacobians, used in both OpenMP and
    // non-OpenMP versions
    void jacobian_forward_kernel(Real* __restrict gradient_multipass_b) const;
    void jacobian_forward_kernel_packet(Real* __restrict gradient_multipass_b) const;
    void jacobian_forward_kernel_extra(Real* __restrict gradient_multipass_b, uIndex) const;
    void jacobian_reverse_kernel(Real* __restrict gradient_multipass_b) const;
    void jacobian_reverse_kernel_packet(Real* __restrict gradient_multipass_b) const;
    void jacobian_reverse_kernel_extra(Real* __restrict gradient_multipass_b, uIndex) const;

    // -------------------------------------------------------------------
    // Stack: 5. Data
    // -------------------------------------------------------------------
  protected:

#ifdef ADEPT_STACK_STORAGE_STL
    // Data are stored using standard template library containers
    //    std::valarray<Real> gradient_;
    std::vector<Real> gradient_;
#else
    // Data are stored as dynamically allocated arrays
    Real* __restrict gradient_;
#endif
    // For Jacobians we process multiple rows/columns at once so need
    // what is essentially a 2D array
    //    std::vector<Block<ADEPT_MULTIPASS_SIZE,Real> > gradient_multipass_;
    // uIndexs of the independent and dependent variables
    std::vector<uIndex> independent_index_;
    std::vector<uIndex> dependent_index_;
    // Keep a record of gaps in the gradient array to ensure that gaps
    // are filled
    GapList gap_list_;
    //    Gap* most_recent_gap_;
    GapListIterator most_recent_gap_;

    uIndex i_gradient_;             // Current number of gradients
    uIndex n_allocated_gradients_;  // Number of allocated gradients
    uIndex max_gradient_;           // Max number of gradients to store
    uIndex n_gradients_registered_; // Number of gradients registered
    bool gradients_initialized_;    // Have the gradients been
				    // initialized?
    bool is_thread_unsafe_;
    bool is_recording_;
    bool have_openmp_;              // true if this header file
				    // compiled with -fopenmp
    bool openmp_manually_disabled_; // true if user called
				    // set_max_jacobian_threads(1)
  }; // End of Stack class


  // -------------------------------------------------------------------
  // Helper functions
  // -------------------------------------------------------------------

  // Sending a Stack object to a stream reports information about the
  // stack
  inline
  std::ostream& operator<<(std::ostream& os, const adept::Stack& stack) {
    stack.print_status(os);
    return os;
  }

  // Memory to store statements and operations can be preallocated,
  // offering modest performance advantage if you define
  // ADEPT_MANUAL_MEMORY_ALLOCATION and know the maximum number of
  // statements and operations you will need. This version is useful
  // in functions that don't have visible access to the currently
  // active Adept stack. 
  inline
  void preallocate_statements(uIndex n) {
    ADEPT_ACTIVE_STACK->preallocate_statements(n);
  }
  inline
  void preallocate_operations(uIndex n) {
    ADEPT_ACTIVE_STACK->preallocate_operations(n);
  }

  // Returns a pointer to the currently active stack (or 0 if there is none)
  inline
  Stack* active_stack() { return ADEPT_ACTIVE_STACK; }

  // Return whether the active stack is stored in a global variable
  // (thread unsafe) rather than a thread-local global variable
  // (thread safe)
#ifdef ADEPT_STACK_THREAD_UNSAFE
  inline bool is_thread_unsafe() { return true; }
#else
  inline bool is_thread_unsafe() { return false; }
#endif 

  // Subsequent code should use adept::active_stack rather than this
  // preprocessor macro
  //#undef ADEPT_ACTIVE_STACK

} // End of namespace adept


#endif


================================================
FILE: include/adept/StackStorage.h
================================================
/* StackStorage.h -- Storage of statement & operation stacks

    Copyright (C) 2012-2014 University of Reading
    Copyright (C) 2015 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

   The Stack class inherits from a class providing the storage (and
   interface to the storage) for the derivative statements that are
   accumulated during the execution of an algorithm.  The derivative
   statements are held in two stacks described by Hogan (2014): the
   "statement stack" and the "operation stack".

   This file provides the stack storage engine: blocks of dynamically
   allocated arrays.

*/

#ifndef AdeptStackStorage_H
#define AdeptStackStorage_H 1

#include <adept/base.h>
#include <adept/exception.h>
#include <adept/Statement.h>

namespace adept {
  namespace internal {

    // Helper classes


    struct StatementBlock {
      StatementBlock(uIndex n_) : n(0), n_allocated(n_) {
	data = new Statement[n_];
      }
      ~StatementBlock() {
	delete [] data;
      }
      // Data
      Statement* data;
      uIndex n;
      const uIndex n_allocated;
    };

    struct OperationBlock {
      StatementBlock(uIndex n_) : n(0), n_allocated(n_) {
	multiplier = new Real[n_];
	index      = new uIndex[n_];
      }
      ~StatementBlock() {
	delete [] multiplier;
	delete [] index;
      }
      // Data
      Real* multiplier;
      uIndex* index;
      uIndex n;
      uIndex n_allocated
    };

    std::vector<StackBlock> stack_block_;
    struct StackBlock {
      StatementBlock* statement_list;
      OperationBlock* operation_list;
      uIndex statement_start;
      uIndex statement_end;
    };

    std::vector<StatementBlock> statement_data_;
    std::vector<OperationBlock> operation_data_;


    class StackStorage {
    public:
      // Constructor
      StackStorage() : 
	statement_(0), multiplier_(0), index_(0),
	n_statements_(0), n_allocated_statements_(0),
	n_operations_(0), n_allocated_operations_(0) { }
      
      // Destructor
      ~StackStorage();

      // Push an operation (i.e. a multiplier-gradient pair) on to the
      // stack.  We assume here that check_space() as been called before
      // so there is enough space to hold these elements.
      void push_rhs(const Real& multiplier, const uIndex& gradient_index) {
#ifdef ADEPT_REMOVE_NULL_STATEMENTS
	// If multiplier==0 then the resulting statement would have no
	// effect so we can speed up the subsequent adjoint/jacobian
	// calculations (at the expense of making this critical part
	// of the code slower)
	if (multiplier != 0.0) {
#endif
	  multiplier_[n_operations_] = multiplier;
	  index_[n_operations_++] = gradient_index;
	
#ifdef ADEPT_TRACK_NON_FINITE_GRADIENTS
	  if (!std::isfinite(multiplier) || std::isinf(multiplier)) {
	    throw non_finite_gradient();
	  }
#endif
	
#ifdef ADEPT_REMOVE_NULL_STATEMENTS
	}
#endif
      }


      // Push a statement on to the stack: this is done after a
      // sequence of operation pushes; gradient_index is the index of
      // the gradient on the LHS of the expression, while the
      // "end_plus_one" element is simply the current length of the
      // operation list
      void push_lhs(const uIndex& gradient_index) {
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
	if (n_statements_ >= n_allocated_statements_) {
	  grow_statement_stack();
	}
#endif
	statement_[n_statements_].index = gradient_index;
	statement_[n_statements_++].end_plus_one = n_operations_;
      }

      // Push n left-hand-sides of differential expressions on to the
      // stack with no corresponding right-hand-side, appropriate if
      // an array of active variables contiguous in memory (or
      // separated by a fixed stride) has been assigned to inactive
      // numbers.
      void push_lhs_range(const uIndex& first, const uIndex& n, 
			  const uIndex& stride = 1) {
	uIndex last_plus_1 = first+n*stride;
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
	if (n_statements_+n > n_allocated_statements_) {
	  grow_statement_stack(n);
	}
#endif
	for (uIndex i = first; i < last_plus_1; i += stride) {
	  statement_[n_statements_].index = i;
	  statement_[n_statements_++].end_plus_one = n_operations_;
	}
      }

      // Check whether the operation stack contains enough space for n
      // new operations; if not, grow it
      void check_space(const uIndex& n) {
	if (n_allocated_operations_ < n_operations_+n+1) {
	  grow_operation_stack(n);
	}
      }
      template<uIndex n>
      void check_space_static() {
	check_space(n);
      }

    protected:
      // Called by new_recording()
      void clear_stack() { 
	// Set the recording indices to zero
	n_operations_ = 0;
	n_statements_ = 0;
      }

      // This function is called by the constructor to initialize
      // memory, which can be grown subsequently
      void initialize(uIndex n) {
	multiplier_ = new Real[n];
	index_ = new uIndex[n];
	n_allocated_operations_ = n;
	statement_ = new Statement[n];
	n_allocated_statements_ = n;
      }

      // Grow the capacity of the operation or statement stacks to
      // hold a minimum of "min" elements. If min=0 then the stacks
      // are doubled in size.
      void grow_operation_stack(uIndex min = 0);
      void grow_statement_stack(uIndex min = 0);

    protected:
      // Data are stored as dynamically allocated arrays

      // The "statement stack" is held as a single array
      Statement* __restrict statement_ ;
      // The "operation stack" is held as two arrays
      Real*      __restrict multiplier_;
      uIndex*    __restrict index_;

      uIndex n_statements_;           // Number of statements
      uIndex n_allocated_statements_; // Space allocated for statements
      uIndex n_operations_;           // Number of operations
      uIndex n_allocated_operations_; // Space allocated for statements
    };

  } // End namespace internal
} // End namespace adept

#endif


================================================
FILE: include/adept/StackStorageOrig.h
================================================
/* StackStorageOrig.h -- Original method to store statement & operation stacks

    Copyright (C) 2014-2015 University of Reading

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

   The Stack class inherits from a class providing the storage (and
   interface to the storage) for the derivative statements that are
   accumulated during the execution of an algorithm.  The derivative
   statements are held in two stacks described by Hogan (2014): the
   "statement stack" and the "operation stack".

   This file provides the original storage engine: dynamically
   allocated arrays with the two stacks resulting from an entire
   algorithm being contiguous in memory.  This is not ideal for very
   large algorithms.

*/

#ifndef AdeptStackStorageOrig_H
#define AdeptStackStorageOrig_H 1

#include <adept/base.h>
#include <adept/exception.h>
#include <adept/Statement.h>

namespace adept {
  namespace internal {

    class StackStorageOrig {
    public:
      // Constructor
      StackStorageOrig() : 
	statement_(0), multiplier_(0), index_(0),
	n_statements_(0), n_allocated_statements_(0),
	n_operations_(0), n_allocated_operations_(0) { }
      
      // Destructor
      ~StackStorageOrig();

      // Push an operation (i.e. a multiplier-gradient pair) on to the
      // stack.  We assume here that check_space() as been called before
      // so there is enough space to hold these elements.
      void push_rhs(const Real& multiplier, const uIndex& gradient_index) {
#ifdef ADEPT_REMOVE_NULL_STATEMENTS
	// If multiplier==0 then the resulting statement would have no
	// effect so we can speed up the subsequent adjoint/jacobian
	// calculations (at the expense of making this critical part
	// of the code slower)
	if (multiplier != 0.0) {
#endif
	  multiplier_[n_operations_] = multiplier;
	  index_[n_operations_++] = gradient_index;
	
#ifdef ADEPT_TRACK_NON_FINITE_GRADIENTS
	  if (!std::isfinite(multiplier) || std::isinf(multiplier)) {
	    throw non_finite_gradient();
	  }
#endif
	
#ifdef ADEPT_REMOVE_NULL_STATEMENTS
	}
#endif
      }

      // Push the gradient indices of a vectorized operation on to the
      // stack.  We assume here that check_space() as been called
      // before so there is enough space to hold these elements. The
      // multipliers will be added later.
      template <Index Num, Index Stride>
      void push_rhs_indices(const uIndex& gradient_index) {
	for (Index i = 0; i < Num; ++i) {
	  index_[n_operations_+i*Stride] = gradient_index+i;
	}
	++n_operations_;
      }

      // Push a statement on to the stack: this is done after a
      // sequence of operation pushes; gradient_index is the index of
      // the gradient on the LHS of the expression, while the
      // "end_plus_one" element is simply the current length of the
      // operation list
      void push_lhs(const uIndex& gradient_index) {
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
	if (n_statements_ >= n_allocated_statements_) {
	  grow_statement_stack();
	}
#endif
	statement_[n_statements_].index = gradient_index;
	statement_[n_statements_++].end_plus_one = n_operations_;
      }

      // Push n left-hand-sides of differential expressions on to the
      // stack with no corresponding right-hand-side, appropriate if
      // an array of active variables contiguous in memory (or
      // separated by a fixed stride) has been assigned to inactive
      // numbers. Note that the second and third arguments must not be
      // references, since they may be compile-time constants for
      // FixedArray objects.
      void push_lhs_range(const uIndex& first, uIndex n, uIndex stride = 1) {
	uIndex last_plus_1 = first+n*stride;
#ifndef ADEPT_MANUAL_MEMORY_ALLOCATION
	if (n_statements_+n > n_allocated_statements_) {
	  grow_statement_stack(n);
	}
#endif
	for (uIndex i = first; i < last_plus_1; i += stride) {
	  statement_[n_statements_].index = i;
	  statement_[n_statements_++].end_plus_one = n_operations_;
	}
      }

      // Check whether the operation stack contains enough space for n
      // new operations; if not, grow it
      void check_space(uIndex n) {
	if (n_allocated_operations_ < n_operations_+n+1) {
	  grow_operation_stack(n);
	}
      }
      template<uIndex n>
      void check_space_static() {
	check_space(n);
      }

    protected:
      // Called by new_recording()
      void clear_stack() { 
	// Set the recording indices to zero
	n_operations_ = 0;
	n_statements_ = 0;
      }

      // This function is called by the constructor to initialize
      // memory, which can be grown subsequently
      void initialize(uIndex n) {
	multiplier_ = new Real[n];
	index_ = new uIndex[n];
	n_allocated_operations_ = n;
	statement_ = new Statement[n];
	n_allocated_statements_ = n;
      }

      // Grow the capacity of the operation or statement stacks to
      // hold a minimum of "min" elements. If min=0 then the stacks
      // are doubled in size.
      void grow_operation_stack(uIndex min = 0);
      void grow_statement_stack(uIndex min = 0);

    protected:
      // Data are stored as dynamically allocated arrays

      // The "statement stack" is held as a single array
      Statement* __restrict statement_ ;
      // The "operation stack" is held as two arrays
      Real*      __restrict multiplier_;
      uIndex*    __restrict index_;

      uIndex n_statements_;           // Number of statements
      uIndex n_allocated_statements_; // Space allocated for statements
      uIndex n_operations_;           // Number of operations
      uIndex n_allocated_operations_; // Space allocated for statements
    };

  } // End namespace internal
} // End namespace adept

#endif


================================================
FILE: include/adept/StackStorageOrigStl.h
================================================
/* StackStorageOrigStl.h -- Original storage of stacks using STL containers

    Copyright (C) 2014-2015 University of Reading

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

   The Stack class inherits from a class providing the storage (and
   interface to the storage) for the derivative statements that are
   accumulated during the execution of an algorithm.  The derivative
   statements are held in two stacks described by Hogan (2014): the
   "statement stack" and the "operation stack".

   This file provides one of the original storage engine, which used
   std::vector to hold the two stacks. Note that these stacks are
   contiguous in memory, which is not ideal for very large algorithms.

*/

#ifndef AdeptStackStorageOrigStl_H
#define AdeptStackStorageOrigStl_H 1

#include <adept/base.h>
#include <adept/exception.h>
#include <adept/Statement.h>

namespace adept {
  namespace internal {

    class StackStorageOrigStl {
    public:
      // Constructor
      StackStorageOrigStl() :
	n_statements_(0), n_allocated_statements_(0),
	n_operations_(0), n_allocated_operations_(0) { }
      
      // Destructor (does nothing)
      ~StackStorageOrigStl() { };

      // Push an operation (i.e. a multiplier-gradient pair) on to the
      // stack.  We assume here that check_space() as been called before
      // so there is enough space to hold these elements.
      void push_rhs(const Real& multiplier, const uIndex& gradient_index) {
#ifdef ADEPT_REMOVE_NULL_STATEMENTS
	// If multiplier==0 then the resulting statement would have no
	// effect so we can speed up the subsequent adjoint/jacobian
	// calculations (at the expense of making this critical part
	// of the code slower)
	if (multiplier != 0.0) {
#endif
	  multiplier_.push_back(multiplier);
	  index_.push_back(gradient_index);
	  n_operations_++;
	
#ifdef ADEPT_TRACK_NON_FINITE_GRADIENTS
	  if (!std::isfinite(multiplier) || std::isinf(multiplier)) {
	    throw non_finite_gradient();
	  }
#endif
	
#ifdef ADEPT_REMOVE_NULL_STATEMENTS
	}
#endif
      }


      // Push a statement on to the stack: this is done after a
      // sequence of operation pushes; gradient_index is the index of
      // the gradient on the LHS of the expression, while the
      // "end_plus_one" element is simply the current length of the
      // operation list
      void push_lhs(const uIndex& gradient_index) {
	statement_.push_back(Statement(gradient_index, n_operations_));
	n_statements_++;
      }

      // Push n left-hand-sides of differential expressions on to the
      // stack with no corresponding right-hand-side, appropriate if
      // an array of active variables contiguous in memory (or
      // separated by a fixed stride) has been assigned to inactive
      // numbers.
      void push_lhs_range(const uIndex& first, const uIndex& n, 
			  const uIndex& stride = 1) {
	uIndex last_plus_1 = first+n*stride;
	for (uIndex i = first; i < last_plus_1; i += stride) {
	  statement_.push_back(Statement(i, n_operations_));
	}
	n_statements_ += n;
      }

      // Check whether the operation stack contains enough space for n
      // new operations; for STL containers this does nothing
      void check_space(const uIndex& n) { }
      template<uIndex n> void check_space_static() { }

    protected:
      // Called by new_recording()
      void clear_stack() { 
	// If we use STL containers then the clear() function sets their
	// size to zero but leaves the memory allocated
	statement_.clear();
	multiplier_.clear();
	index_.clear();
	// Set the recording indices to zero
	n_operations_ = 0;
	n_statements_ = 0;
      }

      // This function is called by the constructor to initialize
      // memory, which can be grown subsequently
      void initialize(uIndex n) {
	statement_.reserve(n);
	multiplier_.reserve(n);
	index_.reserve(n);
      }

      // Grow the capacity of the operation or statement stacks to
      // hold a minimum of "min" elements. If min=0 then the stacks
      // are doubled in size.
      void grow_operation_stack(uIndex min = 0);
      void grow_statement_stack(uIndex min = 0);

    protected:
      // Data are stored using standard template library containers

      // The "statement stack" is held as a single array
      std::vector<Statement> statement_;
      // The "operation stack" is held as two arrays
      std::vector<Real> multiplier_;
      std::vector<uIndex> index_;

      uIndex n_statements_;           // Number of statements
      uIndex n_allocated_statements_; // Space allocated for statements
      uIndex n_operations_;           // Number of operations
      uIndex n_allocated_operations_; // Space allocated for statements
    };

  } // End namespace internal
} // End namespace adept

#endif


================================================
FILE: include/adept/Statement.h
================================================
/* Statement.h -- Original method to store statement & operation stacks

    Copyright (C) 2012-2014 University of Reading

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#ifndef AdeptStatement_H
#define AdeptStatement_H 1

#include <adept/base.h>

namespace adept {
  namespace internal {

    // Structure describing the LHS of a derivative expression.  For dx
    // = z dy + y dz, "index" would be the location of dx in the
    // gradient list, and "end_plus_one" would be one plus the location
    // of the final operation (multiplier-derivative pair) on the RHS,
    // in this case y dz.
    struct Statement {
      Statement() { }
      Statement(uIndex index_, uIndex end_plus_one_)
	: index(index_), end_plus_one(end_plus_one_) { }
      uIndex index;
      uIndex end_plus_one;
    };
 
  }
}

#endif


================================================
FILE: include/adept/Storage.h
================================================
/* Storage.h -- store array of active or inactive data

    Copyright (C) 2012-2014 University of Reading
    Copyright (C) 2015-2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.


   The Storage class manages the data underlying array objects, and
   uses a model of reference counting so that multiple objects can
   refer to the same data.  This enables arrays that are actually
   subsets of another array to be treated as normal array objects.

*/

#ifndef AdeptStorage_H
#define AdeptStorage_H 1

#include <string>
#include <sstream>
#include <limits>
#include <complex>

#include <adept/exception.h>
#include <adept/base.h>
#include <adept/Stack.h>
#include <adept/Packet.h>
#include <adept/traits.h>

#ifdef ADEPT_STORAGE_THREAD_SAFE
#include <atomic>
#endif


namespace adept {

  // -------------------------------------------------------------------
  // Global variables
  // -------------------------------------------------------------------
  namespace internal {
    // To check for memory leaks, we keep a running total of the number
    // of Storage objects that are created and destroyed
    extern Index n_storage_objects_created_;
    extern Index n_storage_objects_deleted_;
  }

  // -------------------------------------------------------------------
  // Definition of Storage class
  // -------------------------------------------------------------------
  template <typename Type>
  class Storage {
  public:
    // -------------------------------------------------------------------
    // Storage: 1. Constructors and destructor
    // -------------------------------------------------------------------

    // The only way to construct this object is by passing it an
    // integer indicating the size, and optionally for active objects,
    // an integer representing the index to the gradients stored in
    // the stack.
    Storage(Index n, bool IsActive = false)
      : n_(n), n_links_(1), gradient_index_(-1) {
      data_ = internal::alloc_aligned<Type>(n);
#ifdef ADEPT_INIT_REAL
      initialize<Type>();
#endif
      internal::n_storage_objects_created_++; 
#ifndef ADEPT_NO_AUTOMATIC_DIFFERENTIATION
      if (IsActive) {
	gradient_index_ = ADEPT_ACTIVE_STACK->register_gradients(n);
      }
#endif
    }
    
  protected:
    // Only allow the class to destroy itself by putting in
    // "protected".  FIX - would be better to start valid
    // gradient_index at 1, so 0 is reserved for invalid values.
    ~Storage() {
      internal::free_aligned(data_);
#ifndef ADEPT_NO_AUTOMATIC_DIFFERENTIATION
#ifdef ADEPT_RECORDING_PAUSABLE
      if (ADEPT_ACTIVE_STACK->is_recording()) {
#endif
	if (gradient_index_ >= 0) {
	  ADEPT_ACTIVE_STACK->unregister_gradients(gradient_index_, n_);
	}
#ifdef ADEPT_RECORDING_PAUSABLE
      }
#endif
#endif
      internal::n_storage_objects_deleted_++; 
    }

    // Null initialization, copy and assignment methods that are
    // "protected" to prevent them being used
    Storage() { }
    Storage(Storage& storage) { };
    void operator=(Storage& storage) { };

#ifdef ADEPT_INIT_REAL

    // Initialize to zero, NaN or whatever for debugging
    template <typename T>
    typename internal::enable_if<internal::is_floating_point<T>::value, void>::type
    initialize() {
      for (int i = 0; i < n_; ++i) {
	data_[i] = ADEPT_INIT_REAL;
      }
    }
    template <typename T>
    typename internal::enable_if<internal::is_complex<T>::value, void>::type
    initialize() {
      for (int i = 0; i < n_; ++i) {
#ifdef ADEPT_INIT_REAL_SNAN
        data_[i] = std::complex<typename Type::value_type>(
          std::numeric_limits<typename Type::value_type>::signaling_NaN(),
	  std::numeric_limits<typename Type::value_type>::signaling_NaN());
#else
	data_[i] = std::complex<typename Type::value_type>(ADEPT_INIT_REAL, ADEPT_INIT_REAL);
#endif
      }
    }

    // Dummy initialize for non-floats
    template <typename T>
    typename internal::enable_if<!internal::is_floating_point<T>::value
				 && !internal::is_complex<T>::value, void>::type
    initialize() { }

#endif


    // -------------------------------------------------------------------
    // Storage: 2. Public member functions
    // -------------------------------------------------------------------  
  public:
    // Add link to an existing storage object
    void add_link()
    { n_links_++; } 
    
    // Remove link as follows; this is only safe in a multi-threaded
    // environment if ADEPT_STORAGE_THREAD_SAFE is defined, making
    // n_links_ atomic
    void remove_link() {
      if (n_links_ == 0) {
	throw invalid_operation("Attempt to remove more links to a storage object than set"
				ADEPT_EXCEPTION_LOCATION);
      }
      else if (--n_links_ == 0) {
	delete this;
      }
    }

    // Return the number of elements allocated
    Index n_allocated() const
    { return n_; }

    // Return the number of links to an object
    int n_links() const
    { return n_links_; }

    Index gradient_index() const
    { return gradient_index_; }

    // Return pointer to the start of the data
    Type*
    data()
    { return data_; }
    const Type*
    data() const
    { return data_; }

    // Return a string of information
    std::string
    info_string() const {
      std::stringstream x;
      x << n_ << " " << sizeof(Type) << "-byte elements allocated with "
	<< n_links_ << " links";
      return x.str();
    }

    // -------------------------------------------------------------------
    // Storage: 3. Data
    // -------------------------------------------------------------------  
  private:
    // Pointer to the start of the data
    Type* data_;
    // Number of elements allocated
    Index n_;
    // Number of links to the storage object allowing for arrays and
    // array slices to point to the same data. If this falls to zero
    // the Storage object will destruct itself
#ifdef ADEPT_STORAGE_THREAD_SAFE
    // If multiple threads are to simultaneously read subsets of this
    // array then accesses to the reference counter must be made
    // atomic
    std::atomic<int> n_links_;
#else
    int n_links_;
#endif
    // For active variables, this s the gradient index of the first
    // element.  It would be better to only store this if Type is
    // floating point.
    Index gradient_index_;

  }; // End of Storage class
  

  // -------------------------------------------------------------------
  // Helper functions
  // -------------------------------------------------------------------
  inline Index n_storage_objects()
  { return internal::n_storage_objects_created_
      - internal::n_storage_objects_deleted_; }

  inline Index n_storage_objects_created()
  { return internal::n_storage_objects_created_; }
  
  inline Index n_storage_objects_deleted()
  { return internal::n_storage_objects_deleted_; }
  
} // End namespace adept

#endif


================================================
FILE: include/adept/UnaryOperation.h
================================================
/* UnaryOperation.h -- Unary operations on Adept expressions

    Copyright (C) 2014-2020 European Centre for Medium-Range Weather Forecasts

    Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.
*/

#ifndef AdeptUnaryOperation_H
#define AdeptUnaryOperation_H

#include <adept/Expression.h>

#include <adept/ArrayWrapper.h>

namespace adept {

  namespace internal {

    // ---------------------------------------------------------------------
    // SECTION 3.1: Unary operations: define UnaryOperation type
    // ---------------------------------------------------------------------

    // Unary operations derive from this class, where Op is a policy
    // class defining how to implement the operation, and R is the
    // type of the argument of the operation
    template <typename Type, template<class> class Op, class R>
    struct UnaryOperation
      : public Expression<Type, UnaryOperation<Type, Op, R> >,
	protected Op<Type> {
      
      static const int  rank       = R::rank;
      static const bool is_active  = R::is_active && !is_same<Type,bool>::value;
      static const int  n_active   = R::n_active;
      // FIX! Only store if active and if needed
      static const int  n_scratch  = 1 + R::n_scratch;
      static const int  n_arrays   = R::n_arrays;
      // Will need to modify this for sqrt:
      static const bool is_vectorizable
	= Op<Type>::is_vectorized && R::is_vectorizable;

      using Op<Type>::operation;
      using Op<Type>::operation_string;
      using Op<Type>::derivative;
      
      //const R& arg;
      typename nested_expression<R>::type arg;

      UnaryOperation(const Expression<Type, R>& arg_)
	: arg(arg_.cast()) { }
      
      template <int Rank>
      bool get_dimensions_(ExpressionSize<Rank>& dim) const {
	return arg.get_dimensions(dim);
      }

      std::string expression_string_() const {
	std::string str;
	str = operation_string();
	str += "(" + arg.expression_string() + ")";
	return str;
      }

      bool is_aliased_(const Type* mem1, const Type* mem2) const {
	return arg.is_aliased(mem1, mem2);
      }
      bool all_arrays_contiguous_() const {
	return arg.all_arrays_contiguous_();
      }
       bool is_aligned_() const {
	return arg.is_aligned_();
      }
      template <int n>
      int alignment_offset_() const { return arg.template alignment_offset_<n>(); }

      template <int Rank>
      Type value_with_len_(Index i, Index len) const {
	return operation(arg.value_with_len(i, len));
      }
      
      template <int MyArrayNum, int NArrays>
      void advance_location_(ExpressionSize<NArrays>& loc) const {
	arg.template advance_location_<MyArrayNum>(loc);
      }

      template <int MyArrayNum, int NArrays>
      Type value_at_location_(const ExpressionSize<NArrays>& loc) const {
	return operation(arg.template value_at_location_<MyArrayNum>(loc));
      }

      template <int MyArrayNum, int NArrays>
      Packet<Type> packet_at_location_(const ExpressionSize<NArrays>& loc) const {
	return operation(arg.template packet_at_location_<MyArrayNum>(loc));
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_at_location_store_(const ExpressionSize<NArrays>& loc,
				    ScratchVector<NScratch>& scratch) const {
	return scratch[MyScratchNum] 
	  = operation(arg.template value_at_location_store_<MyArrayNum,MyScratchNum+1>(loc, scratch));
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_stored_(const ExpressionSize<NArrays>& loc,
			 const ScratchVector<NScratch>& scratch) const {
	return scratch[MyScratchNum];
      }

      template <bool IsAligned,	int MyArrayNum, typename PacketType,
	int NArrays>
      PacketType values_at_location_(const ExpressionSize<NArrays>& loc) const {
	return operation(arg.template values_at_location_<IsAligned,MyArrayNum,PacketType>(loc));
      }

      template <bool UseStored, bool IsAligned,	int MyArrayNum, int MyScratchNum,
		typename PacketType, int NArrays, int NScratch>
      typename enable_if<!UseStored,PacketType>::type
      values_at_location_store_(const ExpressionSize<NArrays>& loc,
				ScratchVector<NScratch,PacketType>& scratch) const {
	return scratch[MyScratchNum]
	  = operation(arg.template values_at_location_store_<UseStored,IsAligned,
		      MyArrayNum,MyScratchNum+1>(loc, scratch));
      }
      template <bool UseStored, bool IsAligned,	int MyArrayNum, int MyScratchNum,
		typename PacketType, int NArrays, int NScratch>
      typename enable_if<UseStored,PacketType>::type
      values_at_location_store_(const ExpressionSize<NArrays>& loc,
				ScratchVector<NScratch,PacketType>& scratch) const {
	return scratch[MyScratchNum];
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      void calc_gradient_(Stack& stack, 
			  const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const {
	arg.template calc_gradient_<MyArrayNum, MyScratchNum+1>(stack, loc, scratch,
		derivative(arg.template value_stored_<MyArrayNum,MyScratchNum+1>(loc, scratch),
			   scratch[MyScratchNum]));
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch,
		typename MyType>
      void calc_gradient_(Stack& stack, 
			  const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch,
			  MyType multiplier) const {
	arg.template calc_gradient_<MyArrayNum, MyScratchNum+1>(stack, loc, scratch,
		multiplier*derivative(arg.template value_stored_<MyArrayNum,MyScratchNum+1>(loc, scratch), 
				      scratch[MyScratchNum]));
      }

      template <bool IsAligned, int MyArrayNum, int MyScratchNum, int MyActiveNum,
		int NArrays, int NScratch, int NActive>
      void calc_gradient_packet_(Stack& stack, 
				 const ExpressionSize<NArrays>& loc,
				 const ScratchVector<NScratch,Packet<Real> >& scratch,
				 ScratchVector<NActive,Packet<Real> >& gradients) const {
	arg.template calc_gradient_packet_<IsAligned,MyArrayNum,MyScratchNum+1,
					   MyActiveNum>(stack, loc, scratch, gradients,
		derivative(arg.template values_at_location_store_<true,IsAligned,MyArrayNum,MyScratchNum+1,
			   MyActiveNum>(loc, scratch), scratch[MyScratchNum]));
      }

      template <bool IsAligned, int MyArrayNum, int MyScratchNum, int MyActiveNum,
		int NArrays, int NScratch, int NActive, typename MyType>
      void calc_gradient_packet_(Stack& stack, 
				 const ExpressionSize<NArrays>& loc,
				 const ScratchVector<NScratch,Packet<Real> >& scratch,
				 ScratchVector<NActive,Packet<Real> >& gradients,
				 const MyType& multiplier) const {
	arg.template calc_gradient_packet_<IsAligned,MyArrayNum,MyScratchNum+1,
					   MyActiveNum>(stack, loc, scratch, gradients,
		multiplier*derivative(arg.template values_at_location_store_<true,IsAligned,MyArrayNum,MyScratchNum+1,
				      MyActiveNum>(loc, scratch), scratch[MyScratchNum]));
      }


      template <int MyArrayNum, int Rank, int NArrays>
      void set_location_(const ExpressionSize<Rank>& i, 
			 ExpressionSize<NArrays>& index) const {
	arg.template set_location_<MyArrayNum>(i, index);
      }

    }; // End UnaryOperation type
  
  } // End namespace internal

  // ---------------------------------------------------------------------
  // SECTION 3.2: Unary operations: define specific operations
  // ---------------------------------------------------------------------

  // We may place the overloaded mathematical functions in the global
  // namespace provided that a using declaration enables the std::
  // version of the function to be located
#define ADEPT_DEF_UNARY_FUNC(NAME, FUNC, RAWFUNC, STRING, DERIVATIVE,	\
			     ISVEC)					\
  namespace internal {							\
    template <typename Type>						\
    struct NAME  {							\
      static const bool is_operator = false;				\
      static const bool is_vectorized = ISVEC;				\
      const char* operation_string() const { return STRING; }		\
      template <typename T>						\
      T operation(const T& val) const {					\
	using RAWFUNC;							\
	return FUNC(val);						\
      }									\
      Type derivative(const Type& val, const Type& result) const {	\
	using std::sin;							\
	using std::cos;							\
	using std::sqrt;						\
	using std::cosh;						\
	using std::sinh;						\
	using std::exp;							\
	return DERIVATIVE;						\
      }									\
      Type fast_sqr(Type val) const { return val*val; }			\
    };									\
  } /* End namespace internal */					\
  template <class Type, class R>					\
  inline								\
  adept::internal::UnaryOperation<Type, adept::internal::NAME, R>	\
  FUNC(const adept::Expression<Type, R>& r)	{			\
    return adept::internal::UnaryOperation<Type,			\
				   adept::internal::NAME, R>(r.cast()); \
  }

  // Functions y(x) whose derivative depends on the argument of the
  // function, i.e. dy(x)/dx = f(x)
  ADEPT_DEF_UNARY_FUNC(Log,   log,   std::log,   "log",   1.0/val, false)
  ADEPT_DEF_UNARY_FUNC(Log10, log10, std::log10, "log10", 0.43429448190325182765/val, false)
  ADEPT_DEF_UNARY_FUNC(Sin,   sin,   std::sin,   "sin",   cos(val), false)
  ADEPT_DEF_UNARY_FUNC(Cos,   cos,   std::cos,   "cos",   -sin(val), false)
  ADEPT_DEF_UNARY_FUNC(Tan,   tan,   std::tan,   "tan",   1.0/fast_sqr(cos(val)), false)
  ADEPT_DEF_UNARY_FUNC(Asin,  asin,  std::asin,  "asin",  1.0/sqrt(1.0-val*val), false)
  ADEPT_DEF_UNARY_FUNC(Acos,  acos,  std::acos,  "acos",  -1.0/sqrt(1.0-val*val), false)
  ADEPT_DEF_UNARY_FUNC(Atan,  atan,  std::atan,  "atan",  1.0/(1.0+val*val), false)
  ADEPT_DEF_UNARY_FUNC(Sinh,  sinh,  std::sinh,  "sinh",  cosh(val), false)
  ADEPT_DEF_UNARY_FUNC(Cosh,  cosh,  std::cosh,  "cosh",  sinh(val), false)
  ADEPT_DEF_UNARY_FUNC(Abs,   abs,   std::abs,   "abs",   ((val>0.0)-(val<0.0)), false)
  ADEPT_DEF_UNARY_FUNC(Fabs,  fabs,  std::fabs,  "fabs",  ((val>0.0)-(val<0.0)), false)

  // Functions y(x) whose derivative depends on the result of the
  // function, i.e. dy(x)/dx = f(y)
  ADEPT_DEF_UNARY_FUNC(Sqrt,  sqrt,  std::sqrt,  "sqrt",  0.5/result, true)
  ADEPT_DEF_UNARY_FUNC(Tanh,  tanh,  std::tanh,  "tanh",  1.0 - result*result, false)

  // Adept's vectorizable exponential function
  ADEPT_DEF_UNARY_FUNC(Fastexp, fastexp, adept::fastexp, "fastexp", result, true)
#ifdef ADEPT_FAST_EXPONENTIAL
  ADEPT_DEF_UNARY_FUNC(Exp,   exp,   adept::functions::exp, "fastexp", result, true)
#else
  ADEPT_DEF_UNARY_FUNC(Exp,   exp,   std::exp,   "exp",   result, false)
#endif

  // Functions with zero derivative
  ADEPT_DEF_UNARY_FUNC(Ceil,  ceil,  std::ceil,  "ceil",  0.0, false)
  ADEPT_DEF_UNARY_FUNC(Floor, floor, std::floor, "floor", 0.0, false)
  
  // Functions defined in the std namespace in C++11 but only in the
  // global namespace before that
#ifdef ADEPT_CXX11_FEATURES
  ADEPT_DEF_UNARY_FUNC(Log2,  log2,  std::log2,  "log2",  1.44269504088896340737/val, false)
  ADEPT_DEF_UNARY_FUNC(Expm1, expm1, std::expm1, "expm1", exp(val), false)
  ADEPT_DEF_UNARY_FUNC(Exp2,  exp2,  std::exp2,  "exp2",  0.6931471805599453094172321214581766*result, false)
  ADEPT_DEF_UNARY_FUNC(Log1p, log1p, std::log1p, "log1p", 1.0/(1.0+val), false)
  ADEPT_DEF_UNARY_FUNC(Asinh, asinh, std::asinh, "asinh", 1.0/sqrt(val*val+1.0), false)
  ADEPT_DEF_UNARY_FUNC(Acosh, acosh, std::acosh, "acosh", 1.0/sqrt(val*val-1.0), false)
  ADEPT_DEF_UNARY_FUNC(Atanh, atanh, std::atanh, "atanh", 1.0/(1.0-val*val), false)
  ADEPT_DEF_UNARY_FUNC(Erf,   erf,   std::erf,   "erf",   1.12837916709551*exp(-val*val), false)
  ADEPT_DEF_UNARY_FUNC(Erfc,  erfc,  std::erfc,  "erfc",  -1.12837916709551*exp(-val*val), false)
  ADEPT_DEF_UNARY_FUNC(Cbrt,  cbrt,  std::cbrt,  "cbrt",  (1.0/3.0)/(result*result), false)
  ADEPT_DEF_UNARY_FUNC(Round, round, std::round, "round", 0.0, false)
  ADEPT_DEF_UNARY_FUNC(Trunc, trunc, std::trunc, "trunc", 0.0, false)
  ADEPT_DEF_UNARY_FUNC(Rint,  rint,  std::rint,  "rint",  0.0, false)
  ADEPT_DEF_UNARY_FUNC(Nearbyint,nearbyint,std::nearbyint,"nearbyint",0.0, false)
#else
  ADEPT_DEF_UNARY_FUNC(Log2,  log2,  ::log2,  "log2",  1.44269504088896340737/val, false)
  ADEPT_DEF_UNARY_FUNC(Expm1, expm1, ::expm1, "expm1", exp(val), false)
  ADEPT_DEF_UNARY_FUNC(Exp2,  exp2,  ::exp2,  "exp2",  0.6931471805599453094172321214581766*result, false)
  ADEPT_DEF_UNARY_FUNC(Log1p, log1p, ::log1p, "log1p", 1.0/(1.0+val), false)
  ADEPT_DEF_UNARY_FUNC(Asinh, asinh, ::asinh, "asinh", 1.0/sqrt(val*val+1.0), false)
  ADEPT_DEF_UNARY_FUNC(Acosh, acosh, ::acosh, "acosh", 1.0/sqrt(val*val-1.0), false)
  ADEPT_DEF_UNARY_FUNC(Atanh, atanh, ::atanh, "atanh", 1.0/(1.0-val*val), false)
  ADEPT_DEF_UNARY_FUNC(Erf,   erf,   ::erf,   "erf",   1.12837916709551*exp(-val*val), false)
  ADEPT_DEF_UNARY_FUNC(Erfc,  erfc,  ::erfc,  "erfc",  -1.12837916709551*exp(-val*val), false)
  ADEPT_DEF_UNARY_FUNC(Cbrt,  cbrt,  ::cbrt,  "cbrt",  (1.0/3.0)/(result*result), false)
  ADEPT_DEF_UNARY_FUNC(Round, round, ::round, "round", 0.0, false)
  ADEPT_DEF_UNARY_FUNC(Trunc, trunc, ::trunc, "trunc", 0.0, false)
  ADEPT_DEF_UNARY_FUNC(Rint,  rint,  ::rint,  "rint",  0.0, false)
  ADEPT_DEF_UNARY_FUNC(Nearbyint,nearbyint,::nearbyint,"nearbyint",0.0, false)
#endif

  //#undef ADEPT_DEF_UNARY_FUNC

#define ADEPT_DEF_UNARY_OP(NAME, FUNC, RAWFUNC, STRING, DERIVATIVE,	\
			   ISVEC)					\
  namespace internal {							\
    template <typename Type>						\
    struct NAME  {							\
      static const bool is_operator = false;				\
      static const bool is_vectorized = ISVEC;				\
      const char* operation_string() const { return STRING; }		\
      template <typename T>						\
      T operation(const T& val) const {					\
	return RAWFUNC(val);						\
      }									\
      Type derivative(const Type& val, const Type& result) const {	\
	return DERIVATIVE;						\
      }									\
      Type fast_sqr(Type val) { return val*val; }			\
    };									\
  } /* End namespace internal */					\
  template <class Type, class R>					\
  inline								\
  adept::internal::UnaryOperation<Type, adept::internal::NAME, R>	\
  FUNC(const adept::Expression<Type, R>& r)	{			\
    return adept::internal::UnaryOperation<Type,			\
				   adept::internal::NAME, R>(r.cast()); \
  }
  
  // Operators
  ADEPT_DEF_UNARY_OP(UnaryPlus,  operator+, +, "+", 1.0, true)
  ADEPT_DEF_UNARY_OP(UnaryMinus, operator-, -, "-", -1.0, true)
  ADEPT_DEF_UNARY_OP(Not,        operator!, !, "!", 0.0, false)


  // ---------------------------------------------------------------------
  // SECTION 3.4: Unary operations: transpose function [DELETED]
  // ---------------------------------------------------------------------

  // ---------------------------------------------------------------------
  // SECTION 3.5: Unary operations: returning boolean expression
  // ---------------------------------------------------------------------
  namespace internal {

    // Unary operations returning bool derive from this class, where
    // Op is a policy class defining how to implement the operation,
    // and R is the type of the argument of the operation
    template <typename Type, template<class> class Op, class R>
    struct UnaryBoolOperation
      : public Expression<bool, UnaryBoolOperation<Type, Op, R> >,
	protected Op<Type> {
      
      static const int  rank       = R::rank;
      static const bool is_active  = false;
      static const int  n_active   = 0;
      static const int  n_scratch  = 0;
      static const int  n_arrays   = R::n_arrays;
      
      using Op<Type>::operation;
      using Op<Type>::operation_string;
      
      const R& arg;

      UnaryBoolOperation(const Expression<Type, R>& arg_)
	: arg(arg_.cast()) { }
      
      template <int Rank>
      bool get_dimensions_(ExpressionSize<Rank>& dim) const {
	return arg.get_dimensions(dim);
      }

      std::string expression_string_() const {
	std::string str;
	str = operation_string();
	str += "(" + static_cast<const R*>(&arg)->expression_string() + ")";
	return str;
      }

      bool is_aliased_(const bool* mem1, const bool* mem2) const {
	return false;
      }
      bool all_arrays_contiguous_() const {
	return arg.all_arrays_contiguous_(); 
      }
      template <int n>
      int alignment_offset_() const { return arg.template alignment_offset_<n>(); }

      template <int Rank>
      Type value_with_len_(Index i, Index len) const {
	return operation(arg.value_with_len(i, len));
      }
      
      template <int MyArrayNum, int NArrays>
      void advance_location_(ExpressionSize<NArrays>& loc) const {
	arg.template advance_location_<MyArrayNum>(loc);
      }

      template <int MyArrayNum, int NArrays>
      Type value_at_location_(const ExpressionSize<NArrays>& loc) const {
	return operation(arg.template value_at_location_<MyArrayNum>(loc));
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_at_location_store_(const ExpressionSize<NArrays>& loc,
				    ScratchVector<NScratch>& scratch) const {
	return scratch[MyScratchNum] 
	  = operation(arg.template value_at_location_store_<MyArrayNum,MyScratchNum+1>(loc, scratch));
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_stored_(const ExpressionSize<NArrays>& loc,
			 const ScratchVector<NScratch>& scratch) const {
	return scratch[MyScratchNum];
      }

      template <bool IsAligned,	int MyArrayNum, typename PacketType,
	int NArrays>
      PacketType values_at_location_(const ExpressionSize<NArrays>& loc) const {
	return operation(arg.template values_at_location_<IsAligned,MyArrayNum,PacketType>(loc));
      }

      template <bool UseStored, bool IsAligned,	int MyArrayNum, int MyScratchNum,
		typename PacketType, int NArrays, int NScratch>
      PacketType values_at_location_store_(const ExpressionSize<NArrays>& loc,
		   ScratchVector<NScratch,PacketType>& scratch) const {
	return operation(arg.template values_at_location_<IsAligned,MyArrayNum,PacketType>(loc));
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      void calc_gradient_(Stack& stack, 
			  const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const { }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch,
		typename MyType>
      void calc_gradient_(Stack& stack, 
			  const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch,
			  MyType multiplier) const { }

      template <bool IsAligned, int MyArrayNum, int MyScratchNum, int MyActiveNum,
		int NArrays, int NScratch, int NActive>
      void calc_gradient_packet_(Stack& stack, 
				 const ExpressionSize<NArrays>& loc,
				 const ScratchVector<NScratch,Packet<Real> >& scratch,
				 ScratchVector<NActive,Packet<Real> >& gradients) const {}

      template <bool IsAligned, int MyArrayNum, int MyScratchNum, int MyActiveNum,
		int NArrays, int NScratch, int NActive, typename MyType>
      void calc_gradient_packet_(Stack& stack, 
				 const ExpressionSize<NArrays>& loc,
				 const ScratchVector<NScratch,Packet<Real> >& scratch,
				 ScratchVector<NActive,Packet<Real> >& gradients,
				 const MyType& multiplier) const {}

      template <int MyArrayNum, int Rank, int NArrays>
      void set_location_(const ExpressionSize<Rank>& i, 
			 ExpressionSize<NArrays>& index) const {
	arg.template set_location_<MyArrayNum>(i, index);
      }

    };
  
  } // End namespace internal

#define ADEPT_DEF_UNARY_BOOL_FUNC(NAME, FUNC, RAWFUNC)		\
  namespace internal {						\
    template <typename Type>					\
    struct NAME  {						\
      const char* operation_string() const { return #FUNC; }	\
      bool operation(const Type& val) const {			\
	using RAWFUNC;						\
	return FUNC(val); /* RAWFUNC(val); */			\
      }								\
    };								\
  } /* End namespace internal */					\
  template <class Type, class R>					\
  inline								\
  adept::internal::UnaryBoolOperation<Type, adept::internal::NAME, R>	\
  FUNC(const adept::Expression<Type, R>& r){				\
    return adept::internal::UnaryBoolOperation<Type,			\
      adept::internal::NAME, R>(r.cast());				\
  }

  ADEPT_DEF_UNARY_BOOL_FUNC(IsNan,    isnan,    std::isnan)
  ADEPT_DEF_UNARY_BOOL_FUNC(IsInf,    isinf,    std::isinf)
  ADEPT_DEF_UNARY_BOOL_FUNC(IsFinite, isfinite, std::isfinite)

  //#undef ADEPT_DEF_UNARY_BOOL_FUNC

} /* End namespace adept */


#endif


================================================
FILE: include/adept/array_shortcuts.h
================================================
/* array_shortcuts.h -- Definitions of "shortcut" typedefs for array types

    Copyright (C) 2015-2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.
*/

#ifndef AdeptArrayShortcuts_H
#define AdeptArrayShortcuts_H

#include <adept/Array.h>
#include <adept/SpecialMatrix.h>
#include <adept/FixedArray.h>

namespace adept {

  // ---------------------------------------------------------------------
  // Pretty typedefs to avoid the need for template arguments
  // ---------------------------------------------------------------------

  typedef Array<1> Vector;
  typedef Array<2> Matrix;
  typedef Array<3> Array3; // Deprecated
  typedef Array<3> Array3D;
  typedef Array<4> Array4D;
  typedef Array<5> Array5D;
  typedef Array<6> Array6D;
  typedef Array<7> Array7D;

  typedef Array<1,Index> IntVector;
  typedef Array<2,Index> IntMatrix;
  typedef Array<3,Index> IntArray3; // Deprecated
  typedef Array<3,Index> IntArray3D;

  typedef Array<1,int> intVector;
  typedef Array<2,int> intMatrix;
  typedef Array<3,int> intArray3; // Deprecated
  typedef Array<3,int> intArray3D;
  typedef Array<4,int> intArray4D;
  typedef Array<5,int> intArray5D;
  typedef Array<6,int> intArray6D;
  typedef Array<7,int> intArray7D;

  typedef Array<1,bool> boolVector;
  typedef Array<2,bool> boolMatrix;
  typedef Array<3,bool> boolArray3; // Deprecated
  typedef Array<3,bool> boolArray3D;
  typedef Array<4,bool> boolArray4D;
  typedef Array<5,bool> boolArray5D;
  typedef Array<6,bool> boolArray6D;
  typedef Array<7,bool> boolArray7D;

  typedef Array<1,float> floatVector;
  typedef Array<2,float> floatMatrix;
  typedef Array<3,float> floatArray3; // Deprecated
  typedef Array<3,float> floatArray3D;
  typedef Array<4,float> floatArray4D;
  typedef Array<5,float> floatArray5D;
  typedef Array<6,float> floatArray6D;
  typedef Array<7,float> floatArray7D;

  typedef SpecialMatrix<Real,internal::SquareEngine<ROW_MAJOR>,
    false> SquareMatrix;
  typedef SpecialMatrix<Real,internal::BandEngine<ROW_MAJOR,0,0>,
    false> DiagMatrix;
  typedef SpecialMatrix<Real,internal::BandEngine<ROW_MAJOR,1,1>,
    false> TridiagMatrix;
  typedef SpecialMatrix<Real,internal::BandEngine<ROW_MAJOR,2,2>,
    false> PentadiagMatrix;
  typedef SpecialMatrix<Real,internal::SymmEngine<ROW_LOWER_COL_UPPER>,
    false> SymmMatrix;
  typedef SpecialMatrix<Real,internal::LowerEngine<ROW_MAJOR>,
    false> LowerMatrix;
  typedef SpecialMatrix<Real,internal::UpperEngine<ROW_MAJOR>,
    false> UpperMatrix;

  typedef FixedArray<Real,false,2> Vector2;
  typedef FixedArray<Real,false,3> Vector3;
  typedef FixedArray<Real,false,4> Vector4;
  typedef FixedArray<Real,false,2,2> Matrix22;
  typedef FixedArray<Real,false,3,3> Matrix33;
  typedef FixedArray<Real,false,4,4> Matrix44;

  // If automatic differentiation is turned off then aVector and
  // friends become identical to their inactive counterparts
#ifdef ADEPT_NO_AUTOMATIC_DIFFERENTIATION
#define ADEPT_IS_ACTIVE false
#else
#define ADEPT_IS_ACTIVE true
#endif

  typedef Array<1,Real,ADEPT_IS_ACTIVE> aVector;
  typedef Array<2,Real,ADEPT_IS_ACTIVE> aMatrix;
  typedef Array<3,Real,ADEPT_IS_ACTIVE> aArray3; // Deprecated
  typedef Array<3,Real,ADEPT_IS_ACTIVE> aArray3D;
  typedef Array<4,Real,ADEPT_IS_ACTIVE> aArray4D;
  typedef Array<5,Real,ADEPT_IS_ACTIVE> aArray5D;
  typedef Array<6,Real,ADEPT_IS_ACTIVE> aArray6D;
  typedef Array<7,Real,ADEPT_IS_ACTIVE> aArray7D;

  typedef SpecialMatrix<Real,internal::SquareEngine<ROW_MAJOR>,
    ADEPT_IS_ACTIVE> aSquareMatrix;
  typedef SpecialMatrix<Real,internal::BandEngine<ROW_MAJOR,0,0>,
    ADEPT_IS_ACTIVE> aDiagMatrix;
  typedef SpecialMatrix<Real,internal::BandEngine<ROW_MAJOR,1,1>,
    ADEPT_IS_ACTIVE> aTridiagMatrix;
  typedef SpecialMatrix<Real,internal::BandEngine<ROW_MAJOR,2,2>,
    ADEPT_IS_ACTIVE> aPentadiagMatrix;
  typedef SpecialMatrix<Real,internal::SymmEngine<ROW_LOWER_COL_UPPER>,
    ADEPT_IS_ACTIVE> aSymmMatrix;
  typedef SpecialMatrix<Real,internal::LowerEngine<ROW_MAJOR>,
    ADEPT_IS_ACTIVE> aLowerMatrix;
  typedef SpecialMatrix<Real,internal::UpperEngine<ROW_MAJOR>,
    ADEPT_IS_ACTIVE> aUpperMatrix;

  typedef FixedArray<Real,ADEPT_IS_ACTIVE,2>   aVector2;
  typedef FixedArray<Real,ADEPT_IS_ACTIVE,3>   aVector3;
  typedef FixedArray<Real,ADEPT_IS_ACTIVE,4>   aVector4;
  typedef FixedArray<Real,ADEPT_IS_ACTIVE,2,2> aMatrix22;
  typedef FixedArray<Real,ADEPT_IS_ACTIVE,3,3> aMatrix33;
  typedef FixedArray<Real,ADEPT_IS_ACTIVE,4,4> aMatrix44;


#undef ADEPT_IS_ACTIVE

} // End namespace adept

#endif


================================================
FILE: include/adept/base.h
================================================
/* base.h -- Basic definitions 

    Copyright (C) 2012-2014 University of Reading
    Copyright (C) 2015-2021 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/


#ifndef AdeptBase_H
#define AdeptBase_H 1

#include <cstddef>


// ---------------------------------------------------------------------
// 0: Adept version number
// ---------------------------------------------------------------------

// The version of the Adept library is specified both as a string and
// an integer
#define ADEPT_VERSION      20100
#define ADEPT_VERSION_STR "2.1"


// ---------------------------------------------------------------------
// 1: Defines not requiring a library recompile
// ---------------------------------------------------------------------

// The following can either be changed here, or define them just
// before including this header file in your code, or define using the
// -Dxxx compiler option.  These options to not need the library to be
// recompiled.

// A globally accessible stack needs to be present for arithmetic
// statements to access; by default this is thread safe but if you
// know you are running a single-threaded application then slightly
// faster performance may be achieved by defining this. Note that in
// section 4 of this header file, ADEPT_STACK_THREAD_UNSAFE is
// explicitly defined on the Mac OS platform, since the executable
// format used typically does not support thread-local storage.
//#define ADEPT_STACK_THREAD_UNSAFE 1

// Define this to check whether the "multiplier" is zero before it is
// placed on the operation stack. This makes the forward pass slower
// and the reverse pass slightly faster, and is only worthwhile if
// many reverse passes will be carried out per forward pass (or if you
// have good reason to believe many variables in your code are zero).
// #define ADEPT_REMOVE_NULL_STATEMENTS 1

// If using the same code for both forward-only and
// forward-and-reverse calculations, then it is useful to be able to
// dynamically control whether or not gradient information is computed
// by expressions in the forward pass using the pause_recording() and
// continue_recording() functions. To enable this feature uncomment
// the following, but note that it slows down the forward pass a
// little.  
//#define ADEPT_RECORDING_PAUSABLE 1

// Initialize real types to signaling NaN or zero
//#define ADEPT_INIT_REAL_SNAN 1
//#define ADEPT_INIT_REAL_ZERO 1

// Often when you first convert a code for automatic differentiation
// the gradients computed contain NaNs or infinities: uncommenting the
// following will check for these and throw an error when they are
// found, so that by running the program in a debugger and looking at
// the backtrace, you can locate the source.
//#define ADEPT_TRACK_NON_FINITE_GRADIENTS 1

// If this is defined then each mathematical operation does not
// involve a check whether more memory needs to be allocated; rather
// the user first specifies how much memory to allocate to hold the
// entire algorithm via the preallocate_statements and
// preallocate_operations functions. This is a little faster, but is
// obviously risky if you don't anticipate correctly how much memory
// will be needed.
//#define ADEPT_MANUAL_MEMORY_ALLOCATION 1

// Do we check array bounds when indexing arrays?
//#define ADEPT_BOUNDS_CHECKING 1

// Do we disable dimension checking when assigning an array expression
// to another array?
//#define ADEPT_NO_DIMENSION_CHECKING 1

// Do we disable automatic alias checking in array operations?
//#define ADEPT_NO_ALIAS_CHECKING 1

// Does adept::exp when applied to Adept types such as arrays invoke a
// faster vectorizable exponential function?  This is not bit
// reproducible with "exp" in the standard library, but the faster
// function is always available as adept::fastexp (and this also works
// on scalars).  Note that when applied to an Adept type, a simple
// "exp" selects the function from the adept namespace.
//#define ADEPT_FAST_EXPONENTIAL 1

// The following will define the adept::exp function for the scalar
// types "float" and "double" to call the faster exponential function,
// bit reproducible with the vectorizable one above.  However, this
// can cause a namespace clash as some C header files import "exp"
// outside of any namespace.  Alternatively you can use adept::fastexp
// on scalars.
//#define ADEPT_FAST_SCALAR_EXPONENTIAL 1

// A shortcut for faster execution that does not change the behaviour
// of single-threaded bug-free code that uses the "eval" function in
// case of aliasing.  ADEPT_FAST_EXPONENTIAL changes results so is not
// activated wtih ADEPT_FAST.
#ifdef ADEPT_FAST
#define ADEPT_STACK_THREAD_UNSAFE 1
#define ADEPT_NO_DIMENSION_CHECKING 1
#define ADEPT_NO_ALIAS_CHECKING 1
#endif

// The compiler option -ffast-math turns on __FAST_MATH__ and allows
// for optimizations that may not be bit-reproducible or do all the
// normal error checking - Adept's fast exponential falls into this
// category.
#ifdef __FAST_MATH__
#define ADEPT_FAST_EXPONENTIAL 1
#endif

// The initial size of the stacks, which can be grown if required
#ifndef ADEPT_INITIAL_STACK_LENGTH
#define ADEPT_INITIAL_STACK_LENGTH 1048576
#endif

// The statement and operation stacks
#ifndef ADEPT_STACK_BLOCK_LENGTH
#define ADEPT_STACK_BLOCK_LENGTH 1048576
#endif

//#define ADEPT_SUPPORT_HUGE_ARRAYS 1

// Since subsetting an array causes a modification to the reference
// counter in the underlying storage object, multiple threads
// subsetting the same array can cause clashes unless the reference
// counter is protected by a mutex. This is possible on C++11 by
// making the reference counter of type std::atomic<int>, enabled by
// defining the following:
//#define ADEPT_STORAGE_THREAD_SAFE


// ---------------------------------------------------------------------
// 2: Defines requiring a library recompile
// ---------------------------------------------------------------------

// The "stack" containing derivative information can be implemented in
// two ways: if ADEPT_STACK_STORAGE_STL is defined then C++ STL
// containers are used, otherwise dynamically allocated arrays are
// used.  Experience says that dynamically allocated arrays are faster.
//#define ADEPT_STACK_STORAGE_STL 1

// The number of rows/columns of a Jacobian that are calculated at
// once. The optimum value depends on platform, the size of your
// Jacobian and the number of OpenMP threads available.
#ifndef ADEPT_MULTIPASS_SIZE
//#define ADEPT_MULTIPASS_SIZE 1
//#define ADEPT_MULTIPASS_SIZE 2
#define ADEPT_MULTIPASS_SIZE 4
//#define ADEPT_MULTIPASS_SIZE 8
//#define ADEPT_MULTIPASS_SIZE 15
//#define ADEPT_MULTIPASS_SIZE 16
//#define ADEPT_MULTIPASS_SIZE 32
//#define ADEPT_MULTIPASS_SIZE 64
#endif

// If ADEPT_MULTIPASS_SIZE > ADEPT_MULTIPASS_SIZE_ZERO_CHECK then the
// Jacobian calculation will try to remove redundant loops involving
// zeros; note that this may inhibit auto-vectorization
#define ADEPT_MULTIPASS_SIZE_ZERO_CHECK 64
#define PACKET_SIZE_ZERO_CHECK 64

// By default the precision of differentiated expressions is "double".
// To override this, define ADEPT_REAL_TYPE_SIZE to 4 (float), 8
// (double) or 16 (long double). Note that if you specify 16 but on
// your system "long double" is actually the same as double, then the
// code will fail to compile.
//#define ADEPT_REAL_TYPE_SIZE 8

// Thread-local storage is used for the global Stack pointer to ensure
// thread safety.  In pre-C++11 compilers, thread-local variables are
// declared in different ways by different compilers, the most common
// ones being detected in section 4 below.  Some platforms
// (particularly some Mac platforms) do not implement thread-local
// storage, and therefore on Mac thread-local storage is disabled. If
// you want to manually specify how thread-local storage is declared,
// you may do it here.  If thread-local storage is not available on
// your platform but is not detected in section 4, and consequently
// you cannot get the code to compile, then you can make an empty
// declaration here.
//#define ADEPT_THREAD_LOCAL thread_local

// Define the following if you wish to use OpenMP to accelerate array
// expressions
//#define ADEPT_OPENMP_ARRAY_OPERATIONS 1

// This cannot be changed without rewriting the Adept library
#define ADEPT_MAX_ARRAY_DIMENSIONS 7

// ---------------------------------------------------------------------
// 4: Miscellaneous
// ---------------------------------------------------------------------

// Various C++11 features
#if __cplusplus > 199711L
// We can optimize the returning of Arrays from functions with move
// semantics:
#define ADEPT_MOVE_SEMANTICS 1
// Other C++11 features such as initializer lists, thread_local
// keyword, extra mathematical functions etc:
#define ADEPT_CXX11_FEATURES 1
#elif defined(_MSVC_LANG)
// Microsoft will only update __cplusplus when all C++11 features are
// included
#if _MSVC_LANG > 199711L
#define ADEPT_MOVE_SEMANTICS 1
#define ADEPT_CXX11_FEATURES 1
#endif
#endif

// Check C++11 is being used if thread-safe array storage is required
#ifdef ADEPT_STORAGE_THREAD_SAFE
#ifndef ADEPT_CXX11_FEATURES
#error "Thread-safe array storage is only available with C++11"
#endif
#endif

// The following attempt to align the data to facilitate SSE2
// vectorization did not work so is disabled
#ifdef __GNUC__
//#define ADEPT_SSE2_ALIGNED __attribute__ ((aligned (16)))
#define ADEPT_SSE2_ALIGNED
#else
#define ADEPT_SSE2_ALIGNED
#endif

// The way thread-local variables are specified pre-C++11 is compiler
// specific.  You can specify this manually by defining the
// ADEPT_THREAD_LOCAL preprocessor variable in the previous section,
// otherwise it is defined here depending on your compiler
#ifndef ADEPT_THREAD_LOCAL
  #ifdef __APPLE__
    #ifdef __GNUC__
      // GNU C++11 compiler on Mac should support thread_local
      #ifdef ADEPT_CXX11_FEATURES
        #define ADEPT_THREAD_LOCAL thread_local
      #endif
    #elif defined(__has_feature)
      // Clang supports "__has_feature": check if thread_local is
      // available
     #if __has_feature(cxx_thread_local)
        #define ADEPT_THREAD_LOCAL thread_local
      #endif
    #endif
    // When thread_local is unavailable we turn it off and provide a
    // blank definition of ADEPT_THREAD_LOCAL.
    #ifndef ADEPT_THREAD_LOCAL
      #define ADEPT_STACK_THREAD_UNSAFE 1
      #define ADEPT_THREAD_LOCAL
    #endif
  #elif defined(ADEPT_CXX11_FEATURES)
    // C++11 has thread_local as part of the language, and should be
    // supported on non-Mac C++11 platforms
    #define ADEPT_THREAD_LOCAL thread_local
  #elif defined(_MSC_VER)
    // Microsoft C++98 has a different way to specify thread-local
    // storage from the GCC/Intel/Sun/IBM compilers.
    #define ADEPT_THREAD_LOCAL __declspec(thread)
  #else
    // The following should work on GCC/Intel/Sun/IBM C++98 compilers
    #define ADEPT_THREAD_LOCAL __thread
  #endif
#endif

// If we use OpenMP to parallelize array expressions then some
// variables local to active operation structures (Multiply etc) need
// to be made thread-local
#ifdef ADEPT_OPENMP_ARRAY_OPERATIONS
#define ADEPT_THREAD_LOCAL_IF_OPENMP ADEPT_THREAD_LOCAL
#else
#define ADEPT_THREAD_LOCAL_IF_OPENMP
#endif

// Currently the design of the stack means that automatic
// differentiation of matrix multiplication is very inefficient. A
// future version of Adept will redesign the stack to store directives
// enabling efficient implementation of the derivative of a matrix
// multiplication, and this will be applicable to different types of
// matrix (dense, symmetric, banded, upper and lower). But for now,
// only differentiation of dense active matrices
// (i.e. Array<2,Real,true>) is implemented.  Therefore other types
// of active matrix need to be converted to this type before they can
// be used in matrix multiplication.
#define ADEPT_ONLY_DIFFERENTIATE_DENSE_MATRIX_MULTIPLICATION 1

// To find bugs it can be useful to initialize arrays to signaling
// NaNs, in which case ADEPT_INIT_REAL is set and used internally
#ifdef ADEPT_INIT_REAL_SNAN
#define ADEPT_INIT_REAL std::numeric_limits<T>::signaling_NaN()
#elif defined(ADEPT_INIT_REAL_ZERO)
#define ADEPT_INIT_REAL 0.0
#endif

// ---------------------------------------------------------------------
// 5: Define basic floating-point and integer types
// ---------------------------------------------------------------------
namespace adept {

  // An older version of Adept used ADEPT_FLOATING_POINT_TYPE to
  // define alternative underlying types for "Real", but unfortunately
  // the preprocessor cannot check if a preprocessor variable is of
  // type "long double", so a numerical value is used instead
#ifdef ADEPT_FLOATING_POINT_TYPE
#undef ADEPT_FLOATING_POINT_TYPE
#error ADEPT_FLOATING_POINT_TYPE is deprecated: use ADEPT_REAL_TYPE_SIZE instead
#endif

#ifndef ADEPT_REAL_TYPE_SIZE
#define ADEPT_REAL_TYPE_SIZE 8
#endif

#if ADEPT_REAL_TYPE_SIZE == 4
  typedef float Real;
#elif ADEPT_REAL_TYPE_SIZE == 8
  typedef double Real;
#elif ADEPT_REAL_TYPE_SIZE == 16
  typedef long double Real;
#else
#undef ADEPT_REAL_TYPE_SIZE
#error If defined, ADEPT_REAL_TYPE_SIZE must be 4 (float), 8 (double) or 16 (long double)
#endif

  // By default sizes of arrays, indices to them, and indices in the
  // automatic differentiation stack are stored as 4-byte integers,
  // but for very large arrays and algorithms, larger types may be
  // needed.  Remember that on 32-bit platforms this will have no
  // effect.
#ifdef ADEPT_SUPPORT_HUGE_ARRAYS
  typedef std::size_t  uIndex; // Unsigned
  typedef std::ptrdiff_t Index;  // Signed
#else
  //  typedef unsigned int uIndex;
  typedef int uIndex;
  typedef int Index;
#endif

  // ---------------------------------------------------------------------
  // 6: Disable stupid warnings
  // ---------------------------------------------------------------------

#ifdef __INTEL_COMPILER
// "type qualifiers are meaningless here"
#pragma warning disable 2536
#elif defined(_MSC_VER)
// "multiple copy constructors specified"
#pragma warning( disable : 4521 )
#endif

} // End namespace adept

#endif


================================================
FILE: include/adept/contiguous_matrix.h
================================================
/* contiguous_matrix.h -- Return matrix with contiguous storage

    Copyright (C) 2015 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.
*/

#ifndef AdeptContiguousMatrix_H
#define AdeptContiguousMatrix_H 1

#include <adept/Array.h>

namespace adept {
  namespace internal {
    
    // If for input into BLAS or LAPACK a matrix is required to have
    // one dimension contiguous and increasing in memory, then call
    // this function: if the matrix has this property then the
    // returned matrix in "out" will be linked to the input matrix;
    // otherwise, "out" will be a copy of "in" but satisfying this
    // condition. The returned "order" is ROW_MAJOR or COL_MAJOR
    // stating the storage type of the returned matrix.
    template <typename T, bool IsActive>
    MatrixStorageOrder contiguous_matrix(Array<2,T,IsActive>& in, 
					 Array<2,T,IsActive>& out,
					 Index& stride) {
      MatrixStorageOrder order = ROW_MAJOR;
      if (in.empty()) {
	throw(invalid_operation("Input matrix must not be empty"));
      }
      if (in.dimension(1) == 1) {
	out.link(in);
	stride = in.offset(0);
      }
      else if (in.dimension(0) == 1) {
	order = COL_MAJOR;
	out.link(in);
	stride = in.offset(1);
      }
      else {
	out.resize_row_major(in.dimensions());
	out = in;
	stride = in.offset(0);
      }
      return order;
    }

    // As contiguous_matrix but checks that the input matrix is square
    template <typename T, bool IsActive>
    MatrixStorageOrder contiguous_square_matrix(Array<2,T,IsActive>& in, 
						Array<2,T,IsActive>& out,
						Index& stride) {
      if (in.dimension(0) != in.dimension(1)) {
	throw(invalid_operation("Square matrix required"));
      }
      return contiguous_matrix(in, out, stride);
    }

  }
}


#endif


================================================
FILE: include/adept/cppblas.h
================================================
/* cppblas.h -- C++ interface to BLAS functions

    Copyright (C) 2015-2016 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

   This file provides a C++ interface to selected Level-2 and -3 BLAS
   functions in which the precision of the arguments (float versus
   double) is inferred via overloading

*/

#ifndef AdeptCppBlas_H
#define AdeptCppBlas_H 1

namespace adept {

  namespace internal {

    typedef bool BLAS_ORDER;
    typedef char BLAS_TRANSPOSE;
    typedef char BLAS_UPLO;
    typedef char BLAS_SIDE;

    static const BLAS_ORDER     BlasRowMajor  = false;
    static const BLAS_ORDER     BlasColMajor  = true;
    static const BLAS_TRANSPOSE BlasNoTrans   = 'N';
    static const BLAS_TRANSPOSE BlasTrans     = 'T';
    static const BLAS_TRANSPOSE BlasConjTrans = 'C';
    static const BLAS_UPLO      BlasUpper     = 'U';
    static const BLAS_UPLO      BlasLower     = 'L';
    static const BLAS_SIDE      BlasLeft      = 'L';
    static const BLAS_SIDE      BlasRight     = 'R';

    // Matrix-matrix multiplication for general dense matrices
#define ADEPT_DEFINE_GEMM(T)					\
    void cppblas_gemm(const BLAS_ORDER Order,			\
		      const BLAS_TRANSPOSE TransA,		\
		      const BLAS_TRANSPOSE TransB,		\
		      const int M, const int N,			\
		      const int K, const T alpha, const T *A,	\
		      const int lda, const T *B, const int ldb,	\
		      const T beta, T *C, const int ldc);
    ADEPT_DEFINE_GEMM(double)
    ADEPT_DEFINE_GEMM(float)
#undef ADEPT_DEFINE_GEMM
    
    // Matrix-vector multiplication for a general dense matrix
#define ADEPT_DEFINE_GEMV(T)					\
    void cppblas_gemv(const BLAS_ORDER order,			\
		      const BLAS_TRANSPOSE TransA,		\
		      const int M, const int N,			\
		      const T alpha, const T *A, const int lda,	\
		      const T *X, const int incX, const T beta,	\
		      T *Y, const int incY);
    ADEPT_DEFINE_GEMV(double)
    ADEPT_DEFINE_GEMV(float)
#undef ADEPT_DEFINE_GEMV
    
    // Matrix-matrix multiplication where matrix A is symmetric
#define ADEPT_DEFINE_SYMM(T)					\
    void cppblas_symm(const BLAS_ORDER Order,			\
		      const BLAS_SIDE Side,			\
		      const BLAS_UPLO Uplo,			\
		      const int M, const int N,			\
		      const T alpha, const T *A, const int lda,	\
		      const T *B, const int ldb, const T beta,	\
		      T *C, const int ldc);
    ADEPT_DEFINE_SYMM(double)
    ADEPT_DEFINE_SYMM(float)
#undef ADEPT_DEFINE_SYMM
    
    // Matrix-vector multiplication where the matrix is symmetric
#define ADEPT_DEFINE_SYMV(T)					\
    void cppblas_symv(const BLAS_ORDER order,			\
		      const BLAS_UPLO Uplo,			\
		      const int N, const T alpha, const T *A,	\
		      const int lda, const T *X, const int incX,\
		      const T beta, T *Y, const int incY);
    ADEPT_DEFINE_SYMV(double)
    ADEPT_DEFINE_SYMV(float)
#undef ADEPT_DEFINE_SYMV
    
    // Matrix-vector multiplication for a general band matrix
#define ADEPT_DEFINE_GBMV(T)					\
    void cppblas_gbmv(const BLAS_ORDER order,			\
		      const BLAS_TRANSPOSE TransA,		\
		      const int M, const int N,			\
		      const int KL, const int KU, const T alpha,\
		      const T *A, const int lda, const T *X,	\
		      const int incX, const T beta, T *Y,	\
		      const int incY);
    ADEPT_DEFINE_GBMV(double)
    ADEPT_DEFINE_GBMV(float)
#undef ADEPT_DEFINE_GBMV

  } // End namespace internal

} // End namespace adept


#endif


================================================
FILE: include/adept/eval.h
================================================
/* eval.h -- Convert expression to array to avoid aliasing issues

    Copyright (C) 2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.
*/

#ifndef AdeptEval_H
#define AdeptEval_H

#include <adept/Array.h>

namespace adept {

  // Copy an expression to an Array of the same rank, type and
  // activeness
  template <typename EType, class E>
  typename internal::enable_if<(E::rank > 0), Array<E::rank,EType,E::is_active> >::type
  eval(const Expression<EType,E>& e) {
    Array<E::rank,EType,E::is_active> a;
    a = e.cast();
    return a;
  }

  // Equivalent for scalar expressions; not really needed
  /*
  template <typename EType, class E>
  typename internal::enable_if<E::rank==0 && !E::is_active, EType>::type
  eval(const Expression<EType,E>& e) {
    return static_cast<EType>(e);
  }

  template <typename EType, class E>
  typename internal::enable_if<E::rank==0 && E::is_active, Active<EType> >::type
  eval(const Expression<EType,E>& e) {
    return static_cast<Active<EType> >(e);
  }
  */

} // End namespace adept

#endif


================================================
FILE: include/adept/exception.h
================================================
/* exception.h -- Exceptions thrown by Adept library

    Copyright (C) 2012-2014 University of Reading
    Copyright (C) 2015 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.


   Adept functions can throw exceptions that are all derived either
   from the adept::autodiff_exception or adept::array_exception types,
   themselves inherited from the adept::exception type.  All implement
   the "what()" function to return an error message.

*/

#ifndef AdeptException_H
#define AdeptException_H 1

#include <exception>
#include <string>
#include <sstream>


namespace adept {

  // -------------------------------------------------------------------
  // adept::exception class from which all others are derived
  // -------------------------------------------------------------------
  class exception : public std::exception {
  public:
    virtual const char* what() const throw() { return message_.c_str(); }
    virtual ~exception() throw() { }
  protected:
    std::string message_;
  };

  class feature_not_available : public adept::exception {
  public:
    feature_not_available(const std::string& message = "Feature not available")
    { message_ = message; }
  };

  // -------------------------------------------------------------------
  // autodiff_exception and child classes
  // -------------------------------------------------------------------

  // The autodiff_exception type is only used as a base for more
  // specific exceptions
  class autodiff_exception : public adept::exception { };

  // Now we define the various specific autodiff exceptions that can
  // be thrown.
  class gradient_out_of_range : public autodiff_exception {
  public:
    gradient_out_of_range(const std::string& message 
	  = "Gradient index out of range: probably aReal objects have been created after a set_gradient(s) call")
    { message_ = message; }
  };

  class gradients_not_initialized : public autodiff_exception {
  public:
    gradients_not_initialized(const std::string& message 
	      = "Gradients not initialized: at least one call to set_gradient(s) is needed before a forward or reverse pass")
    { message_ = message; }
  };

  class stack_already_active : public autodiff_exception {
  public:
    stack_already_active(const std::string& message 
	 = "Attempt to activate an adept::Stack when one is already active in this thread")
    { message_ = message; }
  };

  class dependents_or_independents_not_identified : public autodiff_exception {
  public:
    dependents_or_independents_not_identified(const std::string& message 
	 = "Dependent or independent variables not identified before a Jacobian computation")
    { message_ = message; }
  };

  class wrong_gradient : public autodiff_exception {
  public:
    wrong_gradient(const std::string& message
	  = "Wrong gradient: append_derivative_dependence called on a different aReal object from the most recent add_derivative_dependence call")
    { message_ = message; }
  };

  class non_finite_gradient : public autodiff_exception {
  public:
    non_finite_gradient(const std::string& message
	= "A non-finite gradient has been computed")
    { message_ = message; }
  };


  // -------------------------------------------------------------------
  // array_exception and child classes
  // -------------------------------------------------------------------

  // The array_exception type
  class array_exception : public adept::exception { 
  public:
    array_exception(const std::string& message
		    = "A misuse of arrays occurred")
    { message_ = message; }
  };

  class size_mismatch : public array_exception {
  public:
    size_mismatch(const std::string& message
		  = "Array sizes do not match in array expression")
    { message_ = message; }
  };

  class inner_dimension_mismatch : public array_exception {
  public:
    inner_dimension_mismatch(const std::string& message
	  = "Inner dimensions don't agree in matrix multiplication")
    { message_ = message; }
  };

  class empty_array : public array_exception {
  public:
    empty_array(const std::string& message
	= "Use of empty array where non-empty array required")
    { message_ = message; }
  };

  class invalid_dimension : public array_exception {
  public:
    invalid_dimension(const std::string& message
	= "Attempt to create array with invalid dimension")
    { message_ = message; }
  };

  class index_out_of_bounds : public array_exception {
  public:
    index_out_of_bounds(const std::string& message
	= "Array index is out of bounds")
    { message_ = message; }
  };

  class invalid_operation : public array_exception {
  public:
    invalid_operation(const std::string& message
      = "Operation not permitted for this type of array")
    { message_ = message; }
  };

  class matrix_ill_conditioned : public array_exception {
  public:
    matrix_ill_conditioned(const std::string& message
      = "Matrix ill conditioned")
    { message_ = message; }
  };

  class fortran_interoperability_error : public array_exception {
  public:
    fortran_interoperability_error(const std::string& message
				   = "Fortran interoperability error")
    { message_ = message; }
  };

  
  // -------------------------------------------------------------------
  // optimization_exception
  // -------------------------------------------------------------------

  // The optimization_exception type
  class optimization_exception : public adept::exception { 
  public:
    optimization_exception(const std::string& message
		    = "Optimization/minimization error")
    { message_ = message; }
  };


  // -------------------------------------------------------------------
  // Provide location of where exception was thrown
  // -------------------------------------------------------------------

  // The following enables the file name and line number to be reported
  // with something like 
  //   throw array_exception("Bad matrix" ADEPT_EXCEPTION_LOCATION)
#define ADEPT_EXCEPTION_LOCATION \
  +adept::internal::exception_location(__FILE__,__LINE__)

  // A string with location information to append to the error message
  namespace internal {
    inline
    std::string exception_location(const char* file, int line) {
      std::stringstream s;
      s << " (in " << file << ":" << line << ")";
      return s.str();      
    }
  }

} // End namespace adept

#endif


================================================
FILE: include/adept/interp.h
================================================
/* interp.h -- 1D interpolation

    Copyright (C) 2015- European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#ifndef AdeptInterp_H
#define AdeptInterp_H

#include <adept/Array.h>

namespace adept {

  namespace internal {
    typedef unsigned int uint;
  };
  
  // The interpolation scheme and extrapolation behaviours are passed
  // in as one "options" argument with a bitwise OR. The lowest four
  // bits specify the extrapolation policy and the remaining bits the
  // interpolation scheme.
  static const internal::uint ADEPT_INTERPOLATE_LINEAR  = 0u; // Default
  static const internal::uint ADEPT_INTERPOLATE_NEAREST = (1u<<4);

  static const internal::uint ADEPT_EXTRAPOLATE_DEFAULT  = 0u;
  static const internal::uint ADEPT_EXTRAPOLATE_LINEAR   = 1u; // Default for linear interp 
  static const internal::uint ADEPT_EXTRAPOLATE_CLAMP    = 2u; // Default for nearest-neighbour
  // Return a constant for out-of-bounds inputs, or NaN if the
  // constant is not specified
  static const internal::uint ADEPT_EXTRAPOLATE_CONSTANT = 3u;

  // A bitwise AND of the "options" argument with one of the following
  // will extract the component associated with interpolation and
  // extrapolation
  namespace internal {
    static const internal::uint ADEPT_EXTRAPOLATE_MASK = 15; // Binary 1111
    static const internal::uint ADEPT_INTERPOLATE_MASK = ~ADEPT_EXTRAPOLATE_MASK;

    inline void extract_interp_extrap(uint options, uint& interp_scheme, uint& extrap_policy) {
      interp_scheme = options & ADEPT_INTERPOLATE_MASK;
      extrap_policy = options & ADEPT_EXTRAPOLATE_MASK;
      if (interp_scheme != ADEPT_INTERPOLATE_LINEAR
	  && interp_scheme != ADEPT_INTERPOLATE_NEAREST) {
	throw array_exception("Interpolation scheme not understood");
      }
      else if (extrap_policy > ADEPT_EXTRAPOLATE_CONSTANT) {
	throw array_exception("Extrapolation policy not understood");
      }
      else if (interp_scheme == ADEPT_INTERPOLATE_NEAREST
	       && extrap_policy == ADEPT_EXTRAPOLATE_LINEAR) {
	throw array_exception("Linear extrapolation not available with nearest-neighbour interpolation");
      }
      else if (extrap_policy == ADEPT_EXTRAPOLATE_DEFAULT) {
	if (interp_scheme == ADEPT_INTERPOLATE_LINEAR) {
	  extrap_policy = ADEPT_EXTRAPOLATE_LINEAR;
	}
	else {
	  extrap_policy = ADEPT_EXTRAPOLATE_CLAMP;
	}
      }
    }

    // The dimensions of an array containing the data to be
    // interpolated may be described either by a vector of real
    // numbers, or by a regular range; any other type will not
    // compile.  A regular range (which could be expressed by a
    // LinSpace object) has not yet been defined.
    template <typename T>
    struct InterpHelper {
      static const bool is_valid = false;
    };

    // Specialization for a vector of real numbers    
    template <typename XType>
    struct InterpHelper<Array<1,XType,false> > {
      static const bool is_valid = is_floating_point<XType>::value;
      template <typename XiType>
      static void interp_get_indices_weights(const Array<1,XType,false>& x,
				 const Array<1,XiType,false>& xi,
				 internal::uint interp_scheme,
				 internal::uint extrap_policy,
				 Array<1,Index>& ind0, Array<1,Real,false>& weight0,
				 Array<1,bool>& is_valid) {
	if (x(1) > x(0)) {
	  // Normal ordering; loop over points to be interpolated
	  for (Index i = 0; i < xi.size(); ++i) {
	    const XiType xii = xi(i);
	    if (xii >= x(0) && xii <= x(end)) {
	      // Point is in the range of the interpolated function
	      Index jj = 0;
	      while (jj < x.size()-2 && x(jj+1) < xii) {
		++jj;
	      }
	      ind0(i) = jj;
	      weight0(i) = (x(jj+1)-xii)/(x(jj+1)-x(jj));
	    }
	    else if (xii < x(0)) {
	      // Point is off the low end of the scale
	      ind0(i) = 0;
	      if (extrap_policy == ADEPT_EXTRAPOLATE_LINEAR) {
		weight0(i) = (x(1)-xii)/(x(1)-x(0));
	      }
	      else if (extrap_policy == ADEPT_EXTRAPOLATE_CLAMP) {
		weight0(i) = 1.0;
	      }
	      else {
		is_valid(i) = false;
	      }
	    }
	    else {
	      // Point is off the high end of the scale
	      ind0(i) = x.size()-2;
	      if (extrap_policy == ADEPT_EXTRAPOLATE_LINEAR) {
		weight0(i) = (x(end)-xii)/(x(end)-x(end-1));
	      }
	      else if (extrap_policy == ADEPT_EXTRAPOLATE_CLAMP) {
		weight0(i) = 0.0;
	      }
	      else {
		is_valid(i) = false;
	      }
	    }
	  }
	}
	else {
	  // Reverse ordering; loop over points to be interpolated
	  for (Index i = 0; i < xi.size(); ++i) {
	    const XiType xii = xi(i);
	    if (xii <= x(0) && xii >= x(end)) {
	      // Point is in the range of the interpolated function
	      Index jj = x.size()-2;
	      while (jj > 0 && x(jj) < xii) {
		--jj;
	      }
	      ind0(i) = jj;
	      weight0(i) = (x(jj+1)-xii)/(x(jj+1)-x(jj));
	    }
	    else if (xii > x(0)) {
	      // Point is off the scale (high in x, low in index)
	      ind0(i) = 0;
	      if (extrap_policy == ADEPT_EXTRAPOLATE_LINEAR) {
		weight0(i) = (x(1)-xii)/(x(1)-x(0));
	      }
	      else if (extrap_policy == ADEPT_EXTRAPOLATE_CLAMP) {
		weight0(i) = 1.0;
	      }
	      else {
		is_valid(i) = false;
	      }
	    }
	    else {
	      // Point is off the scale (low in x, high in index)
	      ind0(i) = x.size()-2;
	      if (extrap_policy == ADEPT_EXTRAPOLATE_LINEAR) {
		weight0(i) = (x(end)-xii)/(x(end)-x(end-1));
	      }
	      else if (extrap_policy == ADEPT_EXTRAPOLATE_CLAMP) {
		weight0(i) = 0.0;
	      }
	      else {
		is_valid(i) = false;
	      }
	    }	    
	  }
	}
	// Not very efficient implementation of nearest-neighbour
	// interpolation: round the weights from linear interpolation
	if (interp_scheme == ADEPT_INTERPOLATE_NEAREST) {
	  weight0 = round(weight0);
	}
      }
    };
  }
  
  // 1D interpolation: interp1(x,y,xi) interpolates to obtain values of
  // y (whose first dimension is at the points in vector x)
  // interpolated to the values in vector xi. If y has more than one
  // dimension then multiple values are interpolated for every point
  // in xi, and the returned array has a size equal to y except that
  // the first dimension is of the same length as xi. If the
  // extrapolate policy is specified and is ADEPT_EXTRAPOLATE_CLAMP
  // then values outside the range will be clampted at the first or
  // last point. If it is ADEPT_EXTRAPOLATE_CONSTANT then a constant
  // value will be used which can be specified as the final argument,
  // or is a signaling NaN by default.  Otherwise, linear
  // extrapolation is performed (the default). Note that x and xi must
  // be inactive variables, but y can be active in which case the
  // returned array will be too.
  template <typename XType, typename YType, bool YIsActive, typename XiType, int YDims>
  Array<YDims,YType,YIsActive>
  interp(const Array<1,XType,false>& x,
	 const Array<YDims,YType,YIsActive>& y,
	 const Array<1,XiType,false>& xi,
	 internal::uint options = ADEPT_INTERPOLATE_LINEAR | ADEPT_EXTRAPOLATE_DEFAULT,
	 YType extrap_value = std::numeric_limits<YType>::signaling_NaN()) {
    
    ExpressionSize<YDims> ans_dims = y.dimensions();
    ans_dims[0] = xi.size();
    Array<YDims,YType,YIsActive> ans(ans_dims);
    if (x.size() != y.size(0)) {
      throw(size_mismatch("Interpolation vector x must have same length of first dimension of y in interp"));
    }
    else if (x.size() == 0) {
      throw(size_mismatch("Interpolation from empty vectors"));
    }
    else if (x.size() == 1) {
      // Input arrays are at a single point: copy this point into all
      // output points regardless of their x coordinate
      for (int ii = 0; ii < xi.size(); ++ii) {
	ans[ii] = y[0];
      }
      return ans;
    }

    internal::uint interp_scheme, extrap_policy;
    internal::extract_interp_extrap(options, interp_scheme, extrap_policy);
    
    if (x(0) < x(1)) {
      // Normal ordering
      for (Index i = 0; i < xi.size(); i++) {
	Real xii = xi(i);
	Index jmin = 0;
	Index jmax = x.size()-1;
	if (xii <= x(0)) {
	  if (extrap_policy == ADEPT_EXTRAPOLATE_LINEAR) {
	    // Extrapolate leftwards
	    jmax = 1;
	  }
	  else if (extrap_policy == ADEPT_EXTRAPOLATE_CLAMP) {
	    // Clamp at first value
	    ans[i] = y[0];
	    continue;
	  }
	  else {
	    ans[i] = extrap_value;
	    continue;
	  }
	}
	else if (xii >= x(jmax)) {
	  if (extrap_policy == ADEPT_EXTRAPOLATE_LINEAR) {
	    // Extrapolate rightwards
	    jmin = jmax-1;
	  }
	  else if (extrap_policy == ADEPT_EXTRAPOLATE_CLAMP) {
	    // Clamp at final value
	    ans[i] = y[jmax];
	    continue;
	  }
	  else {
	    ans[i] = extrap_value;
	    continue;
	  }
	}
	else {
	  // xii lies within x
	  // Find pair in which xi sits
	  while (jmax > jmin+1) {
	    Index jmid = jmin + (jmax-jmin)/2;
	    if (xii > x(jmid)) {
	      jmin = jmid;
	    }
	    else {
	      jmax = jmid;
	    }
	  }
	}
	if (interp_scheme == ADEPT_INTERPOLATE_LINEAR) {
	  // Found value: linearly interpolate. Note that we need
	  // square brackets here because ans and y may have more than
	  // one dimension in which case we want to slice them
	  // returning a lower dimensional array
	  ans[i] = ((xii-x(jmin))*y[jmax] + (x(jmax)-xii)*y[jmin])
	    / (x(jmax)-x(jmin));
	}
	else if (xii-x(jmin) > x(jmax)-xii) {
	  // Nearest neighbour is at next point
	  ans[i] = y[jmax];
	}
	else {
	  // Nearest neighbour is at previous point
	  ans[i] = y[jmin];
	}
      }
    }
    else {
      // Reverse ordering
      for (Index i = 0; i < xi.size(); i++) {
	Real xii = xi(i);
	Index jmin = 0;
	Index jmax = x.size()-1;
	if (xii >= x(0)) {
	  if (extrap_policy == ADEPT_EXTRAPOLATE_LINEAR) {
	    // Extrapolate leftwards
	    jmax = 1;
	  }
	  else if (extrap_policy == ADEPT_EXTRAPOLATE_CLAMP) {
	    // Clamp at first value
	    ans[i] = y[0];
	    continue;
	  }
	  else {
	    ans[i] = extrap_value;
	    continue;
	  }
	}
	else if (xii <= x(jmax)) {
	  if (extrap_policy == ADEPT_EXTRAPOLATE_LINEAR) {
	    // Extrapolate rightwards
	    jmin = jmax-1;
	  }
	  else if (extrap_policy == ADEPT_EXTRAPOLATE_CLAMP) {
	    // Clamp at last value
	    ans[i] = y[jmax];
	    continue;
	  }
	  else {
	    ans[i] = extrap_value;
	    continue;
	  }
	}
	else {
	  // xii lies within x
	  // Find pair in which xi sits
	  while (jmax > jmin+1) {
	    Index jmid = jmin + (jmax-jmin)/2;
	    if (xii < x(jmid)) {
	      jmin = jmid;
	    }
	    else {
	      jmax = jmid;
	    }
	  }
	}
	if (interp_scheme == ADEPT_INTERPOLATE_LINEAR) {
	  // Found value: linearly interpolate (all weights here are
	  // negative)
	  ans[i] = ((xii-x(jmin))*y[jmax] + (x(jmax)-xii)*y[jmin])
	    / (x(jmax)-x(jmin));
	}
	else if (xii-x(jmin) < x(jmax)-xii) {
	  // Nearest neighbour is at next point
	  ans[i] = y[jmax];
	}
	else {
	  // Nearest neighbour is at previous point
	  ans[i] = y[jmin];
	}
      }
    }
    return ans;
  }

  // Ensure that 1D interpolation works if expressions are provided
  // for any of the arguments; these are converted to temporary
  // arrays.
  template <typename XType, typename YType, typename XiType,
	    class X, class Y, class Xi>
  Array<Y::rank,YType,Y::is_active>
  interp(const Expression<XType,X>& x,
	 const Expression<YType,Y>& y,
	 const Expression<XiType,Xi>& xi,
	 internal::uint options = ADEPT_INTERPOLATE_LINEAR | ADEPT_EXTRAPOLATE_DEFAULT,
	 YType extrap_value = std::numeric_limits<YType>::signaling_NaN()) {
    const Array<1,XType,false> x2(x.cast());
    const Array<Y::rank,YType,Y::is_active> y2(y.cast());
    const Array<1,XiType,false> xi2(xi.cast());
    return interp(x2, y2, xi2, options, extrap_value);
  }

  // 1D logarithmic interpolation: interpolate log(Y) and then
  // exponentiate the result.
  template <typename XType, typename YType, bool YIsActive, typename XiType>
  Array<1,YType,YIsActive>
  log_interp(const Array<1,XType,false>& x,
	 const Array<1,YType,YIsActive>& y,
	 const Array<1,XiType,false>& xi) {
    using std::exp;
    using std::log;

    int length = xi.size();
    Array<1,YType,YIsActive> ans(length);
    if (x.size() != y.size()) {
      throw(size_mismatch("Interpolation vectors must be the same length in log_interp"));
    }

    if (x(0) < x(1)) {
      // Normal ordering
      for (Index i = 0; i < length; i++) {
	Real xii = xi(i);
	Index jmin = 0;
	Index jmax = x.size()-1;
	if (xii <= x(0)) {
	  // Extrapolate leftwards
	  jmax = 1;
	}
	else if (xii >= x(jmax)) {
	  // Extrapolate rightwards
	  jmin = jmax-1;
	}
	else {
	  // xii lies within x
	  // Find pair in which xi sits
	  while (jmax > jmin+1) {
	    Index jmid = jmin + (jmax-jmin)/2;
	    if (xii > x(jmid)) {
	      jmin = jmid;
	    }
	    else {
	      jmax = jmid;
	    }
	  }
	}
	// Found value: logarithmically interpolate
	if (y(jmax) > 0.0 && y(jmin) > 0.0) {
	  YType log_y_jmax = log(y(jmax));
	  YType log_y_jmin = log(y(jmin));
	  ans(i) = exp(((xii-x(jmin))*log_y_jmax + (x(jmax)-xii)*log_y_jmin)
		       / (x(jmax)-x(jmin)));
	}
	else {
	  // Interpolate linearly since one or both values is zero
	  ans(i) = ((xii-x(jmin))*y(jmax) + (x(jmax)-xii)*y(jmin))
	    / (x(jmax)-x(jmin));
	}
      }
    }
    else {
      // Reverse ordering
      for (Index i = 0; i < length; i++) {
	Real xii = xi(i);
	Index jmin = 0;
	Index jmax = x.size()-1;
	if (xii >= x(0)) {
	  // Extrapolate leftwards
	  jmax = 1;
	}
	else if (xii <= x(jmax)) {
	  // Extrapolate rightwards
	  jmin = jmax-1;
	}
	else {
	  // xii lies within x
	  // Find pair in which xi sits
	  while (jmax > jmin+1) {
	    Index jmid = jmin + (jmax-jmin)/2;
	    if (xii < x(jmid)) {
	      jmin = jmid;
	    }
	    else {
	      jmax = jmid;
	    }
	  }
	}
	// Found value: logarithmically interpolate
	if (y(jmax) > 0.0 && y(jmin) > 0.0) {
	  YType log_y_jmax = log(y(jmax));
	  YType log_y_jmin = log(y(jmin));
	  ans(i) = exp(((xii-x(jmin))*log_y_jmax + (x(jmax)-xii)*log_y_jmin)
		       / (x(jmax)-x(jmin)));
	}
	else {
	  // Interpolate linearly since one or both values is zero
	  ans(i) = ((xii-x(jmin))*y(jmax) + (x(jmax)-xii)*y(jmin))
	    / (x(jmax)-x(jmin));
	}
      }
    }
    return ans;
  }

  // 2D interpolation: as 1D interpolation but with two vectors
  // describing the dimensions of the interpolation array and two
  // vectors providing points at which interpolated values are
  // required
  template <typename XType, typename YType,
	    int MDims, typename MType, bool MIsActive,
	    typename XiType, typename YiType>
  Array<MDims-1,MType,MIsActive>
  interp2d(const XType& x,
	   const YType& y,
	   const Array<MDims,MType,MIsActive>& M,
	   const Array<1,XiType,false>& xi,
	   const Array<1,YiType,false>& yi,
	   internal::uint options = ADEPT_INTERPOLATE_LINEAR | ADEPT_EXTRAPOLATE_DEFAULT,
	   MType extrap_value = std::numeric_limits<MType>::signaling_NaN()) {

    ADEPT_STATIC_ASSERT(MDims >= 2, TWO_DIMENSIONAL_INTERPOLATION_REQUIRES_2D_ARRAY);
    
    if (x.size() != M.size(0)) {
      throw(size_mismatch("Interpolation vector x must have same length as first dimension of M in interp2d"));
    }
    if (y.size() != M.size(1)) {
      throw(size_mismatch("Interpolation vector y must have same length as second dimension of M in interp2d"));
    }
    else if (x.size() < 2 || y.size() < 2) {
      throw(size_mismatch("Interpolation array must have at least two elements in each direction in interp2d"));
    }
    else if (xi.dimensions() != yi.dimensions()) {
      throw(size_mismatch("Indexing arrays must be the same shape in interp2d"));
    }

    internal::uint interp_scheme, extrap_policy;
    internal::extract_interp_extrap(options, interp_scheme, extrap_policy);
    
    Index ni = xi.size();
    ExpressionSize<MDims-1> ans_dims;
    ans_dims[0] = xi.size();
    for (int ii = 2; ii < MDims; ++ii) {
      ans_dims[ii-1] = M.size(ii);
    }

    Array<MDims-1,MType,MIsActive> ans(ans_dims);
    
    // Indices to the first of the two elements in each dimension, and
    // the weight of the first element
    IntVector xind0(ni);
    Vector xweight0(ni);
    IntVector yind0(ni);
    Vector yweight0(ni);
    boolVector is_valid(ni);
    is_valid = true;
    internal::InterpHelper<XType>::interp_get_indices_weights(x, xi, interp_scheme, extrap_policy,
							      xind0, xweight0, is_valid);
    internal::InterpHelper<YType>::interp_get_indices_weights(y, yi, interp_scheme, extrap_policy,
							      yind0, yweight0, is_valid);
    /*
    std::cout << "xind0 " << xind0 << "\n";
    std::cout << "xweight00 " << xweight0 << "\n";
    std::cout << "yind0 " << yind0 << "\n";
    std::cout << "yweight00 " << yweight0 << "\n";
    */
    for (Index ii = 0; ii < ni; ++ii) {
      if (is_valid(ii)) {
	// Bi-linear interpolation
	ans[ii] = yweight0(ii) * (      xweight0(ii)  * M[xind0(ii)][yind0(ii)]
				  +(1.0-xweight0(ii)) * M[xind0(ii)+1][yind0(ii)])
	  + (1.0-yweight0(ii)) * (      xweight0(ii)  * M[xind0(ii)][yind0(ii)+1]
				  +(1.0-xweight0(ii)) * M[xind0(ii)+1][yind0(ii)+1]);
      }
      else {
	ans[ii] = extrap_value;
      }
    }
    return ans;
  }

  // Ensure that 2D interpolation works if expressions are provided
  // for any of the arguments; these are converted to temporary
  // arrays.
  template <typename XType, typename YType, typename MType, typename XiType, class YiType,
	    class X, class Y, class M, class Xi, class Yi>
  Array<M::rank-1,MType,M::is_active>
  interp2d(const Expression<XType,X>& x,
	   const Expression<YType,Y>& y,
	   const Expression<MType,M>& m,
	   const Expression<XiType,Xi>& xi,
	   const Expression<YiType,Yi>& yi,
	   internal::uint options = ADEPT_INTERPOLATE_LINEAR | ADEPT_EXTRAPOLATE_DEFAULT,
	   MType extrap_value = std::numeric_limits<MType>::signaling_NaN()) {
    const Array<1,XType,false> x2(x.cast());
    const Array<1,YType,false> y2(y.cast());
    const Array<M::rank,MType,M::is_active> m2(m.cast());
    const Array<1,XiType,false> xi2(xi.cast());
    const Array<1,YiType,false> yi2(yi.cast());
    return interp2d(x2, y2, m2, xi2, yi2, options, extrap_value);
  }
  
  // 3D interpolation: as 1D interpolation but with two vectors
  // describing the dimensions of the interpolation array and two
  // vectors providing points at which interpolated values are
  // required
  template <typename XType, typename YType, typename ZType,
	    int MDims, typename MType, bool MIsActive,
	    typename XiType, typename YiType, typename ZiType>
  Array<MDims-2,MType,MIsActive>
  interp3d(const XType& x,
	   const YType& y,
	   const ZType& z,
	   const Array<MDims,MType,MIsActive>& M,
	   const Array<1,XiType,false>& xi,
	   const Array<1,YiType,false>& yi,
	   const Array<1,ZiType,false>& zi,
	   internal::uint options = ADEPT_INTERPOLATE_LINEAR | ADEPT_EXTRAPOLATE_DEFAULT,
	   MType extrap_value = std::numeric_limits<MType>::signaling_NaN()) {

    ADEPT_STATIC_ASSERT(MDims >= 3, THREE_DIMENSIONAL_INTERPOLATION_REQUIRES_3D_ARRAY);
    
    if (x.size() != M.size(0)) {
      throw(size_mismatch("Interpolation vector x must have same length as first dimension of M in interp3d"));
    }
    if (y.size() != M.size(1)) {
      throw(size_mismatch("Interpolation vector y must have same length as second dimension of M in interp3d"));
    }
    if (z.size() != M.size(2)) {
      throw(size_mismatch("Interpolation vector z must have same length as third dimension of M in interp3d"));
    }
    else if (x.size() < 2 || y.size() < 2 || z.size() < 2) {
      throw(size_mismatch("Interpolation array must have at least two elements in each direction in interp3d"));
    }
    else if (xi.dimensions() != yi.dimensions() || xi.dimensions() != zi.dimensions()) {
      throw(size_mismatch("Indexing arrays must be the same shape in interp3d"));
    }

    internal::uint interp_scheme, extrap_policy;
    internal::extract_interp_extrap(options, interp_scheme, extrap_policy);
    
    Index ni = xi.size();
    ExpressionSize<MDims-2> ans_dims;
    ans_dims[0] = xi.size();
    for (int ii = 3; ii < MDims; ++ii) {
      ans_dims[ii-2] = M.size(ii);
    }

    Array<MDims-2,MType,MIsActive> ans(ans_dims);
    
    // Indices to the first of the two elements in each dimension, and
    // the weight of the first element
    IntVector xind0(ni);
    Vector xweight0(ni);
    IntVector yind0(ni);
    Vector yweight0(ni);
    IntVector zind0(ni);
    Vector zweight0(ni);
    boolVector is_valid(ni);
    is_valid = true;
    internal::InterpHelper<XType>::interp_get_indices_weights(x, xi, interp_scheme, extrap_policy,
							      xind0, xweight0, is_valid);
    internal::InterpHelper<YType>::interp_get_indices_weights(y, yi, interp_scheme, extrap_policy,
							      yind0, yweight0, is_valid);
    internal::InterpHelper<ZType>::interp_get_indices_weights(z, zi, interp_scheme, extrap_policy,
							      zind0, zweight0, is_valid);
    for (Index ii = 0; ii < ni; ++ii) {
      if (is_valid(ii)) {
	// Tri-linear interpolation
	ans[ii] = xweight0(ii) *
	  (yweight0(ii) * (zweight0(ii) * M[xind0(ii)][yind0(ii)][zind0(ii)]
			   +(1.0-zweight0(ii)) * M[xind0(ii)][yind0(ii)][zind0(ii)+1])
	   + (1.0-yweight0(ii)) * (zweight0(ii)  * M[xind0(ii)][yind0(ii)+1][zind0(ii)]
				   +(1.0-zweight0(ii)) * M[xind0(ii)][yind0(ii)+1][zind0(ii)+1]))
	  + (1.0 - xweight0(ii)) *
	  (yweight0(ii) * (zweight0(ii) * M[xind0(ii)+1][yind0(ii)][zind0(ii)]
			   +(1.0-zweight0(ii)) * M[xind0(ii)+1][yind0(ii)][zind0(ii)+1])
	   + (1.0-yweight0(ii)) * (zweight0(ii)  * M[xind0(ii)+1][yind0(ii)+1][zind0(ii)]
				   +(1.0-zweight0(ii)) * M[xind0(ii)+1][yind0(ii)+1][zind0(ii)+1]));
      }
      else {
	ans[ii] = extrap_value;
      }
    }
    return ans;
  }

  // Ensure that 3D interpolation works if expressions are provided
  // for any of the arguments; these are converted to temporary
  // arrays.
  template <typename XType, typename YType, typename ZType, typename MType,
	    typename XiType, class YiType, class ZiType,
	    class X, class Y, class Z, class M, class Xi, class Yi, class Zi>
  Array<M::rank-2,MType,M::is_active>
  interp3d(const Expression<XType,X>& x,
	   const Expression<YType,Y>& y,
	   const Expression<ZType,Z>& z,
	   const Expression<MType,M>& m,
	   const Expression<XiType,Xi>& xi,
	   const Expression<YiType,Yi>& yi,
	   const Expression<ZiType,Zi>& zi,
	   internal::uint options = ADEPT_INTERPOLATE_LINEAR | ADEPT_EXTRAPOLATE_DEFAULT,
	   MType extrap_value = std::numeric_limits<MType>::signaling_NaN()) {
    const Array<1,XType,false> x2(x.cast());
    const Array<1,YType,false> y2(y.cast());
    const Array<1,ZType,false> z2(z.cast());
    const Array<M::rank,MType,M::is_active> m2(m.cast());
    const Array<1,XiType,false> xi2(xi.cast());
    const Array<1,YiType,false> yi2(yi.cast());
    const Array<1,ZiType,false> zi2(zi.cast());
    return interp3d(x2, y2, z2, m2, xi2, yi2, zi2, options, extrap_value);
  }
  
} // End namespace adept

#endif


================================================
FILE: include/adept/inv.h
================================================
/* inv.h -- Invert matrices

    Copyright (C) 2015 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.
*/
                             
#ifndef AdeptInv_H
#define AdeptInv_H 1

#include <vector>

#include <adept/Array.h>
#include <adept/SpecialMatrix.h>

namespace adept {

  // -------------------------------------------------------------------
  // Invert general square matrix A
  // -------------------------------------------------------------------
  template <typename Type>
  Array<2,Type,false> 
  inv(const Array<2,Type,false>& A);

  // -------------------------------------------------------------------
  // Invert symmetric matrix A
  // -------------------------------------------------------------------
  template <typename Type, SymmMatrixOrientation Orient>
  SpecialMatrix<Type,internal::SymmEngine<Orient>,false> 
  inv(const SpecialMatrix<Type,internal::SymmEngine<Orient>,false>& A);
 
  // -------------------------------------------------------------------
  // Invert arbitrary expression
  // -------------------------------------------------------------------
  template <typename Type, class E>
  typename internal::enable_if<E::rank==2 && E::is_active==false
			       && internal::matrix_op_defined<Type>::value,
			       Array<2,Type,false> >::type
  inv(const Expression<Type,E>& A) {
    Array<2,Type,false> array = A.cast();
    return inv(array);
  }
 
}

#endif


================================================
FILE: include/adept/matmul.h
================================================
/* matmul.h -- Matrix multiplication capability

    Copyright (C) 2015-2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.
*/
                             
#ifndef AdeptMatmul_H
#define AdeptMatmul_H

#include <cmath>

#include <adept/Array.h>
#include <adept/SpecialMatrix.h>
#include <adept/cppblas.h>

namespace adept {

  namespace internal {

    // ---------------------------------------------------------------------
    // Helper functions for checking dimensions
    // ---------------------------------------------------------------------
    template <class L, class R>
    inline
    void
    check_inner_dimensions(const L& left, const R& right) {
      if (left.empty() || right.empty()) {
	throw empty_array("Attempt to perform matrix multiplication with empty array(s)"
			  ADEPT_EXCEPTION_LOCATION);
      }
      if (left.dimension(1) != right.dimension(0)) {
	throw inner_dimension_mismatch("Inner dimension mismatch in array multiplication"
				       ADEPT_EXCEPTION_LOCATION);
      }
    }

    template <class R>
    inline
    void
    check_inner_dimensions_sqr(Index left_dim, const R& right) {
      if (left_dim == 0 || right.empty()) {
	throw empty_array("Attempt to perform matrix multiplication with empty array(s)"
			  ADEPT_EXCEPTION_LOCATION);
      }
      if (left_dim != right.dimension(0)) {
	throw inner_dimension_mismatch("Inner dimension mismatch in array multiplication"
				       ADEPT_EXCEPTION_LOCATION);
      }
    }

    // ---------------------------------------------------------------------
    // Underlying functions
    // ---------------------------------------------------------------------

    // Dense matrix-vector multiplication
    template <typename T, bool LIsActive, bool RIsActive>
    inline
    Array<1,T,(LIsActive||RIsActive)>
    matmul_(const Array<2,T,LIsActive>& left, const Array<1,T,RIsActive>& right) {      
      static const bool is_active = LIsActive || RIsActive;

      check_inner_dimensions(left, right);

      Array<1,T,is_active> ans(left.dimension(0));

      Index stride;
      BLAS_ORDER order;
      if (!left.is_row_contiguous() && !left.is_column_contiguous()) {
	// Matrix is strided in both directions so needs to be copied
	// first
	Array<2,T,LIsActive> left_;
	left_ = left;
	return matmul_(left_, right);
      }
      else if (left.is_row_contiguous()) {
	order = BlasRowMajor;
	stride = left.offset(0);
      }
      else {
	order = BlasColMajor;
	stride = left.offset(1);
      }
      cppblas_gemv(order, BlasNoTrans, left.dimension(0), left.dimension(1), 
		   1.0, left.const_data(), stride, 
		   right.const_data(), right.offset(0), 
		   0.0, ans.data(), ans.offset(0));
      if (is_active
#ifdef ADEPT_RECORDING_PAUSABLE
	  && ADEPT_ACTIVE_STACK->is_recording()
#endif
	  ) {

	uIndex left_index = left.gradient_index();
	uIndex right_index = right.gradient_index();
	uIndex ans_index = ans.gradient_index();
	Index n = right.dimension(0);
	const ExpressionSize<2>& left_offset = left.offset();
	const ExpressionSize<1>& right_offset = right.offset();
	for (Index i = 0; i < ans.dimension(0); ++i) {
	  if (LIsActive) {
	    active_stack()->push_derivative_dependence(left_index+i*left_offset[0], 
						       right.const_data(), n, left_offset[1], right_offset[0]);
	  }
	  if (RIsActive) {
	    active_stack()->push_derivative_dependence(right_index, 
						       left.const_data()+i*left_offset[0], 
						       n, right_offset[0], left_offset[1]);
	  }
	  active_stack()->push_lhs(ans_index + i*ans.offset(0));
	}
      }

      return ans;
    }


    // Dense matrix-matrix multiplication
    template <typename T, bool LIsActive, bool RIsActive>
    inline
    Array<2,T,(LIsActive||RIsActive)>
    matmul_(const Array<2,T,LIsActive>& left, const Array<2,T,RIsActive>& right) {
      static const bool is_active = LIsActive || RIsActive;

      check_inner_dimensions(left, right);

      if (!left.is_row_contiguous() && !left.is_column_contiguous()) {
	Array<2,T,LIsActive> left_;
	left_ = left;
	if (!right.is_row_contiguous() && !right.is_column_contiguous()) {
	  Array<2,T,RIsActive> right_;
	  right_ = right;
	  return matmul_(left_, right_);
	}
	else {
	  return matmul_(left_, right);
	}
      }
      else if (!right.is_row_contiguous() && !right.is_column_contiguous()) {
	Array<2,T,RIsActive> right_;
	right_ = right;
	return matmul_(left, right_);
      }
      else {
	Index left_stride, right_stride, ans_stride;
	BLAS_TRANSPOSE left_trans, right_trans;
	BLAS_ORDER order;
	Array<2,T,is_active> ans(left.dimension(0),right.dimension(1));

	if (ans.is_row_contiguous()) {
	  order = BlasRowMajor;
	  ans_stride = ans.offset(0);
	}
	else {
	  order = BlasColMajor;
	  ans_stride = ans.offset(1);
	}
	if (left.is_row_contiguous()) {
	  left_trans = order == BlasRowMajor ? BlasNoTrans : BlasTrans;
	  left_stride = left.offset(0);
	}
	else {
	  left_trans = order == BlasColMajor ? BlasNoTrans : BlasTrans;
	  left_stride = left.offset(1);
	}
	if (right.is_row_contiguous()) {
	  right_trans = order == BlasRowMajor ? BlasNoTrans : BlasTrans;
	  right_stride = right.offset(0);
	}
	else {
	  right_trans = order == BlasColMajor ? BlasNoTrans : BlasTrans;
	  right_stride = right.offset(1);
	}
	cppblas_gemm(order, left_trans, right_trans,
		    left.dimension(0), right.dimension(1), left.dimension(1),
		    1.0, left.const_data(), left_stride,
		    right.const_data(), right_stride,
		    0.0, ans.data(), ans_stride);
	if ( (LIsActive || RIsActive)
#ifdef ADEPT_RECORDING_PAUSABLE
	    && ADEPT_ACTIVE_STACK->is_recording()
#endif
	    ) {
	  uIndex left_index = left.gradient_index();
	  uIndex right_index = right.gradient_index();
	  uIndex ans_index = ans.gradient_index();
	  Index n = right.dimension(0);
	  const ExpressionSize<2>& left_offset = left.offset();
	  const ExpressionSize<2>& right_offset = right.offset();

	  for (Index i = 0; i < ans.dimension(0); ++i) {
	    for (Index j = 0; j < ans.dimension(1); ++j) {
	      if (LIsActive) {
		active_stack()->push_derivative_dependence(left_index+i*left_offset[0], 
			   right.const_data()+j*right_offset[1], n, 
			   left_offset[1], right_offset[0]);
	      }
	      if (RIsActive) {
		active_stack()->push_derivative_dependence(right_index+j*right_offset[1], 
			   left.const_data()+i*left_offset[0], n, 
			   right_offset[0], left_offset[1]);
	      }
	      active_stack()->push_lhs(ans_index + i*ans.offset(0) + j*ans.offset(1));
	    }
	  }

	}
	return ans;
      }
    }

    // Symmetric matrix-vector multiplication
    template <bool LIsActive, typename T, bool RIsActive>
    inline
    Array<1,T,(LIsActive||RIsActive)>
    matmul_symmetric(const T* left_ptr, SymmMatrixOrientation left_orient, Index left_dim,
		     Index left_offset, uIndex left_gradient_index,
		     const Array<1,T,RIsActive>& right) {

      check_inner_dimensions_sqr(left_dim, right);

      if (LIsActive || RIsActive) {
	throw(invalid_operation("Cannot yet do matmul(SymmMatrix,Vector) when either are active"));
      }
      BLAS_UPLO uplo;
      if (left_orient == ROW_LOWER_COL_UPPER) {
	uplo = BlasLower;
      }
      else {
	uplo = BlasUpper;
      }
      Array<1,T,LIsActive||RIsActive> ans(right.dimension(0));
      cppblas_symv(BlasRowMajor, uplo, right.dimension(0), 
		   1.0, left_ptr, left_offset, 
		   right.const_data(), right.offset(0), 
		   0.0, ans.data(), ans.offset(0));
      return ans;
    }

    // Symmetric matrix-matrix multiplication
    template <bool LIsActive, typename T, bool RIsActive>
    inline
    Array<2,T,(LIsActive||RIsActive)>
    matmul_symmetric(const T* left_ptr, SymmMatrixOrientation left_orient, Index left_dim,
		     Index left_offset, uIndex left_gradient_index,
		     const Array<2,T,RIsActive>& right) {

      check_inner_dimensions_sqr(left_dim, right);

      if (LIsActive || RIsActive) {
	throw(invalid_operation("Cannot yet do matmul(SymmMatrix,Matrix) when either are active"));
      }
      if (!right.is_row_contiguous() && !right.is_column_contiguous()) {
	Array<2,T,RIsActive> right_;
	right_ = right;
	return matmul_symmetric<LIsActive>(left_ptr, left_orient, left_dim, left_offset,
					   left_gradient_index, right_);
      }
      else {
	BLAS_ORDER order;
	BLAS_UPLO uplo;
	Index right_stride, ans_stride;
	Array<2,T,LIsActive||RIsActive> ans;

	if (right.is_row_contiguous()) {
	  order = BlasRowMajor;
	  uplo = left_orient == ROW_LOWER_COL_UPPER ? BlasLower : BlasUpper;
	  right_stride = right.offset(0);
	  ans.resize_row_major(right.dimensions());
	  ans_stride = ans.offset(0);
	}
	else {
	  order = BlasColMajor;
	  uplo = left_orient == ROW_LOWER_COL_UPPER ? BlasUpper : BlasLower;
	  right_stride = right.offset(1);
	  ans.resize_column_major(right.dimensions());
	  ans_stride = ans.offset(1);
	}

	cppblas_symm(order, BlasLeft, uplo,  right.dimension(0), right.dimension(1),
		     1.0, left_ptr, left_offset, 
		     right.const_data(), right_stride, 0.0,
		     ans.data(), ans_stride);
	return ans;
      }
    }


    // Band matrix-vector multiplication
    template <bool LIsActive, typename T, bool RIsActive>
    inline
    Array<1,T,(LIsActive||RIsActive)>
    matmul_band(const T* left_ptr, MatrixStorageOrder left_order, 
		Index LDiags, Index UDiags, Index left_dim, Index left_offset,
		uIndex left_gradient_index, const Array<1,T,RIsActive>& right) {
      check_inner_dimensions_sqr(left_dim, right);

      if (LIsActive) {
	throw(invalid_operation("Cannot yet do matmul(BandMatrix,Vector) for active BandMatrix"));
      }

      BLAS_ORDER order;
      // BLAS declares the start pointer to be in the "missing data"
      // zone, so we need to subtract from the address of the top-left
      // corner of the matrix
      const T* left_start;
      if (left_order == ROW_MAJOR) {
	order = BlasRowMajor;
	left_start = left_ptr-UDiags;
      }
      else {
	order = BlasColMajor;
	left_start = left_ptr-LDiags;
      }
      Array<1,T,(LIsActive||RIsActive)> ans(right.dimension(0));
      cppblas_gbmv(order, BlasNoTrans, left_dim, left_dim, LDiags, UDiags,
		   1.0, left_start, left_offset+1,
		   right.const_data(), right.offset(0), 
		   0.0, ans.data(), ans.offset(0));
      if (RIsActive) {
	uIndex right_index = right.gradient_index();
	uIndex ans_index = ans.gradient_index();

	if (left_order == ROW_MAJOR) {
	  for (Index i = 0; i < ans.dimension(0); ++i) {
	    // Using info from BandEngine<ROW_MAJOR>::get_row_range in
	    // SpecialMatrix.h
	    Index j_start = i<LDiags ? 0 : i-LDiags;
	    Index j_end_plus_1 = i+UDiags+1>left_dim ? left_dim : i+UDiags+1;
	    Index n = j_end_plus_1 - j_start;
	    Index index_start = i*left_offset + j_start;
	    Index index_stride = 1;
	    active_stack()->push_derivative_dependence(right_index + j_start, 
						       left_ptr+index_start,
						       n, right.offset(0), index_stride);
	    active_stack()->push_lhs(ans_index + i*ans.offset(0));
	  }
	}
	else {
	  for (Index i = 0; i < ans.dimension(0); ++i) {
	    // Using info from BandEngine<COL_MAJOR>::get_row_range in
	    // SpecialMatrix.h
	    Index j_start = i<LDiags ? 0 : i-LDiags;
	    Index j_end_plus_1 = i+UDiags+1>left_dim ? left_dim : i+UDiags+1;
	    Index n = j_end_plus_1 - j_start;
	    Index index_start = i + j_start*left_offset;
	    Index index_stride = left_offset;
	    active_stack()->push_derivative_dependence(right_index + j_start, 
						       left_ptr+index_start,
						       n, right.offset(0), index_stride);
	    active_stack()->push_lhs(ans_index + i*ans.offset(0));
	  }
	}
      }
      return ans;
    }


    // Matrix-matrix multiplication with a band matrix on the left,
    // achieved by repeated matrix-vector multiplications
    template <bool LIsActive, typename T, bool RIsActive>
    inline
    Array<2,T,(LIsActive||RIsActive)>
    matmul_band(const T* left_ptr, MatrixStorageOrder left_order, 
		Index LDiags, Index UDiags, Index left_dim, Index left_offset,
		uIndex left_gradient_index, const Array<2,T,RIsActive>& right) {
      check_inner_dimensions_sqr(left_dim, right);
      if (LIsActive || RIsActive) {
	throw(invalid_operation("Cannot yet do matmul(BandMatrix,Matrix) when either are active"));
      }
      BLAS_ORDER order;
      // BLAS declares the start pointer to be in the "missing data"
      // zone, so we need to subtract from the address of the top-left
      // corner of the matrix
      const T* left_start;
      if (left_order == ROW_MAJOR) {
	order = BlasRowMajor;
	left_start = left_ptr-UDiags;
      }
      else {
	order = BlasColMajor;
	left_start = left_ptr-LDiags;
      }
      Array<2,T,(LIsActive||RIsActive)> ans(right.dimension(0),right.dimension(1));
      for (Index i = 0; i < right.dimension(1); ++i) {
	cppblas_gbmv(order, BlasNoTrans, left_dim, left_dim, LDiags, UDiags,
		     1.0, left_start, left_offset+1,
		     right.const_data()+i*right.offset(1), right.offset(0), 
		     0.0, ans.data()+i*ans.offset(1), ans.offset(0));
      }
      return ans;
    }
    

    // ---------------------------------------------------------------------
    // Versions of matmul_ implemented in terms of the underlying functions
    // ---------------------------------------------------------------------

    // Dense vector-matrix multiplication is evaluated by swapping and
    // transposing the arguments
    template <typename T, bool LIsActive, bool RIsActive>
    inline
    Array<1,T,(LIsActive||RIsActive)>
    matmul_(const Array<1,T,LIsActive>& left,
	    const Array<2,T,RIsActive>& right) {
      return matmul_(right.T(), left);
    }

    // Symmetric matrix-vector and matrix-matrix multiplication
    template <typename T, SymmMatrixOrientation LOrient, bool LIsActive, bool RIsActive, int RRank>
    inline
    Array<RRank,T,(LIsActive||RIsActive)>
    matmul_(const SpecialMatrix<T,internal::SymmEngine<LOrient>,LIsActive>& left,
	    const Array<RRank,T,RIsActive>& right) {
      return matmul_symmetric<LIsActive>(left.const_data(), LOrient, left.dimension(0),
					 left.offset(), left.gradient_index(), right);
    }

    // Vector multiplied by symmetric matrix: swap and transpose the arguments
    template <typename T, bool LIsActive, SymmMatrixOrientation ROrient, bool RIsActive>
    inline
    Array<1,T,(LIsActive||RIsActive)>
    matmul_(const Array<1,T,LIsActive>& left,
	    const SpecialMatrix<T,internal::SymmEngine<ROrient>,RIsActive>& right) {
      return matmul_symmetric<RIsActive>(right.const_data(), ROrient, 
					 right.dimension(0), right.offset(),
					 right.gradient_index(), left);
    }

    // Dense matrix multiplied by symmetric matrix: swap and transpose
    // the arguments, then transpose the result
    template <typename T, bool LIsActive, SymmMatrixOrientation ROrient, bool RIsActive>
    inline
    Array<2,T,(LIsActive||RIsActive)>
    matmul_(const Array<2,T,LIsActive>& left,
	    const SpecialMatrix<T,internal::SymmEngine<ROrient>,RIsActive>& right) {
      return matmul_symmetric<RIsActive>(right.const_data(), ROrient,
					 right.dimension(0), right.offset(),
					 right.gradient_index(), left.T()).T();
    }

    // Band matrix-vector and matrix-matrix multiplication
    template <typename T, MatrixStorageOrder LOrder, Index LDiags, Index UDiags, 
	      bool LIsActive, bool RIsActive, int RRank>
    inline
    Array<RRank,T,(LIsActive||RIsActive)>
    matmul_(const SpecialMatrix<T,internal::BandEngine<LOrder,LDiags,UDiags>,LIsActive>& left,
	    const Array<RRank,T,RIsActive>& right) {
      return matmul_band<LIsActive>(left.const_data(), LOrder, LDiags, UDiags,
				    left.dimension(0), left.offset(), left.gradient_index(), right);
    }

    // Vector multiplied by band matrix: swap and transpose the arguments
    template <typename T, bool LIsActive, MatrixStorageOrder ROrder, Index LDiags, Index UDiags,
	      bool RIsActive>
    inline
    Array<1,T,(LIsActive||RIsActive)>
    matmul_(const Array<1,T,LIsActive>& left,
	    const SpecialMatrix<T,internal::BandEngine<ROrder,LDiags,UDiags>,RIsActive>& right) {
      static const MatrixStorageOrder new_r_order = ROrder == ROW_MAJOR ? COL_MAJOR : ROW_MAJOR;
      return matmul_band<RIsActive>(right.const_data(), new_r_order, UDiags, LDiags,
				    right.dimension(0), right.offset(), right.gradient_index(), left);
    }

    // Dense matrix multiplied by band matrix: swap and transpose the
    // arguments, then transpose the result
    template <typename T, bool LIsActive, MatrixStorageOrder ROrder, Index LDiags, Index UDiags,
	      bool RIsActive>
    inline
    Array<2,T,(LIsActive||RIsActive)>
    matmul_(const Array<2,T,LIsActive>& left,
	    const SpecialMatrix<T,internal::BandEngine<ROrder,LDiags,UDiags>,RIsActive>& right) {
      static const MatrixStorageOrder new_r_order = ROrder == ROW_MAJOR ? COL_MAJOR : ROW_MAJOR;
      return matmul_band<RIsActive>(right.const_data(), new_r_order, UDiags, LDiags,
				    right.dimension(0), right.offset(), right.gradient_index(), left.T()).T();
    }


    // ---------------------------------------------------------------------
    // promote_array: helper function to change type of array and
    // convert expressions to arrays
    // ---------------------------------------------------------------------

    // If the argument is not an l-value then convert it to a dense
    // array of the same rank
    template <typename NewType, typename OldType, class A>
    inline
    typename internal::enable_if<!A::is_lvalue,Array<A::rank,NewType,A::is_active> >::type
    promote_array(const Expression<OldType,A>& arg) {
      return Array<A::rank,NewType,A::is_active>(arg);
    }

    // If the argument is a dense array then convert it to the new
    // type; this will only involve a copy of the raw data if the type
    // is changed, otherwise the new array will simply link to the old
    // one
    template <typename NewType, int Rank, typename OldType, bool IsActive>
    inline
    Array<Rank,NewType,IsActive>
    promote_array(const Array<Rank,OldType,IsActive>& arg) {
      return Array<Rank,NewType,IsActive>(const_cast<Array<Rank,OldType,IsActive>&>(arg));
    }

#ifdef ADEPT_ONLY_DIFFERENTIATE_DENSE_MATRIX_MULTIPLICATION
    // If the argument is an active special matrix then it must be
    // copied to a dense "Array" because differentiation of the
    // various types of special matrix (symmetric, band, upper, lower
    // etc) is not yet implemented.
    template <typename NewType, typename OldType, class Engine>
    inline
    Array<2,NewType,true>
    promote_array(const SpecialMatrix<OldType,Engine,true>& arg) {
      return Array<2,NewType,true>(
	   const_cast<SpecialMatrix<OldType,Engine,true>&>(arg));
    }

    // If the argument is an inactive symmetric or band matrix then
    // convert the element type; this will only involve a copy of the
    // raw data if the type is changed, otherwise the new array will
    // simply link to the old
    template <typename NewType, typename OldType, SymmMatrixOrientation Orient>
    inline
    SpecialMatrix<NewType,internal::SymmEngine<Orient>,false>
    promote_array(const SpecialMatrix<OldType,internal::SymmEngine<Orient>,false>& arg) {
      return SpecialMatrix<NewType,internal::SymmEngine<Orient>,false>(
	 const_cast<SpecialMatrix<OldType,internal::SymmEngine<Orient>,false>&>(arg));
    }
    template <typename NewType, typename OldType, 
      MatrixStorageOrder Order, Index LDiags, Index UDiags>
    inline
    SpecialMatrix<NewType,internal::BandEngine<Order,LDiags,UDiags>,false>
    promote_array(const SpecialMatrix<OldType,internal::BandEngine<Order,LDiags,UDiags>,false>& arg) {
      return SpecialMatrix<NewType,internal::BandEngine<Order,LDiags,UDiags>,false>(
	 const_cast<SpecialMatrix<OldType,internal::BandEngine<Order,LDiags,UDiags>,false>&>(arg));
    } 

    // For other special matrices (square and triangular), specific
    // matrix multiplication functions have not yet been added, so we
    // have to convert to a dense array first
    template <typename NewType, typename OldType, class Engine>
    inline
    Array<2,NewType,false>
    promote_array(const SpecialMatrix<OldType,Engine,false>& arg) {
      return Array<2,NewType,false>(
	 const_cast<SpecialMatrix<OldType,Engine,false>&>(arg));
    } 

#else
    // The following assumes that the Adept library knows how to
    // differentiate special matrices: currently it doesn't so this
    // path is likely to throw a run-time exception.
    template <typename NewType, typename OldType, class Engine, bool IsActive>
    inline
    SpecialMatrix<NewType,Engine,IsActive>
    promote_array(const SpecialMatrix<OldType,Engine,IsActive>& arg) {
      return SpecialMatrix<NewType,Engine,IsActive>(
		     const_cast<SpecialMatrix<OldType,Engine,IsActive>&>(arg));
    }
#endif

    // If the argument is a fixed array of a different type then copy it
    template <typename NewType, typename OldType, bool IsActive, Index J0,
	      Index J1, Index J2, Index J3, Index J4, Index J5, Index J6>
    inline
    typename enable_if<!is_same<NewType,OldType>::value,
		       Array<fixed_array<J0,J1,J2,J3,J4,J5,J6>::rank,
			     NewType,IsActive> >::type
    promote_array(const FixedArray<OldType,IsActive,J0,J1,J2,J3,J4,J5,J6>& arg) {
      return Array<fixed_array<J0,J1,J2,J3,J4,J5,J6>::rank, 
	NewType,IsActive>(const_cast<FixedArray<OldType,IsActive,J0,J1,J2,J3,J4,J5,J6>&>(arg));
    }

    // If the argument is a fixed array of the same type then link to it
    template <typename NewType, typename OldType, bool IsActive, Index J0, 
	      Index J1, Index J2, Index J3, Index J4, Index J5, Index J6>
    inline
    typename enable_if<is_same<NewType,OldType>::value,
		       Array<fixed_array<J0,J1,J2,J3,J4,J5,J6>::rank,
			     NewType,IsActive> >::type
    promote_array(const FixedArray<OldType,IsActive,J0,J1,J2,J3,J4,J5,J6>& arg) {
      return Array<fixed_array<J0,J1,J2,J3,J4,J5,J6>::rank,NewType,IsActive>
	(const_cast<FixedArray<OldType,IsActive,J0,J1,J2,J3,J4,J5,J6>&>(arg).data(), 0,
	 arg.dimensions(), arg.offset(), arg.gradient_index());
    }

  } // End namespace internal

  // ---------------------------------------------------------------------
  // matmul function: replicates Fortran-90 equivalent
  // ---------------------------------------------------------------------

  // If either argument is not an lvalue (i.e. is an array expression
  // rather than an array) then convert it into a dense array
  template <typename LType, class L, typename RType, class R>
  inline
  typename internal::enable_if<(L::rank == 1 || L::rank == 2) && (R::rank == 1 || R::rank == 2)
                      && (L::rank+R::rank > 2),
    Array<L::rank+R::rank-2,typename internal::promote<LType,RType>::type,
    L::is_active||R::is_active> >::type
  matmul(const Expression<LType,L>& left, const Expression<RType,R>& right) {
    typedef typename internal::promote<typename L::type,typename R::type>::type type;
    return internal::matmul_(internal::promote_array<type>(left.cast()),
			     internal::promote_array<type>(right.cast()));
  }
  

  // ---------------------------------------------------------------------
  // Implement "**" pseudo-operator for matrix multiplication
  // ---------------------------------------------------------------------

  // In order for A**B to lead to matrix multiplication, *B will
  // return a MatmulRHS object, and A*[a MatmulRHS object] will send
  // the two arguments to the matmul function

  namespace internal {

    // The MatmulRHS class simply contains a reference to an array
    template <class A>
    struct MatmulRHS {
      // The following are not used but enable
      // expr_cast<MatmulRHS>::... to work
      static const int  rank      = A::rank;
      static const bool is_active = A::is_active;
      static const int  n_arrays  = 0;
      static const bool n_active  = 0;
      static const bool is_lvalue = false;
      static const bool is_vectorizable = false;
      static const int  n_scratch = 0;
      // The following are necessary in order that other binary
      // operator* functions can compile, even if they are rejected
      // for a particular multiplication
      typedef typename A::type type;
      typedef bool _adept_expression_flag;
      // Constructor simply saves a reference to the expression
      // argument
      MatmulRHS(const A& a) : array(a) { }
      const A& array;
    };
  }

  // Dereference operator returns a MatmulRHS object
  template <typename Type, class A>
  inline
  typename internal::enable_if<(A::rank == 1 || A::rank == 2),
			       internal::MatmulRHS<A> >::type
  operator*(const Expression<Type,A>& a) {
    return internal::MatmulRHS<A>(a.cast());
  }

  // Multiply operator with a MatmulRHS object on the right-hand-side
  // will call the matmul function
  template <typename LType, class L, class R>
  inline
  Array<L::rank+R::rank-2,typename internal::promote<LType,typename R::type>::type,
	(L::is_active||R::is_active)>
  operator*(const Expression<LType,L>& left, const internal::MatmulRHS<R>& right) {
    return matmul(left.cast(),right.array.cast());
  }


} // End namespace adept

#endif


================================================
FILE: include/adept/noalias.h
================================================
/* noalias.h -- Wrap an expression so that alias checking is not performed

    Copyright (C) 2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.
*/

#ifndef AdeptNoalias_H
#define AdeptNoalias_H

#include <adept/Expression.h>

namespace adept {

  namespace internal {

    // No-alias wrapper for enabling noalias()
    template <typename Type, class R>
    struct NoAlias
      : public Expression<Type, NoAlias<Type, R> > 
    {
      static const int  rank       = R::rank;
      static const bool is_active  = R::is_active;
      static const int  n_active   = R::n_active;
      static const int  n_scratch  = R::n_scratch;
      static const int  n_arrays   = R::n_arrays;
      static const bool is_vectorizable = R::is_vectorizable;

      const R& arg;

      NoAlias(const Expression<Type, R>& arg_)
	: arg(arg_.cast()) { }
      
      template <int Rank>
	bool get_dimensions_(ExpressionSize<Rank>& dim) const {
	return arg.get_dimensions(dim);
      }

//       Index get_dimension_with_len(Index len) const {
// 	return arg.get_dimension_with_len_(len);
//       }

      std::string expression_string_() const {
	std::string str = "noalias(";
	str += static_cast<const R*>(&arg)->expression_string() + ")";
	return str;
      }

      bool is_aliased_(const Type* mem1, const Type* mem2) const {
	return false;
      }
      bool all_arrays_contiguous_() const {
	return arg.all_arrays_contiguous_(); 
      }
 
      bool is_aligned_() const {
	return arg.is_aligned_();
      } 
     
      template <int n>
      int alignment_offset_() const { 
        return arg.template alignment_offset_<n>();
      }

      template <int Rank>
      Type value_with_len_(Index i, Index len) const {
	return operation(arg.value_with_len(i, len));
      }
      
      template <int MyArrayNum, int NArrays>
      void advance_location_(ExpressionSize<NArrays>& loc) const {
	arg.template advance_location_<MyArrayNum>(loc);
      }

      template <int MyArrayNum, int NArrays>
      Type value_at_location_(const ExpressionSize<NArrays>& loc) const {
	return arg.template value_at_location_<MyArrayNum>(loc);
      }
      template <int MyArrayNum, int NArrays>
      Packet<Type> packet_at_location_(const ExpressionSize<NArrays>& loc) const {
	return arg.template packet_at_location_<MyArrayNum>(loc);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_at_location_store_(const ExpressionSize<NArrays>& loc,
				    ScratchVector<NScratch>& scratch) const {
	return arg.template value_at_location_store_<MyArrayNum,MyScratchNum>(loc, 
								     scratch);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_stored_(const ExpressionSize<NArrays>& loc,
			 const ScratchVector<NScratch>& scratch) const {
	return scratch[MyScratchNum];
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      void calc_gradient_(Stack& stack, 
			  const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const {
	arg.template calc_gradient_<MyArrayNum, MyScratchNum>(stack, loc, 
							      scratch);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch,
		typename MyType>
      void calc_gradient_(Stack& stack, 
			  const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch,
			  MyType multiplier) const {
	arg.template calc_gradient_<MyArrayNum, MyScratchNum+1>(stack, loc, 
								scratch,
								multiplier);
      }

      template <int MyArrayNum, int Rank, int NArrays>
      void set_location_(const ExpressionSize<Rank>& i, 
			 ExpressionSize<NArrays>& index) const {
	arg.template set_location_<MyArrayNum>(i, index);
      }

    }; // End struct NoAlias

  }

  template <typename Type, class R>
  inline
  adept::internal::NoAlias<Type, R>
  noalias(const Expression<Type, R>& r) {
    return adept::internal::NoAlias<Type, R>(r.cast());
  }

  template <typename Type>
  inline
  typename internal::enable_if<internal::is_not_expression<Type>::value, Type>::type
  noalias(const Type& r) {
    return r;
  }

}

#endif


================================================
FILE: include/adept/outer_product.h
================================================
/* outer_product.h -- Compute the outer product of two vectors

    Copyright (C) 2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.
*/
                             
#ifndef AdeptOuterProduct_H
#define AdeptOuterProduct_H

#include <adept/BinaryOperation.h>
#include <adept/Array.h>

namespace adept {

  namespace internal {

    // Expression representing an outer product
    template <typename Type, typename LType, class L, typename RType, class R>
    class OuterProduct
      : public Expression<Type, OuterProduct<Type,LType,L,RType,R> > {

      typedef Array<1,LType,L::is_active> LArray;
      typedef Array<1,RType,R::is_active> RArray;

    public:
      // Static data
      static const int rank  = 2;
      static const bool is_active  = L::is_active || R::is_active;
      static const int  store_result = is_active;
      static const int  n_active  = LArray::n_active + RArray::n_active;
      static const int  n_local_scratch = store_result; 
      static const int  n_scratch 
        = n_local_scratch + LArray::n_scratch + RArray::n_scratch;
      static const int  n_arrays  = LArray::n_arrays + RArray::n_arrays;
      // Currently not vectorizable because the current design always
      // has the array index increasing
      //      static const bool is_vectorizable = is_same<LType,RType>::value;
      static const bool is_vectorizable = false;//is_same<LType,RType>::value;

    protected:

      // DATA: need to store actual arrays to avoid temporaries going
      // out of scope before they're used; note that if an array is
      // passed in then a shallow copy is made.
      const LArray left;
      const RArray right;
 
    public:

      OuterProduct(const Expression<LType,L>& left_,
		   const Expression<RType,R>& right_) 
	: left(left_.cast()), right(right_.cast()) { }

      bool get_dimensions_(ExpressionSize<2>& dim) const {
	dim[0] = left.size();
	dim[1] = right.size();

	return dim[0] > 0 && dim[1] > 0;
      }

      std::string expression_string_() const {
	return "outer_product(" + left.expression_string() + ","
	  + right.expression_string() + ")";
      }

      bool is_aliased_(const Type* mem1, const Type* mem2) const {
	return false;
      }

      bool all_arrays_contiguous_() const {
	return right.all_arrays_contiguous_();
      }
 
      bool is_aligned_() const {
	return right.is_aligned_();
      }
      
      template <int n>
      int alignment_offset_() const {
	return right.template alignment_offset_<n>();
      }

      // Do not implement value_with_len_

      // Advance the row only, so the left vector is not advanced
      template <int MyArrayNum, int NArrays>
      void advance_location_(ExpressionSize<NArrays>& loc) const {
	right.template advance_location_<MyArrayNum+LArray::n_arrays>(loc);
      }

      template <int MyArrayNum, int NArrays>
      Type value_at_location_(const ExpressionSize<NArrays>& loc) const {
	return left.template value_at_location_<MyArrayNum>(loc)
  	    * right.template value_at_location_<MyArrayNum+LArray::n_arrays>(loc);
      }

      // This does not work because the array index is always
      // increased which it shouldn't be for the left vector. For this
      // reason, vectorization is turned off (see is_vectorizable
      // above)
      template <int MyArrayNum, int NArrays>
      Packet<Type> packet_at_location_(const ExpressionSize<NArrays>& loc) const {
	// The LHS of the following multiplication returns a packet
	// containing repeated values of the left vector at one
	// location
	return Packet<Type>(left.template value_at_location_<MyArrayNum>(loc)) // <- fix!
	  * right.template packet_at_location_<MyArrayNum+LArray::n_arrays>(loc);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_at_location_store_(const ExpressionSize<NArrays>& loc,
				    ScratchVector<NScratch>& scratch) const {
	return scratch[MyScratchNum] = 
	  left.template value_at_location_store_<MyArrayNum,MyScratchNum+n_local_scratch>(loc, scratch)
	  * right.template value_at_location_store_<MyArrayNum+LArray::n_arrays,
					   MyScratchNum+LArray::n_scratch+n_local_scratch>(loc, scratch);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_stored_(const ExpressionSize<NArrays>& loc,
			 const ScratchVector<NScratch>& scratch) const {
	return scratch[MyScratchNum];
      }
      
      template <int MyArrayNum, int NArrays>
      void set_location_(const ExpressionSize<2>& i, 
			 ExpressionSize<NArrays>& index) const {
	left.template  set_location_<MyArrayNum>(ExpressionSize<1>(i[0]), index);
	right.template set_location_<MyArrayNum+LArray::n_arrays>(ExpressionSize<1>(i[1]), index);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      void calc_gradient_(Stack& stack, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const {
        calc_left_ <MyArrayNum, MyScratchNum>(stack, left,  loc, scratch);
        calc_right_<MyArrayNum, MyScratchNum>(stack, right, loc, scratch);
      }

      // As the previous but multiplying the gradient by "multiplier"
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, typename MyType>
      void calc_gradient_(Stack& stack, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch,
			  MyType multiplier) const {
        calc_left_ <MyArrayNum, MyScratchNum>(stack, left,  loc, scratch, multiplier);
        calc_right_<MyArrayNum, MyScratchNum>(stack, right, loc, scratch, multiplier);
      }

    protected:
      // Only calculate gradients for left and right arguments if they
      // are active; otherwise do nothing
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class MyLType>
      typename enable_if<MyLType::is_active,void>::type
      calc_left_(Stack& stack, const MyLType& left, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const {
	Multiply::template calc_left<MyArrayNum, MyScratchNum>(stack, left, right, loc, scratch);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class MyLType>
      typename enable_if<!MyLType::is_active,void>::type
      calc_left_(Stack& stack, const MyLType& left, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const { }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class MyRType>
      typename enable_if<MyRType::is_active,void>::type
      calc_right_(Stack& stack, const MyRType& right, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const {
	Multiply::template calc_right<MyArrayNum, MyScratchNum>(stack, left, right, loc, scratch);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class MyRType>
      typename enable_if<!MyRType::is_active,void>::type
      calc_right_(Stack& stack, const MyRType& right, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const { }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class MyLType, typename MyType>
      typename enable_if<MyLType::is_active,void>::type
      calc_left_(Stack& stack, const MyLType& left, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch, MyType multiplier) const {
	Multiply::template calc_left<MyArrayNum, MyScratchNum>(stack, left, right, loc, scratch, multiplier);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class MyLType, typename MyType>
      typename enable_if<!MyLType::is_active,void>::type
      calc_left_(Stack& stack, const MyLType& left, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch, MyType multiplier) const { }


      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class MyRType, typename MyType>
      typename enable_if<MyRType::is_active,void>::type
      calc_right_(Stack& stack, const MyRType& right, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch, MyType multiplier) const {
	Multiply::template calc_right<MyArrayNum, MyScratchNum>(stack, left, right, loc, scratch, multiplier);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch, class MyRType, typename MyType>
      typename enable_if<!MyRType::is_active,void>::type
      calc_right_(Stack& stack, const MyRType& right, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch, MyType multiplier) const { }
    };
   
  }

  // Define outer_product function
  template <typename LType, class L, typename RType, class R>
  internal::OuterProduct<typename internal::promote<LType,RType>::type,LType,L,RType,R>
  outer_product(const Expression<LType,L>& l, const Expression<RType,R>& r) {
    return internal::OuterProduct<typename internal::promote<LType,RType>::type,
				  LType,L,RType,R>(l,r);
  }

}


#endif


================================================
FILE: include/adept/quick_e.h
================================================
/* quick_e.h -- Fast exponential function for Intel and ARM intrinsics

   Copyright (C) 2020 European Centre for Medium-Range Weather Forecasts

   Author: Robin Hogan <r.j.hogan@ecmwf.int>

   This file is part of the Adept library, although can be used
   stand-alone.

   The exponential function for real arguments is used in many areas
   of physics, yet is not vectorized by many compilers.  This C++
   header file provides a fast exponential function (quick_e::exp) for
   single and double precision floating point numbers, Intel
   intrinsics representing packets of 2, 4, 8 and 16 such numbers, and
   ARM NEON intrinsics representing 2 doubles or 4 floats.  The
   algorithm has been taken from Agner Fog's Vector Class Library. It
   is designed to be used in other libraries that make use of Intel or
   ARM intrinsics.  Since such libraries often define their own
   classes for representing vectors of numbers, this file does not
   define any such classes itself.

   Also in the namespace quick_e, this file defines the following
   inline functions that work on intrinsics of type "Vec" and the
   corresponding scalar type "Sca":

     Vec add(Vec x, Vec y)   Add the elements of x and y
     Vec sub(Vec x, Vec y)   Subtract the elements of x and y
     Vec mul(Vec x, Vec y)   Multiply the elements of x and y
     Vec div(Vec x, Vec y)   Divide the elements of x and y
     Vec set0<Vec>()         Returns zero in all elements
     Vec set1<Vec>(Sca a)    Returns all elements set to a
     Vec sqrt(Vec x)         Square root of all elements
     Vec fmin(Vec x, Vec y)  Minimum of elements of x and y
     Vec fmax(Vec x, Vec y)  Maximum of elements of x and y
     Vec load(const Sca* d)  Aligned load from memory location d
     Vec loadu(const Sca* d) Unaligned load from memory location d
     void store(Sca* d, Vec x)  Aligned store of x to d
     void storeu(Sca* d, Vec x) Unaligned store of x to d
     Sca hsum(Vec x)         Horizontal sum of elements of x
     Sca hmul(Vec x)         Horizontal product of elements of x
     Sca hmin(Vec x)         Horizontal minimum of elements of x
     Sca hmax(Vec x)         Horizontal maximum of elements of x
     Vec fma(Vec x, Vec y, Vec z)  Fused multiply-add: (x*y)+z
     Vec fnma(Vec x, Vec y, Vec z) Returns z-(x*y)
     Vec pow2n(Vec x)        Returns 2 to the power of x
     Vec exp(Vec x)          Returns exponential of x
   
 */

#ifndef QuickE_H
#define QuickE_H 1

#include <cmath>

// Microsoft compiler doesn't define __SSE2__ even if __AVX__ is
// defined
#ifdef __AVX__
#ifndef __SSE2__
#define __SSE2__ 1
#endif
#endif

// Headers needed for x86 vector intrinsics
#ifdef __SSE2__
  #include <xmmintrin.h> // SSE
  #include <emmintrin.h> // SSE2
  // Numerous platforms don't define _mm_undefined_ps in xmmintrin.h,
  // so we assume none do, except GCC >= 4.9.1 and CLANG >= 3.8.0.
  // Those that don't use an equivalent function that sets the
  // elements to zero.
  #define QE_MM_UNDEFINED_PS _mm_setzero_ps
  #ifdef __clang__
    #if __has_builtin(__builtin_ia32_undef128)
      #undef QE_MM_UNDEFINED_PS
      #define QE_MM_UNDEFINED_PS _mm_undefined_ps
    #endif
  #elif defined(__GNUC__)
    #define GCC_VERSION (__GNUC__ * 10000 \
			 + __GNUC_MINOR__ * 100	\
			 + __GNUC_PATCHLEVEL__)
    #if GCC_VERSION >= 40901
      #undef QE_MM_UNDEFINED_PS
      #define QE_MM_UNDEFINED_PS _mm_undefined_ps
    #endif
    #undef GCC_VERSION
  #endif // __clang__/__GNUC__
#endif // __SSE2__

#ifdef __SSE4_1__
#include <smmintrin.h>
#endif

#ifdef __AVX__
  #include <tmmintrin.h> // SSE3
  #include <immintrin.h> // AVX
#endif

#ifdef __AVX512F__
  #include <immintrin.h>
#endif

#ifdef __ARM_NEON
  // We only have sufficient floating-point intrinsics to vectorize on
  // 64-bit ARM targets
  #if defined(__aarch64__) || defined(_M_ARM64)
    #define QE_HAVE_ARM64_NEON 1
    #include "arm_neon.h"
  #endif
#endif

namespace quick_e {

  // -------------------------------------------------------------------
  // Traits
  // -------------------------------------------------------------------

  template <typename Type, int Size> struct packet {
    static const bool is_available = false;
    static const int  size         = 1;
    typedef Type type;
  };
  template <typename Type> struct longest_packet {
    typedef Type type;
    static const int size = 1;
  };

  // g++ issues ugly warnings if VEC is an Intel intrinsic, disabled
  // with -Wno-ignored-attributes
#define QE_DEFINE_TRAITS(TYPE, SIZE, VEC, HALF_TYPE)   \
  template <> struct packet<TYPE,SIZE> {	       \
    static const bool is_available = true;	       \
    static const int  size = SIZE;		       \
    typedef VEC type;				       \
    typedef HALF_TYPE half_type;		       \
  };

#define QE_DEFINE_LONGEST(VECS, VECD)			\
  template <> struct longest_packet<float> {		\
    typedef VECS type;					\
    static const int size = sizeof(VECS)/sizeof(float);	\
  };							\
  template <> struct longest_packet<double> {		\
    typedef VECD type;					\
    static const int size = sizeof(VECD)/sizeof(double);\
  };
  
#ifdef __SSE2__
  #define QE_HAVE_FAST_EXP 1
  QE_DEFINE_TRAITS(float, 4, __m128, __m128)
  QE_DEFINE_TRAITS(double, 2, __m128d, double)
  #ifdef __AVX__
    QE_DEFINE_TRAITS(float, 8, __m256, __m128)
    QE_DEFINE_TRAITS(double, 4, __m256d, __m128d)
    #ifdef __AVX512F__
      QE_DEFINE_TRAITS(float, 16, __m512, __m256)
      QE_DEFINE_TRAITS(double, 8, __m512d, __m256d)
      QE_DEFINE_LONGEST(__m512, __m512d)
      #define QE_LONGEST_FLOAT_PACKET 16
      #define QE_LONGEST_DOUBLE_PACKET 8
    #else
      QE_DEFINE_LONGEST(__m256, __m256d)
      #define QE_LONGEST_FLOAT_PACKET 8
      #define QE_LONGEST_DOUBLE_PACKET 4
    #endif
  #else
    QE_DEFINE_LONGEST(__m128, __m128d)
    #define QE_LONGEST_FLOAT_PACKET 4
    #define QE_LONGEST_DOUBLE_PACKET 2
  #endif
  // If QE_AVAILABLE is defined then we can use the fast exponential
  #define QE_AVAILABLE
#elif defined(QE_HAVE_ARM64_NEON)
  #define QE_HAVE_FAST_EXP 1
  QE_DEFINE_TRAITS(float, 4, float32x4_t, float32x4_t)
  QE_DEFINE_TRAITS(double, 2, float64x2_t, double)
  QE_DEFINE_LONGEST(float32x4_t, float64x2_t)
  #define QE_LONGEST_FLOAT_PACKET 4
  #define QE_LONGEST_DOUBLE_PACKET 2
#else
  // No vectorization available: longest packet is of size 1
  QE_DEFINE_LONGEST(float, double);
#define QE_LONGEST_FLOAT_PACKET 1
#define QE_LONGEST_DOUBLE_PACKET 1
#endif
  
  
  // -------------------------------------------------------------------
  // Scalars
  // -------------------------------------------------------------------
  
  // Define a few functions for scalars in order that the same
  // implementation of "exp" can be used for both scalars and SIMD
  // vectors
  template <typename T> T add(T x, T y) { return x+y; }
  template <typename T> T sub(T x, T y) { return x-y; }
  template <typename T> T mul(T x, T y) { return x*y; }
  template <typename T> T div(T x, T y) { return x/y; }
  template <typename T> T neg(T x)      { return -x;  }
  template <typename T, typename V> void store(T* d, V x) { *d = x;  }
  template <typename T, typename V> void storeu(T* d, V x){ *d = x;  }
  template <typename V, typename T> V load(const T* d) { return *d;  }
  template <typename V, typename T> V loadu(const T* d){ return *d;  }
  template <typename V, typename T> V set1(T x) { return x;   }
  template <typename V> inline V set0() { return 0.0; };
  template <typename T> T sqrt(T x) { return std::sqrt(x); }
  
  template <typename T> T hsum(T x) { return x; }
  template <typename T> T hmul(T x) { return x; }
  template <typename T> T hmin(T x) { return x; }
  template <typename T> T hmax(T x) { return x; }
  
  template <typename T> T fma(T x, T y, T z)  { return (x*y)+z; }
  template <typename T> T fnma(T x, T y, T z) { return z-(x*y); }
  template <typename T> T fmin(T x, T y)  { return std::min(x,y); }
  template <typename T> T fmax(T x, T y)  { return std::max(x,y); }
 
#if __cplusplus > 199711L
  template <> inline float  fmin(float x, float y)   { return std::fmin(x,y); }
  template <> inline double fmin(double x, double y) { return std::fmin(x,y); }
  template <> inline float  fmax(float x, float y)   { return std::fmax(x,y); }
  template <> inline double fmax(double x, double y) { return std::fmax(x,y); }
#endif

  inline float select_gt(float x1, float x2, float y1, float y2) {
    if (x1 > x2) { return y1; } else { return y2; }
  }
  inline double select_gt(double x1, double x2, double y1, double y2) {
    if (x1 > x2) { return y1; } else { return y2; }
  }
  
  inline bool all_in_range(float x, float low_bound, float high_bound) {
    return x >= low_bound && x <= high_bound;
  }
  inline bool all_in_range(double x, double low_bound, double high_bound) {
    return x >= low_bound && x <= high_bound;
  }
  
  // -------------------------------------------------------------------
  // Macros to define mathematical operations
  // -------------------------------------------------------------------

  // Basic load store, arithmetic, sqrt, min and max
#define QE_DEFINE_BASIC(TYPE, VEC, LOAD, LOADU, SET0, SET1,	\
			STORE, STOREU, ADD, SUB, MUL, DIV,	\
			SQRT, FMIN, FMAX)			\
  inline VEC add(VEC x, VEC y)       { return ADD(x, y); }	\
  inline VEC sub(VEC x, VEC y)       { return SUB(x, y); }	\
  inline VEC mul(VEC x, VEC y)       { return MUL(x, y); }	\
  inline VEC div(VEC x, VEC y)       { return DIV(x, y); }	\
  inline VEC neg(VEC x)              { return SUB(SET0(), x); }	\
  template <> inline VEC set0<VEC>()        { return SET0();  }	\
  template <> inline VEC set1<VEC>(TYPE x)  { return SET1(x); }	\
  inline VEC sqrt(VEC x)             { return SQRT(x);   }	\
  inline VEC fmin(VEC x, VEC y)      { return FMIN(x,y); }	\
  inline VEC fmax(VEC x, VEC y)      { return FMAX(x,y); }	\
  template <> inline VEC load<VEC,TYPE>(const TYPE* d)		\
  { return LOAD(d);  }						\
  template <> inline VEC loadu<VEC,TYPE>(const TYPE* d)         \
  { return LOADU(d); }						\
  inline void store(TYPE* d, VEC x)  { STORE(d, x);      }	\
  inline void storeu(TYPE* d, VEC x) { STOREU(d, x);     }	\
  inline std::ostream& operator<<(std::ostream& os, VEC x) {	\
    static const int size = sizeof(VEC)/sizeof(TYPE);		\
    union { VEC v; TYPE d[size]; };				\
    v = x; os << "{";						\
    for (int i = 0; i < size; ++i)				\
      { os << " " << d[i]; }					\
    os << "}"; return os;					\
  }
  
#define QE_DEFINE_CHOP(VEC, HALF_TYPE, LOW, HIGH, PACK)		\
  inline HALF_TYPE low(VEC x)   { return LOW;       }		\
  inline HALF_TYPE high(VEC x)  { return HIGH;      }		\
  inline VEC pack(HALF_TYPE x, HALF_TYPE y) { return PACK; }
  
  // Reduction operations: horizontal sum, product, min and max
#define QE_DEFINE_HORIZ(TYPE, VEC, HSUM, HMUL, HMIN, HMAX)	\
  inline TYPE hsum(VEC x)            { return HSUM(x);   }	\
  inline TYPE hmul(VEC x)            { return HMUL(x);   }	\
  inline TYPE hmin(VEC x)            { return HMIN(x);   }	\
  inline TYPE hmax(VEC x)            { return HMAX(x);   }

  // Define fused multiply-add functions
#define QE_DEFINE_FMA(TYPE, VEC, FMA, FNMA)			\
  inline VEC fma(VEC x,VEC y,VEC z)  { return FMA(x,y,z); }	\
  inline VEC fma(VEC x,TYPE y,VEC z)				\
  { return FMA(x,set1<VEC>(y),z); }				\
  inline VEC fma(TYPE x, VEC y, TYPE z)				\
  { return FMA(set1<VEC>(x),y,set1<VEC>(z)); }			\
  inline VEC fma(VEC x, VEC y, TYPE z)				\
  { return FMA(x,y,set1<VEC>(z)); }				\
  inline VEC fnma(VEC x,VEC y,VEC z) { return FNMA(x,y,z);}

  // Alternative order of arguments for ARM NEON
#define QE_DEFINE_FMA_ALT(TYPE, VEC, FMA, FNMA)			\
  inline VEC fma(VEC x,VEC y,VEC z)  { return FMA(z,x,y); }	\
  inline VEC fma(VEC x,TYPE y,VEC z)				\
  { return FMA(z,x,set1<VEC>(y)); }				\
  inline VEC fma(TYPE x, VEC y, TYPE z)				\
  { return FMA(set1<VEC>(z),set1<VEC>(x),y); }			\
  inline VEC fma(VEC x, VEC y, TYPE z)				\
  { return FMA(set1<VEC>(z),x,y); }				\
  inline VEC fnma(VEC x,VEC y,VEC z) { return FNMA(z,x,y);}
  
  // Emulate fused multiply-add if instruction not available
#define QE_EMULATE_FMA(TYPE, VEC)				\
  inline VEC fma(VEC x,VEC y,VEC z)  { return add(mul(x,y),z);}	\
  inline VEC fma(VEC x,TYPE y,VEC z)				\
  { return add(mul(x,set1<VEC>(y)),z); }			\
  inline VEC fma(TYPE x, VEC y, TYPE z)				\
  { return add(mul(set1<VEC>(x),y),set1<VEC>(z)); }		\
  inline VEC fma(VEC x, VEC y, TYPE z)				\
  { return add(mul(x,y),set1<VEC>(z)); }			\
  inline VEC fnma(VEC x,VEC y,VEC z) { return sub(z,mul(x,y));}

#define QE_DEFINE_POW2N_S(VEC, VECI, CASTTO, CASTBACK, SHIFTL,  \
			  SETELEM)				\
  inline VEC pow2n(VEC n) {					\
    const float pow2_23 = 8388608.0;				\
    const float bias = 127.0;					\
    VEC  a = add(n, set1<VEC>(bias+pow2_23));			\
    VECI b = CASTTO(a);						\
    VECI c = SHIFTL(b, SETELEM(23));				\
    VEC  d = CASTBACK(c);					\
    return d;							\
  }
#define QE_DEFINE_POW2N_D(VEC, VECI, CASTTO, CASTBACK, SHIFTL,  \
			  SETELEM)				\
  inline VEC pow2n(VEC n) {					\
    const double pow2_52 = 4503599627370496.0;			\
    const double bias = 1023.0;					\
    VEC  a = add(n, set1<VEC>(bias+pow2_52));			\
    VECI b = CASTTO(a);						\
    VECI c = SHIFTL(b, SETELEM(52));				\
    VEC  d = CASTBACK(c);					\
    return d;							\
  }

  // -------------------------------------------------------------------
  // Define operations for SSE2: vector of 4 floats or 2 doubles
  // -------------------------------------------------------------------
  

#ifdef __SSE2__
  QE_DEFINE_BASIC(float, __m128, _mm_load_ps, _mm_loadu_ps,
		  _mm_setzero_ps, _mm_set1_ps, _mm_store_ps, _mm_storeu_ps,
		  _mm_add_ps, _mm_sub_ps, _mm_mul_ps, _mm_div_ps,
		  _mm_sqrt_ps, _mm_min_ps, _mm_max_ps)
  QE_DEFINE_BASIC(double, __m128d, _mm_load_pd, _mm_loadu_pd,
		  _mm_setzero_pd, _mm_set1_pd, _mm_store_pd, _mm_storeu_pd,
		  _mm_add_pd, _mm_sub_pd, _mm_mul_pd, _mm_div_pd,
		  _mm_sqrt_pd, _mm_min_pd, _mm_max_pd)
  // Don't define chop operations for __m128 because we don't have a
  // container for two floats
  QE_DEFINE_CHOP(__m128d, double, _mm_cvtsd_f64(x),
		 _mm_cvtsd_f64(_mm_unpackhi_pd(x,x)),
		 _mm_set_pd(y,x))
		 
  // No built-in horizontal operations for SSE2, so need to implement
  // by hand
#define QE_DEFINE_HORIZ_SSE2(FUNC, OP_PS, OP_SS, OP_PD)			\
  inline float FUNC(__m128 x) {						\
    __m128 shuf = _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1));	\
    __m128 sums = OP_PS(x, shuf);					\
    shuf        = _mm_movehl_ps(shuf, sums);				\
    return _mm_cvtss_f32(OP_SS(sums, shuf));				\
  }									\
  inline double FUNC(__m128d x) {					\
    __m128 shuftmp= _mm_movehl_ps(QE_MM_UNDEFINED_PS(),			\
				  _mm_castpd_ps(x));			\
    __m128d shuf  = _mm_castps_pd(shuftmp);				\
    return  _mm_cvtsd_f64(OP_PD(x, shuf));				\
  }
  QE_DEFINE_HORIZ_SSE2(hsum, _mm_add_ps, _mm_add_ss, _mm_add_pd)
  QE_DEFINE_HORIZ_SSE2(hmul, _mm_mul_ps, _mm_mul_ss, _mm_mul_pd)
  QE_DEFINE_HORIZ_SSE2(hmin, _mm_min_ps, _mm_min_ss, _mm_min_pd)
  QE_DEFINE_HORIZ_SSE2(hmax, _mm_max_ps, _mm_max_ss, _mm_max_pd)

#undef QE_MM_UNDEFINED_PS
#undef QE_DEFINE_HORIZ_SSE2
  
#ifdef __FMA__
  QE_DEFINE_FMA(float, __m128, _mm_fmadd_ps, _mm_fnmadd_ps)
  QE_DEFINE_FMA(double, __m128d, _mm_fmadd_pd, _mm_fnmadd_pd)
#else
  QE_EMULATE_FMA(float, __m128)
  QE_EMULATE_FMA(double, __m128d)
#endif
#ifdef __SSE4_1__
  inline __m128 unchecked_round(__m128 x)
  { return _mm_round_ps(x, (_MM_FROUND_TO_NEAREST_INT
			      |_MM_FROUND_NO_EXC)); }
  inline __m128d unchecked_round(__m128d x)
  { return _mm_round_pd(x, (_MM_FROUND_TO_NEAREST_INT
			      |_MM_FROUND_NO_EXC)); }
#else
  // No native function available, but since the arguments are limited
  // to +/- 700, we don't need to check for going out of bounds
  inline __m128 unchecked_round(__m128 x)
  { return _mm_cvtepi32_ps(_mm_cvtps_epi32(x)); }
  inline __m128d unchecked_round(__m128d x)
  { return _mm_cvtepi32_pd(_mm_cvtpd_epi32(x)); }

#endif
  inline float unchecked_round(float x)
  { return _mm_cvtss_f32(unchecked_round(_mm_set_ss(x))); }
  inline double unchecked_round(double x)
  { return low(unchecked_round(_mm_set_sd(x))); }

  QE_DEFINE_POW2N_S(__m128, __m128i, _mm_castps_si128,
		    _mm_castsi128_ps, _mm_sll_epi32, _mm_cvtsi32_si128)
  QE_DEFINE_POW2N_D(__m128d, __m128i, _mm_castpd_si128,
		    _mm_castsi128_pd, _mm_sll_epi64, _mm_cvtsi32_si128)
  inline float pow2n(float x)
  { return _mm_cvtss_f32(pow2n(quick_e::set1<__m128>(x))); }
  inline double pow2n(double x)
  { return low(pow2n(quick_e::set1<__m128d>(x))); }

  
  inline bool horiz_and(__m128i a) {
#ifdef __SSE4_1__
    return _mm_testc_si128(a, _mm_set1_epi32(-1)) != 0;
#else
    __m128i t1 = _mm_unpackhi_epi64(a, a); // get 64 bits down
    __m128i t2 = _mm_and_si128(a, t1);     // and 64 bits
#ifdef __x86_64__
    int64_t t5 = _mm_cvtsi128_si64(t2);    // transfer 64 bits to integer
    return  t5 == int64_t(-1);
#else
    __m128i t3 = _mm_srli_epi64(t2, 32);   // get 32 bits down
    __m128i t4 = _mm_and_si128(t2, t3);    // and 32 bits
    int     t5 = _mm_cvtsi128_si32(t4);    // transfer 32 bits to integer
    return  t5 == -1;
#endif  // __x86_64__
#endif  // SSE 4.1
  }
  inline bool all_in_range(__m128 x, float low_bound, float high_bound) {
    return horiz_and(_mm_castps_si128(_mm_and_ps(
			 _mm_cmpge_ps(x,set1<__m128>(low_bound)),
			 _mm_cmple_ps(x,set1<__m128>(high_bound)))));
  }
  inline bool all_in_range(__m128d x, double low_bound, double high_bound) {
    return horiz_and(_mm_castpd_si128(_mm_and_pd(
			 _mm_cmpge_pd(x,set1<__m128d>(low_bound)),
			 _mm_cmple_pd(x,set1<__m128d>(high_bound)))));
  }

  // If x1 > x2, select y1, or select y2 otherwise
  inline __m128 select_gt(__m128 x1, __m128 x2,
			  __m128 y1, __m128 y2) {
    __m128 mask = _mm_cmpgt_ps(x1,x2);
#ifdef __SSE4_1__
    return _mm_blendv_ps(y2, y1, mask);
#else
    return _mm_or_ps(_mm_and_ps(mask, y1),
		     _mm_andnot_ps(mask, y2));
#endif
  }
  inline __m128d select_gt(__m128d x1, __m128d x2,
			   __m128d y1, __m128d y2) {
    __m128d mask = _mm_cmpgt_pd(x1,x2);
#ifdef __SSE4_1__
    return _mm_blendv_pd(y2, y1, mask);
#else
    return _mm_or_pd(_mm_and_pd(mask, y1),
		     _mm_andnot_pd(mask, y2));
#endif
  }
#endif

  // -------------------------------------------------------------------
  // Define operations for AVX: vector of 8 floats or 4 doubles
  // -------------------------------------------------------------------
#ifdef __AVX__
  QE_DEFINE_BASIC(float, __m256, _mm256_load_ps, _mm256_loadu_ps,
		  _mm256_setzero_ps, _mm256_set1_ps,
		  _mm256_store_ps, _mm256_storeu_ps,
		  _mm256_add_ps, _mm256_sub_ps,
		  _mm256_mul_ps, _mm256_div_ps, _mm256_sqrt_ps,
		  _mm256_min_ps, _mm256_max_ps)
  QE_DEFINE_BASIC(double, __m256d, _mm256_load_pd, _mm256_loadu_pd,
		  _mm256_setzero_pd, _mm256_set1_pd,
		  _mm256_store_pd, _mm256_storeu_pd,
		  _mm256_add_pd, _mm256_sub_pd,
		  _mm256_mul_pd, _mm256_div_pd, _mm256_sqrt_pd,
		  _mm256_min_pd, _mm256_max_pd)
  QE_DEFINE_CHOP(__m256, __m128,
		 _mm256_castps256_ps128(x), _mm256_extractf128_ps(x,1),
		 _mm256_permute2f128_ps(_mm256_castps128_ps256(x),
					_mm256_castps128_ps256(y), 0x20))
  QE_DEFINE_CHOP(__m256d, __m128d, _mm256_castpd256_pd128(x),
		 _mm256_extractf128_pd(x,1),
		 _mm256_permute2f128_pd(_mm256_castpd128_pd256(x),
					_mm256_castpd128_pd256(y), 0x20));

  // Implement by calling SSE2 h* functions
  inline float  hsum(__m256 x)  { return hsum(add(low(x), high(x))); }
  inline float  hmul(__m256 x)  { return hmul(mul(low(x), high(x))); }
  inline float  hmin(__m256 x)  { return hmin(fmin(low(x), high(x))); }
  inline float  hmax(__m256 x)  { return hmax(fmax(low(x), high(x))); }
  inline double hsum(__m256d x) { return hsum(add(low(x),  high(x))); } // Alternative would be to use _mm_hadd_pd
  inline double hmul(__m256d x) { return hmul(mul(low(x),  high(x))); }
  inline double hmin(__m256d x) { return hmin(fmin(low(x), high(x))); }
  inline double hmax(__m256d x) { return hmax(fmax(low(x), high(x))); }
  
  // Define extras
#ifdef __FMA__
  QE_DEFINE_FMA(float, __m256,  _mm256_fmadd_ps, _mm256_fnmadd_ps)
  QE_DEFINE_FMA(double, __m256d, _mm256_fmadd_pd, _mm256_fnmadd_pd)
#else
  QE_EMULATE_FMA(float, __m256)
  QE_EMULATE_FMA(double, __m256d)
#endif
  
  inline __m256 unchecked_round(__m256 x)
  { return _mm256_round_ps(x, (_MM_FROUND_TO_NEAREST_INT
			       |_MM_FROUND_NO_EXC)); }
  inline __m256d unchecked_round(__m256d x)
  { return _mm256_round_pd(x, (_MM_FROUND_TO_NEAREST_INT
			       |_MM_FROUND_NO_EXC)); }
  #ifdef __AVX2__
    QE_DEFINE_POW2N_S(__m256, __m256i, _mm256_castps_si256,
		      _mm256_castsi256_ps, _mm256_sll_epi32, _mm_cvtsi32_si128)
    QE_DEFINE_POW2N_D(__m256d, __m256i, _mm256_castpd_si256,
		      _mm256_castsi256_pd, _mm256_sll_epi64, _mm_cvtsi32_si128)
  #else
    // Suboptimized versions call the SSE2 functions on the upper and
    // lower parts
    inline __m256 pow2n(__m256 n) {
      return pack(pow2n(low(n)), pow2n(high(n)));
    }
    inline __m256d pow2n(__m256d n) {
      return pack(pow2n(low(n)), pow2n(high(n)));
    }
  #endif
 
  // Return true if all elements of x are in the range (inclusive) of
  // low_bound to high_bound.  If so the exp call can exit before the
  // more costly case of working out what to do with inputs out of
  // bounds.  Note that _CMP_GE_OS means compare
  // greater-than-or-equal-to, ordered, signaling, where "ordered"
  // means that if either operand is NaN, the result is false.
  inline bool all_in_range(__m256 x, float low_bound, float high_bound) {
    return _mm256_testc_si256(_mm256_castps_si256(_mm256_and_ps(
		 _mm256_cmp_ps(x,set1<__m256>(low_bound), _CMP_GE_OS),
		 _mm256_cmp_ps(x,set1<__m256>(high_bound), _CMP_LE_OS))),
			      _mm256_set1_epi32(-1)) != 0;
  }
  inline bool all_in_range(__m256d x, double low_bound, double high_bound) {
    return _mm256_testc_si256(_mm256_castpd_si256(_mm256_and_pd(
		 _mm256_cmp_pd(x,set1<__m256d>(low_bound), _CMP_GE_OS),
		 _mm256_cmp_pd(x,set1<__m256d>(high_bound), _CMP_LE_OS))),
			      _mm256_set1_epi32(-1)) != 0;
  }
  inline __m256 select_gt(__m256 x1, __m256 x2,
			  __m256 y1, __m256 y2) {
    return _mm256_blendv_ps(y2, y1, _mm256_cmp_ps(x1,x2,_CMP_GT_OS));
  }
  inline __m256d select_gt(__m256d x1, __m256d x2,
			   __m256d y1, __m256d y2) {
    return _mm256_blendv_pd(y2, y1, _mm256_cmp_pd(x1,x2,_CMP_GT_OS));
  }

#endif
  

  // -------------------------------------------------------------------
  // Define operations for AVX512: vector of 16 floats or 8 doubles
  // -------------------------------------------------------------------
#ifdef __AVX512F__
  QE_DEFINE_BASIC(float, __m512, _mm512_load_ps, _mm512_loadu_ps,
		  _mm512_setzero_ps, _mm512_set1_ps,
		  _mm512_store_ps, _mm512_storeu_ps,
		  _mm512_add_ps, _mm512_sub_ps,
		  _mm512_mul_ps, _mm512_div_ps, _mm512_sqrt_ps,
		  _mm512_min_ps, _mm512_max_ps)
  QE_DEFINE_HORIZ(float, __m512,
		  _mm512_reduce_add_ps, _mm512_reduce_mul_ps,
		  _mm512_reduce_min_ps, _mm512_reduce_max_ps)
  QE_DEFINE_BASIC(double, __m512d, _mm512_load_pd, _mm512_loadu_pd,
		  _mm512_setzero_pd, _mm512_set1_pd,
		  _mm512_store_pd, _mm512_storeu_pd,
		  _mm512_add_pd, _mm512_sub_pd,
		  _mm512_mul_pd, _mm512_div_pd, _mm512_sqrt_pd,
		  _mm512_min_pd, _mm512_max_pd)
  QE_DEFINE_HORIZ(double, __m512d,
		  _mm512_reduce_add_pd, _mm512_reduce_mul_pd,
		  _mm512_reduce_min_pd, _mm512_reduce_max_pd)
  
  inline __m512 unchecked_round(__m512 x)   { return _mm512_roundscale_ps(x, 0); }
  inline __m512d unchecked_round(__m512d x) { return _mm512_roundscale_pd(x, 0); }

  QE_DEFINE_FMA(float, __m512,  _mm512_fmadd_ps, _mm512_fnmadd_ps)
  QE_DEFINE_FMA(double, __m512d, _mm512_fmadd_pd, _mm512_fnmadd_pd)
  
  QE_DEFINE_POW2N_S(__m512, __m512i, _mm512_castps_si512,
		    _mm512_castsi512_ps, _mm512_sll_epi32, _mm_cvtsi32_si128)
  QE_DEFINE_POW2N_D(__m512d, __m512i, _mm512_castpd_si512,
		    _mm512_castsi512_pd, _mm512_sll_epi64, _mm_cvtsi32_si128)

  inline bool all_in_range(__m512 x, float low_bound, float high_bound) {
    return static_cast<unsigned short int>(_mm512_kand(
	      _mm512_cmp_ps_mask(x,set1<__m512>(low_bound),_CMP_GE_OS),
	      _mm512_cmp_ps_mask(x,set1<__m512>(high_bound),_CMP_LE_OS)))
      == static_cast<unsigned short int>(65535);
  }
  inline bool all_in_range(__m512d x, double low_bound, double high_bound) {
    return static_cast<unsigned short int>(_mm512_kand(
	      _mm512_cmp_pd_mask(x,set1<__m512d>(low_bound),_CMP_GE_OS),
	      _mm512_cmp_pd_mask(x,set1<__m512d>(high_bound),_CMP_LE_OS)))
      == static_cast<unsigned short int>(255);
  }
  inline __m512 select_gt(__m512 x1, __m512 x2,
			  __m512 y1, __m512 y2) {
    return _mm512_mask_mov_ps(y2, _mm512_cmp_ps_mask(x1,x2,_CMP_GT_OS), y1);
  }
  inline __m512d select_gt(__m512d x1, __m512d x2,
			   __m512d y1, __m512d y2) {
    return _mm512_mask_mov_pd(y2, _mm512_cmp_pd_mask(x1,x2,_CMP_GT_OS), y1);
  }

#endif

  
#ifdef QE_HAVE_ARM64_NEON

  // Implement ARM version of x86 setzero
  inline float32x4_t vzeroq_f32() { return vdupq_n_f32(0.0); }
  inline float64x2_t vzeroq_f64() { return vdupq_n_f64(0.0); }
  // Horizontal multiply across vector
  inline float vmulvq_f32(float32x4_t x) {
    union {
      float32x2_t v;
      float data[2];
    };
    v = vmul_f32(vget_low_f32(x), vget_high_f32(x));
    return data[0] * data[1];
  }
  inline double vmulvq_f64(float64x2_t x) {
    union {
      float64x2_t v;
      double data[2];
    };
    v = x;
    return data[0] * data[1];
  }
  
  QE_DEFINE_BASIC(float, float32x4_t, vld1q_f32, vld1q_f32,
		  vzeroq_f32, vdupq_n_f32, vst1q_f32, vst1q_f32,
		  vaddq_f32, vsubq_f32, vmulq_f32, vdivq_f32,
		  vsqrtq_f32, vminq_f32, vmaxq_f32)
  QE_DEFINE_HORIZ(float, float32x4_t,
		  vaddvq_f32, vmulvq_f32,
		  vminvq_f32, vmaxvq_f32)
  QE_DEFINE_BASIC(double, float64x2_t, vld1q_f64, vld1q_f64,
		  vzeroq_f64, vdupq_n_f64, vst1q_f64, vst1q_f64,
		  vaddq_f64, vsubq_f64, vmulq_f64, vdivq_f64,
		  vsqrtq_f64, vminq_f64, vmaxq_f64)
  QE_DEFINE_HORIZ(double, float64x2_t,
		  vaddvq_f64, vmulvq_f64,
		  vminvq_f64, vmaxvq_f64)
  QE_DEFINE_POW2N_S(float32x4_t, int32x4_t, vreinterpretq_s32_f32,
		    vreinterpretq_f32_s32, vshlq_s32, vdupq_n_s32)
  QE_DEFINE_POW2N_D(float64x2_t, int64x2_t, vreinterpretq_s64_f64,
		    vreinterpretq_f64_s64, vshlq_s64, vdupq_n_s64)
  QE_DEFINE_FMA_ALT(float, float32x4_t, vfmaq_f32, vfmsq_f32)
  QE_DEFINE_FMA_ALT(double, float64x2_t, vfmaq_f64, vfmsq_f64)
  inline bool all_in_range(float32x4_t x, double low_bound, double high_bound) {
    union {
      uint32x2_t v;
      uint32_t data[2];
    };
    uint32x4_t tmp = vandq_u32(vcgeq_f32(x,vdupq_n_f32(low_bound)),
			       vcleq_f32(x,vdupq_n_f32(high_bound)));
    v = vand_u32(vget_low_u32(tmp), vget_high_u32(tmp));
    return data[0] && data[1];
  }
  inline bool all_in_range(float64x2_t x, double low_bound, double high_bound) {
    union {
      uint64x2_t v;
      uint64_t data[2];
    };
    v = vandq_u64(vcgeq_f64(x,vdupq_n_f64(low_bound)),
		  vcleq_f64(x,vdupq_n_f64(high_bound)));
    return data[0] && data[1];
  }

  inline float32x4_t unchecked_round(float32x4_t x) {
    return vcvtq_f32_s32(vcvtaq_s32_f32(x));
  }
  inline float64x2_t unchecked_round(float64x2_t x) {
    return vcvtq_f64_s64(vcvtaq_s64_f64(x));
  }
  inline float32x4_t select_gt(float32x4_t x1, float32x4_t x2,
			       float32x4_t y1, float32x4_t y2) {
    return vbslq_f32(vcgtq_f32(x1,x2), y1, y2);
  }
  inline float64x2_t select_gt(float64x2_t x1, float64x2_t x2,
			       float64x2_t y1, float64x2_t y2) {
    return vbslq_f64(vcgtq_f64(x1,x2), y1, y2);
  }

  inline float unchecked_round(float x)
  { return vgetq_lane_f32(unchecked_round(vdupq_n_f32(x)), 0); }
  inline double unchecked_round(double x)
  { return vgetq_lane_f64(unchecked_round(vdupq_n_f64(x)), 0); }

  inline float pow2n(float x) {
    return vgetq_lane_f32(pow2n(vdupq_n_f32(x)),0);
  }
  inline double pow2n(double x) {
    return vgetq_lane_f64(pow2n(vdupq_n_f64(x)),0);
  }

#endif
 
  
#ifdef QE_HAVE_FAST_EXP
  
  // -------------------------------------------------------------------
  // Implementation of fast exponential
  // -------------------------------------------------------------------

  template<typename Type, typename Vec>
  static inline
  Vec polynomial_5(Vec const x, Type c0, Type c1, Type c2, Type c3, Type c4, Type c5) {
    // calculates polynomial c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
    using quick_e::fma;
    Vec x2 = mul(x, x);
    Vec x4 = mul(x2, x2);
    return fma(fma(c3, x, c2), x2, fma(fma(c5, x, c4), x4, fma(c1, x, c0)));
  }

  template<typename Vec>
  inline
  Vec fastexp_float(Vec const initial_x) {
    using namespace quick_e;
    using quick_e::unchecked_round;
    using quick_e::fma;
    
    // Taylor coefficients
    const float P0expf   =  1.f/2.f;
    const float P1expf   =  1.f/6.f;
    const float P2expf   =  1.f/24.f;
    const float P3expf   =  1.f/120.f; 
    const float P4expf   =  1.f/720.f; 
    const float P5expf   =  1.f/5040.f; 
    const float VM_LOG2E = 1.44269504088896340736;  // 1/log(2)
    const float ln2f_hi  =  0.693359375f;
    const float ln2f_lo  = -2.12194440e-4f;
#ifndef __FAST_MATH__
    const float min_x    = -87.3f;
    const float max_x    = +89.0f;
#endif

    Vec r = unchecked_round(mul(initial_x,set1<Vec>(VM_LOG2E)));
    Vec x = fnma(r, set1<Vec>(ln2f_hi), initial_x); //  x -= r * ln2f_hi;
    x = fnma(r, set1<Vec>(ln2f_lo), x);             //  x -= r * ln2f_lo;
 
    Vec z = polynomial_5(x,P0expf,P1expf,P2expf,P3expf,P4expf,P5expf);

    Vec x2 = mul(x, x);
    z = fma(z, x2, x);                       // z *= x2;  z += x;

    // multiply by power of 2 
    Vec n2 = pow2n(r);

    z = fma(z,n2,n2);
    
#ifdef __FAST_MATH__
    return z;
#else
    if (all_in_range(initial_x, min_x, max_x)) {
      return z;
    }
    else {
      // When initial_x<-87.3, set exp(x) to -Inf
      z = select_gt(set1<Vec>(min_x), initial_x, set0<Vec>(), z);
      // When initial_x>+89.0, set exp(x) to +Inf
      z = select_gt(initial_x, set1<Vec>(max_x),
		    set1<Vec>(std::numeric_limits<float>::infinity()),
		    z);
      return z;
    }
#endif
  }


  template <typename Type, typename Vec>
  Vec polynomial_13m(Vec const x,
		     Type c2, Type c3, Type c4, Type c5, Type c6, Type c7,
		     Type c8, Type c9, Type c10, Type c11, Type c12, Type c13) {
    // calculates polynomial c13*x^13 + c12*x^12 + ... + x + 0
    using quick_e::fma;
    
    Vec x2 = mul(x, x);
    Vec x4 = mul(x2, x2);
    //    Vec x8 = mul(x4, x4);
    return fma(fma(fma(c13, x, c12), x4,
		   fma(fma(c11, x, c10), x2, fma(c9, x, c8))), mul(x4, x4),
	       fma(fma(fma(c7, x, c6), x2, fma(c5, x, c4)), x4,
		   fma(fma(c3, x, c2), x2, x)));
    //return fma(fma(fma(fma(fma(fma(fma(fma(fma(fma(fma(fma(c13, x, c12), x, c11), x, c10), x, c9), x, c8), x, c7), x, c6), x, c5), x, c4), x, c3), x, c2), mul(x,x), x);
    
  }

  
  // Template function implementing the fast exponential, where Vec
  // can be double, __m128d, __m256d or __m512d
  template <typename Vec>
  inline
  Vec fastexp_double(Vec const initial_x) {
    using namespace quick_e;
    using quick_e::unchecked_round;
    using quick_e::fma;
    
    const double p2  = 1./2.;
    const double p3  = 1./6.;
    const double p4  = 1./24.;
    const double p5  = 1./120.; 
    const double p6  = 1./720.; 
    const double p7  = 1./5040.; 
    const double p8  = 1./40320.; 
    const double p9  = 1./362880.; 
    const double p10 = 1./3628800.; 
    const double p11 = 1./39916800.; 
    const double p12 = 1./479001600.; 
    const double p13 = 1./6227020800.; 
    const double VM_LOG2E = 1.44269504088896340736;  // 1/log(2)
    const double ln2d_hi = 0.693145751953125;
    const double ln2d_lo = 1.42860682030941723212E-6;
#ifndef __FAST_MATH__
    const double min_x = -708.39;
    const double max_x = +709.70;
#endif

    Vec r = unchecked_round(mul(initial_x,set1<Vec>(VM_LOG2E)));
    // subtraction in two steps for higher precision
    Vec x = fnma(r, set1<Vec>(ln2d_hi), initial_x);   //  x -= r * ln2d_hi;
    x = fnma(r, set1<Vec>(ln2d_lo), x);               //  x -= r * ln2d_lo;

    // multiply by power of 2 
    Vec n2 = pow2n(r);
    
    Vec z = polynomial_13m(x, p2, p3, p4, p5, p6, p7,
			   p8, p9, p10, p11, p12, p13);
    z = fma(z,n2,n2);
#ifdef __FAST_MATH__
    return z;
#else
    if (all_in_range(initial_x, min_x, max_x)) {
      // Fast normal path
      return z;
    }
    else {
      // When initial_x<-708.39, set exp(x) to 0.0
      z = select_gt(set1<Vec>(min_x), initial_x, set0<Vec>(), z);
      // When initial_x>+709.70.0, set exp(x) to +Inf
      z = select_gt(initial_x, set1<Vec>(max_x),
		    set1<Vec>(std::numeric_limits<double>::infinity()),
		    z);
      return z;
    }
#endif
  }
#endif
  

  // Define the various overloads for the quick_e::exp function taking
  // Intel intrinsics as an argument

#ifdef __SSE2__
  inline __m128  exp(__m128 x)  { return fastexp_float(x);  }
  inline __m128d exp(__m128d x) { return fastexp_double(x); }
#endif

#ifdef __AVX__
  inline __m256  exp(__m256 x)  { return fastexp_float(x);  }
  inline __m256d exp(__m256d x) { return fastexp_double(x); }
#endif

#ifdef __AVX512F__
  inline __m512  exp(__m512 x)  { return fastexp_float(x);  }
  inline __m512d exp(__m512d x) { return fastexp_double(x); }
#endif

#ifdef QE_HAVE_ARM64_NEON
  inline float32x4_t exp(float32x4_t x) { return fastexp_float(x);  }
  inline float64x2_t exp(float64x2_t x) { return fastexp_double(x); }
#endif

  // Define the quick_e::exp function for scalar arguments
#ifdef QE_HAVE_FAST_EXP
  inline float  exp(float x)  { return quick_e::fastexp_float(x); }
  inline double exp(double x) { return quick_e::fastexp_double(x); }
#else
  // If no vectorization available then we fall back to the standard
  // library scalar version
  inline float  exp(float x)  { return std::exp(x); }
  inline double exp(double x) { return std::exp(x); }
#endif

#undef QE_DEFINE_TRAITS
#undef QE_DEFINE_LONGEST
#undef QE_DEFINE_BASIC
#undef QE_DEFINE_CHOP
#undef QE_DEFINE_HORIZ
#undef QE_DEFINE_FMA
#undef QE_DEFINE_FMA_ALT
#undef QE_EMULATE_FMA
#undef QE_DEFINE_POW2N_S
#undef QE_DEFINE_POW2N_D
#undef QE_HAVE_FAST_EXP
#undef QE_HAVE_ARM64_NEON
}

#endif


================================================
FILE: include/adept/reduce.h
================================================
/* reduce.h -- "Reduce" functions such as find, all, sum etc.

    Copyright (C) 2015-2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.


   This file implements a number of array functions whose return
   values are reduced in either rank or size compared to their
   arguments.

   The first is the "find" function that takes a rank-1 bool
   Expression, and returns an IntVector of indices to the "true"
   values.  This is modelled on Matlab's "find" function.

   A number of further reduce functions are implemented using the same
   calling style as the equivalent Fortran-90 functions.  They fall
   into two types:
     1. sum, mean, product, minval, maxval, norm2
     2. all, any
   The first take active or inactive Expression arguments of real or
   (sometimes) integer type, while the second only take inactive
   Expressions of bool type.  If called with one Expression argument
   of any rank, a single value is returned containing the result of
   the reduce operation on all the elements of the Expression.  If a
   second integer argument is provided then the operation is carried
   out along that dimension and an Expression of rank one less than
   the first argument is returned. These functions are implemented by
   delegating to a generic "Reduce" function that uses policy classes
   to implement the elemental operations.

*/

#ifndef AdeptReduce_H
#define AdeptReduce_H

#include <limits>
#include <algorithm>

#include <adept/Array.h>
#include <adept/Active.h>
#include <adept/SpecialMatrix.h>
#include <adept/array_shortcuts.h>

namespace adept {

  // -------------------------------------------------------------------
  // Section 1. "find"
  // -------------------------------------------------------------------
  // This function takes a rank-1 bool Expression, and returns an
  // IntVector of indices to the "true" values.
  template <class E>
  inline
  typename internal::enable_if<E::rank == 1,IntVector>::type
  find(const Expression<bool, E>& rhs)
  {
    ExpressionSize<1> length;
    // Check the argument of the function is a valid expression
    if (!rhs.get_dimensions(length)) {
      std::string str = "Array size mismatch in "
	+ rhs.expression_string() + ".";
      throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
    }
    // Length of the rank-1 expression
    Index& len = length[0];
    // Allocate a return vector of the same length as the expression
    // in case all values are true
    IntVector ans(len);
    // Keep track of the actual number of true values
    Index true_len = 0;
    // Get location of first value in expression
    ExpressionSize<1> coords(0);
    ExpressionSize<E::n_arrays> loc;
    rhs.set_location(coords, loc);
    // Loop over all values in the expression
    for (int i = 0; i < len; i++) {
      if (rhs.next_value(loc)) {
	ans(true_len++) = i;
      }
    }
    if (true_len == 0) {
      // No values are "true": return an empty vector
      return IntVector();
    }
    else if (true_len < len) {
      // Some values are "true": return the part of the "ans" vector
      // that contains indices to these values.  Note that the
      // following subsetting operation links to the original data
      // rather than copying it.
      return ans(range(0,true_len-1));
    }
    else {
      // All values are "true": return the entire vector.
      return ans;
    }
  }

  namespace internal {

    // For minval and maxval to work we need starting values for the accumulation
    template <typename T, class Enable = void>
    struct numeric_limits { };

    template <typename T>
    struct numeric_limits<T, typename internal::enable_if<!std::numeric_limits<T>::has_infinity>::type> {
      static T min_inf() { return std::numeric_limits<T>::min(); }
      static T max_inf() { return std::numeric_limits<T>::max(); }
    };
    template <typename T>
    struct numeric_limits<T, typename internal::enable_if<std::numeric_limits<T>::has_infinity>::type> {
      static T min_inf() { return -std::numeric_limits<T>::infinity(); }
      static T max_inf() { return  std::numeric_limits<T>::infinity(); }
    };


    // -------------------------------------------------------------------
    // Section 2. Policy classes to enable the generic "reduce" function
    // -------------------------------------------------------------------

    // Sum enables the "sum" function that sums its arguments.
    template <typename T>
    struct Sum {
      // What is the type of the running total?
      typedef T total_type;
      // Number of extra operations per element, needed for reserving
      // space in active calculations
      static const int extra_element_cost = 0;
      // Do we need to do anything to the final summed value(s)?
      static const bool finish_needed = false;
      // Do we need to do anything to the final summed value(s) in the
      // case that we are doing automatic differentiation?
      static const bool active_finish_needed = true;
      // Used by "expression_string()"
      const char* name() { return "sum"; }
      // Start the accumulation with zero
      T first_value() { return 0; }
      // Accumulation consists of incrementing "total" by the value on
      // the right hand side; note that the arguments are either of
      // type T or type Packet<T>
      template <typename E>
      void accumulate(E& total, const E& rhs) { total += rhs; }
      // When the reduce operation is vectorized, packets of data are
      // accumulated, requiring the ability to horizontally accumulate
      // each element of the packet, but only the packet2 version is
      // needed (the original accumulate_packet had problems with the
      // norm2 function, and is no longer used - can be removed)
      //T accumulate_packet(const Packet<T>& ptotal) {
      //  return hsum(ptotal);
      //}
      template <typename E>
      void accumulate_packet2(E& total, const Packet<T>& ptotal) {
	total += hsum(ptotal);
      }
      // In the case of active arguments, the next_value_and_gradient
      // function pushes the right hand side onto the operation stack,
      // but does not push the "total" object onto the statement
      // stack.  This is done right at the end of the summation
      // operations.
      template <class E, int NArrays>
      void accumulate_active(Active<T>& total, const E& rhs, 
			     ExpressionSize<NArrays>& loc) {
	total.lvalue() += rhs.next_value_and_gradient(*ADEPT_ACTIVE_STACK, loc);
      }
      // No need to do anything to the final value
      template <class X>
      void finish(X& total, const Index& n) { }
      // In the active case, the final action is to complete the
      // storage of the differential statement by pushing the left
      // hand side onto the statement stack.
      void finish_active(Active<T>& total, const Index& n) { 
	ADEPT_ACTIVE_STACK->push_lhs(total.gradient_index());
      }
    };

    // Mean enables the "mean" function - the same as "sum" but
    // dividing the final result by the number of elements averaged.
    template <typename T>
    struct Mean {
      typedef T total_type;
      static const int extra_element_cost = 0;
      static const bool finish_needed = true;
      static const bool active_finish_needed = true;
      const char* name() { return "mean"; }
      T first_value() { return 0; }
      template <typename E>
      void accumulate(E& total, const E& rhs) { total += rhs; }
      //T accumulate_packet(const Packet<T>& ptotal) {
      //  return hsum(ptotal);
      //}
      template <typename E>
      void accumulate_packet2(E& total, const Packet<T>& ptotal) {
	total += hsum(ptotal);
      }
      template <class E, int NArrays>
      void accumulate_active(Active<T>& total, const E& rhs, 
			     ExpressionSize<NArrays>& loc) {
	total.lvalue() += rhs.next_value_and_gradient(*ADEPT_ACTIVE_STACK, loc);
      }
      template <class X>
      // Divide by the total number of elements
      void finish(X& total, const Index& n) { total /= n; }
      void finish_active(Active<T>& total, const Index& n) { 
	ADEPT_ACTIVE_STACK->push_lhs(total.gradient_index());
	total /= n;
      }
    };

    // Product enables the "product" function that multiplies all its
    // arguments together.
    template <typename T>
    struct Product {
      typedef T total_type;
      static const int extra_element_cost = 1;
      static const bool finish_needed = false;
      static const bool active_finish_needed = false;
      const char* name() { return "product"; }
      T first_value() { return 1; }
      template <typename E>
      void accumulate(E& total, const E& rhs) { total *= rhs; }
      //T accumulate_packet(const Packet<T>& ptotal) {
      //  return hprod(ptotal);
      //}
      template <typename E>
      void accumulate_packet2(E& total, const Packet<T>& ptotal) {
	total *= hprod(ptotal);
      }
      template <class E, int NArrays>
      void accumulate_active(Active<T>& total, const E& rhs, 
			     ExpressionSize<NArrays>& loc) {
	// Differentiate t = t*x -> dt = t*dx + x*dt.  First compute
	// x, while passing t as the last argument so that t*dx is put
	// on the operation stack.
	T xval = rhs.next_value_and_gradient_special(*ADEPT_ACTIVE_STACK, loc,
						     total.value());
	// Now treat x as inactive and Active<T> will do the rest
	total *= xval;
      }
      template <class X>
      void finish(X& total, const Index& n) { }
      void finish_active(Active<T>& total, const Index& n) { }
    };

    // MaxVal enables the "maxval" function that returns the maximum value
    template <typename T>
    struct MaxVal {
      typedef T total_type;
      static const int extra_element_cost = 0;
      static const bool finish_needed = false;
      static const bool active_finish_needed = false;
      const char* name() { return "maxval"; }
      // Initiate the total with the minimum possible value
      T first_value() { return internal::numeric_limits<T>::min_inf(); }
#ifdef ADEPT_CXX11_FEATURES
      void accumulate(T& total, const T& rhs) { 
	using std::fmax;
	total = fmax(total,rhs);
      }
      template <typename E>
      void accumulate_packet2(E& total, const Packet<T>& ptotal) {
	using std::fmax;
	total = fmax(total,hmax(ptotal));
      }
#else
      void accumulate(T& total, const T& rhs) {
	using std::max;
	total = max(total,rhs);
      }
      template <typename E>
      void accumulate_packet2(E& total, const Packet<T>& ptotal) {
	using std::max;
	total = max(total,hmax(ptotal));
      }
#endif
      void accumulate(Packet<T>& total, const Packet<T>& rhs) { total = fmax(total,rhs); }
      //T accumulate_packet(const Packet<T>& ptotal) {
      //  return hmax(ptotal);
      //}
      template <class E, int NArrays>
      void accumulate_active(Active<T>& total, const E& rhs, 
			     ExpressionSize<NArrays>& loc) {
	// The following is not optimal since if a maximum is found
	// then the value is evaluated twice. Better would be to
	// locate the maximum in the entire array, then do the active
	// stuff just for that element.
	if (rhs.value_at_location(loc) > total.value()) {
	  // The right hand side puts itself on the operation stack,
	  // while operator= puts the left hand side on the statement
	  // stack.
	  total = rhs.next_value_and_gradient(*ADEPT_ACTIVE_STACK, loc);
	}
	else {
	  rhs.advance_location(loc);
	}
      }
      template <class X>
      void finish(X& total, const Index& n) { }
      void finish_active(Active<T>& total, const Index& n) { }
    };

    // MinVal enables the "minval" function that returns the minimum value
    template <typename T>
    struct MinVal {
      typedef T total_type;
      static const int extra_element_cost = 0;
      static const bool finish_needed = false;
      static const bool active_finish_needed = false;
      const char* name() { return "minval"; }
      T first_value() { return internal::numeric_limits<T>::max_inf(); }
#ifdef ADEPT_CXX11_FEATURES
      void accumulate(T& total, const T& rhs) {
	using std::fmin;
	total = fmin(total,rhs);
      }
      void accumulate_packet2(T& total, const Packet<T>& ptotal) {
	using std::fmin;
	total = fmin(total,hmin(ptotal));
      }
#else
      void accumulate(T& total, const T& rhs) {
	using std::min;
	total = min(total,rhs);
      }
      void accumulate_packet2(T& total, const Packet<T>& ptotal) {
	using std::min;
	total = min(total,hmin(ptotal));
      }
#endif
      void accumulate(Packet<T>& total, const Packet<T>& rhs) { total = fmin(total,rhs); }
      //T accumulate_packet(const Packet<T>& ptotal) {
      //  return hmin(ptotal);
      //}
      template <class E, int NArrays>
      void accumulate_active(Active<T>& total, const E& rhs, 
			     ExpressionSize<NArrays>& loc) {
	// The following is not optimal since if a maximum is found
	// then the value is evaluated twice
	if (rhs.value_at_location(loc) < total.value()) {
	  // The right hand side puts itself on the operation stack,
	  // while operator= puts the left hand side on the statement
	  // stack.
	  total = rhs.next_value_and_gradient(*ADEPT_ACTIVE_STACK, loc);
	}
	else {
	  rhs.advance_location(loc);
	}
      }
      template <class X>
      void finish(X& total, const Index& n) { }
      void finish_active(Active<T>& total, const Index& n) { }
    };
  
    // Norm2 enables the "norm2" function that returns the L-2 norm of
    // its arguments, equal to sqrt(sum(rhs*rhs))
    template <typename T>
    struct Norm2 {
      typedef T total_type;
      static const int extra_element_cost = 0;
      static const bool finish_needed = true;
      static const bool active_finish_needed = true;
      const char* name() { return "norm2"; }
      T first_value() { return 0; }
      template <typename E>
      void accumulate(E& total, const E& rhs) { total += rhs*rhs; }
      //T accumulate_packet(const Packet<T>& ptotal) {
      //  return hsum(ptotal);
      //}
      // Note that ptotal is already an accumulation of squared
      // values, so does not need to be squared again
      template <typename E>
      void accumulate_packet2(E& total, const Packet<T>& ptotal) {
	total += hsum(ptotal);
      }
      template <class E, int NArrays>
      void accumulate_active(Active<T>& total, const E& rhs, 
			     ExpressionSize<NArrays>& loc) {
	// Differentiate t += x*x -> dt += 2*x*dx.  Use the "special2"
	// version of the following function, where multiplier*x*dx is
	// put on the operation stack.
	T xval = rhs.next_value_and_gradient_special2(*ADEPT_ACTIVE_STACK,
						      loc, 2.0);
	// Now do a purely inactive operation since we will put
	// "total" on the statement stack only right at the end
	total.lvalue() += xval*xval;
      }
      template <class X>
      void finish(X& total, const Index& n) {
	using std::sqrt;
	total = noalias(sqrt(total));
      }
      void finish_active(Active<T>& total, const Index& n) {
	using std::sqrt;
	// The operation stack now contains the derivatives of all the
	// squared elements on the right hand side.  Here we complete
	// the differential statement by pushing the left hand side
	// onto the statement stack.
	ADEPT_ACTIVE_STACK->push_lhs(total.gradient_index());
	// Since total is active it will do the right thing in the
	// final operation.
	total = noalias(sqrt(total));
      }
    };

    // All enables the "all" function that returns "true" only if all
    // the bool elements of the right hand side are true.  It would be
    // faster if it could quit after finding the first "false".
    struct All {
      typedef bool total_type;
      static const bool finish_needed = false;
      const char* name() { return "all"; }
      bool first_value() { return true; }
      void accumulate(bool& total, const bool& rhs)
      { total = total && rhs; }
      template <class X>
      void finish(X& total, const Index& n) { }
    };

    // Any enables the "any" function that returns "true" if any of
    // the bool elements of the right hand side are true. It would be
    // faster if it could quite after finding the first "true".
    struct Any {
      typedef bool total_type;
      static const bool finish_needed = false;
      const char* name() { return "any"; }
      bool first_value() { return false; }
      void accumulate(bool& total, const bool& rhs)
      { total = total || rhs; }
      template <class X>
      void finish(X& total, const Index& n) { }
    };

    // Count enables the "count" function that returns the number of
    // "true" elements in a bool array.
    struct Count {
      typedef Index total_type;
      static const bool finish_needed = false;
      const char* name() { return "count"; }
      Index first_value() { return 0; }
      void accumulate(Index& total, const bool& rhs)
      { total += static_cast<Index>(rhs); } // true=1, false=0
      template <class X>
      void finish(X& total, const Index& n) { }
    };

    // -------------------------------------------------------------------
    // Section 3. Various versions of the "reduce" function
    // -------------------------------------------------------------------

    // Reduce an entire inactive array, unvectorized
    template <class Func, typename Type, class E>
    inline
    typename internal::enable_if<!(E::is_vectorizable
			 &&Packet<Type>::is_vectorized
			 &&is_same<Type,typename Func::total_type>::value),
		       typename Func::total_type>::type
    reduce_inactive(const Expression<Type, E>& rhs) {
      typename Func::total_type total;
      Func f;
      ExpressionSize<E::rank> dims;
      // Check right hand side is a valid expression
      if (!rhs.get_dimensions(dims)) {
	std::string str = "Array size mismatch in "
	  + rhs.expression_string() + ".";
	throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
      }
      else if (dims[0] == 0) {
	// Return zero if any of these functions applied to an empty
	// array
	total = 0;
      }
      else {
	total = f.first_value();
	Index n = dims.size();
	ExpressionSize<E::rank> i(0);
	ExpressionSize<E::n_arrays> loc(0);
	int my_rank;
	static const int last = E::rank-1;
	do {
	  i[last] = 0;
	  rhs.set_location(i, loc);
	  // Innermost loop
	  for ( ; i[last] < dims[last]; ++i[last]) {
	    f.accumulate(total, rhs.next_value(loc));
	  }
	  my_rank = E::rank-1;
	  while (--my_rank >= 0) {
	    if (++i[my_rank] >= dims[my_rank]) {
	      i[my_rank] = 0;
	    }
	    else {
	      break;
	    }
	  }
	} while (my_rank >= 0);
	f.finish(total, n);
      }
      return total;
    }

    // Reduce an entire inactive array, vectorized
    template <class Func, typename Type, class E>
    inline
    typename internal::enable_if<E::is_vectorizable
                       &&Packet<Type>::is_vectorized
                       &&is_same<Type,typename Func::total_type>::value,
		       typename Func::total_type>::type
    reduce_inactive(const Expression<Type, E>& rhs) {
      typename Func::total_type total;
      Func f;
      ExpressionSize<E::rank> dims;
      // Check right hand side is a valid expression
      if (!rhs.get_dimensions(dims)) {
	std::string str = "Array size mismatch in "
	  + rhs.expression_string() + ".";
	throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
      }
      else if (dims[0] == 0) {
	// Return zero if any of these functions applied to an empty
	// array
	total = 0;
      }
      else if (dims[E::rank-1] >= Packet<Type>::size*2
	       && rhs.all_arrays_contiguous()) {
	// Vectorization is possible
	Packet<Type> ptotal(f.first_value());
	Index n = dims.size();
	ExpressionSize<E::rank> i(0);
	ExpressionSize<E::n_arrays> loc(0);
	int my_rank;
	static const int last = E::rank-1;
	int iendvec;
	int istartvec = rhs.alignment_offset();
	total = f.first_value();
	if (istartvec < 0) {
	  istartvec = iendvec = 0;
	}
	else {
	  // Adjust iendvec such that iendvec-istartvec is a multiple
	  // of the packet size
	  iendvec = (dims[last]-istartvec);
	  iendvec -= (iendvec % Packet<Type>::size);
	  iendvec += istartvec;
	}
	do {
	  i[last] = 0;
	  rhs.set_location(i, loc);
	  // Innermost loop
	  for ( ; i[last] < istartvec; ++i[last]) {
	    f.accumulate(total, rhs.next_value_contiguous(loc));
	  }
	  for ( ; i[last] < iendvec; i[last] += Packet<Type>::size) {
	    f.accumulate(ptotal, rhs.next_packet(loc));
	  }
	  for ( ; i[last] < dims[last]; ++i[last]) {
	    f.accumulate(total, rhs.next_value_contiguous(loc));
	  }
	  my_rank = E::rank-1;
	  while (--my_rank >= 0) {
	    if (++i[my_rank] >= dims[my_rank]) {
	      i[my_rank] = 0;
	    }
	    else {
	      break;
	    }
	  }
	} while (my_rank >= 0);
	// norm2 cannot use accumulate here or elements will be squared twice
	//f.accumulate(total, f.accumulate_packet(ptotal));
	f.accumulate_packet2(total, ptotal);
	f.finish(total, n);
      }
      else {
	// Back to unvectorized version
	total = f.first_value();
	Index n = dims.size();
	ExpressionSize<E::rank> i(0);
	ExpressionSize<E::n_arrays> loc(0);
	int my_rank;
	static const int last = E::rank-1;
	do {
	  i[last] = 0;
	  rhs.set_location(i, loc);
	  // Innermost loop
	  for ( ; i[last] < dims[last]; ++i[last]) {
	    f.accumulate(total, rhs.next_value(loc));
	  }
	  my_rank = E::rank-1;
	  while (--my_rank >= 0) {
	    if (++i[my_rank] >= dims[my_rank]) {
	      i[my_rank] = 0;
	    }
	    else {
	      break;
	    }
	  }
	} while (my_rank >= 0);
	f.finish(total, n);
      }
      return total;
    }


    // Reduce the specified dimension of an inactive array of rank > 1
    template <class Func, typename Type, class E>
    inline
    void reduce_dimension(const Expression<Type, E>& rhs, int reduce_dim,
		    Array<E::rank-1,typename Func::total_type,false>& total) {
      Func f;
      ExpressionSize<E::rank> dims;
      if (!rhs.get_dimensions(dims)) {
	std::string str = "Array size mismatch in "
	  + rhs.expression_string() + ".";
	throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
      }
      else if (dims[0] == 0) {
	// Return empty array if any of these functions applied to an
	// empty array
	total.clear();
      }
      else if (reduce_dim >= E::rank) {
	std::stringstream s;
	s << "In " << f.name() << "(Expression<rank="
	  << E::rank << ">,dim=" << reduce_dim 
	  << "), dim must be less than rank.";
	throw invalid_dimension(s.str() ADEPT_EXCEPTION_LOCATION);
      }
      else {
	// New array has the same dimensions as the input but with one
	// of the dimensions removed
	ExpressionSize<E::rank-1> new_dims;
	int jnew = 0;
	for (int j = 0; j < E::rank; ++j) {
	  if (j != reduce_dim) {
	    new_dims[jnew++] = dims[j];
	  }
	}
	total.resize(new_dims);
	total = f.first_value();
	ExpressionSize<E::rank> i(0);
	ExpressionSize<E::rank-1> inew(0);
	ExpressionSize<E::n_arrays> loc(0);
	int my_rank;
	static const int last = E::rank-1;
	do {
	  i[last] = 0;
	  rhs.set_location(i, loc);
	  // Innermost loop. Note that indexing of total with inew is
	  // not very efficient for high-rank arrays since the
	  // location must be computed from all dimensions each time.
	  if (reduce_dim == last) {
	    for ( ; i[last] < dims[last]; ++i[last]) {
	      f.accumulate(total.get_lvalue(inew), rhs.next_value(loc));
	    }
	  }
	  else {
	    for ( inew[last-1] = 0; i[last] < dims[last]; 
		 ++i[last], ++inew[last-1]) {
	      f.accumulate(total.get_lvalue(inew), rhs.next_value(loc));
	    }
	  }
	  // Advancing to next innermost loop is somewhat involved
	  // since we have to do something different when we reach the
	  // dimension that is being reduced
	  my_rank = E::rank-1;
	  while (--my_rank >= 0) {
	    ++i[my_rank];
	    if (my_rank < reduce_dim) {
	      ++inew[my_rank];
	      if (i[my_rank] >= dims[my_rank]) {
		i[my_rank] = 0;
		inew[my_rank] = 0;
	      }
	      else {
		break;
	      }   
	    }
	    else if (my_rank == reduce_dim) {
	      if (i[my_rank] >= dims[my_rank]) {
		i[my_rank] = 0;
	      }
	      else {
		break;
	      }   
	    }
	    // The following could be a simple "else", but sometimes
	    // the compiler optimizes to the extent that it thinks
	    // inew[-1] will be accessed (even though it won't),
	    // leading to a warning about the array subscript being
	    // out of bounds. Here the compiler knows the index must
	    // be zero or positive.
	    else if (my_rank > 0) {
	      ++inew[my_rank-1];
	      if (i[my_rank] >= dims[my_rank]) {
		i[my_rank] = 0;
		inew[my_rank-1] = 0;
	      }
	      else {
		break;
	      }
	    }
	  }
	} while (my_rank >= 0);
	
	if (f.finish_needed) {
	  f.finish(total, dims[reduce_dim]);
	}
      }
    }

    // Reduce the entirety of an active array
    template <class Func, typename Type, class E>
    inline
    void reduce_active(const Expression<Type, E>& rhs, Active<Type>& total) {
#ifdef ADEPT_RECORDING_PAUSABLE
      if (!ADEPT_ACTIVE_STACK->is_recording()) {
	total.lvalue() = reduce_inactive<Func>(rhs);
	return;
      }
#endif

      Func f;
      ExpressionSize<E::rank> dims;
      if (!rhs.get_dimensions(dims)) {
	std::string str = "Array size mismatch in "
	  + rhs.expression_string() + ".";
	throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
      }
      else if (dims[0] == 0) {
	// Return zero if any of these functions applied to an empty
	// array
	total = 0;
      }
      else {
	total.set_value(f.first_value());
	Index n = dims.size();
	ExpressionSize<E::rank> i(0);
	ExpressionSize<E::n_arrays> loc(0);
	int my_rank;
	static const int last = E::rank-1;
	// Check there is enough space on the operation stack by
	// working out the cost of all the elements of the array. Note
	// that the final operation to compute the total at the end is
	// dealt with separately.
	ADEPT_ACTIVE_STACK->check_space((E::n_active + Func::extra_element_cost) * n);
	do {
	  i[last] = 0;
	  rhs.set_location(i, loc);
	  // Innermost loop
	  for ( ; i[last] < dims[last]; ++i[last]) {
	    f.accumulate_active(total, rhs, loc);
	  }
	  my_rank = E::rank-1;
	  while (--my_rank >= 0) {
	    if (++i[my_rank] >= dims[my_rank]) {
	      i[my_rank] = 0;
	    }
	    else {
	      break;
	    }
	  }
	} while (my_rank >= 0);
	if (f.active_finish_needed) {
	  f.finish_active(total, n);
	}
      }
    }

    // Reduce the specified dimension of an active array of rank > 1
    template <class Func, typename Type, class E>
    inline
    void reduce_dimension(const Expression<Type, E>& rhs, int reduce_dim,
		Array<E::rank-1,Type,true>& result) {
#ifdef ADEPT_RECORDING_PAUSABLE
      if (!ADEPT_ACTIVE_STACK->is_recording()) {
	// This solution requires more shallow copies than are really
	// needed; could be made more efficient if Array had a member
	// function to link an pre-constructed active Array to
	// inactive data.
	Array<E::rank-1,Type,false> result_inactive;
	reduce_dimension<Func>(rhs, reduce_dim, result_inactive);
	Array<E::rank-1,Type,true> result_active(result_inactive.data(),
						 result_inactive.storage(),
						 result_inactive.dimensions(),
						 result_inactive.offset());
	result >>= result_active;
	return;
      }
#endif

      Func f;
      ExpressionSize<E::rank> dims;
      if (!rhs.get_dimensions(dims)) {
	std::string str = "Array size mismatch in "
	  + rhs.expression_string() + ".";
	throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
      }
      else if (dims[0] == 0) {
	// Return empty array if any of these functions applied to an
	// empty array
	result.clear();
      }
      else if (reduce_dim >= E::rank) {
	std::stringstream s;
	s << "In " << f.name() << "(Expression<rank="
	  << E::rank << ">,dim=" << reduce_dim 
	  << "), dim must be less than rank.";
	throw invalid_dimension(s.str() ADEPT_EXCEPTION_LOCATION);
      }
      else {
	// New array has the same dimensions as the input but with one
	// of the dimensions removed
	ExpressionSize<E::rank-1> new_dims;
	int jnew = 0;
	for (int j = 0; j < E::rank; ++j) {
	  if (j != reduce_dim) {
	    new_dims[jnew++] = dims[j];
	  }
	}
	result.resize(new_dims);
	ExpressionSize<E::rank> i(0);
	ExpressionSize<E::rank-1> inew(0);
	ExpressionSize<E::n_arrays> loc(0);
	int my_rank;
	Active<Type> total;
	Index n = dims.size();
	// Check there is enough space on the operation stack,
	// including the per-element cost, and an additional cost to
	// finalize each individual strip of the array. Even though an
	// additional check is performed at the end of each completed
	// strip, the total number needs to be anticipated beforehand
	// (omitting this can cause memory corruption).
	ADEPT_ACTIVE_STACK->check_space((E::n_active + Func::extra_element_cost) * n + new_dims.size());
	do {
	  i[reduce_dim] = 0;
	  //	  total.set_value(f.first_value());
	  total = f.first_value();

	  // Innermost loop. Note that indexing of total with inew is
	  // not very efficient for high-rank arrays since the
	  // location must be computed from all dimensions each time.
	  for ( ; i[reduce_dim] < dims[reduce_dim]; ++i[reduce_dim]) {
	    rhs.set_location(i, loc);
	    f.accumulate_active(total, rhs, loc);
	  }
	  if (f.active_finish_needed) {
	    f.finish_active(total, dims[reduce_dim]);
	  }
	  result.get_lvalue(inew) = total;
	  my_rank = E::rank;
	  while (--my_rank >= 0) {
	    if (my_rank == reduce_dim) {
	      continue;
	    }
	    ++i[my_rank];
	    if (my_rank < reduce_dim) {
	      ++inew[my_rank];
	      if (i[my_rank] >= dims[my_rank]) {
		i[my_rank] = 0;
		inew[my_rank] = 0;
	      }
	      else {
		break;
	      }   
	    }
	    else if (my_rank == reduce_dim) {
	      if (i[my_rank] >= dims[my_rank]) {
		i[my_rank] = 0;
	      }
	      else {
		break;
	      }   
	    }
	    else {
	      ++inew[my_rank-1];
	      if (i[my_rank] >= dims[my_rank]) {
		i[my_rank] = 0;
		inew[my_rank-1] = 0;
	      }
	      else {
		break;
	      }
	    }
	  }
	} while (my_rank >= 0);
      }
    }

  }


  // -------------------------------------------------------------------
  // Section 4. Implement the functions
  // -------------------------------------------------------------------

  // Implement sum(x), sum(x,dim), mean(x), mean(x,dim) etc.
  // Different versions of the "reduce" function are called depending
  // on whether "x" is active and whether "dim" is present.

#define DEFINE_REDUCE_FUNCTION(NAME, CLASSNAME)		\
  /* function(inactive) */				\
  template <typename Type, class E>			\
  inline						\
  typename internal::enable_if<!E::is_active && E::rank != 0,	\
		     Type>::type			\
  NAME(const Expression<Type, E>& rhs) {		\
    return internal::reduce_inactive<internal:: CLASSNAME<Type> >(rhs);	\
  }							\
  							\
  /* function(active) */				\
  template <typename Type, class E>			\
  inline						\
  typename internal::enable_if<E::is_active && E::rank != 0,	\
		     Active<Type> >::type		\
  NAME(const Expression<Type, E>& rhs) {		\
    Active<Type> result;				\
    internal::reduce_active<internal:: CLASSNAME<Type> >(rhs, result);	\
    return result;					\
  }							\
							\
  /* function(active[rank=1], dim) */			\
  template <typename Type, class E>			\
  inline						\
  typename internal::enable_if<!E::is_active && E::rank == 1,	\
				     Type>::type	\
  NAME(const Expression<Type, E>& rhs, int dim) {	\
    if (dim != 0) {					\
      throw invalid_dimension("Two-argument reduce function applied to vector must have zero as second argument" \
			      ADEPT_EXCEPTION_LOCATION);		\
    }							\
    return internal::reduce_inactive<internal:: CLASSNAME<Type> >(rhs);	\
  }							\
  							\
  /* function(active[rank=1], dim) */			\
  template <typename Type, class E>			\
  inline						\
  typename internal::enable_if<E::is_active && E::rank == 1,	\
		     Active<Type> >::type		\
  NAME(const Expression<Type, E>& rhs, int dim) {	\
    if (dim != 0) {					\
      throw invalid_dimension("Two-argument reduce function applied to vector must have zero as second argument" \
			    ADEPT_EXCEPTION_LOCATION);			\
    }							\
    Active<Type> result;				\
    internal::reduce_active<internal:: CLASSNAME<Type> >(rhs, result);	\
    return result;					\
  }							\
							\
  /* function(inactive[rank>1], dim) */			\
  /* function(active[rank>1], dim) */			\
  template <typename Type, class E>			\
  inline						\
  typename internal::enable_if<(E::rank > 1),		\
	     Array<E::rank-1,Type,E::is_active> >::type	\
  NAME(const Expression<Type, E>& rhs, int dim) {	\
    Array<E::rank-1,Type,E::is_active> result;		\
    internal::reduce_dimension<internal:: CLASSNAME<Type> >(rhs, dim, result); \
    return result;					\
  }

  DEFINE_REDUCE_FUNCTION(sum, Sum)
  DEFINE_REDUCE_FUNCTION(mean, Mean)
  DEFINE_REDUCE_FUNCTION(product, Product)
  DEFINE_REDUCE_FUNCTION(maxval, MaxVal)
  DEFINE_REDUCE_FUNCTION(minval, MinVal)
  DEFINE_REDUCE_FUNCTION(norm2, Norm2)

#undef DEFINE_REDUCE_FUNCTION


  // Implement all(x), all(x,dim), any(x) and any(x,dim).  Fewer
  // possibilities this time as no active versions.

#define DEFINE_BOOL_REDUCE_FUNCTION(NAME, CLASSNAME)	 \
  template <class E>					 \
  inline bool NAME(const Expression<bool, E>& rhs)	 \
  { return internal::reduce_inactive<internal:: CLASSNAME>(rhs); }	\
  							 \
  template <class E>					 \
  inline						 \
  Array<E::rank-1,bool,false>				 \
  NAME(const Expression<bool, E>& rhs, int dim) {	 \
    Array<E::rank-1,bool,false> result;			 \
    internal::reduce_dimension<internal:: CLASSNAME>(rhs, dim, result);	\
    return result;					 \
  }

  DEFINE_BOOL_REDUCE_FUNCTION(all, All)
  DEFINE_BOOL_REDUCE_FUNCTION(any, Any)
#undef DEFINE_BOOL_REDUCE_FUNCTION

  // count(x) and count(x,dim) is slightly different as it returns
  // Index
  template <class E>
  inline Index count(const Expression<bool, E>& rhs)
  { return internal::reduce_inactive<internal::Count>(rhs); }

  template <class E>
  inline Array<E::rank-1,Index,false>
  count(const Expression<bool, E>& rhs, int dim) {
    Array<E::rank-1,Index,false> result;
    internal::reduce_dimension<internal::Count>(rhs, dim, result);
    return result;
  }


  // -------------------------------------------------------------------
  // Section 5. diag_vector
  // -------------------------------------------------------------------

  // diag_vector(A,offdiag), where A is a 2D array, returns the
  // diagonal indexed by "offdiag" as a 1D array pointing to the
  // original data, or the main diagonal if offidag is missing. Can be
  // used as an lvalue.
  template <typename Type, bool IsActive>
  Array<1,Type,IsActive>
  diag_vector(Array<2,Type,IsActive>& A, Index offdiag = 0) {
    ExpressionSize<2> dims = A.dimensions();
    ExpressionSize<2> offset = A.offset();
    ExpressionSize<1> new_dim, new_offset;
    new_offset[0] = offset[0]+offset[1];
    if (offdiag >= 0) {
      new_dim[0] = std::min(dims[0], dims[1]-offdiag);
      return Array<1,Type,IsActive>(A.data()+offdiag*offset[1],
				    A.storage(), new_dim, new_offset);
    }
    else {
      new_dim[0] = std::min(dims[0]+offdiag, dims[1]);
      return Array<1,Type,IsActive>(A.data()-offdiag*offset[0],
				    A.storage(), new_dim, new_offset);
    }
  }

  // diag_vector(A,offdiag), where A is a 2D expression, returns the
  // diagonal indexed by "offdiag" as a 1D array, or the main diagonal
  // if offidag is missing. Cannot be used as an lvalue.
  template <typename Type, class E>
  typename internal::enable_if<E::rank == 2 && !E::is_active,
			       Array<1,Type,E::is_active> >::type
  diag_vector(const Expression<Type,E>& arg, Index offdiag = 0) {
    ExpressionSize<2> dims;
    if (!arg.get_dimensions(dims)) {
      std::string str;
      str += "Array size mismatch in ";
      str += arg.expression_string();
      throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
    }

    ExpressionSize<2> i;
    ExpressionSize<E::n_arrays> ind;
    if (offdiag >= 0) {
      Index new_dim = std::min(dims[0], dims[1]-offdiag);
      Array<1,Type,E::is_active> v(new_dim);
      for (int j = 0; j < new_dim; ++j) {
	i[0] = j;
	i[1] = j+offdiag;
	arg.set_location(i, ind);
	v(j) = arg.next_value(ind);
      }
      return v;
    }
    else {
      Index new_dim = std::min(dims[0]+offdiag, dims[1]);
      Array<1,Type,E::is_active> v(new_dim);
      for (int j = 0; j < new_dim; ++j) {
	i[0] = j;
	i[1] = j+offdiag;
	arg.set_location(i, ind);
	v(j) = arg.next_value(ind);
      }
      return v;
    }
  }
  template <typename Type, class E>
  typename internal::enable_if<E::rank == 2 && E::is_active,
			       Array<1,Type,E::is_active> >::type
  diag_vector(const Expression<Type,E>& arg, Index offdiag = 0) {
    ExpressionSize<2> dims;
    if (!arg.get_dimensions(dims)) {
      std::string str;
      str += "Array size mismatch in ";
      str += arg.expression_string();
      throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
    }

    ExpressionSize<2> i;
    ExpressionSize<E::n_arrays> ind;
    if (offdiag >= 0) {
      Index new_dim = std::min(dims[0], dims[1]-offdiag);
      Array<1,Type,E::is_active> v(new_dim);
      for (int j = 0; j < new_dim; ++j) {
	i[0] = j;
	i[1] = j+offdiag;
	arg.set_location(i, ind);
	v.data()[j] = arg.next_value_and_gradient(*ADEPT_ACTIVE_STACK,ind);
	ADEPT_ACTIVE_STACK->push_lhs(v.gradient_index()+j);
      }
      return v;
    }
    else {
      Index new_dim = std::min(dims[0]+offdiag, dims[1]);
      Array<1,Type,E::is_active> v(new_dim);
      for (int j = 0; j < new_dim; ++j) {
	i[0] = j;
	i[1] = j+offdiag;
	arg.set_location(i, ind);
	v.data()[j] = arg.next_value_and_gradient(*ADEPT_ACTIVE_STACK,ind);
	ADEPT_ACTIVE_STACK->push_lhs(v.gradient_index()+j);
      }
      return v;
    }
  }

  // diag_matrix(v,offdiag), where v is a 1D expression, returns a
  // DiagMatrix whose diagonal is a copy of v. Cannot be used as an
  // lvalue.
  template <typename Type, class E>
  typename internal::enable_if<E::rank == 1,
       SpecialMatrix<Type, internal::BandEngine<ROW_MAJOR,0,0>,
		    E::is_active> >::type
  diag_matrix(const Expression<Type,E>& arg) {
    Array<1,Type,E::is_active> v = arg;
    return v.diag_matrix();
  }

  // -------------------------------------------------------------------
  // Section 6. dot_product
  // -------------------------------------------------------------------
  template <typename LType, typename RType, class L, class R>
  typename internal::enable_if<L::rank == 1 && R::rank == 1,
	     typename internal::active_scalar<typename internal::promote<LType,RType>::type,
				     L::is_active || R::is_active>::type>::type
  dot_product(const Expression<LType,L>& l,
	      const Expression<RType,R>& r) {
    return sum(l*r);
  }

  // -------------------------------------------------------------------
  // Section 7. minloc
  // -------------------------------------------------------------------

  template <typename Type, class E>
  inline
  typename internal::enable_if<E::rank == 1, Index>::type
  minloc(const Expression<Type, E>& rhs) {
    ExpressionSize<1> length;
    // Check the argument of the function is a valid expression
    if (!rhs.get_dimensions(length)) {
      std::string str = "Array size mismatch in "
	+ rhs.expression_string() + ".";
      throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
    }
    // Length of the rank-1 expression
    Index& len = length[0];
    Type running_min  = internal::numeric_limits<Type>::max_inf();
    Index running_loc = 0;
    ExpressionSize<1> coords(0);
    ExpressionSize<E::n_arrays> loc;
    rhs.set_location(coords, loc);
    // Loop over all values in the expression
    for (Index i = 0; i < len; i++) {
      Type val = rhs.next_value(loc);
      if (val < running_min) {
	running_min = val;
	running_loc = i;
      }
    }
    return running_loc;
  }

  // -------------------------------------------------------------------
  // Section 8. maxloc
  // -------------------------------------------------------------------

  template <typename Type, class E>
  inline
  typename internal::enable_if<E::rank == 1, Index>::type
  maxloc(const Expression<Type, E>& rhs) {
    ExpressionSize<1> length;
    // Check the argument of the function is a valid expression
    if (!rhs.get_dimensions(length)) {
      std::string str = "Array size mismatch in "
	+ rhs.expression_string() + ".";
      throw size_mismatch(str ADEPT_EXCEPTION_LOCATION);
    }
    // Length of the rank-1 expression
    Index& len = length[0];
    Type running_max  = internal::numeric_limits<Type>::min_inf();
    Index running_loc = 0;
    ExpressionSize<1> coords(0);
    ExpressionSize<E::n_arrays> loc;
    rhs.set_location(coords, loc);
    // Loop over all values in the expression
    for (Index i = 0; i < len; i++) {
      Type val = rhs.next_value(loc);
      if (val > running_max) {
	running_max = val;
	running_loc = i;
      }
    }
    return running_loc;
  }

} // End namespace adept

#endif


================================================
FILE: include/adept/scalar_shortcuts.h
================================================
/* shortcuts.h -- Definitions of "shortcut" typedefs for scalar types

    Copyright (C) 2015 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.
*/

#ifndef AdeptScalarShortcuts_H
#define AdeptScalarShortcuts_H

#include <complex>

#ifndef ADEPT_NO_AUTOMATIC_DIFFERENTIATION
// First the case when automatic differentiation is ON

#include <adept/Active.h>

namespace adept {

  typedef Active<Real> aReal;
  typedef Active<float> afloat;
  typedef Active<double> adouble;

  typedef Active<std::complex<Real> > aComplex;
  typedef Active<std::complex<float> > aComplexFloat;
  typedef Active<std::complex<double> > aComplexDouble;

  inline Real value(Real x) { return x; }

} // End namespace adept


#else
// Second the case when automatic differentiation is OFF

#include <adept/base.h>

namespace adept {

  typedef Real aReal;
  typedef float afloat;
  typedef double adouble;

  typedef std::complex<Real> aComplex;
  typedef std::complex<float> aComplexFloat;
  typedef std::complex<double> aComplexDouble;

  // Normally value(x) returns the inactive part of x, so if x is
  // inactive we simply return a constant reference to x
  template <typename T>
  inline const T& value(const T& x) { return x; }

  inline Real value(Real x) { return x; }

} // End namespace adept

#endif

#endif


================================================
FILE: include/adept/settings.h
================================================
/* settings.h -- View/change the overall Adept settings

    Copyright (C) 2016-2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#ifndef AdeptSettings_H
#define AdeptSettings_H 1

#include <string>

namespace adept {

  // -------------------------------------------------------------------
  // Get compiler settings
  // -------------------------------------------------------------------

  // Return the version of Adept at compile time
  std::string version();

  // Return the compiler used to compile the Adept library (e.g. "g++ 4.3.2")
  std::string compiler_version();

  // Return the compiler flags used when compiling the Adept library
  // (e.g. "-Wall -g -O3")
  std::string compiler_flags();
  
  // Return a multi-line string listing numerous aspects of the way
  // Adept has been configured.
  std::string configuration();

  // Was the library compiled with matrix multiplication support (from
  // BLAS)?
  bool have_matrix_multiplication();

  // Was the library compiled with linear algebra support (e.g. inv
  // and solve from LAPACK)
  bool have_linear_algebra();

  // -------------------------------------------------------------------
  // Get/set number of threads for array operations
  // -------------------------------------------------------------------

  // Get the maximum number of threads available for BLAS operations
  int max_blas_threads();

  // Set the maximum number of threads available for BLAS operations
  // (zero means use the maximum sensible number on the current
  // system), and return the number actually set.  Note that OpenBLAS
  // uses pthreads and the Jacobian calculation uses OpenMP - this can
  // lead to inefficient behaviour so if you are computing Jacobians
  // then you may get better performance by setting the number of
  // array threads to one.
  int set_max_blas_threads(int n);

} // End namespace adept

#endif


================================================
FILE: include/adept/solve.h
================================================
/* solve.h -- Solve systems of linear equations

    Copyright (C) 2015 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.
*/
                             
#ifndef AdeptSolve_H
#define AdeptSolve_H 1

#include <vector>

#include <adept/Array.h>
#include <adept/SpecialMatrix.h>

namespace adept {

  // -------------------------------------------------------------------
  // Solve Ax = b for general square matrix A
  // -------------------------------------------------------------------
  template <typename T>
  Array<1,T,false> 
  solve(const Array<2,T,false>& A, const Array<1,T,false>& b);

  // -------------------------------------------------------------------
  // Solve AX = B for general square matrix A and rectangular matrix B
  // -------------------------------------------------------------------
  template <typename T>
  Array<2,T,false> 
  solve(const Array<2,T,false>& A, const Array<2,T,false>& B);

  // -------------------------------------------------------------------
  // Solve Ax = b for symmetric square matrix A
  // -------------------------------------------------------------------
  template <typename T, SymmMatrixOrientation Orient>
  Array<1,T,false>
  solve(const SpecialMatrix<T,internal::SymmEngine<Orient>,false>& A,
	const Array<1,T,false>& b);

  // -------------------------------------------------------------------
  // Solve AX = B for symmetric square matrix A
  // -------------------------------------------------------------------
  template <typename T, SymmMatrixOrientation Orient>
  Array<2,T,false>
  solve(const SpecialMatrix<T,internal::SymmEngine<Orient>,false>& A,
	const Array<2,T,false>& B);

  // -------------------------------------------------------------------
  // Solve AX = B for symmetric square matrices A and B
  // -------------------------------------------------------------------
  // Simply copy B into a general dense matrix
  template <typename T, SymmMatrixOrientation LOrient,
    SymmMatrixOrientation ROrient>
  inline
  Array<2,T,false>
  solve(const SpecialMatrix<T,internal::SymmEngine<LOrient>,false>& A,
	const SpecialMatrix<T,internal::SymmEngine<ROrient>,false>& B) {
    Array<2,T,false> B_array = B;
    return solve(A,B_array);
  }

  // -------------------------------------------------------------------
  // Solve Ax = b for general expressions
  // -------------------------------------------------------------------
  template <typename LType, class L, typename RType, class R>
  typename internal::enable_if<L::rank==2 && R::rank==1
			       && !L::is_active && !R::is_active
			       && internal::matrix_op_defined<LType>::value
			       && internal::matrix_op_defined<RType>::value,
			       Array<1,typename internal::promote<LType,RType>::type,false> >::type
  solve(const Expression<LType,L>& l, const Expression<RType,R>& r) {
    typedef typename internal::promote<LType,RType>::type PType;
    Array<2,PType,false> left = l.cast();
    Array<1,PType,false> right = r.cast();
    return solve(left,right);
  }

  // -------------------------------------------------------------------
  // Solve AX = B for general expressions
  // -------------------------------------------------------------------
  template <typename LType, class L, typename RType, class R>
  typename internal::enable_if<L::rank==2 && R::rank==2
			       && !L::is_active && !R::is_active
			       && internal::matrix_op_defined<LType>::value
			       && internal::matrix_op_defined<RType>::value,
			       Array<2,typename internal::promote<LType,RType>::type,false> >::type
  solve(const Expression<LType,L>& l, const Expression<RType,R>& r) {
    typedef typename internal::promote<LType,RType>::type PType;
    Array<2,PType,false> left = l.cast();
    Array<2,PType,false> right = r.cast();
    return solve(left,right);
  } 
}

#endif


================================================
FILE: include/adept/spread.h
================================================
/* spread.h -- Spread an array into an additional dimension

    Copyright (C) 2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.
*/
                   
#ifndef AdeptSpread_H
#define AdeptSpread_H

#include <adept/Array.h>

namespace adept {

  namespace internal {
    
    // Expression representing the spread of an array into an
    // additional dimension
    template <int SpreadDim, typename Type, class E>
    class Spread : public Expression<Type, Spread<SpreadDim,Type,E> > {
      typedef Array<E::rank,Type,E::is_active> ArrayType;

    public:
      // Static data
      static const int  rank       = E::rank+1;
      static const bool is_active  = E::is_active;
      static const int  n_active   = ArrayType::n_active;
      static const int  n_scratch  = 0;
      static const int  n_arrays   = ArrayType::n_arrays;
      // Currently not vectorizable if the final dimension is the
      // spread dimension because the current design always has the
      // array index increasing
      static const bool is_vectorizable = (SpreadDim != E::rank);

    protected:
      const ArrayType array;
      ExpressionSize<rank> dims;
      Index n;

    public:
      Spread(const Expression<Type,E>& e, Index n_)
	: array(e.cast()), n(n_) {
	for (int i = 0; i < SpreadDim; ++i) {
	  dims[i] = array.dimension(i);
	}
	dims[SpreadDim] = n_;
	for (int i = SpreadDim+1; i < rank; ++i) {
	  dims[i] = array.dimension(i-1);
	}
	// Communicate empty array if n == 0
	if (n_ == 0) {
	  dims[0] = 0;
	}
      }

      bool get_dimensions_(ExpressionSize<rank>& dim) const {
	dim = dims;
	return true;
      }

      std::string expression_string_() const {
	std::stringstream s;
	s << "spread<" << SpreadDim << ">(" << array.expression_string()
	  << "," << n << ")";
	return s.str();
      }

      bool is_aliased_(const Type* mem1, const Type* mem2) const {
	return false;
      }

      bool all_arrays_contiguous_() const {
	return array.all_arrays_contiguous_();
      }

      bool is_aligned_() const {
	return array.is_aligned_();
      }
     
      template <int N>
      int alignment_offset_() const {
	return array.template alignment_offset_<N>();
      }

      // Do not implement value_with_len_

      // Advance only if the spread dimension is not the last
      template <int MyArrayNum, int NArrays>
      void advance_location_(ExpressionSize<NArrays>& loc) const {
	// If false this if statement should be optimized away
	if (SpreadDim < rank-1) {
	  array.template advance_location_<MyArrayNum>(loc);
	}
      }

      template <int MyArrayNum, int NArrays>
      Type value_at_location_(const ExpressionSize<NArrays>& loc) const {
	return array.template value_at_location_<MyArrayNum>(loc);
      }
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_at_location_store_(const ExpressionSize<NArrays>& loc,
				    ScratchVector<NScratch>& scratch) const {
	return array.template value_at_location_<MyArrayNum>(loc);
      }
      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      Type value_stored_(const ExpressionSize<NArrays>& loc,
			 const ScratchVector<NScratch>& scratch) const {
	return array.template value_at_location_<MyArrayNum>(loc);
      }

      template <int MyArrayNum, int NArrays>
      Packet<Type> 
      packet_at_location_(const ExpressionSize<NArrays>& loc) const {
	return packet_at_location_local_<SpreadDim==rank-1,MyArrayNum>(loc);

      }

    protected:

      // Specializing for the case when the final dimension is the
      // final dimension of the wrapped array
      template <bool IsDuplicate, int MyArrayNum, int NArrays>
      typename enable_if<!IsDuplicate, Packet<Type> >::type
      packet_at_location_local_(const ExpressionSize<NArrays>& loc) const {
	return array.template packet_at_location_<MyArrayNum>(loc);
      }

      // Specializing for the case when the final dimension is to be
      // "spread".  The following does not work because the array
      // location is incremented for packets when we really want it to
      // always point to the start of a row.  It is deactivated by
      // is_vectorizable_ (above).
      template <bool IsDuplicate, int MyArrayNum, int NArrays>
      typename enable_if<IsDuplicate, Packet<Type> >::type
      packet_at_location_local_(const ExpressionSize<NArrays>& loc) const {
	return Packet<Type>(array.template value_at_location_<MyArrayNum>(loc));
      }
      
    public:

      template <int MyArrayNum, int NArrays>
      void set_location_(const ExpressionSize<rank>& i, 
			 ExpressionSize<NArrays>& index) const {
	ExpressionSize<rank-1> i_array(0);
	int j = 0;
	for ( ; j < SpreadDim; ++j) {
	  i_array[j] = i[j];
	}
	for ( ; j < rank-1; ++j) {
	  i_array[j] = i[j+1];
	}
	array.template set_location_<MyArrayNum>(i_array, index);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch>
      void calc_gradient_(Stack& stack, const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch) const {
	array.template calc_gradient_<MyArrayNum,MyScratchNum>(stack,loc,scratch);
      }

      template <int MyArrayNum, int MyScratchNum, int NArrays, int NScratch,
		typename MyType>
      void calc_gradient_(Stack& stack, 
			  const ExpressionSize<NArrays>& loc,
			  const ScratchVector<NScratch>& scratch,
			  MyType multiplier) const {
	array.template calc_gradient_<MyArrayNum,MyScratchNum>(stack,loc,
						      scratch,multiplier);
      }


    };
    
      
  }

  // Define spread function applied to an expression
  template <int SpreadDim, typename Type, class E>
  typename internal::enable_if<(SpreadDim >= 0 && SpreadDim <= E::rank),
	       internal::Spread<SpreadDim,Type,E> >::type
  spread(const Expression<Type,E>& e, Index n) {
    return internal::Spread<SpreadDim,Type,E>(e,n);
  }

  /*
  // If "spread" is applied to a scalar, we expand it to a Vector of
  // the same type
  template <int SpreadDim, typename Type>
  typename internal::enable_if<internal::is_not_expression<Type>::value,
			       Array<1,Type,false> >::type
  spread(const Type& e, Index n) {
    Array<1,Type,false> arr(n);
    arr = e;
    return arr;
  }
  */

}


#endif


================================================
FILE: include/adept/store_transpose.h
================================================
/* store_transpose.h -- Store the transpose of a vector of Packets

    Copyright (C) 2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

   Vectorization of active expressions involves storage of the
   gradients in an object of type ScratchVector<N,Packet<Real>>, which
   we need to transpose when placing on the stack.

*/

#ifndef StoreTranspose_H
#define StoreTranspose_H 1

#include <adept/Packet.h>
#include <adept/ScratchVector.h>


namespace adept {

  namespace internal {

    // Unvectorized version
    template <int Len, typename Type>
    store_transpose(ScratchVector<Len,Packet<Type> >& src, Type* dest) {
      for (int i = 0; i < Len; ++i) {
	union {
	  Packet<Type>::intrinsic_type packet;
	  Type[Packet<Type>::size]     array;
	}
	packet = src[i];
	for (int j = 0; j < Packet<Type>::size; ++j) {
	  dest[j*Len] = array[j];
	}
	++dest;
      }
    }

  }
}


#endif


================================================
FILE: include/adept/traits.h
================================================
/* traits.h -- Traits used to support array/automatic differentiation expressions

    Copyright (C) 2012-2014 University of Reading
    Copyright (C) 2015-2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#ifndef AdeptTraits_H
#define AdeptTraits_H 1

#include <complex>
#include <limits>
#include <iostream>

#include <adept/base.h>

#ifdef ADEPT_CXX11_FEATURES
#include <initializer_list>
#endif

namespace adept {

  // Forward declaration of "Active"
  template <typename T> class Active;


  // All traits are in the adept::internal namespace.  Note that many
  // of these are part of the STL in C++11 but are needed so that
  // Adept can be used with C++98 compilers.
  namespace internal {

    // ----- CONTENTS -----
    // 1. ADEPT_STATIC_ASSERT
    // 2. enable_if
    // 3. if_then_else
    // 4. is_not_expression
    // 5. is_complex
    // 6. is_active
    // 7. is_array
    // 8. is_scalar_int
    // 9. all_scalar_ints
    // 10. underlying_real
    // 11. underlying_passive
    // 12. promote
    // 13. rank_compatible
    // 14. is_same
    // 15. remove_reference
    // 16. initializer_list_rank
    // 17. matrix_op_defined
    // 18. is_floating_point
    // --------------------

    // ---------------------------------------------------------------------
    // 1. ADEPT_STATIC_ASSERT
    // ---------------------------------------------------------------------

    // Heavily templated C++ code as in the Adept library can produce
    // very long and cryptic compiler error messages. This macro is
    // useful to check for conditions that should not happen. It check
    // a bool known at compile time is true, otherwise fail to compile
    // with a message that is hopefully understandable.
    // E.g. ADEPT_STATIC_ASSERT(0 > 1, ZERO_IS_NOT_GREATER_THAN_ONE)
    // would fail at compile time with a message containing
    // ERROR_ZERO_IS_NOT_GREATER_THAN_ONE, which should hopefully
    // stand out even in a long error message.

    // Helper class
    template<bool> struct compile_time_check 
    { typedef int STATIC_ASSERTION_HAS_FAILED; };
    template<> struct compile_time_check<false> { };

    // Define the macro in which a struct is defined that inherits
    // from compile_time_check
#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
#pragma GCC diagnostic ignored "-Wpragmas"
#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
#pragma GCC diagnostic warning "-Wpragmas"
#endif
#define ADEPT_STATIC_ASSERT(condition, msg)				\
    do { struct ERROR_##msg : public ::adept::internal::compile_time_check<(condition)> { }; \
	typedef typename ERROR_##msg ::STATIC_ASSERTION_HAS_FAILED type; \
    } while (0)

    // ---------------------------------------------------------------------
    // 2. enable_if
    // ---------------------------------------------------------------------

    // To enable a function "Type function()" only if CONDITION is
    // true, replace "Type" in the function declaration with "typename
    // enable_if<CONDITIONAL,Type>::type"
    template <bool, typename T = void> struct enable_if { };
    // Partial specialization for true.
    template <typename T> struct enable_if<true, T> { typedef T type; };


    // ---------------------------------------------------------------------
    // 3. if_then_else
    // ---------------------------------------------------------------------

    // "if_then_else<CONDITION, YES, NO>::type" resolves to YES if
    // CONDITION is "true", NO otherwise. A limitation is that both Y
    // and N must be valid types
    template <bool, typename Y, typename N>
    struct if_then_else { typedef Y type; };

    template <typename Y, typename N>
    struct if_then_else<false, Y, N> { typedef N type; };


    // ---------------------------------------------------------------------
    // 4. is_not_expression
    // ---------------------------------------------------------------------

    // The following enables us to provide functions that work only on
    // types *not* derived from the Expression struct:
    // "is_not_expression<E>::value" is "false" if E is not an
    // expression and "true" otherwise
    template <typename T>
    struct is_not_expression
    {
    private:
      typedef char yes;
      typedef struct { char array[2]; } no;
      template <typename C> static yes test(typename C::_adept_expression_flag*);
      template <typename C> static no  test(...);
    public:
      static const bool value = sizeof(test<T>(0)) != sizeof(yes);
    };


    // ---------------------------------------------------------------------
    // 5. is_complex
    // ---------------------------------------------------------------------

    // Test for complex numbers: "is_complex<S>::value" is "true" if S
    // is complex, "false" otherwise
    template <typename> struct is_complex
    { static const bool value = false; };
    template <> struct is_complex<std::complex<float> > 
    { static const bool value = true; };
    template <> struct is_complex<std::complex<double> > 
    { static const bool value = true; };
    template <> struct is_complex<std::complex<long double> > 
    { static const bool value = true; };


    // ---------------------------------------------------------------------
    // 6. is_active
    // ---------------------------------------------------------------------

    // Test for active numbers: "is_active<S>::value" is "true" if S
    // is active, "false" otherwise.
    // Then the default case for non-expressions returns false
    
    template <typename T> struct expr_cast; // Forward declaration

    template <typename T, class Enable = void>
    struct is_active { };

    template <typename T>
    struct is_active<T, typename enable_if<is_not_expression<T>::value>::type>
    { static const bool value = false; };
    
    // Expressions define a static const bool called "is_active"
    template <typename T>
    struct is_active<T, typename enable_if<!is_not_expression<T>::value>::type>
    { static const bool value = expr_cast<T>::is_active; };
    

    // ---------------------------------------------------------------------
    // 7. is_array
    // ---------------------------------------------------------------------
    
    /*
    // "is_array<E>::value" is "true" if E is an array expression and
    // "false" otherwise.  The default case for non-expressions
    // returns false
    template <typename T, class Enable = void>
    struct is_array { };
    template <typename T>
    struct is_array<T, typename enable_if<is_not_expression<T>::value>::type>
    { static const bool value = false; };
    // Expressions define a static const bool called "is_array"
    template <typename T>
    struct is_array<T, typename enable_if<!is_not_expression<T>::value>::type>
    { static const bool value = T::is_array; };
    */

    // ---------------------------------------------------------------------
    // 8. is_scalar_int
    // ---------------------------------------------------------------------

    // Return whether template argument is of integer type, or is a
    // 0-dimensional expression of integer type
    template <typename T, class Enable = void>
    struct is_scalar_int { };
    
    template <typename T>
    struct is_scalar_int<T, 
	      typename enable_if<is_not_expression<T>::value>::type> {
      static const bool value = std::numeric_limits<T>::is_integer;
      static const int  count = value;
    };
    
    template <typename T>
    struct is_scalar_int<T, 
	      typename enable_if<!is_not_expression<T>::value>::type>
    {
      static const bool value
      = std::numeric_limits<typename T::type>::is_integer
	&& expr_cast<T>::rank == 0; 
      static const int  count = value;
    };


    // ---------------------------------------------------------------------
    // 9. all_scalar_ints
    // ---------------------------------------------------------------------

    // all_scalar_ints<Rank,I0,I1...>::value returns true if I[0] to
    // I[Rank-1] are all scalar integers

    // First define a "null" type
    struct null_type { };
    template <typename T> struct is_null_type { 
      static const bool value = false; 
      static const int  count = 0; 
    };
    template <> struct is_null_type<null_type>{
      static const bool value = true; 
      static const int  count = 1;
    };

    template <int Rank, typename I0, typename I1 = null_type, 
	      typename I2 = null_type, typename I3 = null_type,
	      typename I4 = null_type, typename I5 = null_type,
	      typename I6 = null_type>
    struct all_scalar_ints {
      static const bool value = (Rank == (is_scalar_int<I0>::count
					  +is_scalar_int<I1>::count
					  +is_scalar_int<I2>::count
					  +is_scalar_int<I3>::count
					  +is_scalar_int<I4>::count
					  +is_scalar_int<I5>::count
					  +is_scalar_int<I6>::count));
    };


    // ---------------------------------------------------------------------
    // 10. underlying_real
    // ---------------------------------------------------------------------
  
    // Return the underlying real type for a complex argument:
    // "underlying_real<S>::type returns T if S is of type
    // std::complex<T>, or returns S if it is not complex
    /*
    template <typename T>
    struct underlying_real
    {
    private:
      template <bool, typename S>
      struct _underlying_real
      { typedef S type; };
      template <typename S>
      struct _underlying_real<true, S>
      { typedef typename S::type type; };
    public:
      typedef typename _underlying_real<is_complex<T>::value,
					T>::type type;
    };
    */
    template <typename T>
    struct underlying_real {
      typedef T type;
    };
    template <typename T>
    struct underlying_real<std::complex<T> > {
      typedef T type;
    };
	
    // ---------------------------------------------------------------------
    // 11. underlying_passive
    // ---------------------------------------------------------------------
  
    // Return the underlying passive type for an active argument:
    // "underlying_passive<S>::type returns T if S is of type
    // adept::Active<T>, or returns S if it is not active.
    template <typename T>
    struct underlying_passive
    {
    private:
      template <bool, typename S>
      struct _underlying_passive
      { typedef S type; };
      template <typename S>
      struct _underlying_passive<true, S>
      { typedef typename S::type type; };
    public:
      typedef typename _underlying_passive<is_active<T>::value,
					T>::type type;
    };
    

    // ---------------------------------------------------------------------
    // 12. promote
    // ---------------------------------------------------------------------
  
    // "promote<L,R>::type" returns the type that a binary operation
    // (e.g. multiplication) between types L and R should result in.
    // Note that "complexity" and "precision" are promoted separately,
    // so double + std::complex<float> will result in an object of
    // type std::complex<double> >.
    template <typename L, typename R>
    struct promote {
    private:
      template <typename A, typename B>
      struct promote_primitive {
	static const bool A_bigger_than_B = (sizeof(A) > sizeof(B));
	static const bool A_float_B_int = (!std::numeric_limits<A>::is_integer) 
	  && std::numeric_limits<B>::is_integer;
	static const bool A_int_B_float = std::numeric_limits<A>::is_integer
	  && (!std::numeric_limits<B>::is_integer);
	static const bool prefer_float = A_float_B_int || A_int_B_float;
	typedef typename if_then_else<A_float_B_int, A, B>::type float_type;
	typedef typename if_then_else<A_bigger_than_B, A, B>::type biggest_type;
	typedef typename if_then_else<prefer_float, float_type, biggest_type>::type type;
      };
      
      typedef typename promote_primitive<
        typename underlying_real<typename underlying_passive<L>::type>::type,
	typename underlying_real<typename underlying_passive<R>::type>::type>::type real;
      typedef typename if_then_else<is_complex<L>::value
				    || is_complex<R>::value,
				    std::complex<real>,
				    real>::type complex_type;
    public: 
      typedef typename if_then_else<is_active<L>::value || is_active<R>::value,
				    adept::Active<complex_type>, 
				    complex_type>::type type;
    };

    // If ever the template arguments are the same
    // (e.g. Packet<double>), we simply return this type
    template <typename T>
    struct promote<T,T> {
      typedef T type;
    };

  
    // ---------------------------------------------------------------------
    // 13. rank_compatible
    // ---------------------------------------------------------------------

    // Check that an array of rank LRank could enter an operation
    // (e.g. addition) with an array of rank RRank: the two ranks must
    // either be the same, or either can be zero
    template <int LRank, int RRank>
    struct rank_compatible {
      static const bool value = (LRank == RRank || LRank == 0 || RRank == 0);
    };


    // ---------------------------------------------------------------------
    // 14. is_same
    // ---------------------------------------------------------------------

    // Compare two types to see if they're the same
    template<typename T, typename U>
    struct is_same { static const bool value = false;  };
    
    template<typename T>
    struct is_same<T,T>  { static const bool value = true; };
    

    // ---------------------------------------------------------------------
    // 15. remove_reference
    // ---------------------------------------------------------------------

    // Remove reference from a type if present
    template<typename T>  struct remove_reference { typedef T type; };
    template<typename T>  struct remove_reference<T&> { typedef T type; };


    // ---------------------------------------------------------------------
    // 16. initializer_list_rank
    // ---------------------------------------------------------------------
#ifdef ADEPT_CXX11_FEATURES

    // initializer_link_rank<T>::value returns 0 if T is not a
    // std:initializer_list, otherwise it returns the number of nested
    // std::initializer_list's
    template <typename T> struct is_initializer_list 
    { static const bool value = false; };
    template <typename T> struct is_initializer_list<std::initializer_list<T> >
    { static const bool value = true; };

    template <typename T, class Enable = void>
    struct initializer_list_rank { };

    template <typename T>
    struct initializer_list_rank<T,
				 typename enable_if<!is_initializer_list<T>::value>::type>
    { typedef T type;
      static const int value = 0; };
    
    template <typename T>
    struct initializer_list_rank<std::initializer_list<T>,
				 typename enable_if<!is_initializer_list<T>::value>::type>
    { typedef T type;
      static const int value = 1; };

    template <typename T>
    struct initializer_list_rank<std::initializer_list<T>,
				 typename enable_if<is_initializer_list<T>::value>::type>
    { typedef typename initializer_list_rank<T>::type type;
      static const int value = 1 + initializer_list_rank<T>::value; };

#endif

    // ---------------------------------------------------------------------
    // 17. matrix_op_defined
    // ---------------------------------------------------------------------

    // Return true if a type is float or double, false otherwise
    template <typename T>
    struct matrix_op_defined { static const bool value = false;  };
    
    template <>
    struct matrix_op_defined<float>  { static const bool value = true; };

    template <>
    struct matrix_op_defined<double>  { static const bool value = true; };
 
    // ---------------------------------------------------------------------
    // 18. is_floating_point
    // ---------------------------------------------------------------------

    template <typename T>
    struct is_floating_point { static const bool value = false; };

    template <>
    struct is_floating_point<float> { static const bool value = true; };
    template <>
    struct is_floating_point<double> { static const bool value = true; };
    template <>
    struct is_floating_point<long double> { static const bool value = true; };

  } // End namespace internal

} // End namespace adept


#endif


================================================
FILE: include/adept/vector_utilities.h
================================================
/* vector_utilities.h -- Vector utility functions

    Copyright (C) 2016 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#ifndef AdeptVectorUtilities_H
#define AdeptVectorUtilities_H

#include <adept/Array.h>

namespace adept {

  Array<1,Real,false> linspace(Real x1, Real x2, Index n);

}

#endif


================================================
FILE: include/adept/where.h
================================================
/* where.h -- Support for Fortran-90-like "where" construct

    Copyright (C) 2015 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

  
   Consider the following:

     A.where(B) = C;
     A.where(B) = either_or(C, D);

   where A is an Array, B is a boolean expression, and C and D are
   expressions, and the arrays and expressions have the same rank and
   size, except that C and or D may have rank zero. The first line has
   the effect of setting every element of A for which B is true to the
   corresponding value in C. The second line does this but for
   elements where B is false it sets A instead to D.

   
*/


#ifndef AdeptWhere_H
#define AdeptWhere_H 1

#include <vector>

#include <adept/Expression.h>

namespace adept {

  namespace internal {


    // ---------------------------------------------------------------------
    // Section 1. EitherOr object returned by either_or function
    // ---------------------------------------------------------------------
    template <class C, class D>
    class EitherOr {
    public:
      typedef bool _adept_either_or_flag;
      EitherOr(const C& c, const D& d) : either_(c), or_(d) { }
      const C& value_if_true() const { return either_; }
      const D& value_if_false() const { return or_; }
    protected:
      const C& either_;
      const D& or_;
    };


    template <typename T>
    struct is_not_either_or
    {
    private:
      typedef char yes;
      typedef struct { char array[2]; } no;
      template <typename C> static yes test(typename C::_adept_either_or_flag*);
      template <typename C> static no  test(...);
    public:
      static const bool value = sizeof(test<T>(0)) != sizeof(yes);
    };


    // ---------------------------------------------------------------------
    // Section 2. Where class returned by A.where(B)
    // ---------------------------------------------------------------------
    template <class A, class B>
    class Where {
    public:
      Where(A& a, const B& b) : array_(a), bool_expr_(b) { }

      template <class C>
      typename enable_if<is_not_either_or<C>::value, Where&>::type
      operator=(const C& c) {
	array_.assign_conditional(bool_expr_, c);
	return *this;
      }

      // With either_or on the right-hand-side: this implementation
      // could be faster if bool_expr was not evaluated twice
      template <class C>
      typename enable_if<!is_not_either_or<C>::value, Where&>::type
      operator=(const C& c) {
	array_.assign_conditional(!const_cast<B&>(bool_expr_), c.value_if_false());
	array_.assign_conditional(bool_expr_,  c.value_if_true());
	return *this;
      }

#define ADEPT_WHERE_OPERATOR(EQ_OP, OP)					\
      template <class C>						\
      typename enable_if<is_not_either_or<C>::value, Where&>::type	\
      EQ_OP(const C& c) {						\
	array_.assign_conditional(bool_expr_, noalias(*this) OP c);	\
        return *this;							\
      }									\
      template <class C>						\
      typename enable_if<!is_not_either_or<C>::value, Where&>::type	\
      EQ_OP(const C& c) {						\
	array_.assign_conditional(!const_cast<B&>(bool_expr_),		\
				  noalias(*this) OP c.value_if_false()); \
	array_.assign_conditional(bool_expr_,				\
				  noalias(*this) OP c.value_if_true()); \
	return *this;							\
      }									
      ADEPT_WHERE_OPERATOR(operator+=, +)
      ADEPT_WHERE_OPERATOR(operator-=, -)
      ADEPT_WHERE_OPERATOR(operator*=, *)
      ADEPT_WHERE_OPERATOR(operator/=, /)
#undef ADEPT_WHERE_OPERATOR

    protected:
      A& array_;
      const B& bool_expr_;

    };

  } // end namespace internal


  template <class C, class D>
  internal::EitherOr<C,D> either_or(const C& c, const D& d) {
    return internal::EitherOr<C,D>(c, d);
  }

} // end namespace adept

#endif 


================================================
FILE: include/adept.h
================================================
/* adept.h -- Header file for basic scalar functionality of Adept automatic differentiation library

    Copyright (C) 2015-2016 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#ifndef Adept_H
#define Adept_H 1

#include <adept/settings.h>
#include <adept/UnaryOperation.h>
#include <adept/BinaryOperation.h>
#include <adept/Active.h>
#include <adept/scalar_shortcuts.h>

#endif


================================================
FILE: include/adept_arrays.h
================================================
/* adept_arrays.h -- Header file for array functionality of Adept automatic differentiation library

    Copyright (C) 2014-2015 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#ifndef AdeptArrays_H
#define AdeptArrays_H 1

#include <adept.h>

#include <adept/Array.h>
#include <adept/FixedArray.h>
#include <adept/reduce.h>
#include <adept/matmul.h>
#include <adept/solve.h>
#include <adept/inv.h>
#include <adept/Allocator.h>
#include <adept/interp.h>
#include <adept/spread.h>
#include <adept/outer_product.h>
#include <adept/eval.h>
#include <adept/array_shortcuts.h>
#include <adept/vector_utilities.h>

#endif


================================================
FILE: include/adept_fortran.h
================================================
/* adept_fortran.h -- Interoperability between Adept and Fortran-90 arrays

    Copyright (C) 2020 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.


   Fortran-90 introduced multi-dimensional arrays with essentially the
   same basic capabilities as passive Adept arrays, including the
   ability to index strided data in memory. The improved
   interoperability features of Fortran 2018 enable Fortran array data
   to be passed to and from C/C++. This header file enables passive
   Adept arrays to be passed to and from Fortran.

   PASSING ARRAYS FROM FORTRAN TO C++

   A C++ subroutine callable from Fortran could be declared in C++ as:

     extern "C"
     void adept_subroutine(adept::FortranArray* int_arr,
                           adept::FortranArray* dbl_arr);

   where FortranArray is a C++ class wrapping the CFI_cdesc_t type
   that contains the Fortran array descriptor. Within the definition
   of this function, Adept arrays may be associated with the Fortran
   data as follows:

     adept::intMatrix imat;
     adept::associate(imat, int_arr);
     imat >>= int_arr; // Alternative form

   In this example, the matrix of integers "imat" shares its data with
   the Fortran array int_arr. An exception will be thrown if the
   Fortran array is not of type integer and rank 2. Note that the
   array indexing of imat will be in the standard C/C++ convention,
   zero-based and with the final index varying fastest as memory is
   traversed. This is opposite to the way the array is accessed in
   Fortran.  The ">>=" provides a more succinct way to do the same
   thing.

   Consider the following: 

     adept::Matrix dmat;
     adept::associate(dmat, dbl_arr, true);

   Here, the third argument "true" indicates that the array strides of
   dmat are to be configured so that the array indices are the same as
   in Fortran (although still zero based). This will impede
   optimization of some array expressions using dmat, since the second
   dimension of dmat will not be contiguous in memory, and this is the
   dimension that Adept attempts to vectorize.

   PASSING ARRAYS FROM ADEPT TO FORTRAN

   A Fortran-implemented subroutine could be declared in C++ as
   follows:

     extern "C"
     void fort_subroutine(adept::FortranArray* int_arr,
                          adept::FortranArray* dbl_arr);

   To call this routine from C++, passing Adept arrays "imat" and
   "dmat" as the arguments, we can do simply:

     fort_subroutine(FortranArray(imat), FortranArray(dmat));

*/


#ifndef AdeptFortran_H
#define AdeptFortran_H 1

#include <complex>
#include <adept_arrays.h>

// GNU defines CFI_type_Bool as "_Bool", but this is only available in
// C99, not C++, so we make it an alias for C++'s "bool"
#ifdef __GNUC__
#include <stdbool.h>
#endif

// Load the Fortran array interface into the global namespace
#include <ISO_Fortran_binding.h>

namespace adept {

  namespace internal {
    // Helper types such that cfi_type<X>::type returns the integer
    // type of "X", or fails to compile if it is not possible to send
    // an array of type X to Fortran
    template <typename Type> struct cfi_type
    { }; // Fails to compile if attempt to access "type"
    template <> struct cfi_type<char>
    { static const CFI_type_t type = CFI_type_signed_char; };
    template <> struct cfi_type<short>
    { static const CFI_type_t type = CFI_type_short; };
    template <> struct cfi_type<int>
    { static const CFI_type_t type = CFI_type_int; };
    template <> struct cfi_type<long>
    { static const CFI_type_t type = CFI_type_long; };
    template <> struct cfi_type<long long>
    { static const CFI_type_t type = CFI_type_long_long; };
    template <> struct cfi_type<bool>
    { static const CFI_type_t type = CFI_type_Bool; };
    template <> struct cfi_type<float>
    { static const CFI_type_t type = CFI_type_float; };
    template <> struct cfi_type<double>
    { static const CFI_type_t type = CFI_type_double; };
    template <> struct cfi_type<long double>
    { static const CFI_type_t type = CFI_type_long_double; };
    template <> struct cfi_type<std::complex<float> >
    { static const CFI_type_t type = CFI_type_float_Complex; };
    template <> struct cfi_type<std::complex<double> >
    { static const CFI_type_t type = CFI_type_long_double_Complex; };
    template <> struct cfi_type<std::complex<long double> >
    { static const CFI_type_t type = CFI_type_long_double_Complex; };
  }

  // This class is essentially a wrapper around the CFI_cdesc_t type
  // which stores a Fortran array descriptor which could be for an
  // array of any rank or type
  class FortranArray {

  protected:
    // Data: the Fortran array descriptor CFI_cdesc_t type, but the
    // version configured for the maximum allowable Fortran rank
    CFI_CDESC_T(CFI_MAX_RANK) ad;

  public:
    // This class either exists as a pointer to a Fortran array passed
    // in from a Fortran routine, or as an object pointing to an Adept
    // array that is about to be passed into a Fortran routine.
    // Therefore it can only be constructed from an existing Adept
    // array.
    FortranArray() = delete;
    
    // Initialize from Adept array. By default, the dimensions will
    // need to be accessed in opposite order in Fortran than in
    // C++/Adept, reflecting the default column-major array access of
    // the former and row-major array access of the latter. But by
    // providing preserve_dim_order=true, the dimension access order
    // will be preserved between the two.
    template <int Rank, typename Type>
    FortranArray(adept::Array<Rank,Type>& a,
		 bool preserve_dim_order = false) {
      init(a, preserve_dim_order);
    }
    // No way to ensure that Fortran cannot modify an array,
    // unfortunately, so we need to cast away the const-ness
    template <int Rank, typename Type>
    FortranArray(const adept::Array<Rank,Type>& a,
		 bool preserve_dim_order = false) {
      init(const_cast<adept::Array<Rank,Type>&>(a), preserve_dim_order);
    }

  protected:
    // Constructor implementation: initialize CFI_cdesc_t elements
    // from Adept array
    template <int Rank, typename Type>
    void init(adept::Array<Rank,Type>& a, bool preserve_dim_order) {
      ADEPT_STATIC_ASSERT(Rank <= CFI_MAX_RANK, ARRAY_RANK_EXCEEDS_FORTRAN_MAXIMUM);
      ad.base_addr = static_cast<void*>(a.data());
      ad.elem_len  = sizeof(Type);
      ad.version   = CFI_VERSION;
      ad.rank      = Rank;
      ad.attribute = CFI_attribute_other;
      ad.type      = internal::cfi_type<Type>::type;
      if (!preserve_dim_order) {
	for (int irank = 0; irank < Rank; ++irank) {
	  ad.dim[irank].lower_bound = 0;
	  ad.dim[irank].extent = a.dimension(Rank-irank-1);
	  ad.dim[irank].sm = a.offset(Rank-irank-1)*sizeof(Type);
	}
      }
      else {
	for (int irank = 0; irank < Rank; ++irank) {
	  ad.dim[irank].lower_bound = 0;
	  ad.dim[irank].extent = a.dimension(irank);
	  ad.dim[irank].sm = a.offset(irank)*sizeof(Type);
	}
      }
    }

  public:
    // Query the rank and type of the Fortran array
    int rank() const { return ad.rank; }
    int type_code() const { return ad.type; }

    // Return "true" if the rank or type equal the template parameters
    // Rank and Type
    template <int Rank>
    bool is_rank() const {
      return (Rank == ad.rank);
    }
    template <typename Type>
    bool is_type() const {
      return (internal::cfi_type<Type>::type == ad.type
	      && sizeof(Type) == ad.elem_len);
    }

    // Return the length or stride in memory of a particular dimension
    CFI_index_t dimension(int idim) const { return ad.dim[idim].extent; }
    CFI_index_t offset(int idim) const { return ad.dim[idim].sm/ad.elem_len; }
    
    // Throw an exception if the rank or type differ from the template
    // parameters Rank and Type
    template <int Rank, typename Type>
    void verify() const {
      if (!is_rank<Rank>()) {
	throw fortran_interoperability_error(
           "Rank of Fortran array does not match expected rank");
      }
      else if (!is_type<Type>()) {
	throw fortran_interoperability_error(
           "Type of Fortran array does not match expected type");
      }
    }

    // Return a pointer to the underlying data casting to the
    // specified Type
    template <typename Type>
    Type* data() {
      return static_cast<Type*>(ad.base_addr);
    }

    // Allow this object to be passed to a function expecting a
    // pointer
    operator CFI_cdesc_t*() { return reinterpret_cast<CFI_cdesc_t*>(&ad); }
    operator FortranArray*() { return this; }
    
  };

  // Associate Adept array "a" with Fortran array "fa" so that
  // subsequent changes to the elements of "a" will be seen within
  // Fortran when the C++ routine returns.
  template <int Rank, typename Type>
  void associate(Array<Rank,Type>& a, FortranArray* fa,
		 bool preserve_dim_order = false) {
    fa->verify<Rank,Type>(); // Verify rank and type
    ExpressionSize<Rank> dims, offs;
    if (!preserve_dim_order) {
      for (int irank = 0; irank < Rank; ++irank) {
	dims[Rank-irank-1] = fa->dimension(irank);
	offs[Rank-irank-1] = fa->offset(irank);
      }
    }
    else {
      for (int irank = 0; irank < Rank; ++irank) {
	dims[irank] = fa->dimension(irank);
	offs[irank] = fa->offset(irank);
      }
    }
    a.clear();
    a = Array<Rank,Type>(static_cast<Type*>(fa->data<Type>()), 0, dims, offs);
  }

  // Associate Adept array "a" with a general Fortran array descriptor
  // "cd", noting that we only verify that the rank and type match
  // when the "associate" function above is called.
  template <int Rank, typename Type>
  void associate(Array<Rank,Type>& a, CFI_cdesc_t* cd,
		 bool preserve_dim_order = false) {
    FortranArray* fa = reinterpret_cast<FortranArray*>(cd);
    associate(a, fa, preserve_dim_order);
  }

  // Enable link of an Adept array to a Fortran array using the >>=
  // operator
  template<int Rank, typename Type>
  void operator>>=(adept::Array<Rank,Type>& a, FortranArray* fa) {
    associate(a,fa);
  }
  template<int Rank, typename Type>
  void operator>>=(adept::Array<Rank,Type>& a, CFI_cdesc_t* cd) {
    associate(a,cd);
  }

} // End namespace adept

#endif


================================================
FILE: include/adept_optimize.h
================================================
/* adept_optimize.h -- Header file for optimization algorithms of Adept library

    Copyright (C) 2020 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

    This file is part of the Adept library.

*/

#ifndef AdeptOptimize_H
#define AdeptOptimize_H 1

#include <adept_arrays.h>

#include <adept/Optimizable.h>
#include <adept/Minimizer.h>

#endif


================================================
FILE: include/create_adept_source_header
================================================
#!/bin/sh
# This script creates a header file "adept_source.h" containing the
# ../adept/*.h ../adept/*.cpp source files; why this is useful is explained below.

ADEPT_SOURCE_HEADER=adept_source.h
rm -f $ADEPT_SOURCE_HEADER

echo "Creating $ADEPT_SOURCE_HEADER"

echo "/* $ADEPT_SOURCE_HEADER - Source code for the Adept library

  Copyright (C) 2012-2015 The University of Reading
  Copyright (C) 2015-     European Centre for Medium-Range Weather Forecasts

  Licensed under the Apache License, Version 2.0 (the \"License\"); you
  may not use this file except in compliance with the License.  You
  may obtain a copy of the License at

      http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an \"AS IS\" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
  implied.  See the License for the specific language governing
  permissions and limitations under the License.


  This file was created automatically by script $0 
  on "$(date)"

  It contains a concatenation of the source files from the Adept
  library. The idea is that a program may #include this file in one of
  its source files (typically the one containing the main function),
  and then the Adept library will be built into the executable without
  the need to link to an external library. All other source files
  should just #include <adept.h> or <adept_arrays.h>. The ability to
  use Adept in this way makes it easier to distribute an Adept package
  that is usable on non-Unix platforms that are unable to use the
  autoconf configure script to build external libraries.

  If HAVE_BLAS is defined below then matrix multiplication will be
  enabled; the BLAS library should be provided at the link stage
  although no header file is required.  If HAVE_LAPACK is defined
  below then linear algebra routines will be enabled (matrix inverse
  and solving linear systems of equations); again, the LAPACK library
  should be provided at the link stage although no header file is
  required.

*/

/* Feel free to delete this warning: */
#ifdef _MSC_FULL_VER 
#pragma message(\"warning: the adept_source.h header file has not been edited so BLAS matrix multiplication and LAPACK linear-algebra support have been disabled\")
#else
#warning \"The adept_source.h header file has not been edited so BLAS matrix multiplication and LAPACK linear-algebra support have been disabled\"
#endif

/* Uncomment this if you are linking to the BLAS library (header file
   not required) to enable matrix multiplication */
//#define HAVE_BLAS 1

/* Uncomment this if you are linking to the LAPACK library (header
   file not required) */
//#define HAVE_LAPACK 1

/* Uncomment this if you have the cblas.h header from OpenBLAS */
//#define HAVE_OPENBLAS_CBLAS_HEADER

/*

  The individual source files now follow.

*/

#ifndef AdeptSource_H
#define AdeptSource_H 1

" > $ADEPT_SOURCE_HEADER

for FILE in ../config_platform_independent.h ../adept/*.h ../adept/*.cpp
do
    echo "   Adding $FILE"
    echo "

// =================================================================
// Contents of $(basename $FILE)
// =================================================================
" >> $ADEPT_SOURCE_HEADER
    cat $FILE >> $ADEPT_SOURCE_HEADER
done

echo "

#endif
" >> $ADEPT_SOURCE_HEADER
echo "Done"


================================================
FILE: m4/adept.m4
================================================
# ---------------------------------------------------------------------------
# FILE         : adept.m4
# COPYRIGHT    : 2018- ECMWF
# AUTHOR       : Alessio Bozzo
# LICENSE      : Apache License Version 2.0
# ----------------------------------------------------------------------------
#
# This software is licensed under the terms of the Apache Licence
# Version 2.0 which can be obtained at
# http://www.apache.org/licenses/LICENSE-2.0. In applying this
# licence, ECMWF does not waive the privileges and immunities granted
# to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.
#
# ----------------------------------------------------------------------------
#
# This file contains a macro processor (m4 file) to enable autotools
# to locate the Adept C++ library (version 2.0 or greater).  The file
# should be placed in the m4 directory of your package. If you have
# aclocal.m4 in your top-level directory then it will be found
# automatically; otherwise you will need the following in your
# configure.ac file:
#
#   m4_include([m4/adept.m4])
#
# Usage is then as follows in the configure.ac file
#
#   AX_CHECK_ADEPT([ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
#
# for example:
#
#   AX_CHECK_ADEPT([have_adept=yes], [have_adept=no])
#
# This creates variables ADEPT_LDFLAGS and ADEPT_CPPFLAGS, and adds
# them to LDFLAGS and CPPFLAGS.
#
# The macro looks for the Adept library in system directories, but the
# user can specify another location by passing an argument to the
# configure script as follows:
#
#   ./configure --with-adept=/home/me/apps/adept-2.1
#
# ----------------------------------------------------------------------------

dnl defines a custom macro
AC_DEFUN([AX_CHECK_ADEPT], [

      dnl provides a framework to handle the --with-{arg} values passed to configure on the command line      
      AC_ARG_WITH([adept],
            [AS_HELP_STRING([--with-adept=DIR], [use Adept Library from directory DIR])],
            adept_prefix="$with_adept"
            []
            )
      
      AS_IF([test x$adept_prefix != x],
            [AS_IF([test -d "$adept_prefix/lib"],
                  [ADEPT_LDFLAGS="-L$adept_prefix/lib -Wl,-rpath,$adept_prefix/lib -ladept"
                  ADEPT_CPPFLAGS="-I$adept_prefix/include"],
		  [test -d "$adept_prefix/lib64"],
                  [ADEPT_LDFLAGS="-L$adept_prefix/lib64 -Wl,-rpath,$adept_prefix/lib64 -ladept"
                  ADEPT_CPPFLAGS="-I$adept_prefix/include"],
                  [AC_MSG_ERROR([
  -----------------------------------------------------------------------------
     --with-adept=$adept_prefix is not a valid directory
  -----------------------------------------------------------------------------])])],
      [AC_MSG_WARN([
  -----------------------------------------------------------------------------
   Missing option `--with-adept=DIR`. Looking for Adept Library
   into Linux default library search paths
  -----------------------------------------------------------------------------])]
           )
     
      LDFLAGS="$ADEPT_LDFLAGS $LDFLAGS"
      CPPFLAGS="$ADEPT_CPPFLAGS $CPPFLAGS"
      ax_have_adept=yes
      dnl checks for ADEPT
      AC_MSG_CHECKING([for Adept >= 2.0.4: including adept_arrays.h and linking via -ladept])
      AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <adept_arrays.h>
      #include <string>
      #if ADEPT_VERSION < 20004
      #error "Adept version >= 2.0.4 required"
      #endif],[std::string test = adept::compiler_version()])],AC_MSG_RESULT([yes]),AC_MSG_RESULT([no])
      AC_MSG_ERROR([Unable to find Adept library version >= 2.0.4]))

      AS_IF([test "x$ax_have_adept" = xyes],
            dnl outputing Adept Library
            [AC_SUBST([ADEPT_LDFLAGS])
            AC_SUBST([ADEPT_CPPFLAGS])
            $1],
            [$2])
      ]
)
dnl vim:set softtabstop=4 shiftwidth=4 expandtab:


================================================
FILE: m4/ax_blas.m4
================================================
# ===========================================================================
#          http://www.gnu.org/software/autoconf-archive/ax_blas.html
# ===========================================================================
#
# SYNOPSIS
#
#   AX_BLAS([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
#
# DESCRIPTION
#
#   This macro looks for a library that implements the BLAS linear-algebra
#   interface (see http://www.netlib.org/blas/). On success, it sets the
#   BLAS_LIBS output variable to hold the requisite library linkages.
#
#   To link with BLAS, you should link with:
#
#     $BLAS_LIBS $LIBS $FLIBS
#
#   in that order. FLIBS is the output variable of the
#   AC_F77_LIBRARY_LDFLAGS macro (called if necessary by AX_BLAS), and is
#   sometimes necessary in order to link with F77 libraries. Users will also
#   need to use AC_F77_DUMMY_MAIN (see the autoconf manual), for the same
#   reason.
#
#   Many libraries are searched for, from ATLAS to CXML to ESSL. The user
#   may also use --with-blas=<lib> in order to use some specific BLAS
#   library <lib>. In order to link successfully, however, be aware that you
#   will probably need to use the same Fortran compiler (which can be set
#   via the F77 env. var.) as was used to compile the BLAS library.
#
#   ACTION-IF-FOUND is a list of shell commands to run if a BLAS library is
#   found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it is
#   not found. If ACTION-IF-FOUND is not specified, the default action will
#   define HAVE_BLAS.
#
# LICENSE
#
#   Copyright (c) 2008 Steven G. Johnson <stevenj@alum.mit.edu>
#
#   This program is free software: you can redistribute it and/or modify it
#   under the terms of the GNU General Public License as published by the
#   Free Software Foundation, either version 3 of the License, or (at your
#   option) any later version.
#
#   This program is distributed in the hope that it will be useful, but
#   WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
#   Public License for more details.
#
#   You should have received a copy of the GNU General Public License along
#   with this program. If not, see <http://www.gnu.org/licenses/>.
#
#   As a special exception, the respective Autoconf Macro's copyright owner
#   gives unlimited permission to copy, distribute and modify the configure
#   scripts that are the output of Autoconf when processing the Macro. You
#   need not follow the terms of the GNU General Public License when using
#   or distributing such scripts, even though portions of the text of the
#   Macro appear in them. The GNU General Public License (GPL) does govern
#   all other use of the material that constitutes the Autoconf Macro.
#
#   This special exception to the GPL applies to versions of the Autoconf
#   Macro released by the Autoconf Archive. When you make and distribute a
#   modified version of the Autoconf Macro, you may extend this special
#   exception to the GPL to apply to your modified version as well.

#serial 14

AU_ALIAS([ACX_BLAS], [AX_BLAS])
AC_DEFUN([AX_BLAS], [
AC_PREREQ(2.50)
AC_REQUIRE([AC_F77_LIBRARY_LDFLAGS])
AC_REQUIRE([AC_CANONICAL_HOST])
ax_blas_ok=no

AC_ARG_WITH(blas,
	[AS_HELP_STRING([--with-blas=<lib>], [use BLAS library <lib>])])
case $with_blas in
	yes | "") ;;
	no) ax_blas_ok=disable ;;
	-* | */* | *.a | *.so | *.so.* | *.o) BLAS_LIBS="$with_blas" ;;
	*) BLAS_LIBS="-l$with_blas" ;;
esac

# Get fortran linker names of BLAS functions to check for.
AC_F77_FUNC(sgemm)
AC_F77_FUNC(dgemm)

ax_blas_save_LIBS="$LIBS"
LIBS="$LIBS $FLIBS"

# First, check BLAS_LIBS environment variable
if test $ax_blas_ok = no; then
if test "x$BLAS_LIBS" != x; then
	save_LIBS="$LIBS"; LIBS="$BLAS_LIBS $LIBS"
	AC_MSG_CHECKING([for $sgemm in $BLAS_LIBS])
	AC_TRY_LINK_FUNC($sgemm, [ax_blas_ok=yes], [BLAS_LIBS=""])
	AC_MSG_RESULT($ax_blas_ok)
	LIBS="$save_LIBS"
fi
fi

# BLAS linked to by default?  (happens on some supercomputers)
if test $ax_blas_ok = no; then
	save_LIBS="$LIBS"; LIBS="$LIBS"
	AC_MSG_CHECKING([if $sgemm is being linked in already])
	AC_TRY_LINK_FUNC($sgemm, [ax_blas_ok=yes])
	AC_MSG_RESULT($ax_blas_ok)
	LIBS="$save_LIBS"
fi

# BLAS in OpenBLAS library? (http://xianyi.github.com/OpenBLAS/)
if test $ax_blas_ok = no; then
	AC_CHECK_LIB(openblas, $sgemm, [ax_blas_ok=yes
			                BLAS_LIBS="-lopenblas"])
fi

# BLAS in ATLAS library? (http://math-atlas.sourceforge.net/)
if test $ax_blas_ok = no; then
	AC_CHECK_LIB(atlas, ATL_xerbla,
		[AC_CHECK_LIB(f77blas, $sgemm,
		[AC_CHECK_LIB(cblas, cblas_dgemm,
			[ax_blas_ok=yes
			 BLAS_LIBS="-lcblas -lf77blas -latlas"],
			[], [-lf77blas -latlas])],
			[], [-latlas])])
fi

# BLAS in PhiPACK libraries? (requires generic BLAS lib, too)
if test $ax_blas_ok = no; then
	AC_CHECK_LIB(blas, $sgemm,
		[AC_CHECK_LIB(dgemm, $dgemm,
		[AC_CHECK_LIB(sgemm, $sgemm,
			[ax_blas_ok=yes; BLAS_LIBS="-lsgemm -ldgemm -lblas"],
			[], [-lblas])],
			[], [-lblas])])
fi

# BLAS in Intel MKL library?
if test $ax_blas_ok = no; then
	# MKL for gfortran
	if test x"$ac_cv_fc_compiler_gnu" = xyes; then
		# 64 bit
		if test $host_cpu = x86_64; then
			AC_CHECK_LIB(mkl_gf_lp64, $sgemm,
			[ax_blas_ok=yes;BLAS_LIBS="-lmkl_gf_lp64 -lmkl_sequential -lmkl_core -lpthread"],,
			[-lmkl_gf_lp64 -lmkl_sequential -lmkl_core -lpthread])
		# 32 bit
		elif test $host_cpu = i686; then
			AC_CHECK_LIB(mkl_gf, $sgemm,
				[ax_blas_ok=yes;BLAS_LIBS="-lmkl_gf -lmkl_sequential -lmkl_core -lpthread"],,
				[-lmkl_gf -lmkl_sequential -lmkl_core -lpthread])
		fi
	# MKL for other compilers (Intel, PGI, ...?)
	else
		# 64-bit
		if test $host_cpu = x86_64; then
			AC_CHECK_LIB(mkl_intel_lp64, $sgemm,
				[ax_blas_ok=yes;BLAS_LIBS="-lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread"],,
				[-lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread])
		# 32-bit
		elif test $host_cpu = i686; then
			AC_CHECK_LIB(mkl_intel, $sgemm,
				[ax_blas_ok=yes;BLAS_LIBS="-lmkl_intel -lmkl_sequential -lmkl_core -lpthread"],,
				[-lmkl_intel -lmkl_sequential -lmkl_core -lpthread])
		fi
	fi
fi
# Old versions of MKL
if test $ax_blas_ok = no; then
	AC_CHECK_LIB(mkl, $sgemm, [ax_blas_ok=yes;BLAS_LIBS="-lmkl -lguide -lpthread"],,[-lguide -lpthread])
fi

# BLAS in Apple vecLib library?
if test $ax_blas_ok = no; then
	save_LIBS="$LIBS"; LIBS="-framework vecLib $LIBS"
	AC_MSG_CHECKING([for $sgemm in -framework vecLib])
	AC_TRY_LINK_FUNC($sgemm, [ax_blas_ok=yes;BLAS_LIBS="-framework vecLib"])
	AC_MSG_RESULT($ax_blas_ok)
	LIBS="$save_LIBS"
fi

# BLAS in Alpha CXML library?
if test $ax_blas_ok = no; then
	AC_CHECK_LIB(cxml, $sgemm, [ax_blas_ok=yes;BLAS_LIBS="-lcxml"])
fi

# BLAS in Alpha DXML library? (now called CXML, see above)
if test $ax_blas_ok = no; then
	AC_CHECK_LIB(dxml, $sgemm, [ax_blas_ok=yes;BLAS_LIBS="-ldxml"])
fi

# BLAS in Sun Performance library?
if test $ax_blas_ok = no; then
	if test "x$GCC" != xyes; then # only works with Sun CC
		AC_CHECK_LIB(sunmath, acosp,
			[AC_CHECK_LIB(sunperf, $sgemm,
				[BLAS_LIBS="-xlic_lib=sunperf -lsunmath"
                                 ax_blas_ok=yes],[],[-lsunmath])])
	fi
fi

# BLAS in SCSL library?  (SGI/Cray Scientific Library)
if test $ax_blas_ok = no; then
	AC_CHECK_LIB(scs, $sgemm, [ax_blas_ok=yes; BLAS_LIBS="-lscs"])
fi

# BLAS in SGIMATH library?
if test $ax_blas_ok = no; then
	AC_CHECK_LIB(complib.sgimath, $sgemm,
		     [ax_blas_ok=yes; BLAS_LIBS="-lcomplib.sgimath"])
fi

# BLAS in IBM ESSL library? (requires generic BLAS lib, too)
if test $ax_blas_ok = no; then
	AC_CHECK_LIB(blas, $sgemm,
		[AC_CHECK_LIB(essl, $sgemm,
			[ax_blas_ok=yes; BLAS_LIBS="-lessl -lblas"],
			[], [-lblas $FLIBS])])
fi

# Generic BLAS library?
if test $ax_blas_ok = no; then
	AC_CHECK_LIB(blas, $sgemm, [ax_blas_ok=yes; BLAS_LIBS="-lblas"])
fi

AC_SUBST(BLAS_LIBS)

LIBS="$ax_blas_save_LIBS"

# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
if test x"$ax_blas_ok" = xyes; then
        ifelse([$1],,AC_DEFINE(HAVE_BLAS,1,[Define if you have a BLAS library.]),[$1])
        :
else
        ax_blas_ok=no
        $2
fi
])dnl AX_BLAS


================================================
FILE: m4/ax_lapack.m4
================================================
# ===========================================================================
#         http://www.gnu.org/software/autoconf-archive/ax_lapack.html
# ===========================================================================
#
# SYNOPSIS
#
#   AX_LAPACK([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
#
# DESCRIPTION
#
#   This macro looks for a library that implements the LAPACK linear-algebra
#   interface (see http://www.netlib.org/lapack/). On success, it sets the
#   LAPACK_LIBS output variable to hold the requisite library linkages.
#
#   To link with LAPACK, you should link with:
#
#     $LAPACK_LIBS $BLAS_LIBS $LIBS $FLIBS
#
#   in that order. BLAS_LIBS is the output variable of the AX_BLAS macro,
#   called automatically. FLIBS is the output variable of the
#   AC_F77_LIBRARY_LDFLAGS macro (called if necessary by AX_BLAS), and is
#   sometimes necessary in order to link with F77 libraries. Users will also
#   need to use AC_F77_DUMMY_MAIN (see the autoconf manual), for the same
#   reason.
#
#   The user may also use --with-lapack=<lib> in order to use some specific
#   LAPACK library <lib>. In order to link successfully, however, be aware
#   that you will probably need to use the same Fortran compiler (which can
#   be set via the F77 env. var.) as was used to compile the LAPACK and BLAS
#   libraries.
#
#   ACTION-IF-FOUND is a list of shell commands to run if a LAPACK library
#   is found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it
#   is not found. If ACTION-IF-FOUND is not specified, the default action
#   will define HAVE_LAPACK.
#
# LICENSE
#
#   Copyright (c) 2009 Steven G. Johnson <stevenj@alum.mit.edu>
#
#   This program is free software: you can redistribute it and/or modify it
#   under the terms of the GNU General Public License as published by the
#   Free Software Foundation, either version 3 of the License, or (at your
#   option) any later version.
#
#   This program is distributed in the hope that it will be useful, but
#   WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
#   Public License for more details.
#
#   You should have received a copy of the GNU General Public License along
#   with this program. If not, see <http://www.gnu.org/licenses/>.
#
#   As a special exception, the respective Autoconf Macro's copyright owner
#   gives unlimited permission to copy, distribute and modify the configure
#   scripts that are the output of Autoconf when processing the Macro. You
#   need not follow the terms of the GNU General Public License when using
#   or distributing such scripts, even though portions of the text of the
#   Macro appear in them. The GNU General Public License (GPL) does govern
#   all other use of the material that constitutes the Autoconf Macro.
#
#   This special exception to the GPL applies to versions of the Autoconf
#   Macro released by the Autoconf Archive. When you make and distribute a
#   modified version of the Autoconf Macro, you may extend this special
#   exception to the GPL to apply to your modified version as well.

#serial 7

AU_ALIAS([ACX_LAPACK], [AX_LAPACK])
AC_DEFUN([AX_LAPACK], [
AC_REQUIRE([AX_BLAS])
ax_lapack_ok=no

AC_ARG_WITH(lapack,
        [AS_HELP_STRING([--with-lapack=<lib>], [use LAPACK library <lib>])])
case $with_lapack in
        yes | "") ;;
        no) ax_lapack_ok=disable ;;
        -* | */* | *.a | *.so | *.so.* | *.o) LAPACK_LIBS="$with_lapack" ;;
        *) LAPACK_LIBS="-l$with_lapack" ;;
esac

# Get fortran linker name of LAPACK function to check for.
AC_F77_FUNC(cheev)

# We cannot use LAPACK if BLAS is not found
if test "x$ax_blas_ok" != xyes; then
        ax_lapack_ok=noblas
        LAPACK_LIBS=""
fi

# First, check LAPACK_LIBS environment variable
if test "x$LAPACK_LIBS" != x; then
        save_LIBS="$LIBS"; LIBS="$LAPACK_LIBS $BLAS_LIBS $LIBS $FLIBS"
        AC_MSG_CHECKING([for $cheev in $LAPACK_LIBS])
        AC_TRY_LINK_FUNC($cheev, [ax_lapack_ok=yes], [LAPACK_LIBS=""])
        AC_MSG_RESULT($ax_lapack_ok)
        LIBS="$save_LIBS"
        if test $ax_lapack_ok = no; then
                LAPACK_LIBS=""
        fi
fi

# LAPACK linked to by default?  (is sometimes included in BLAS lib)
if test $ax_lapack_ok = no; then
        save_LIBS="$LIBS"; LIBS="$LIBS $BLAS_LIBS $FLIBS"
        AC_CHECK_FUNC($cheev, [ax_lapack_ok=yes])
        LIBS="$save_LIBS"
fi

# Generic LAPACK library?
for lapack in lapack lapack_rs6k; do
        if test $ax_lapack_ok = no; then
                save_LIBS="$LIBS"; LIBS="$BLAS_LIBS $LIBS"
                AC_CHECK_LIB($lapack, $cheev,
                    [ax_lapack_ok=yes; LAPACK_LIBS="-l$lapack"], [], [$FLIBS])
                LIBS="$save_LIBS"
        fi
done

AC_SUBST(LAPACK_LIBS)

# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
if test x"$ax_lapack_ok" = xyes; then
        ifelse([$1],,AC_DEFINE(HAVE_LAPACK,1,[Define if you have LAPACK library.]),[$1])
        :
else
        ax_lapack_ok=no
        $2
fi
])dnl AX_LAPACK


================================================
FILE: m4/ltsugar.m4
================================================
# ltsugar.m4 -- libtool m4 base layer.                         -*-Autoconf-*-
#
# Copyright (C) 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
# Written by Gary V. Vaughan, 2004
#
# This file is free software; the Free Software Foundation gives
# unlimited permission to copy and/or distribute it, with or without
# modifications, as long as this notice is preserved.

# serial 6 ltsugar.m4

# This is to help aclocal find these macros, as it can't see m4_define.
AC_DEFUN([LTSUGAR_VERSION], [m4_if([0.1])])


# lt_join(SEP, ARG1, [ARG2...])
# -----------------------------
# Produce ARG1SEPARG2...SEPARGn, omitting [] arguments and their
# associated separator.
# Needed until we can rely on m4_join from Autoconf 2.62, since all earlier
# versions in m4sugar had bugs.
m4_define([lt_join],
[m4_if([$#], [1], [],
       [$#], [2], [[$2]],
       [m4_if([$2], [], [], [[$2]_])$0([$1], m4_shift(m4_shift($@)))])])
m4_define([_lt_join],
[m4_if([$#$2], [2], [],
       [m4_if([$2], [], [], [[$1$2]])$0([$1], m4_shift(m4_shift($@)))])])


# lt_car(LIST)
# lt_cdr(LIST)
# ------------
# Manipulate m4 lists.
# These macros are necessary as long as will still need to support
# Autoconf-2.59 which quotes differently.
m4_define([lt_car], [[$1]])
m4_define([lt_cdr],
[m4_if([$#], 0, [m4_fatal([$0: cannot be called without arguments])],
       [$#], 1, [],
       [m4_dquote(m4_shift($@))])])
m4_define([lt_unquote], $1)


# lt_append(MACRO-NAME, STRING, [SEPARATOR])
# ------------------------------------------
# Redefine MACRO-NAME to hold its former content plus `SEPARATOR'`STRING'.
# Note that neither SEPARATOR nor STRING are expanded; they are appended
# to MACRO-NAME as is (leaving the expansion for when MACRO-NAME is invoked).
# No SEPARATOR is output if MACRO-NAME was previously undefined (different
# than defined and empty).
#
# This macro is needed until we can rely on Autoconf 2.62, since earlier
# versions of m4sugar mistakenly expanded SEPARATOR but not STRING.
m4_define([lt_append],
[m4_define([$1],
	   m4_ifdef([$1], [m4_defn([$1])[$3]])[$2])])


# lt_combine(SEP, PREFIX-LIST, INFIX, SUFFIX1, [SUFFIX2...])
# ----------------------------------------------------------
# Produce a SEP delimited list of all paired combinations of elements of
# PREFIX-LIST with SUFFIX1 through SUFFIXn.  Each element of the list
# has the form PREFIXmINFIXSUFFIXn.
# Needed until we can rely on m4_combine added in Autoconf 2.62.
m4_define([lt_combine],
[m4_if(m4_eval([$# > 3]), [1],
       [m4_pushdef([_Lt_sep], [m4_define([_Lt_sep], m4_defn([lt_car]))])]]dnl
[[m4_foreach([_Lt_prefix], [$2],
	     [m4_foreach([_Lt_suffix],
		]m4_dquote(m4_dquote(m4_shift(m4_shift(m4_shift($@)))))[,
	[_Lt_sep([$1])[]m4_defn([_Lt_prefix])[$3]m4_defn([_Lt_suffix])])])])])


# lt_if_append_uniq(MACRO-NAME, VARNAME, [SEPARATOR], [UNIQ], [NOT-UNIQ])
# -----------------------------------------------------------------------
# Iff MACRO-NAME does not yet contain VARNAME, then append it (delimited
# by SEPARATOR if supplied) and expand UNIQ, else NOT-UNIQ.
m4_define([lt_if_append_uniq],
[m4_ifdef([$1],
	  [m4_if(m4_index([$3]m4_defn([$1])[$3], [$3$2$3]), [-1],
		 [lt_append([$1], [$2], [$3])$4],
		 [$5])],
	  [lt_append([$1], [$2], [$3])$4])])


# lt_dict_add(DICT, KEY, VALUE)
# -----------------------------
m4_define([lt_dict_add],
[m4_define([$1($2)], [$3])])


# lt_dict_add_subkey(DICT, KEY, SUBKEY, VALUE)
# --------------------------------------------
m4_define([lt_dict_add_subkey],
[m4_define([$1($2:$3)], [$4])])


# lt_dict_fetch(DICT, KEY, [SUBKEY])
# ----------------------------------
m4_define([lt_dict_fetch],
[m4_ifval([$3],
	m4_ifdef([$1($2:$3)], [m4_defn([$1($2:$3)])]),
    m4_ifdef([$1($2)], [m4_defn([$1($2)])]))])


# lt_if_dict_fetch(DICT, KEY, [SUBKEY], VALUE, IF-TRUE, [IF-FALSE])
# -----------------------------------------------------------------
m4_define([lt_if_dict_fetch],
[m4_if(lt_dict_fetch([$1], [$2], [$3]), [$4],
	[$5],
    [$6])])


# lt_dict_filter(DICT, [SUBKEY], VALUE, [SEPARATOR], KEY, [...])
# --------------------------------------------------------------
m4_define([lt_dict_filter],
[m4_if([$5], [], [],
  [lt_join(m4_quote(m4_default([$4], [[, ]])),
           lt_unquote(m4_split(m4_normalize(m4_foreach(_Lt_key, lt_car([m4_shiftn(4, $@)]),
		      [lt_if_dict_fetch([$1], _Lt_key, [$2], [$3], [_Lt_key ])])))))])[]dnl
])


================================================
FILE: m4/lt~obsolete.m4
================================================
# lt~obsolete.m4 -- aclocal satisfying obsolete definitions.    -*-Autoconf-*-
#
#   Copyright (C) 2004, 2005, 2007, 2009 Free Software Foundation, Inc.
#   Written by Scott James Remnant, 2004.
#
# This file is free software; the Free Software Foundation gives
# unlimited permission to copy and/or distribute it, with or without
# modifications, as long as this notice is preserved.

# serial 5 lt~obsolete.m4

# These exist entirely to fool aclocal when bootstrapping libtool.
#
# In the past libtool.m4 has provided macros via AC_DEFUN (or AU_DEFUN)
# which have later been changed to m4_define as they aren't part of the
# exported API, or moved to Autoconf or Automake where they belong.
#
# The trouble is, aclocal is a bit thick.  It'll see the old AC_DEFUN
# in /usr/share/aclocal/libtool.m4 and remember it, then when it sees us
# using a macro with the same name in our local m4/libtool.m4 it'll
# pull the old libtool.m4 in (it doesn't see our shiny new m4_define
# and doesn't know about Autoconf macros at all.)
#
# So we provide this file, which has a silly filename so it's always
# included after everything else.  This provides aclocal with the
# AC_DEFUNs it wants, but when m4 processes it, it doesn't do anything
# because those macros already exist, or will be overwritten later.
# We use AC_DEFUN over AU_DEFUN for compatibility with aclocal-1.6. 
#
# Anytime we withdraw an AC_DEFUN or AU_DEFUN, remember to add it here.
# Yes, that means every name once taken will need to remain here until
# we give up compatibility with versions before 1.7, at which point
# we need to keep only those names which we still refer to.

# This is to help aclocal find these macros, as it can't see m4_define.
AC_DEFUN([LTOBSOLETE_VERSION], [m4_if([1])])

m4_ifndef([AC_LIBTOOL_LINKER_OPTION],	[AC_DEFUN([AC_LIBTOOL_LINKER_OPTION])])
m4_ifndef([AC_PROG_EGREP],		[AC_DEFUN([AC_PROG_EGREP])])
m4_ifndef([_LT_AC_PROG_ECHO_BACKSLASH],	[AC_DEFUN([_LT_AC_PROG_ECHO_BACKSLASH])])
m4_ifndef([_LT_AC_SHELL_INIT],		[AC_DEFUN([_LT_AC_SHELL_INIT])])
m4_ifndef([_LT_AC_SYS_LIBPATH_AIX],	[AC_DEFUN([_LT_AC_SYS_LIBPATH_AIX])])
m4_ifndef([_LT_PROG_LTMAIN],		[AC_DEFUN([_LT_PROG_LTMAIN])])
m4_ifndef([_LT_AC_TAGVAR],		[AC_DEFUN([_LT_AC_TAGVAR])])
m4_ifndef([AC_LTDL_ENABLE_INSTALL],	[AC_DEFUN([AC_LTDL_ENABLE_INSTALL])])
m4_ifndef([AC_LTDL_PREOPEN],		[AC_DEFUN([AC_LTDL_PREOPEN])])
m4_ifndef([_LT_AC_SYS_COMPILER],	[AC_DEFUN([_LT_AC_SYS_COMPILER])])
m4_ifndef([_LT_AC_LOCK],		[AC_DEFUN([_LT_AC_LOCK])])
m4_ifndef([AC_LIBTOOL_SYS_OLD_ARCHIVE],	[AC_DEFUN([AC_LIBTOOL_SYS_OLD_ARCHIVE])])
m4_ifndef([_LT_AC_TRY_DLOPEN_SELF],	[AC_DEFUN([_LT_AC_TRY_DLOPEN_SELF])])
m4_ifndef([AC_LIBTOOL_PROG_CC_C_O],	[AC_DEFUN([AC_LIBTOOL_PROG_CC_C_O])])
m4_ifndef([AC_LIBTOOL_SYS_HARD_LINK_LOCKS], [AC_DEFUN([AC_LIBTOOL_SYS_HARD_LINK_LOCKS])])
m4_ifndef([AC_LIBTOOL_OBJDIR],		[AC_DEFUN([AC_LIBTOOL_OBJDIR])])
m4_ifndef([AC_LTDL_OBJDIR],		[AC_DEFUN([AC_LTDL_OBJDIR])])
m4_ifndef([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH], [AC_DEFUN([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH])])
m4_ifndef([AC_LIBTOOL_SYS_LIB_STRIP],	[AC_DEFUN([AC_LIBTOOL_SYS_LIB_STRIP])])
m4_ifndef([AC_PATH_MAGIC],		[AC_DEFUN([AC_PATH_MAGIC])])
m4_ifndef([AC_PROG_LD_GNU],		[AC_DEFUN([AC_PROG_LD_GNU])])
m4_ifndef([AC_PROG_LD_RELOAD_FLAG],	[AC_DEFUN([AC_PROG_LD_RELOAD_FLAG])])
m4_ifndef([AC_DEPLIBS_CHECK_METHOD],	[AC_DEFUN([AC_DEPLIBS_CHECK_METHOD])])
m4_ifndef([AC_LIBTOOL_PROG_COMPILER_NO_RTTI], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_NO_RTTI])])
m4_ifndef([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE], [AC_DEFUN([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE])])
m4_ifndef([AC_LIBTOOL_PROG_COMPILER_PIC], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_PIC])])
m4_ifndef([AC_LIBTOOL_PROG_LD_SHLIBS],	[AC_DEFUN([AC_LIBTOOL_PROG_LD_SHLIBS])])
m4_ifndef([AC_LIBTOOL_POSTDEP_PREDEP],	[AC_DEFUN([AC_LIBTOOL_POSTDEP_PREDEP])])
m4_ifndef([LT_AC_PROG_EGREP],		[AC_DEFUN([LT_AC_PROG_EGREP])])
m4_ifndef([LT_AC_PROG_SED],		[AC_DEFUN([LT_AC_PROG_SED])])
m4_ifndef([_LT_CC_BASENAME],		[AC_DEFUN([_LT_CC_BASENAME])])
m4_ifndef([_LT_COMPILER_BOILERPLATE],	[AC_DEFUN([_LT_COMPILER_BOILERPLATE])])
m4_ifndef([_LT_LINKER_BOILERPLATE],	[AC_DEFUN([_LT_LINKER_BOILERPLATE])])
m4_ifndef([_AC_PROG_LIBTOOL],		[AC_DEFUN([_AC_PROG_LIBTOOL])])
m4_ifndef([AC_LIBTOOL_SETUP],		[AC_DEFUN([AC_LIBTOOL_SETUP])])
m4_ifndef([_LT_AC_CHECK_DLFCN],		[AC_DEFUN([_LT_AC_CHECK_DLFCN])])
m4_ifndef([AC_LIBTOOL_SYS_DYNAMIC_LINKER],	[AC_DEFUN([AC_LIBTOOL_SYS_DYNAMIC_LINKER])])
m4_ifndef([_LT_AC_TAGCONFIG],		[AC_DEFUN([_LT_AC_TAGCONFIG])])
m4_ifndef([AC_DISABLE_FAST_INSTALL],	[AC_DEFUN([AC_DISABLE_FAST_INSTALL])])
m4_ifndef([_LT_AC_LANG_CXX],		[AC_DEFUN([_LT_AC_LANG_CXX])])
m4_ifndef([_LT_AC_LANG_F77],		[AC_DEFUN([_LT_AC_LANG_F77])])
m4_ifndef([_LT_AC_LANG_GCJ],		[AC_DEFUN([_LT_AC_LANG_GCJ])])
m4_ifndef([AC_LIBTOOL_LANG_C_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_C_CONFIG])])
m4_ifndef([_LT_AC_LANG_C_CONFIG],	[AC_DEFUN([_LT_AC_LANG_C_CONFIG])])
m4_ifndef([AC_LIBTOOL_LANG_CXX_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_CXX_CONFIG])])
m4_ifndef([_LT_AC_LANG_CXX_CONFIG],	[AC_DEFUN([_LT_AC_LANG_CXX_CONFIG])])
m4_ifndef([AC_LIBTOOL_LANG_F77_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_F77_CONFIG])])
m4_ifndef([_LT_AC_LANG_F77_CONFIG],	[AC_DEFUN([_LT_AC_LANG_F77_CONFIG])])
m4_ifndef([AC_LIBTOOL_LANG_GCJ_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_GCJ_CONFIG])])
m4_ifndef([_LT_AC_LANG_GCJ_CONFIG],	[AC_DEFUN([_LT_AC_LANG_GCJ_CONFIG])])
m4_ifndef([AC_LIBTOOL_LANG_RC_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_RC_CONFIG])])
m4_ifndef([_LT_AC_LANG_RC_CONFIG],	[AC_DEFUN([_LT_AC_LANG_RC_CONFIG])])
m4_ifndef([AC_LIBTOOL_CONFIG],		[AC_DEFUN([AC_LIBTOOL_CONFIG])])
m4_ifndef([_LT_AC_FILE_LTDLL_C],	[AC_DEFUN([_LT_AC_FILE_LTDLL_C])])
m4_ifndef([_LT_REQUIRED_DARWIN_CHECKS],	[AC_DEFUN([_LT_REQUIRED_DARWIN_CHECKS])])
m4_ifndef([_LT_AC_PROG_CXXCPP],		[AC_DEFUN([_LT_AC_PROG_CXXCPP])])
m4_ifndef([_LT_PREPARE_SED_QUOTE_VARS],	[AC_DEFUN([_LT_PREPARE_SED_QUOTE_VARS])])
m4_ifndef([_LT_PROG_ECHO_BACKSLASH],	[AC_DEFUN([_LT_PROG_ECHO_BACKSLASH])])
m4_ifndef([_LT_PROG_F77],		[AC_DEFUN([_LT_PROG_F77])])
m4_ifndef([_LT_PROG_FC],		[AC_DEFUN([_LT_PROG_FC])])
m4_ifndef([_LT_PROG_CXX],		[AC_DEFUN([_LT_PROG_CXX])])


================================================
FILE: makefile_include.in
================================================
# Template for configure to create makefile_include, which is included
# by test/Makefile and benchmark/Makefile

AR = @AR@
CC = @CC@
CFLAGS = @CFLAGS@
CPP = @CPP@
CPPFLAGS = @CPPFLAGS@
CXX = @CXX@
CXXCPP = @CXXCPP@
CXXFLAGS = @CXXFLAGS@ @OPENMP_CXXFLAGS@
DEFS = @DEFS@
LD = @LD@
LDFLAGS = @LDFLAGS@ @OPENMP_CXXFLAGS@
LIBOBJS = @LIBOBJS@
LIBS = @LIBS@
SHELL = @SHELL@
LIBTOOL = @LIBTOOL@
USE_GSL = @USE_GSL@


================================================
FILE: test/Makefile
================================================
# Makefile for example programs that demonstrate different features of
# the Adept library
#
# Note that this Makefile is hand-coded rather than being generated by
# automake
#
# The -DADEPT_RECORDING_PAUSABLE option enables the pause_recording
# and continue_recording functionality and is used by test_adept,
# although it will run correctly (but slightly more slowly) without
# this flag

# The configure script writes the following file, which contains
# variables controlling the compilation
include ../makefile_include

# Uncomment the following to check what happens if thread safety
# disabled
# ADEPT_FLAGS = -DADEPT_STACK_THREAD_UNSAFE

# The objects to create
OBJECTS = algorithm.o algorithm_noad.o test_checkpoint.o \
	test_adept.o test_adept_with_and_without_ad.o \
	test_radiances.o simulate_radiances.o test_thread_safe.o \
	test_no_lib.o test_misc.o test_arrays.o test_arrays_active.o \
	test_array_speed.o test_arrays_active_pausable.o \
	test_fixed_arrays_active.o test_radiances_array.o \
	test_fixed_arrays.o test_constructors.o test_derivatives.o \
	test_array_derivatives.o test_thread_safe_arrays.o \
	test_complex_arrays.o test_packet_operations.o \
	test_fastexp.o test_reduce_active.o test_minimizer.o \
	test_interp.o

GSL_OBJECTS = test_gsl_interface.o state.o rosenbrock_banana_function.o

GSL_LIBS = -lgsl

COMPILE_FLAGS = $(CXXFLAGS) -I../include $(CPPFLAGS) $(ADEPT_FLAGS)

# Because we aren't going to install the test programs, and we want
# them to work even if Adept is not installed, it is easiest to use
# libtool to create statically-linked executables
top_builddir = ..
CXXLINK = $(LIBTOOL) --tag=CXX --mode=link $(CXX) $(CXXFLAGS) \
	-static -no-install -L../adept/.libs $(LDFLAGS) -ladept -o $@

# Link without the Adept library
CXXLINK_NOLIB = $(LIBTOOL) --tag=CXX --mode=link $(CXX) $(CXXFLAGS) \
	$(LDFLAGS) -o $@

# Dependency on the presence of the Adept static library
LIBADEPT = ../adept/.libs/libadept.a

MYLIBS = $(LIBS)

PROGRAMS = test_adept test_adept_with_and_without_ad test_radiances \
	test_gsl_interface test_misc test_checkpoint test_thread_safe \
	test_array_speed test_no_lib test_radiances_array test_constructors \
	test_arrays test_arrays_active test_arrays_active_pausable \
	test_fixed_arrays test_fixed_arrays_active test_derivatives \
	test_array_derivatives test_thread_safe_arrays test_complex_arrays \
	test_packet_operations test_fastexp test_reduce_active test_minimizer \
	test_interp

all:
	@echo "********************************************************"
	@echo "*** To compile test programs in test/ and benchmark/ ***"
	@echo "*** type \"make check\"                                ***"
	@echo "********************************************************"

# Compile all four programs
check: $(PROGRAMS) run-tests

run-tests:
	./run_tests.sh $(PROGRAMS)

# Test program 1
test_adept: algorithm.o test_adept.o $(LIBADEPT)
	$(CXXLINK) algorithm.o test_adept.o $(MYLIBS)

# Test program 2
test_adept_with_and_without_ad: algorithm.o algorithm_noad.o test_adept_with_and_without_ad.o $(LIBADEPT)
	$(CXXLINK) algorithm.o algorithm_noad.o test_adept_with_and_without_ad.o $(MYLIBS)

# Test program 3
test_radiances: simulate_radiances.o test_radiances.o $(LIBADEPT)
	$(CXXLINK) simulate_radiances.o test_radiances.o $(MYLIBS)

ifeq "X$(USE_GSL)" "Xyes"
# Test program 4
test_gsl_interface: $(GSL_OBJECTS) $(LIBADEPT)
	$(CXXLINK) $(GSL_OBJECTS) $(GSL_LIBS) $(MYLIBS)
else
test_gsl_interface:
	@echo "The executable test_gsl_interface will not be created because GSL library was not found"
endif

# Test program 5
test_misc: test_misc.o algorithm.o $(LIBADEPT)
	$(CXXLINK) test_misc.o algorithm.o $(MYLIBS)

# Test program 6
test_checkpoint: test_checkpoint.o $(LIBADEPT)
	$(CXXLINK) test_checkpoint.o $(MYLIBS)

# Test program 7
test_thread_safe: test_thread_safe.o $(LIBADEPT)
	$(CXXLINK) test_thread_safe.o $(MYLIBS)

# Test program 8 (note that it is not linked against the Adept library)
test_no_lib: test_no_lib.o algorithm.o
	$(CXXLINK_NOLIB) test_no_lib.o algorithm.o $(MYLIBS)

# Test program 9a
test_arrays: test_arrays.o $(LIBADEPT)
	$(CXXLINK) test_arrays.o $(MYLIBS)

# Test program 9b
test_arrays_active: test_arrays_active.o $(LIBADEPT)
	$(CXXLINK) test_arrays_active.o $(MYLIBS)

# Test program 9c
test_arrays_active_pausable: test_arrays_active_pausable.o $(LIBADEPT)
	$(CXXLINK) test_arrays_active_pausable.o $(MYLIBS)

# Test program 9d
test_complex_arrays: test_complex_arrays.o $(LIBADEPT)
	$(CXXLINK) test_complex_arrays.o $(MYLIBS)

# Test program 10
test_array_speed: test_array_speed.o $(LIBADEPT)
	$(CXXLINK) test_array_speed.o $(MYLIBS)


# Test program 11
test_radiances_array: simulate_radiances.o test_radiances_array.o $(LIBADEPT)
	$(CXXLINK) simulate_radiances.o test_radiances_array.o $(MYLIBS)

# Test program 12a
test_fixed_arrays: test_fixed_arrays.o $(LIBADEPT)
	$(CXXLINK) test_fixed_arrays.o $(MYLIBS)

# Test program 12b
test_fixed_arrays_active: test_fixed_arrays_active.o $(LIBADEPT)
	$(CXXLINK) test_fixed_arrays_active.o $(MYLIBS)

# Test program 13
test_constructors: test_constructors.o $(LIBADEPT)
	$(CXXLINK) test_constructors.o $(MYLIBS)

# Test program 14
test_derivatives: test_derivatives.o $(LIBADEPT)
	$(CXXLINK) test_derivatives.o $(MYLIBS)

# Test program 15
test_array_derivatives: test_array_derivatives.o $(LIBADEPT)
	$(CXXLINK) test_array_derivatives.o $(MYLIBS)

# Test program 16
test_thread_safe_arrays: test_thread_safe_arrays.o $(LIBADEPT)
	$(CXXLINK) test_thread_safe_arrays.o $(MYLIBS)

# Test program 17
test_packet_operations: test_packet_operations.o $(LIBADEPT)
	$(CXXLINK) test_packet_operations.o $(MYLIBS)

# Test program 18
test_fastexp: test_fastexp.o $(LIBADEPT)
	$(CXXLINK) test_fastexp.o $(MYLIBS)

# Test program 19
test_reduce_active: test_reduce_active.o $(LIBADEPT)
	$(CXXLINK) test_reduce_active.o $(MYLIBS)

# Test program 20
test_minimizer: test_minimizer.o $(LIBADEPT)
	$(CXXLINK) test_minimizer.o $(MYLIBS)

# Test program 21
test_interp: test_interp.o $(LIBADEPT)
	$(CXXLINK) test_interp.o $(MYLIBS)

# The no-automatic-differentiation version of the algorithm: uses the
# -DADEPT_NO_AUTOMATIC_DIFFERENTIATION to produce a version of the
# algorithm that takes double rather than adouble arguments
algorithm_noad.o: algorithm.cpp *.h ../include/adept.h
	$(CXX) $(COMPILE_FLAGS) $(INCLUDES) -c algorithm.cpp -DADEPT_NO_AUTOMATIC_DIFFERENTIATION -o $@

# All other object files created by compiling the corresponding source
# file without this flag
%.o: %.cpp *.h ../include/*.h ../include/adept/*.h
	$(CXX) $(COMPILE_FLAGS) $(INCLUDES) -c $<

test_arrays_active.o: test_arrays.cpp  *.h ../include/*.h ../include/adept/*.h
	$(CXX) $(COMPILE_FLAGS) -DALL_ACTIVE $(INCLUDES) -c test_arrays.cpp -o test_arrays_active.o

test_arrays_active_pausable.o: test_arrays.cpp  *.h ../include/*.h ../include/adept/*h
	$(CXX) $(COMPILE_FLAGS) -DADEPT_RECORDING_PAUSABLE -DALL_ACTIVE $(INCLUDES) -c test_arrays.cpp -o test_arrays_active_pausable.o

test_complex_arrays.o: test_arrays.cpp  *.h ../include/*.h ../include/adept/*.h
	$(CXX) $(COMPILE_FLAGS) -DALL_COMPLEX $(INCLUDES) -c test_arrays.cpp -o test_complex_arrays.o

test_fixed_arrays_active.o: test_fixed_arrays.cpp  *.h ../include/*.h ../include/adept/*.h
	$(CXX) $(COMPILE_FLAGS) -DALL_ACTIVE $(INCLUDES) -c test_fixed_arrays.cpp -o test_fixed_arrays_active.o

# Remove all object files and executables
clean:
	rm -f $(OBJECTS) $(GSL_OBJECTS) $(PROGRAMS) test_stderr.txt test_results.txt

mostlyclean: clean

# Null targets to satisfy autotools
EMPTY_AUTOMAKE_TARGETS = distdir install install-data install-exec uninstall \
	install-dvi install-html install-info install-ps install-pdf \
	installdirs installcheck distclean maintainer-clean \
	dvi pdf ps info html tags ctags
.PHONY: $(EMPTY_AUTOMAKE_TARGETS)
$(EMPTY_AUTOMAKE_TARGETS):


================================================
FILE: test/README
================================================
This directory contains examples to demonstrate various features of
Adept. Type "make check" from the directory above to compile
them. 

Note that unlike in the rest of this package, the Makefile in this
directory was not generated by automake; it is well commented and so
may assist in understanding how to build software that uses Adept.


TEST 1: BASIC FEATURES

Executable: test_adept

Source files: test_adept.cpp, algorithm.cpp, algorithm.h

Demonstrates: basic use of Adept, reverse-mode automatic
differentiation, computing the Jacobian matrix, printing diagnostic
information, verifying results by comparing to numerical calculations,
pausing and continuing recordings

Synopsis: This program demonstrates how to differentiate a simple
function (in algorithm.cpp), comparing the results from automatic
differentiation with numerical differentiation. The function used is
the contrived example from the Adept paper.


TEST 2: COMPILING SOURCE FILES TWICE, WITH AND WITHOUT AUTOMATIC
DIFFERENTIATION

Executable: test_adept_with_and_without_ad

Source files: test_adept_with_and_without_ad.cpp, algorithm.cpp,
algorithm.h, algorithm_with_and_without_ad.h

Demonstrates: most of the same features as TEST_ADEPT, plus compiling
a source file twice

Synopsis: This program is the same as in Test 1, except that
algorithm.cpp is compiled twice, once with automatic differentiation
(producing the object file algorithm.o) and once without (producing
the object file algorithm_noad.o). This is achieved in the Makefile
using the -DADEPT_NO_AUTOMATIC_DIFFERENTIATION flag. This provides two
overloaded versions of the "algorithm" function, one that takes active
"adouble" arguments, and the other that takes inactive "double"
arguments. The two versions are declared in the
algorithm_with_and_without_ad.h header file.


TEST 3: RADIANCE SIMULATION

Executable: test_radiances

Source files: test_radiances.cpp, simulate_radiances.cpp,
simulate_radiances.h

Demonstrates: activation and deactivation of an Adept stack, using
more than one Adept stack in the same program (but not at the same
time), how to interface Adept with software that computes its own
Jacobian

Synopsis: The "main" function is in test_radiances.cpp, and
demonstrates how to interface Adept to an algorithm that does not have
an Adept interface, but which provides its own Jacobian. The algorithm
in this case is in simulate_radiances.cpp; while it does not have an
Adept interface, it does use Adept internally to compute the Jacobian
that it returns. It therefore needs to temporarily deactivate the
calling function's Adept stack (where derivative information is
stored) while using its own.  This example is from the Adept
documentation.


TEST 4: GSL MINIMIZATION INTERFACE

Executable: test_gsl_interface

Command-line arguments: optionally, the executable name can be
followed by an integer (which should be 2 or greater) expressing the
number of dimensions of the minimization problem.  The default is 2.

Source files: test_gsl_interface.cpp, rosenbrock_banana_function.cpp,
state.cpp, state.h

Pre-requisites: the GNU Scientific Library should be installed; on an
RPM-based system you want the "gsl" and "gsl-devel" packages. If this
is not available at the time the configure script is run, this
executable will not be built.

Demonstrates: interface with the multi-dimensional minimization
capability of the GNU Scientific Library, use of Adept to minimize a
real function, an object-oriented way to store Adept data for a
minimization problem

Synopsis: The "main" function is in test_gsl_interface.cpp and is
fairly self-explanatory. The state.cpp and state.h files show how
Adept data can be stored and accessed in an object-oriented way. The
function to be minimized is the N-dimensional Rosenbrock banana
function, given in rosenbrock_banana_function.cpp.


TEST 5: TRIVIAL EXAMPLE IN ADEPT PAPER

Executable: test_misc

Source files: test_misc.cpp, algorithm.cpp, algorithm.h

Demonstrates: basic use of Adept, reverse-mode automatic
differentiation

Synopsis: This program is simply the trivial example in the Adept
paper, using the same algorithm as in Test 1.


TEST 6: CHECKPOINTING

Executable: test_checkpointing

Source files: test_checkpoint.cpp

Demonstrates: checkpointing

Synopsis: Large algorithms, particularly those that involve
time-dependent simulations, can require a lot of memory when used with
an automatic-differentiation tool. Even if enough memory is available,
the speed may be sub-optimal.  This program demonstrates the
checkpointing technique, where a simulation using the "Toon" algorithm
in the Adept paper is first run with 10,000 timesteps, and then in 100
blocks of 100 timesteps (the checkpointed simulation), with the output
stored after each block so that the reverse pass of the automatic
differentiation needs 100 times less memory. The resulting gradients
are output to verify that the two versions produce the results, and
the timings of the two are presented as well.


TEST 7: THREAD SAFETY

Executable: test_thread_safe

Source files: test_thread_safe.cpp

Demonstrates: use of Adept in multi-threaded applications, thread
safety, comparison of Jacobian matrices computed using the forward and
reverse methods

Synopsis: This program computes the 128-128 Jacobian matrix of an
algorithm 16 times with different inputs.  The Jacobian matrix is
actually computed twice, once with 128 forward passes through the
derivative statements and once with 128 reverse passes through the
derivative statements, and a check is performed to see that the
root-mean-squared difference is within some tolerance.
  The default behaviour (and if the "-parallel" command-line argument
is provided) is to use OpenMP to run the 16 computations in parallel.
In this instance the 128 passes required to compute the Jacobian
matrices will be computed using just a single thread. If the "-serial"
command-line argument is provided then the 16 computations are carried
out in series.  In this instance, the Adept library is able to run the
Jacobian-matrix calculation in parallel (this behaviour is automatic
if the program is compiled with the -fopenmp option).
  If the program is compiled with the ADEPT_STACK_THREAD_UNSAFE
preprocessor variable defined, or on platforms that don't support
thread-local variables (e.g. some Mac platforms), then the program
should abort in the "-parallel" case ONLY.


TEST 8: COMPILING WITHOUT EXTERNAL ADEPT LIBRARY

Executable: test_no_lib

Source files: test_no_lib.cpp algorithm.cpp algorithm.h

Demonstrates: use of adept_source.h to create an executable without
the need to the external Adept library

Synopsis: This is basically the same as test_misc.cpp, but one of the
source files includes adept_source.h (rather than adept.h), which
contains the source code for the Adept library. This means that no
linking to an external Adept library (via -ladept) is required. This
capability makes it easier to distribute a package that can be used on
the widest range of operating systems, particularly those like
Microsoft Windows that cannot natively run the configure shell script.


TEST 9a,b,c,d: ARRAY FUNCTIONALITY

Executables: (a) test_arrays, (b) test_arrays_active, (c)
test_arrays_active_pausable, (d) test_complex_arrays

Source files: test_arrays.cpp

Demonstrates: array functionality for (a) passive arrays, (b) active
arrays, (c) active arrays but with stack recording "paused", (d)
complex arrays.


TEST 10: ARRAY SPEED

Executable: test_array_speed

Source files: test_array_speed.cpp

Demonstrates: speed of arrays versus for loops


TEST 11: RADIANCE SIMULATION WITH ARRAYS

Executable: test_radiances_array

Source files: test_radiances_array.cpp, simulate_radiances.cpp,
simulate_radiances.h

Demonstrates: use of arrays with add/append_derivative_dependence


TEST 12a,b: FIXED-ARRAY FUNCTIONALITY

Executables: (a) test_fixed_arrays, (b) test_fixed_arrays_active

Source file: test_fixed_arrays.cpp

Demonstrates: functionality of fixed arrays, i.e. those whose
dimensions are set at compile time: (a) passive version, and (b)
active version.


TEST 13: ARRAY CONSTRUCTORS

Executable: test_constructors

Source file: test_constructors.cpp

Demonstrates: different ways of constructing, assigning and linking
arrays, and passing them to and from functions.


TEST 14: DERIVATIVES

Exeutable: test_derivatives

Source file: test_derivatives.cpp

Demonstrates: validity of the automatic differentiation of all
mathematical functions supported by Adept, via finite differencing.


TEST 15: ARRAY DERIVATIVES

Exeutable: test_array_derivatives

Source file: test_array_derivatives.cpp

Demonstrates: validity of the automatic differentiation of selected
array operations, on both Array types and FixedArray types.


TEST 16: THREAD-SAFE ARRAYS

Executable: test_thread_safe_arrays

Source file: test_thread_safe_arrays.cpp

Demonstrates: two ways to make accessing arrays thread safe: use the
soft_link() member function of Array and SpecialMatrix, OR compile
with ADEPT_STORAGE_THREAD_SAFE (C++11 only).


TEST 17: PACKET OPERATIONS

Executable: test_packet_operations

Source file: test_packet_operations.cpp

Demonstrates: Use of Intel or ARM intrinsics is mathematically
consistent regardless of whether code is compiled with SSE2, NEON,
AVX2 or AVX512.  You will need to recompile with (e.g. for g++)
-msse2, -mavx2 or -mavx512f (or simply march=native to use the best
instruction set available) and check that the output is the same each
time.


TEST 18: FAST EXPONENTIAL OPERATIONS

Executable: test_fastexp

Source file: test_fastexp.cpp

Demonstrates: Correctness of Adept's fast exponential function.


TEST 19: ACTIVE REDUCE OPERATIONS

Executable: test_reduce_active

Source file: test_reduce_active.cpp

Demonstrates: differentiation of reduction operations (sum, product,
maxval etc).


TEST 20: MINIMIZER

Executable: test_minimizer

Source file: test_minimizer.cpp

Demonstrates: Adept's various minimization algorithms on the
N-dimensional Rosenbrock banana function, where the (optional)
arguments are:
 1. number of dimensions, default 2
 2. minimization algorithm string, default "Levenberg-Marquardt" (also
 available: Levenberg, L-BFGS, Conjugate-Gradient,
 Conjugate-Gradient-FR; additionally, the "Newton-Levenberg-Marquardt"
 and "Newton-Levenberg" will use the exact Hessian, rather than an
 approximation
 3. maximum number of iterations, default 100
 4. gradient-norm to indicate convergence, default 0.1

The cost function value and gradient norm are reported to standard
output each iteration. To standard error is written a table of
numbers, one line per call to the function being minimized.  The first
on each line is the number of the sub-iteration, usually the number of
the call to the line-search algorithm, starting at 0. Then follows the
N values of the state vector, followed by the value of the cost
function. This can be used to plot how each minimizer progresses to
the solution.


TEST 21: INTERPOLATION

Executable: test_interp

Source file: test_interp.cpp

Demonstrates: Adept's interpolation functions interp, interp2d and
interp3d.

================================================
FILE: test/algorithm.cpp
================================================
/* algorithm.cpp - A simple demonstration algorithm used in Tests 1 & 2 

  Copyright (C) 2012-2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.

*/


#include <cmath>

#include "algorithm.h"
using adept::adouble;

// A simple demonstration algorithm used in the Adept paper. Note that
// this algorithm can be compiled with
// -DADEPT_NO_AUTOMATIC_DIFFERENTIATION to create a version that takes
// double arguments and returns a double result.
adouble algorithm(const adouble x[2]) {
  adouble y = 4.0;
  adouble s = 2.0*x[0] + 3.0*x[1]*x[1];
  double b=3.0;
  y = s + b;
  y *= sin(s);
  return y;
}
 

================================================
FILE: test/algorithm.h
================================================
/* algorithm.h - Header file for the simple example algorithm function

  Copyright (C) 2012-2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

#ifndef ALGORITHM_H
#define ALGORITHM_H 1

// This header file defining the interface of the simple demonstration
// function "algorithm".  This header file is included by both
// algorithm.cpp, which defines the body of the function, and
// test_adept.cpp, which calls algorithm. 

#include "adept.h"

// Declare the function
adept::adouble algorithm(const adept::adouble x[2]);

#endif


================================================
FILE: test/algorithm_with_and_without_ad.h
================================================
/* algorithm_with_and_without_ad.h

  Copyright (C) 2012-2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

// This header file defining the interface of the simple demonstration
// function "algorithm", and is included by
// test_adept_with_and_without_ad.cpp. It demonstrates the use of a
// single source file that is compiled twice to produce two overloaded
// versions of a function. The "original" version takes
// double-precision arguments and returns a double-precision answer,
// while the automatic differentiation version takes adouble arguments
// and returns an adouble answer. The two versions are compiled from
// the same source file algorithm.cpp by compiling it twice with and
// without the compiler option -DAUTOMATIC_DIFFERENTIATION.


#ifndef ALGORITHM_WITH_AND_WITHOUT_AD_H
#define ALGORITHM_WITH_AND_WITHOUT_AD_H 1

#include "adept.h"

// Declare the original version of the function
double algorithm(const double x[2]);

#ifndef ADEPT_NO_AUTOMATIC_DIFFERENTIATION
// Declare the automatic-differentiation version of the function
adept::adouble algorithm(const adept::adouble x[2]);
#endif

#endif


================================================
FILE: test/rosenbrock_banana_function.cpp
================================================
/* rosenbrock_banana_function.cpp - N-dimensional Rosenbrock function

  Copyright (C) 2012-2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

// This function is an N-dimensional extension of Rosenbrock's banana
// function; it is actually the "2nd De Jong function" - see the
// Wikipedia entry for Rosenbrock's function.

#include "state.h"

using adept::adouble;
adouble State::calc_function_value(const adouble* x) {
  adouble sum = 0.0;
  for (unsigned int i = 0; i < nx()-1; i++) {
    adouble a = x[i+1]-x[i]*x[i];
    sum += (1.0-x[i])*(1.0-x[i]) + 100.0*a*a;
  }
  return sum;
}


================================================
FILE: test/run_tests.sh
================================================
#!/bin/sh

# Simple script to run all programs provided to it and report whether
# they succeed or fail

LOG=test_results.txt
STDERR=test_stderr.txt

rm -f $LOG
touch $LOG

echo
echo "Writing output of test programs to $LOG"
echo

FAILURES=0

for TEST in "$@"
do
    if [ -x "$TEST" ]
    then
	rm -f $STDERR
	echo >> $LOG
	echo "########################################################" >> $LOG
	echo "### $TEST" >> $LOG
	echo "########################################################" >> $LOG
	echo >> $LOG
	# The built-in version of "echo" on some versions of "sh" does
	# not treat the "-n" option correctly, so we use /bin/echo
	# here
	/bin/echo -n "$TEST... "
	./$TEST >> $LOG 2> $STDERR
	if [ "$?" = 0 ]
	then
	    echo "PASSED"
	else
	    echo "*** FAILED ***"
	    cat $STDERR
	    FAILURES=`expr $FAILURES + 1`
	fi
    else
	echo "$TEST does not exist"
    fi
done

echo
if [ "$FAILURES" -gt "0" ]
then
    echo "$FAILURES programs failed in some way - see detailed output in $LOG"
else
    echo "All test programs ran successfully"
fi
echo

exit $FAILURES


================================================
FILE: test/simulate_radiances.cpp
================================================
/* simulate_radiances.cpp - provides a function taking inactive arguments that returns also Jacobian matrices

  Copyright (C) 2012-2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

#include "adept.h"
#include "simulate_radiances.h"

using adept::aReal;
using adept::Real;

// Simulate a single radiance (W sr-1 m-3) given the wavelength (m),
// emissivity profile, surface temperature (K) and temperature profile
// (K), where the profile data are located at n points with spacing
// 1000 m. This function uses active arguments. It is accessible only
// from within this file; the public interface is the
// simulate_radiance function.
static
aReal
simulate_radiance_private(int n,
			  Real wavelength,
			  const Real* emissivity,
			  const aReal& surface_temperature,
			  const aReal* temperature)
{
  static const Real BOLTZMANN_CONSTANT = 1.380648813e-23;
  static const Real SPEED_OF_LIGHT = 299792458.0;

  int i;
  aReal bt = surface_temperature; // Brightness temperature in K
  // Loop up through the atmosphere working out the contribution from
  // each layer
  for (i = 0; i < n; i++) {
    bt = bt*(1.0-emissivity[i]) + emissivity[i]*temperature[i];
  }
  // Convert from brightness temperature to radiance using
  // Rayleigh-Jeans approximation
  return 2.0*SPEED_OF_LIGHT*BOLTZMANN_CONSTANT*bt
    /(wavelength*wavelength*wavelength*wavelength);
}

// Simulate two radiances (W sr-1 m-3) given the surface temperature
// (K) and temperature profile (K), where the profile data are located
// at n points with spacing 1000 m. This function uses inactive
// arguments.
void
simulate_radiances(int n, // Size of temperature array
		   // Input variables:
		   Real surface_temperature, 
		   const Real* temperature,
		   // Output variables:
		   Real radiance[2],
		   // Output Jacobians:
		   Real dradiance_dsurface_temperature[2],
		   Real* dradiance_dtemperature)
{
  // First temporarily deactivate any existing Adept stack used by the
  // calling function
  adept::Stack* caller_stack = adept::active_stack();
  if (caller_stack != 0) {
    caller_stack->deactivate();
  }

  // Within the scope of these curly brackets, another Adept stack
  // will be used
  {
    // Ficticious oxygen channels around 60 GHz: wavelength in m
    static const Real wavelength[2] = {0.006, 0.0061}; 
    // Mass absorption coefficient of oxygen in m2 kg-1
    static const Real mass_abs_coefft[2] = {3.0e-5, 3.0e-3};
    // Layer thickness in m
    static const Real dz = 1000.0;

    // Density of oxygen in kg m-3
    std::vector<Real> density_oxygen(n);
    // Emissivity at a particular microwave wavelength
    std::vector<Real> emissivity(n);

    // Start a new stack
    adept::Stack s;

    // Create local active variables: surface temperature, temperature
    // and radiance
    aReal st = surface_temperature;
    std::vector<aReal> t(n);
    aReal r[2];

    // Initialize the oxygen density and temperature
    for (int i = 0; i < n; i++) {
      Real altitude = i*dz;
      // Oxygen density uses an assumed volume mixing ratio with air
      // of 21%, molecular mass of 16 (compared to 29 for air), a
      // surface air density of 1.2 kg m-3 and an atmospheric scale
      // height of 8000 m
      density_oxygen[i] = 1.2*0.21*(16.0/29.0)*exp(-altitude/8000.0);
      t[i] = temperature[i];
    }

    // Start recording derivative information
    s.new_recording();

    // Loop through the two channels
    for (int ichan = 0; ichan < 2; ichan++) {
      // Compute the emissivity profile
      for (int i = 0; i < n; i++) {
	emissivity[i] = 1.0-exp(-density_oxygen[i]*mass_abs_coefft[ichan]*dz);
      }
      // Simulate the radiance
      r[ichan] = simulate_radiance_private(n, wavelength[ichan], 
					   &emissivity[0], st, &t[0]);
      // Copy the aReal variable to the Real variable
      radiance[ichan] = r[ichan].value();
    }

    // Declare independent (x) and dependent (y) variables for
    // Jacobian matrix
    s.independent(st);
    s.independent(&t[0], n);
    s.dependent(r, 2);
    
    // Compute Jacobian matrix
    std::vector<Real> jacobian((n+1)*2);
    s.jacobian(&jacobian[0]);

    // Copy elements of Jacobian matrix into the calling arrays
    for (int ichan = 0; ichan < 2; ichan++) {
      dradiance_dsurface_temperature[ichan] = jacobian[ichan];
      for (int i = 0; i < n; i++) {
	dradiance_dtemperature[i*2+ichan] = jacobian[2+i*2+ichan];
      }
    }

    // At the following curly bracket, the local Adept stack will be
    // destructed
  }

  // Reactivate the Adept stack of the calling function
  if (caller_stack != 0) {
    caller_stack->activate();
  }
}


================================================
FILE: test/simulate_radiances.h
================================================
/* simulate_radiances.h - a function taking inactive arguments that returns also Jacobian matrices

  Copyright (C) 2012-2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

#include <adept.h>

void simulate_radiances(int n, // Size of temperature array
			// Input variables:
			adept::Real surface_temperature, 
			const adept::Real* temperature,
			// Output variables:
			adept::Real radiance[2],
			// Output Jacobians:
			adept::Real dradiance_dsurface_temperature[2],
			adept::Real* dradiance_dtemperature);


================================================
FILE: test/state.cpp
================================================
/* state.cpp - An object-oriented interface to an Adept-based minimizer

  Copyright (C) 2012-2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

// Note that this implementation uses the GNU Scientific Library (GSL)
// to provide the quasi-Newton minimization capability

#include <iostream>
#include <gsl/gsl_multimin.h>

#include "state.h"

// C functions needed by GSL

// Return function value given a vector of state variables x
extern "C" 
double my_function_value(const gsl_vector* x, void* params) {
  State* state = reinterpret_cast<State*>(params);
  return state->calc_function_value(x->data);
}
// Return gradient of function with respect to each state variable x
extern "C"
void my_function_gradient(const gsl_vector* x, void* params, gsl_vector* gradJ) { 
  State* state = reinterpret_cast<State*>(params);
  state->calc_function_value_and_gradient(x->data, gradJ->data);
}
// Return both function and its gradient
extern "C"
void my_function_value_and_gradient(const gsl_vector* x, void* params,
				    double* J, gsl_vector* gradJ) { 
  State* state = reinterpret_cast<State*>(params);
  *J = state->calc_function_value_and_gradient(x->data, gradJ->data);
}

using adept::adouble;

// "State" member function for returning the value of the function; it
// does this by calling the underlying calc_function_value(const
// adouble&) function, which is defined in
// rosenbrock_banana_function.cpp.  Since the gradient is not
// required, the recording of automatic differentiation is "paused"
// while this function is called.
double State::calc_function_value(const double* x) {
  stack_.pause_recording();
  for (unsigned int i = 0; i < nx(); ++i) active_x_[i] = x[i];
  double result = value(calc_function_value(&active_x_[0]));
  stack_.continue_recording();
  return result;
}

// Member function for returning both the value of the function and
// its gradient - here Adept is used to compute the gradient
double State::calc_function_value_and_gradient(const double* x, double* dJ_dx) {
  for (unsigned int i = 0; i < nx(); ++i) active_x_[i] = x[i];
  stack_.new_recording();
  adouble J = calc_function_value(&active_x_[0]);
  J.set_gradient(1.0);
  stack_.compute_adjoint();
  adept::get_gradients(&active_x_[0], nx(), dJ_dx);
  return value(J);
}

// Minimize the function, returning true if minimization successful,
// false otherwise
bool State::minimize() {
  // Minimizer settings
  const double initial_step_size = 0.01;
  const double line_search_tolerance = 1.0e-4;
  const double converged_gradient_norm = 1.0e-3;
  // Use the "limited-memory BFGS" quasi-Newton minimizer
  const gsl_multimin_fdfminimizer_type* minimizer_type
    = gsl_multimin_fdfminimizer_vector_bfgs2;
  
  // Declare and populate structure containing function pointers
  gsl_multimin_function_fdf my_function;
  my_function.n = nx();
  my_function.f = my_function_value;
  my_function.df = my_function_gradient;
  my_function.fdf = my_function_value_and_gradient;
  my_function.params = reinterpret_cast<void*>(this);
   
  // Set initial state variables using GSL's vector type: use -5.0 for
  // every value
  gsl_vector *x;
  x = gsl_vector_alloc(nx());
  for (unsigned int i = 0; i < nx(); ++i) gsl_vector_set(x, i, -5.0);

  // Configure the minimizer, and call function once
  gsl_multimin_fdfminimizer* minimizer
    = gsl_multimin_fdfminimizer_alloc(minimizer_type, nx());
  gsl_multimin_fdfminimizer_set(minimizer, &my_function, x,
				initial_step_size, line_search_tolerance);

  // Print out the result of the first function call with the initial
  // state
  std::cout << "Initial state: x = [";
  for (unsigned int i = 0; i < nx(); i++) {
    std::cout << active_x_[i].value() << " ";
  }
  std::cout << "], cost_function = " << minimizer->f << "\n";

  // Begin loop
  size_t iter = 0;
  int status;
  do {
    ++iter;
    // Perform one iteration
    status = gsl_multimin_fdfminimizer_iterate(minimizer);
    
    // Quit loop if iteration failed
    if (status != GSL_SUCCESS) break;
    
    // Test for convergence
    status = gsl_multimin_test_gradient(minimizer->gradient,
					converged_gradient_norm);
     
    // Print out limited number of state variables from this
    // iteration, and the corresponding cost function
    std::cout << "Iteration " << iter << ": x = [";
    for (unsigned int i = 0; i < nx(); i++) {
      std::cout << active_x_[i].value() << " ";
      if (i >= 5) {
	std::cout << "...";
	break;
      }
    }
    std::cout << "], cost_function = " << minimizer->f << "\n";
  }
  while (status == GSL_CONTINUE && iter < 1000);

  // Free memory
  gsl_multimin_fdfminimizer_free(minimizer);
  gsl_vector_free(x);

  // Return true if successfully minimized function, false otherwise
  if (status == GSL_SUCCESS) {
    std::cout << "Minimum found after " << iter << " iterations\n";
    return true;
  }
  else {
    std::cout << "Minimizer failed after " << iter << " iterations: "
	      << gsl_strerror(status) << "\n";
    return false;
  }
}

// Enquiry function to return the current value of the state
// variables, called after minimize() has been run.
void
State::x(std::vector<double>& x_out) const
{
  x_out.resize(nx());
  for (unsigned int i = 0; i < nx(); i++) {
    x_out[i] = active_x_[i].value();
  }
}


================================================
FILE: test/state.h
================================================
/* state.h - An object-oriented interface to an Adept-based minimizer

  Copyright (C) 2012-2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

#ifndef STATE_H
#define STATE_H 1
#include <vector>
#include "adept.h"
class State {
public:
  // Construct a state with n state variables
  State(int n) { active_x_.resize(n); }
  // Minimize the function, returning true if minimization
  // successful, false otherwise
  bool minimize();
  // Get copy of state variables after minimization
  void x(std::vector<double>& x_out) const;
  // For input state variables x, compute the function J(x) and
  // return it
  double calc_function_value(const double* x);
  // For input state variables x, compute function and put its
  // gradient in dJ_dx
  double calc_function_value_and_gradient(const double* x, double* dJ_dx);
  // Return the size of the state vector
  unsigned int nx() const { return active_x_.size(); }
protected:
  // Active version of the function: the algorithm is contained in
  // the definition of this function (in
  // rosenbrock_banana_function.cpp)
  adept::adouble calc_function_value(const adept::adouble* x);
  // DATA
  adept::Stack stack_;                    // Adept stack object
  std::vector<adept::adouble> active_x_;  // Active state variables
};
#endif


================================================
FILE: test/test_adept.cpp
================================================
/* test_adept.cpp - Demonstration of basic features of Adept

  Copyright (C) 2012-2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

#include <iostream>

#include "adept.h"

// Provide function prototype for "algorithm"; see algorithm.cpp for
// the contents of the function
#include "algorithm.h"

int
main(int argc, char** argv)
{
  using adept::adouble;
  using adept::Real;

  // Start an Adept stack before the first adouble object is
  // constructed
  adept::Stack s;

  adouble x[2]; // Our independent variables
  adouble y;    // Our dependent variable

  // Set the values of x
  x[0] = 2.0;
  x[1] = 3.0;


  // PART 1: NUMERICAL ADJOINT
  std::cout << "*** Computing numerical adjoint ***\n\n";

  // We will provide an estimate of the adjoints by perturbing the
  // inputs by a small amount

  adouble x_perturbed[2]; // Perturbed independent variables

  // This version of the code uses the same algorithm function that
  // takes adouble arguments for doing the numerical adjoint, even
  // though we are not doing automatic differentiation. To make it
  // faster, we can turn off the recording of derivative information
  // using the pause_recording function.  This only works if all code
  // has been compiled with the -DADEPT_RECORDING_PAUSABLE flag;
  // otherwise it does nothing (so the program will still run
  // correctly, but will be less efficient). Note that another
  // approach if you want to call a function several times, sometimes
  // with automatic differentiation and sometimes without, is
  // demonstrated in
  // test_adept_with_without_automatic_differentiation.cpp.
  s.pause_recording();

  // We will compare the Adept result to a numerically computed
  // adjoint, so define the perturbation size
  double dx = 1.0e-5;

  // Run the algorithm
  y = algorithm(x);

  // Now perturb x[0] and x[1] in turn and get a numerical estimate of
  // the gradient
  x_perturbed[0] = x[0]+dx;
  x_perturbed[1] = x[1];
  double dy_dx0 = adept::value((algorithm(x_perturbed)-y)/dx);
  x_perturbed[0] = x[0];
  x_perturbed[1] = x[1]+dx;
  double dy_dx1 = adept::value((algorithm(x_perturbed)-y)/dx);

  // Turn the recording of deriviative information back on
  s.continue_recording();

  // Print information about the data held in the stack
  std::cout << "Stack status after numerical adjoint (if recording was successfully\n"
	    << "paused then the number of operations should be zero):\n" 
	    << s;
  // Print memory information
  std::cout << "Memory usage: " << s.memory() << " bytes\n\n";

  // PART 2: REVERSE-MODE AUTOMATIC DIFFERENTIATION

  // Now we use Adept to do the automatic differentiation
  std::cout << "*** Computing adjoint using automatic differentiation ***\n\n";

  // Start a new recording of derivative statements; note that this
  // must be done after the independent variables x[0] and x[1] are
  // defined and after they have been given their initial values
  s.new_recording();

  // Run the algorithm again
  y = algorithm(x);

  // Print information about the data held in the stack
  std::cout << "Stack status after algorithm run but adjoint not yet computed:\n"
	    << s;
  // Print memory information
  std::cout << "Memory usage: " << s.memory() << " bytes\n\n";

  // If we set the adjoint of the dependent variable to 1 then the
  // resulting adjoints of the independent variables after
  // reverse-mode automatic differentiation will be comparable to the
  // outputs of the numerical differentiation
  y.set_gradient(1.0);

  // Print out some diagnostic information
  std::cout << "List of derivative statements:\n";
  s.print_statements();
  std::cout << "\n";

  std::cout << "Initial list of gradients:\n";
  s.print_gradients();
  std::cout << "\n";

  // Run the adjoint algorithm (reverse-mode differentiation)
  s.reverse();

  // Some more diagnostic information
  std::cout << "Final list of gradients:\n";
  s.print_gradients();
  std::cout << "\n";
  
  // Extract the adjoints of the independent variables
  double x0_ad = 0, x1_ad = 0; 
  x[0].get_gradient(x0_ad);
  x[1].get_gradient(x1_ad);


  // PART 3: JACOBIAN COMPUTATION

  // Here we use the same recording to compute the Jacobian matrix
  std::cout << "*** Computing Jacobian matrix ***\n\n";

  s.independent(x, 2); // Declare independents
  s.dependent(y);      // Declare dependents
  Real jac[2];         // Where the Jacobian will be stored
  s.jacobian(jac);     // Compute Jacobian


  // PART 4: PRINT OUT RESULTS

  // Print information about the data held in the stack
  std::cout << "Stack status after adjoint and Jacobian computed:\n"
	    << s;
  // Print memory information
  std::cout << "Memory usage: " << s.memory() << " bytes\n\n";

  std::cout << "Result of forward algorithm:\n";
  std::cout << "  y = " << y.value() << "\n";
  
  std::cout << "Comparison of gradients:\n";
  std::cout << "  dy_dx0[numerical] = " << dy_dx0 << "\n";
  std::cout << "  dy_dx0[adjoint]   = " << x0_ad  << "\n";
  std::cout << "  dy_dx0[jacobian]  = " << jac[0] << "\n";
  std::cout << "  dy_dx1[numerical] = " << dy_dx1 << "\n";
  std::cout << "  dy_dx1[adjoint]   = " << x1_ad  << "\n";
  std::cout << "  dy_dx1[jacobian]  = " << jac[1] << "\n";

  std::cout << "\nNote that the numerical gradients are less accurate since they use\n"
	    << "a finite difference and are also succeptible to round-off error.\n";

  return 0;

}


================================================
FILE: test/test_adept_with_and_without_ad.cpp
================================================
/* test_adept_with_and_without_ad.cpp

  Copyright (C) 2012-2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

// Demonstration of the use of Adept with code (in this case,
// algorithm.cpp) that has been compiled twice, once with automatic
// differentiation enabled (the default) and once with it disabled
// (using -DADEPT_NO_AUTOMATIC_DIFFERENTIATION) to provide a faster
// version of a function that works with double rather than adouble
// objects.

#include <iostream>

#include "adept.h"

// Provide function prototypes for "algorithm"; see algorithm.cpp for
// the contents of the function
#include "algorithm_with_and_without_ad.h"

// Simple demonstration of automatic differentiation using Adept
int
main(int argc, char** argv)
{
  using adept::adouble;
  using adept::Real;

  // Start an Adept stack before the first adouble object is
  // constructed
  adept::Stack s;

  adouble x[2]; // Our independent variables
  adouble y;    // Our dependent variable

  // Set the values of x
  x[0] = 2.0;
  x[1] = 3.0;


  // PART 1: NUMERICAL ADJOINT
  std::cout << "*** Computing numerical adjoint ***\n\n";

  // We will compare the Adept result to a numerically computed
  // adjoint, so define the perturbation size
  double dx = 1.0e-5;

  // Initialize a inactive version of x as double rather than adouble
  // variables
  double x_r[2];
  x_r[0] = x[0].value();
  x_r[1] = x[1].value();

  // Run the original version of the algorithm that takes real
  // arguments; this was compiled from algorithm.cpp using the
  // -DADEPT_NO_AUTOMATIC_DIFFERENTIATION flag to produce the
  // algorithm_noad.o object file
  double y_real = algorithm(x_r);

  // Now perturb x[0] and x[1] in turn and get a numerical estimate of
  // the gradient
  x_r[0] = x[0].value()+dx;
  x_r[1] = x[1].value();
  double dy_dx0 = (algorithm(x_r)-y_real)/dx;
  x_r[0] = x[0].value();
  x_r[1] = x[1].value()+dx;
  double dy_dx1 = (algorithm(x_r)-y_real)/dx;

  // Print information about the data held in the stack
  std::cout << "Stack status after numerical adjoint (number of operations should be zero):\n" 
	    << s << "\n";


  // PART 2: REVERSE-MODE AUTOMATIC DIFFERENTIATION
  std::cout << "*** Computing adjoint using automatic differentiation ***\n\n";

  // Start a new recording of derivative statements (note that this
  // must be done after the independent variables x[0] and x[1] are
  // initialized
  s.new_recording();

  // Now use Adept to do it - first run the algorithm overloaded for
  // adouble arguments
  y = algorithm(x);

  // Print information about the data held in the stack
  std::cout << "Stack status after algorithm run but adjoint not yet computed:\n"
	    << s << "\n";

  // If we set the adjoint of the dependent variable to 1 then the
  // resulting adjoints of the independent variables after
  // reverse-mode automatic differentiation will be comparable to the
  // outputs of the numerical differentiation
  y.set_gradient(1.0);

  // Print out some diagnostic information
  std::cout << "List of derivative statements:\n";
  s.print_statements();
  std::cout << "\n";

  std::cout << "Initial list of gradients:\n";
  s.print_gradients();
  std::cout << "\n";

  // Run the adjoint algorithm (reverse-mode differentiation)
  s.reverse();

  std::cout << "Final list of gradients:\n";
  s.print_gradients();
  std::cout << "\n";
  
  // Extract the adjoints of the independent variables
  double x0_ad = 0, x1_ad = 0; 
  x[0].get_gradient(x0_ad);
  x[1].get_gradient(x1_ad);


  // PART 3: JACOBIAN COMPUTATION

  // Here we use the same recording to compute the Jacobian matrix
  std::cout << "*** Computing Jacobian matrix ***\n\n";

  s.independent(x, 2); // Declare independents
  s.dependent(y);      // Declare dependents
  Real jac[2];         // Jacobian data must be of type "Real"
  s.jacobian(jac);     // Compute Jacobian


  // PART 4: PRINT OUT RESULT

  // Print information about the data held in the stack
  std::cout << "Stack status after adjoint and Jacobian computed:\n"
	    << s << "\n";

  // Print memory information
  std::cout << "Memory usage: " << s.memory() << " bytes\n\n";

  std::cout << "Result of forward algorithm:\n";
  std::cout << "  y[from algorithm taking double arguments]  = " << y_real << "\n";
  std::cout << "  y[from algorithm taking adouble arguments] = " << y.value() << "\n\n";
  
  std::cout << "Comparison of gradients:\n";
  std::cout << "  dy_dx0[numerical] = " << dy_dx0 << "\n";
  std::cout << "  dy_dx0[adjoint]   = " << x0_ad  << "\n";
  std::cout << "  dy_dx0[jacobian]  = " << jac[0] << "\n";
  std::cout << "  dy_dx1[numerical] = " << dy_dx1 << "\n";
  std::cout << "  dy_dx1[adjoint]   = " << x1_ad  << "\n";
  std::cout << "  dy_dx1[jacobian]  = " << jac[1] << "\n";

  std::cout << "\nNote that the numerical gradients are less accurate since they use\n"
	    << "a finite difference and are also succeptible to round-off error.\n";

  return 0;

}


================================================
FILE: test/test_array_derivatives.cpp
================================================
/* test_array_derivatives.cpp - Test derivatives of array expressions

    Copyright (C) 2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.

*/

#include <adept_arrays.h>


// Arbitrary algorithm converting array of general type A to scalar of
// type S, which may be active or passive
template <class A, class S>
void algorithm(const A& x, S& y) {
  using namespace adept;
  A tmp;
  intVector index(2);
  index << 1, 0;
  tmp = atan2((exp(x) * x), spread<0>(x(index,1),2)) / x(0,0);
  y = sum(tmp);
}


int
main(int argc, const char** argv) {
  using namespace adept;

  Stack stack;

  // Matrix dimension
  static const int N = 2;
  static const Real MAX_FRAC_ERR = 1.0e-5;

  // Perturbation size for numerical calculation
  Real dx = 1.0e-6;

  if (sizeof(Real) < 8) {
    // Single precision only works with larger perturbations
    dx = 1.0e-4;
  }

  // Maximum fractional error
  Real max_frac_err;
  bool error_too_large = false;

  // Input data
  Matrix X(N,N);
  X << 2, 3, 5, 7;
  
  // Numerical calculation 
  std::cout << "NUMERICAL CALCULATION\n";
  Matrix dJ_dx_num(N,N);
  {
    Real J;
    algorithm(X, J);
    std::cout << "J = " << J << "\n";

    for (int i = 0; i < N; ++i) {
      for (int j = 0; j < N; ++j) {
	Matrix Xpert(N,N);
	Xpert = X;
	Xpert(i,j) += dx;
	Real Jpert;
	algorithm(Xpert, Jpert);
	dJ_dx_num(i,j) = (Jpert - J) / dx;
      }
    }
  }

  std::cout << "dJ_dx_num = " << dJ_dx_num << "\n";

  std::cout << "\nNUMERICAL CALCULATION WITH \"FixedArray\"\n";
  Matrix22 dJ_dx_num_FixedArray;
  {
    Real J;
    algorithm(X, J);
    std::cout << "J = " << J << "\n";

    for (int i = 0; i < N; ++i) {
      for (int j = 0; j < N; ++j) {
	Matrix22 Xpert = X;
	Xpert(i,j) += dx;
	Real Jpert;
	algorithm(Xpert, Jpert);
	dJ_dx_num_FixedArray(i,j) = (Jpert - J) / dx;
      }
    }
  }

  std::cout << "dJ_dx_num_FixedArray = " << dJ_dx_num_FixedArray << "\n";

  // Adept calculation with aArray
  std::cout << "\nADEPT CALCULATION WITH \"aArray\"\n";
  Matrix dJ_dx_adept_Array(N,N);
  {
    aMatrix aX = X;
    stack.new_recording();
    aReal aJ;
    algorithm(aX, aJ);
    std::cout << "J = " << aJ << "\n";
    aJ.set_gradient(1.0);
    stack.reverse();
   
    dJ_dx_adept_Array = aX.get_gradient();
  }

  std::cout << "dJ_dx_adept_Array = " << dJ_dx_adept_Array << "\n";

  max_frac_err = maxval(abs(dJ_dx_adept_Array-dJ_dx_num)/dJ_dx_num);
  if (max_frac_err <= MAX_FRAC_ERR) {
    std::cout << "max fractional error = " << max_frac_err
		<< ": PASSED\n";
  }
  else {
    std::cout << "max fractional error = "
	      << max_frac_err << ": FAILED\n";
    error_too_large = true;
  }
  // Adept calculation with aFixedArray
  std::cout << "\nADEPT CALCULATION WITH \"aFixedArray\"\n";
  Matrix dJ_dx_adept_FixedArray;
  {
    aMatrix22 aX = X;
    stack.new_recording();
    aReal aJ;
    algorithm(aX, aJ);
    std::cout << "J = " << aJ << "\n";
    aJ.set_gradient(1.0);
    stack.reverse();
    dJ_dx_adept_FixedArray = aX.get_gradient();

  }
  std::cout << "dJ_dx_adept_FixedArray = " << dJ_dx_adept_FixedArray << "\n";

  max_frac_err = maxval(abs(dJ_dx_adept_FixedArray-dJ_dx_num)/dJ_dx_num);
  if (max_frac_err <= MAX_FRAC_ERR) {
    std::cout << "max fractional error = " << max_frac_err
		<< ": PASSED\n";
  }
  else {
    std::cout << "max fractional error = "
	      << max_frac_err << ": FAILED\n";
    error_too_large = true;
  }
  

  // Adept forward calculation with aArray: four (NxN) separate
  // calculations are required to compute the derivative with respect
  // to the four inputs.
  std::cout << "\nADEPT FORWARD CALCULATION WITH \"aArray\"\n";
  Matrix dJ_dx_adept_forward_Array(N,N);
  {
    aMatrix aX = X;
    stack.new_recording();
    aReal aJ;
    algorithm(aX, aJ);
    std::cout << "J = " << aJ << "\n";

    Matrix X_tl(N,N);

    X_tl=0.0;
    X_tl(0,0) = 1.0;
    aX.set_gradient(X_tl);
    stack.forward();
    dJ_dx_adept_forward_Array(0,0) = aJ.get_gradient();

    stack.clear_gradients();
    X_tl=0.0;
    X_tl(0,1) = 1.0;
    aX.set_gradient(X_tl);
    stack.forward();
    dJ_dx_adept_forward_Array(0,1) = aJ.get_gradient();
    
    stack.clear_gradients();
    X_tl=0.0;
    X_tl(1,0) = 1.0;
    aX.set_gradient(X_tl);
    stack.forward();
    dJ_dx_adept_forward_Array(1,0) = aJ.get_gradient();
    
    stack.clear_gradients();
    X_tl=0.0;
    X_tl(1,1) = 1.0;
    aX.set_gradient(X_tl);
    stack.forward();
    dJ_dx_adept_forward_Array(1,1) = aJ.get_gradient();
    
  }

  std::cout << "dJ_dx_adept_forward_Array = " << dJ_dx_adept_forward_Array << "\n";

  max_frac_err = maxval(abs(dJ_dx_adept_forward_Array-dJ_dx_num)/dJ_dx_num);
  if (max_frac_err <= MAX_FRAC_ERR) {
    std::cout << "max fractional error = " << max_frac_err
		<< ": PASSED\n";
  }
  else {
    std::cout << "max fractional error = "
	      << max_frac_err << ": FAILED\n";
    error_too_large = true;
  }

  
  std::cout << "\n";

  if (error_too_large) {
    std::cerr << "*** Error: fractional error in the derivatives of some configurations too large\n";

    if (sizeof(Real) < 8) {
      std::cerr << "*** (but you are using less than double precision so it is not surprising)\n";
    }

    return 1;
  }
  else {
    return 0;
  }


}


================================================
FILE: test/test_array_speed.cpp
================================================
#include <iostream>
#define ADEPT_NO_AUTOMATIC_DIFFERENTIATION
#define ADEPT_REAL_TYPE_SIZE 4
#include <adept_arrays.h>
#include "Timer.h"

#define ASSIGN   =
#define WARMUP_OPERATOR + exp
#define OPERATOR + fastexp
//#define SUFFIX_OP + 0.5
#define SUFFIX_OP

using namespace adept;

int main()
{
  Timer timer;
  timer.print_on_exit();
  int n = 128;

  static const int rep = 10000;
  //  static const int rep = 10;

  std::cout << "Packet<Real>::size = " << internal::Packet<Real>::size << "\n";

  Stack stack;

  aMatrix M(n,n), P(n,n), Q(n,n);
  //  Array<2,aReal,false> M(n,n), P(n,n), Q(n,n);
  aReal Mc[n][n], Pc[n][n], Qc[n][n];

  for (int i = 0; i < n; ++i) {
    for (int j = 0; j < n; ++j) {
      P(i,j) = Pc[i][j] = 0.01 * (i-j);
      Q(i,j) = Qc[i][j] = 0.1 * (j+1);
      M(i,j) = Mc[i][j] = 0.0;
    }
  }

  int t_c_style_w = timer.new_activity("C-style for loops (warm-up)");
  int t_c_style = timer.new_activity("C-style for loops");
  int t_adept_w = timer.new_activity("Adept (warm-up)");
  int t_adept = timer.new_activity("Adept");
  int t_adept_container_w = timer.new_activity("Adept container only (warm-up)");
  int t_adept_container = timer.new_activity("Adept container only");
#ifndef ADEPT_NO_AUTOMATIC_DIFFERENTIATION
  int t_jacobian_w = timer.new_activity("Jacobian (warm-up)");
  int t_jacobian = timer.new_activity("Jacobian");
  int t_jacobian_array_w = timer.new_activity("Jacobian array-op (warm-up)");
  int t_jacobian_array = timer.new_activity("Jacobian array-op");
#endif

  stack.new_recording();
  timer.start(t_c_style_w);
  for (int irep = 0; irep < rep; ++irep) {
    for (int i = 0; i < n; ++i) {
      for (int j = 0; j < n; ++j) {
	Mc[i][j] ASSIGN Pc[i][j] WARMUP_OPERATOR (Qc[i][j] SUFFIX_OP);
      }
    }
  }
  timer.stop();

  if (n <= 10) {
    std::cout << "C-style M = \n";
    for (int i = 0; i < n; ++i) {
      for (int j = 0; j < n; ++j) {
	std::cout << " " << Mc[i][j];
      }
      std::cout << "\n";
    }
  }
  
  //  std::cout << stack;

  stack.new_recording();
  timer.start(t_c_style);
  for (int irep = 0; irep < rep; ++irep) {
    for (int i = 0; i < n; ++i) {
      for (int j = 0; j < n; ++j) {
	Mc[i][j] ASSIGN Pc[i][j] OPERATOR (Qc[i][j] SUFFIX_OP);
      }
    }
  }
  timer.stop();
  //  std::cout << stack;

#ifndef ADEPT_NO_AUTOMATIC_DIFFERENTIATION
  stack.independent(&Pc[0][0], n*n);
  stack.dependent(&Mc[0][0], n*n);

  timer.start(t_jacobian_w);
  Real* jac;
  jac = new Real[n*n*n*n];

  stack.jacobian_forward(jac);
  timer.stop();
  timer.start(t_jacobian);
  stack.jacobian_forward(jac);
  timer.stop();
#endif


  //  std::cout << Mc[0][0] << " " << Mc[10][10] << "\n";

  stack.new_recording();
  timer.start(t_adept_w);
  for (int irep = 0; irep < rep; ++irep) {
    //    M ASSIGN noalias(P WARMUP_OPERATOR (Q SUFFIX_OP));
    M ASSIGN P WARMUP_OPERATOR (Q SUFFIX_OP);
  }
  timer.stop();
  //  std::cout << stack;

  if (n <= 10) {
    std::cout << "Array-style M = \n";
    for (int i = 0; i < n; ++i) {
      for (int j = 0; j < n; ++j) {
	std::cout << " " << M(i,j);
      }
      std::cout << "\n";
    }
  }

  std::cout << "Alignment offset = " << (P OPERATOR (Q SUFFIX_OP)).alignment_offset() << "\n";


  stack.new_recording();
  timer.start(t_adept);
  for (int irep = 0; irep < rep; ++irep) {
    //    M += noalias(P OPERATOR (Q SUFFIX_OP));
    M ASSIGN P OPERATOR (Q SUFFIX_OP);
  }
  timer.stop();
  //  std::cout << stack;


#ifndef ADEPT_NO_AUTOMATIC_DIFFERENTIATION

  stack.clear_independents();
  stack.clear_dependents();
  stack.independent(P);
  stack.dependent(Q);
  //  stack.independent(P.data(), n*n);
  //  stack.dependent(M.data(), n*n);

  std::cout << stack;

  timer.start(t_jacobian_array_w);
  stack.jacobian_forward(jac);
  timer.stop();
  timer.start(t_jacobian_array);
  stack.jacobian_forward(jac);
  timer.stop();
#endif

  stack.new_recording();
  timer.start(t_adept_container_w);
  for (int irep = 0; irep < rep; ++irep) {
    for (int i = 0; i < n; ++i) {
      for (int j = 0; j < n; ++j) {
	M(i,j) ASSIGN P(i,j) WARMUP_OPERATOR (Q(i,j) SUFFIX_OP);
      }
    }
  }
  timer.stop();
  //  std::cout << stack;
  //  std::cout << M;

  stack.new_recording();
  timer.start(t_adept_container);
  for (int irep = 0; irep < rep; ++irep) {
    for (int i = 0; i < n; ++i) {
      for (int j = 0; j < n; ++j) {
	M(i,j) ASSIGN P(i,j) OPERATOR (Q(i,j) SUFFIX_OP);
      }
    }
  }
  timer.stop();
  //  std::cout << stack;
}


================================================
FILE: test/test_arrays.cpp
================================================
/* test_arrays.cpp - Test Adept's array functionality

    Copyright (C) 2016-2018 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.

  This program can be compiled to run in three ways: (1) normal
  compilation tests inactive arrays, (2) with -DALL_ACTIVE tests
  active arrays, and (3) "-DALL_ACTIVE -DADEPT_RECORDING_PAUSABLE"
  tests whether a "paused" recording correctly records nothing to the
  automatic-differentiation stack.

*/

#include <iostream>
#include <complex>

#define ADEPT_BOUNDS_CHECKING 1

#include <adept_arrays.h>

//#define TRAP_FLOATING_POINT_EXCEPTIONS 1
#ifdef TRAP_FLOATING_POINT_EXCEPTIONS
#include <fenv.h>
#endif


// The following controls whether to use active variables or not
//#define ALL_ACTIVE 1
//#define MARVEL_STYLE 1
//#define ALL_COMPLEX 1

using namespace adept;


int
main(int argc, const char** argv) {
  using namespace adept;

#ifdef TRAP_FLOATING_POINT_EXCEPTIONS
  feenableexcept(FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW);
#endif

#ifdef ALL_ACTIVE
#define IsActive true
  Stack stack;
#else
#define IsActive false
#endif
  
#define HEADING(MESSAGE)						\
  std::cout << "====================================================================\n" \
	    << "   TESTING " << MESSAGE << "\n"

#define COMMA ,


#define SIMPLE_EVAL(MESSAGE, TYPE, X, INIT, EXPR)			\
  std::cout << "--------------------------------------------------------------------\n" \
	    << "### " << MESSAGE << "\n### " << #EXPR << "\n";	\
  try {								\
    TYPE X;								\
    if (INIT) {								\
      X = test. X;							\
    }									\
    std::cout << "Evaluating " << #EXPR << "\n";			\
    std::cout.flush();						\
    EXPR;								\
    if (should_fail) { std::cout << "*** INCORRECT OUTCOME\n";	\
      anomalous_results++;						\
    }									\
  } catch (const adept::exception& e) {					\
    std::cout << "*** Failed with: " << e.what() << "\n";		\
    if (!should_fail) { std::cout << "*** INCORRECT OUTCOME\n";	\
      anomalous_results++;						\
    }									\
    else {								\
      std::cout << "*** Correct behaviour\n";				\
    }									\
  }

#define EVAL(MESSAGE, TYPE, X, INIT, EXPR)				\
  std::cout << "--------------------------------------------------------------------\n" \
	    << "### " << MESSAGE << "\n### " << #EXPR << "\n";	\
  try {								\
    TYPE X;								\
    if (INIT) {								\
      X = test. X;							\
      std::cout << #TYPE << " " << #X << " = " << X << "\n";		\
    }									\
    else {								\
      std::cout << #TYPE << " " << #X << " = " << X << "\n";		\
    }									\
    std::cout << "Evaluating " << #EXPR << "\n";			\
    std::cout.flush();						\
    EXPR;								\
    std::cout << "Result: " << #X << " = " << X << "\n";		\
    if (should_fail) { std::cout << "*** INCORRECT OUTCOME\n";	\
      anomalous_results++;						\
    }									\
  } catch (const adept::exception& e) {					\
    std::cout << "*** Failed with: " << e.what() << "\n";		\
    if (!should_fail) { std::cout << "*** INCORRECT OUTCOME\n";	\
      anomalous_results++;						\
    }									\
    else {								\
      std::cout << "*** Correct behaviour\n";				\
    }									\
  }

  
#ifdef ALL_ACTIVE
#define EVAL2(MESSAGE, TYPEX, X, INITX, TYPEY, Y, EXPR)			\
  std::cout << "--------------------------------------------------------------------\n" \
	    << "### " << MESSAGE << "\n### " << #EXPR << "\n";	\
  try {									\
    TYPEX X;								\
    if (INITX) {							\
      X = test. X;							\
      std::cout << #TYPEX << " " << #X << " = " << X << "\n";		\
    }									\
    else {								\
      std::cout << #TYPEX << " " << #X << " = " << X << "\n";		\
    }									\
    TYPEY Y; Y = test. Y;						\
    std::cout << #TYPEY << " " << #Y << " = " << Y << "\n";		\
    std::cout << "Evaluating " << #EXPR << "\n";			\
    std::cout.flush();							\
    int nop=stack.n_operations();					\
    EXPR;								\
    std::cout << "Result: " << #X << " = " << X << "\n";		\
    std::cout << "Differential operations: " << stack.n_operations()-nop << "\n";	\
    if (should_fail) { std::cout << "*** INCORRECT OUTCOME\n";	        \
      anomalous_results++;						\
    }									\
  } catch (const adept::exception& e) {					\
    std::cout << "*** Failed with: " << e.what() << "\n";		\
    if (!should_fail) { std::cout << "*** INCORRECT OUTCOME\n";		\
      anomalous_results++;						\
    }									\
    else {								\
      std::cout << "*** Correct behaviour\n";				\
    }									\
  }
#else
#define EVAL2(MESSAGE, TYPEX, X, INITX, TYPEY, Y, EXPR)			\
  std::cout << "--------------------------------------------------------------------\n" \
	    << "### " << MESSAGE << "\n### " << #EXPR << "\n";	\
  try {									\
    TYPEX X;								\
    if (INITX) {							\
      X = test. X;							\
      std::cout << #TYPEX << " " << #X << " = " << X << "\n";		\
    }									\
    else {								\
      std::cout << #TYPEX << " " << #X << " = " << X << "\n";		\
    }									\
    TYPEY Y; Y = test. Y;						\
    std::cout << #TYPEY << " " << #Y << " = " << Y << "\n";		\
    std::cout << "Evaluating " << #EXPR << "\n";			\
    std::cout.flush();							\
    EXPR;								\
    std::cout << "Result: " << #X << " = " << X << "\n";		\
    if (should_fail) { std::cout << "*** INCORRECT OUTCOME\n";	        \
      anomalous_results++;						\
    }									\
  } catch (const adept::exception& e) {					\
    std::cout << "*** Failed with: " << e.what() << "\n";		\
    if (!should_fail) { std::cout << "*** INCORRECT OUTCOME\n";		\
      anomalous_results++;						\
    }									\
    else {								\
      std::cout << "*** Correct behaviour\n";				\
    }									\
  }
#endif

#define EVAL3(MESSAGE, TYPEX, X, INITX, TYPEY, Y, TYPEZ, Z, EXPR)	\
  std::cout << "--------------------------------------------------------------------\n" \
	    << "### " << MESSAGE << "\n### " << #EXPR << "\n"; \
  try {									\
    TYPEX X;								\
    if (INITX) {							\
      X = test. X;							\
      std::cout << #TYPEX << " " << #X << " = " << X << "\n";		\
    }									\
    else {								\
      std::cout << #TYPEX << " " << #X << " = " << X << "\n";		\
    }									\
    TYPEY Y; Y.link( test. Y );						\
    TYPEZ Z; Z.link( test. Z );						\
    std::cout << #TYPEY << " " << #Y << " = " << Y << "\n";		\
    std::cout << #TYPEZ << " " << #Z << " = " << Z << "\n";		\
    std::cout << "Evaluating " << #EXPR << "\n";			\
    std::cout.flush();							\
    EXPR;								\
    std::cout << "Result: " << #X << " = " << X << "\n";		\
    if (should_fail) { std::cout << "*** INCORRECT OUTCOME\n";	        \
      anomalous_results++;						\
    }									\
  } catch (const adept::exception& e) {					\
    std::cout << "*** Failed with: " << e.what() << "\n";		\
    if (!should_fail) { std::cout << "*** INCORRECT OUTCOME\n";		\
      anomalous_results++;						\
    }									\
    else {								\
      std::cout << "*** Correct behaviour\n";				\
    }									\
  }

#define EVAL_NO_TRAP(MESSAGE, TYPE, X, INIT, EXPR)				\
  std::cout << "--------------------------------------------------------------------\n" \
	    << "### " << MESSAGE << "\n### " << #EXPR << "\n";	\
  {									\
    TYPE X;								\
    if (INIT) {								\
      X = test. X;							\
      std::cout << #TYPE << " " << #X << " = " << X << "\n";		\
    }									\
    else {								\
      std::cout << #TYPE << " " << #X << " = " << X << "\n";		\
    }									\
    std::cout << "Evaluating " << #EXPR << "\n";			\
    std::cout.flush();						\
    EXPR;								\
    std::cout << "Result: " << #X << " = " << X << "\n";		\
    if (should_fail) { std::cout << "*** INCORRECT OUTCOME\n";	\
      anomalous_results++;						\
    }									\
  }  

#define EVAL2_NO_TRAP(MESSAGE, TYPEX, X, INITX, TYPEY, Y, EXPR)			\
  std::cout << "--------------------------------------------------------------------\n" \
	    << "### " << MESSAGE << "\n###  " << #EXPR << "\n";	\
  {									\
    TYPEX X;								\
    if (INITX) {								\
      X = test. X;							\
      std::cout << #TYPEX << " " << #X << " = " << X << "\n";		\
    }									\
    else {								\
      std::cout << #TYPEX << " " << #X << " = " << X << "\n";		\
    }									\
    TYPEY Y; Y = test. Y;						\
    std::cout << #TYPEY << " " << #Y << " = " << Y << "\n";		\
    std::cout << "Evaluating " << #EXPR << "\n";			\
    std::cout.flush();							\
    EXPR;								\
    std::cout << "Result: " << #X << " = " << X << "\n";		\
    if (should_fail) { std::cout << "*** INCORRECT OUTCOME\n";	\
      anomalous_results++;						\
    }									\
  }

#ifndef ALL_COMPLEX

#ifdef ALL_ACTIVE
#ifndef MARVEL_STYLE
  typedef aReal myReal;
  typedef aMatrix myMatrix;
  typedef aVector myVector;
  typedef aSymmMatrix mySymmMatrix;
  //typedef aSquareMatrix mySymmMatrix;
  typedef aDiagMatrix myDiagMatrix;
  typedef aTridiagMatrix myTridiagMatrix;
  typedef aLowerMatrix myLowerMatrix;
  typedef aUpperMatrix myUpperMatrix;
  typedef SpecialMatrix<Real,internal::BandEngine<ROW_MAJOR,2,1>,true> myOddBandMatrix;
  typedef aArray3D myArray3D;
#else
  typedef aReal myReal;
  typedef Array<2,aReal,false> myMatrix;
  typedef Array<1,aReal,false> myVector;
  typedef SpecialMatrix<aReal,internal::SquareEngine<ROW_MAJOR>,false> mySymmMatrix;
  typedef SpecialMatrix<aReal,internal::BandEngine<ROW_MAJOR,0,0>,false> myDiagMatrix;
  typedef SpecialMatrix<aReal,internal::BandEngine<ROW_MAJOR,1,1>,false> myTridiagMatrix;
  typedef SpecialMatrix<aReal,internal::LowerEngine<ROW_MAJOR>, false> myLowerMatrix;
  typedef SpecialMatrix<aReal,internal::UpperEngine<ROW_MAJOR>, false> myUpperMatrix;
  typedef SpecialMatrix<aReal,internal::BandEngine<ROW_MAJOR,2,1>,false> myOddBandMatrix;

#endif
#else

  typedef Real   myReal;
  typedef Matrix myMatrix;
  typedef Vector myVector;
  typedef Array3D myArray3D;

  typedef SymmMatrix mySymmMatrix;
  //typedef SquareMatrix mySymmMatrix;
  typedef DiagMatrix myDiagMatrix;
  typedef TridiagMatrix myTridiagMatrix;
  typedef LowerMatrix myLowerMatrix;
  typedef UpperMatrix myUpperMatrix;
  typedef SpecialMatrix<Real,internal::BandEngine<ROW_MAJOR,2,1>,false> myOddBandMatrix;

  /*    
  typedef SpecialMatrix<Real,SymmEngine<ROW_UPPER_COL_LOWER>,false> mySymmMatrix;
  typedef SpecialMatrix<Real,internal::BandEngine<COL_MAJOR,0,0>,false> myDiagMatrix;
  typedef SpecialMatrix<Real,internal::BandEngine<COL_MAJOR,1,1>,false> myTridiagMatrix;
  typedef SpecialMatrix<Real,internal::BandEngine<COL_MAJOR,2,1>,false> myOddBandMatrix;
  */

#endif


#else
  typedef std::complex<Real> myReal;
  typedef Array<1,std::complex<Real>,IsActive> myVector;
  typedef Array<2,std::complex<Real>,IsActive> myMatrix;
  typedef Array<3,std::complex<Real>,IsActive> myArray3D;
  typedef SpecialMatrix<std::complex<Real>,internal::SquareEngine<ROW_MAJOR>,IsActive> mySymmMatrix;
  typedef SpecialMatrix<std::complex<Real>,internal::BandEngine<ROW_MAJOR,0,0>,IsActive> myDiagMatrix;
  typedef SpecialMatrix<std::complex<Real>,internal::BandEngine<ROW_MAJOR,1,1>,IsActive> myTridiagMatrix;
  typedef SpecialMatrix<std::complex<Real>,internal::LowerEngine<ROW_MAJOR>, IsActive> myLowerMatrix;
  typedef SpecialMatrix<std::complex<Real>,internal::UpperEngine<ROW_MAJOR>, IsActive> myUpperMatrix;
  typedef SpecialMatrix<std::complex<Real>,internal::BandEngine<ROW_MAJOR,2,1>,IsActive> myOddBandMatrix;

#endif

  struct Test {

    bool b;
    boolVector B;
    int c;
    myReal x;
    myVector v, w, vlong;
    myMatrix M, N;
    myMatrix Mstrided;
    myMatrix S;
    mySymmMatrix O, P;
    myDiagMatrix D, E;
    myTridiagMatrix T, TT;
    myLowerMatrix L, LL;
    myUpperMatrix U, UU;
    myOddBandMatrix Q, R;
    intVector index;
    myArray3D A;

#define MINI_TEST
#ifdef MINI_TEST
#define DIM1 3
#define DIM2 2
#define DIM3 5
#define DIMLONG 12
#else
#define DIM1 12
#define DIM2 10
#define DIM3 15
#define DIMLONG 20
#endif
    Test() {
#ifdef ALL_COMPLEX
#define I std::complex<Real>(0.0,1.0)
#else
#define I 0.0
#endif
      b = false;
      B.resize(DIM1); B = false;
      c = 0;
      x = -2;
      v.resize(DIM1);
      vlong.resize(DIMLONG); vlong = linspace(1,DIMLONG,DIMLONG);
      w.resize(DIM1);
      M.resize(DIM2,DIM1);
      myMatrix Mtmp(DIM2*3,DIM1*2);
      Mstrided.link(Mtmp(stride(0,end,3),stride(0,end,2)));
      N.resize(DIM2,DIM1);
      S.resize(DIM1,DIM1);
      O.resize(DIM1);
      Q.resize(DIM3);
      index.resize(DIM2);
      v(0) = 2.0 + 3.0*I; v(1) = 3; v(2) = 5;
      w(0) = 7.0 + 4.0*I; w(1) = 11; w(2) = 13;
      M(0,0) = 2.0 + 3.0*I; M(0,1) = 3; M(0,2) = 5;
      M(1,0) = 7; M(1,1) = 11; M(1,2) = 13;
      Mstrided = M;
      N(0,0) = 17.0+5.0*I; N(0,1) = 19; N(0,2) = 23;
      N(1,0) = 29; N(1,1) = 31; N(1,2) = 37;
      S(0,0) = 2.0+3.0*I; S(0,1) = 3; S(0,2) = 5;
      S(1,0) = 7.0+4.0*I; S(1,1) = 11; S(1,2) = 13;
      S(2,0) = 17; S(2,1) = 19; S(2,2) = 23;

      O(0,0) = 7.0+3.0*I;
      O(1,0) = 2; O(1,1) = 11;
      O(2,0) = 3; O(2,1) = 5; O(2,2) = 13;

      P = 14.0 - O;

      Q.diag_vector(-2) = 1;
      Q.diag_vector(-1) = 2;
      Q.diag_vector(0)  = 3;
      Q.diag_vector(1)  = 4;

      D = S;
      T = S;
      L = S;
      U = S;

      A.resize(DIM2,DIM1,DIM2);
      A << 2.0+3.0*I, 3, 5, 7, 11, 13,
	17, 19, 23, 29, 31,37;

      index << 1, 0;
    }
  };

#ifdef ALL_ACTIVE
#ifndef ADEPT_RECORDING_PAUSABLE
  stack.new_recording();
#else
  stack.pause_recording();
#endif
#endif

  Test test;

  bool should_fail=false;
  int anomalous_results=0;

  std::cout << adept::configuration();

#ifdef ALL_ACTIVE
  std::cout << "Testing ACTIVE arrays\n";
#else
  std::cout << "Testing INACTIVE arrays\n";
#endif
#ifdef ALL_COMPLEX
  std::cout << "Testing COMPLEX arrays\n";
#endif


  HEADING("ARRAY FUNCTIONALITY");
  EVAL("Array \"resize\" member function", myMatrix, M, true, M.resize(1,5));
  
  should_fail=true;
  EVAL("Array \"resize\" with invalid dimensions", myMatrix, M, true, M.resize(1));
  should_fail=false;
  EVAL("Array \"resize\" with \"dimensions\" function", myMatrix, M, true, M.resize(dimensions(4,2)));

  EVAL("Array \"clear\" member function", myMatrix, M, true, M.clear());

#ifdef ADEPT_CXX11_FEATURES
  HEADING("INITIALIZER LISTS (C++11 ONLY)");
  EVAL("Vector assignment to initializer list from empty", myVector, v,
       false, v = {1 COMMA 2});
  EVAL("Vector assignment to initializer list with underfill", myVector, v,
       true, v = {1.0 COMMA 2.0});
  should_fail = true;
  EVAL("Vector assignment to initializer list with overfill (SHOULD FAIL)", myVector, v,
    true, v = {1.0 COMMA 2.0 COMMA 3.0 COMMA 4.0});
  should_fail = false;
  EVAL("Matrix assignment to initializer list from empty", myMatrix, M,
    false, M = { {1 COMMA 2} COMMA {3 COMMA 4} });
  EVAL("Matrix assignment to initializer list with underfill", myMatrix, M,
    true, M = { {1.0 COMMA 2.0} COMMA {3.0 COMMA 4.0} });
  should_fail = true;
  EVAL("Matrix assignment to initializer list with overfill (SHOULD FAIL)", myMatrix, M,
    true, M = { {1.0 COMMA 2.0 COMMA 3.0 COMMA 4.0} });
  should_fail = false;
  EVAL("Initializer list in expression", myVector, v,
    true, v = v + Vector({1.0 COMMA 2.0 COMMA 3.0}));
  EVAL2("Indexed matrix assigned to initializer list", myMatrix, M, true, intVector, index, 
	M(index,index) = {{1 COMMA 2} COMMA {3 COMMA 4}});

#endif


  HEADING("BASIC EXPRESSIONS");
  EVAL2("Vector assignment to vector from empty", myVector, v, false, myVector, w, v = w);
  EVAL2("Vector assignment to expression from empty", myVector, v, false, myVector, w, v = log(w) + 1.0);

  /*
  should_fail=true;
  EVAL("Vector = operator from empty (SHOULD FAIL)", myVector, v, false, v = 1.0);
  EVAL("Vector += operator from empty (SHOULD FAIL)", myVector, v, false, v += 1.0);
  should_fail=false;
  */

  EVAL("Matrix *= operator", myMatrix, M, true, M *= 0.5);
  EVAL2("Matrix = scalar", myMatrix, M, true, myReal, x, M = x);
  EVAL2("Matrix = scalar expression", myMatrix, M, true, myReal, x, M = (10.0*x));
#ifndef ALL_COMPLEX
  HEADING("BASIC FUNCTIONS");
  EVAL2("max", myVector, v, true, myVector, w, v = max(v,w/3.0));
  EVAL2("min", myVector, v, true, myVector, w, v = min(v,w/3.0));
#endif

  HEADING("ARRAY SLICING");
  EVAL2("Array indexing rvalue", myReal, x, true, myMatrix, M, x = M(1,end-1));

  should_fail=true;
  EVAL2("Array indexing rvalue out of range (SHOULD FAIL)", myReal, x, true, myMatrix, M, x = M(1,3));
  should_fail=false;

  EVAL("Array indexing lvalue", myMatrix, M, true, M(1,end-1) *= -1.0);

  EVAL2("contiguous subarray rvalue", myVector, v, false, myMatrix, M, v = M(__,end));
  EVAL("contiguous subarray lvalue", myMatrix, M, true, M(end-1,__) /= 2.0);
  EVAL2("contiguous subarray rvalue and lvalue", myMatrix, M, true, myMatrix, N, M(__,1) = N(__,2));
  EVAL2("contiguous subarray rvalue using range", myVector, v, false, myMatrix, M, v = 2.0 * M(1,range(1,2)));
  EVAL2("contiguous subarray lvalue using range", myMatrix, M, true, myVector, v, M(end-1,range(0,1)) = log(v(range(1,2))));
  EVAL2("contiguous subarray rvalue using subset", myMatrix, M, false, myMatrix, N, M = 2.0 * N.subset(1,1,1,2));
  EVAL("contiguous subarray lvalue using subset", myVector, v, true, v.subset(end-1,end) *= 10.0);
  EVAL2("regular subarray rvalue", myVector, v, false, myVector, w, v = w(stride(end,0,-1)));
  EVAL2("regular subarray lvalue", myMatrix, M, true, myVector, w, M(0,stride(0,end,2)) *= w(stride(end,0,-2)));
#ifndef ALL_COMPLEX
  EVAL2("irregular subarray rvalue", myMatrix, M, false, myMatrix, N, M = N(stride(1,0,-1),find(N(0,__)>18)));
  EVAL("irregular subarray lvalue", myMatrix, M, true, M(stride(1,0,-1),find(M(0,__)>4)) = 0);
#endif
  EVAL("slice leading dimension", myMatrix, M, true, M[end] = 0);
  EVAL("slice two dimensions", myMatrix, M, true, M[end][0] = 0);
  EVAL2("diag_vector member function as rvalue", myVector, v, false, myMatrix, S, v = diag_vector(S,1));
  EVAL2("diag_vector member function as lvalue", myMatrix, S, true, myVector, v, S.diag_vector() += v);
  EVAL2("diag_matrix member function", myMatrix, S, false, myVector, v, S = v.diag_matrix());
  EVAL2("diag_matrix external function", myMatrix, S, false, myVector, v, S = diag_matrix(v));
  EVAL2("transpose as rvalue via T member function", myMatrix, N, false, myMatrix, M, N = 2.0 * M.T());
  EVAL2("transpose as rvalue via permute member function", myMatrix, N, false, myMatrix, M, N = 2.0 * M.permute(1,0));
  EVAL3("matrix indexing (scalar,non-contiguous)", myVector, v, false, myMatrix, N, intVector, index, v = N(1,index)); 
  EVAL3("matrix indexing (non-contiguous,scalar)", myVector, v, false, myMatrix, N, intVector, index, v = N(index,1)); 
  EVAL3("2D arbitrary index as rvalue", myMatrix, M, false, myMatrix, N, intVector, index, M = const_cast<const myMatrix&>(N)(index,index));
  EVAL3("2D arbitrary index as lvalue assigned to scalar expression", myMatrix, M, true, myMatrix, N, intVector, index, M(index,index) = 2.0*(myReal)(4.0));
  EVAL3("2D arbitrary index as lvalue", myMatrix, M, true, myMatrix, N, intVector, index, M(index,index) = N(__,range(1,2)));
  EVAL2("2D arbitrary index as lvalue with assign-multiply operator", myMatrix, M, true, intVector, index, M(index,index) *= 10.0);
  EVAL2("2D arbitrary index as lvalue with aliased right-hand-side", myMatrix, M, true, intVector, index, M(index,index) = M(__,range(0,1)));
  EVAL2("2D arbitrary index as lvalue with aliased right-hand-side and eval function", myMatrix, M, true, intVector, index, M(index,index) = eval(M(__,range(0,1))));
  EVAL2("reshape member function", myMatrix, M, false, myVector, vlong, M >>= vlong.reshape(3,4));
  should_fail=true;
  EVAL2("reshape member function with invalid dimensions", myMatrix, M, false, myVector, vlong, M >>= vlong.reshape(5,5));
  should_fail=false;
  EVAL("end/2 indexing", myVector, vlong, true, vlong(range(end/2,end)) = 0.0);
  EVAL("end/2 indexing", myVector, vlong, true, vlong(range(0,end/2)) = 0.0);
  EVAL("end/2 indexing", myVector, vlong, true, vlong.subset(end/2,end) = 0.0);

  HEADING("REDUCTION OPERATIONS"); 
  EVAL2("full sum", myReal, x, true, myMatrix, M, x = sum(M));
  EVAL2("full mean", myReal, x, true, myMatrix, M, x = mean(M));
  EVAL2("full product", myReal, x, true, myMatrix, M, x = product(M));
  EVAL2("full norm2", myReal, x, true, myMatrix, M, x = norm2(M));
#ifndef ALL_COMPLEX
  EVAL2("full maxval", myReal, x, true, myMatrix, M, x = maxval(M));
  EVAL2("full minval", myReal, x, true, myMatrix, M, x = minval(-M));
#endif
  
  EVAL2("1-dimension sum", myVector, v, true, myMatrix, M, v += sum(M,0));
  EVAL2("1-dimension mean", myVector, v, false, myMatrix, M, v = mean(M*M,1));
  EVAL2("1-dimension product", myVector, v, false, myMatrix, M, v = product(M,1));
  EVAL2("1-dimension norm2", myVector, v, false, myMatrix, M, v = norm2(M,1));
  //  EVAL2("1-dimension sum", myMatrix, M, false, myArray3D, A, M = sum(A,2));
#ifndef ALL_COMPLEX
  EVAL2("1-dimension maxval", myVector, v, false, myMatrix, M, v = maxval(M,1));
  EVAL2("1-dimension minval", myVector, v, false, myMatrix, M, v = minval(M,1));

  EVAL2("dot product", myReal, x, true, myVector, w, x = dot_product(w,w(stride(end,0,-1))));
  EVAL2("dot product on expressions", myReal, x, true, myVector, w, x = dot_product(2.0*w,w(stride(end,0,-1))+1.0));
  EVAL2("1D interpolation", myVector, v, true, myVector, w, v = interp(value(v), w, Vector(value(w)/2.0)));
  EVAL2("1D clamped interpolation", myVector, v, true, myVector, w, v = interp(value(v), w, value(w)/2.0, ADEPT_EXTRAPOLATE_CLAMP));
#ifndef ALL_ACTIVE
  EVAL2("1D interpolation of matrix", myMatrix, M, true, myVector, v, M = interp(v(range(0,1)), M, v(range(1,2))/2.0));
  EVAL2("1D clamped interpolation of matrix", myMatrix, M, true, myVector, v, M = interp(v(range(0,1)), M, v(range(1,2))/2.0, ADEPT_EXTRAPOLATE_CLAMP));
#endif
  EVAL2("all reduction", bool, b, true, myMatrix, M, b = all(M > 8.0));
  EVAL2("any reduction", bool, b, true, myMatrix, M, b = any(M > 8.0));
  EVAL2("count reduction", int, c, true, myMatrix, M, c = count(M > 8.0));
  EVAL2("1-dimension all reduction", boolVector, B, false, myMatrix, M, B = all(M > 8.0, 1));
  EVAL2("1-dimension any reduction", boolVector, B, false, myMatrix, M, B = any(M > 8.0, 1));
  EVAL2("1-dimension count reduction", intVector, index, false, myMatrix, M, index = count(M > 8.0, 1));
  HEADING("CONDITIONAL OPERATIONS");
  EVAL2("where construct, scalar right-hand-side", myMatrix, M, true, myMatrix, N, M.where(N > 20) = 0);
  EVAL2("where construct, expression right-hand-side", myMatrix, M, true, myMatrix, N, M.where(N > 20) = -N);
  EVAL2("where construct, scalar either-or right-hand-side", myMatrix, M, true, myMatrix, N, M.where(N > 20) = either_or(0,1));
  EVAL2("where construct, expression either-or right-hand-side", myMatrix, M, true, myMatrix, N, M.where(N > 20) = either_or(-N,N));
  EVAL_NO_TRAP("find construct, scalar right-hand-side", myVector, v, true, v(find(v > 3.5)) = 0);
  EVAL("find construct, expression right-hand-side", myVector, v, true, v(find(v > 3.5)) = -v(range(end,end)));
  EVAL("find construct, multiply-assign right-hand-side", myVector, v, true, v(find(v != 5.0)) *= 10.0);
#endif
  HEADING("SPECIAL SQUARE MATRICES");
  EVAL("SymmMatrix \"resize\" member function", mySymmMatrix, O, true, O.resize(5));

  should_fail = true;
  EVAL("SymmMatrix \"resize\" with invalid dimensions", mySymmMatrix, O, true, O.resize(4,5));
  should_fail = false;

  EVAL("SymmMatrix \"clear\" member function", mySymmMatrix, O, true, O.clear());
  EVAL2("SymmMatrix assign from dense matrix", mySymmMatrix, O, false, myMatrix, S, O = S);
  EVAL2("DiagMatrix assign from dense matrix", myDiagMatrix, D, false, myMatrix, S, D = S);
  EVAL2("TridiagMatrix assign from dense matrix", myTridiagMatrix, T, false, myMatrix, S, T = S);
  EVAL2("LowerMatrix assign from dense matrix", myLowerMatrix, L, false, myMatrix, S, L = S);
  EVAL2("UpperMatrix assign from dense matrix", myUpperMatrix, U, false, myMatrix, S, U = S);
  EVAL("SymmMatrix += operator", mySymmMatrix, O, true, O += 3.0);
  EVAL("DiagMatrix += operator", myDiagMatrix, D, true, D += 3.0);
  EVAL("TridiagMatrix += operator", myTridiagMatrix, T, true, T += 3.0);
  EVAL("LowerMatrix += operator", myLowerMatrix, L, true, L += 3.0);
  EVAL("UpperMatrix += operator", myUpperMatrix, U, true, U += 3.0);
  EVAL2("SymmMatrix as rvalue", myMatrix, M, false, mySymmMatrix, O, M = O);
  EVAL2("DiagMatrix as rvalue", myMatrix, M, false, myDiagMatrix, D, M = D);
  EVAL2("TridiagMatrix as rvalue", myMatrix, M, false, myTridiagMatrix, T, M = T);
  EVAL2("LowerMatrix as rvalue", myMatrix, M, false, myLowerMatrix, L, M = L);
  EVAL2("UpperMatrix as rvalue", myMatrix, M, false, myUpperMatrix, U, M = U);
  EVAL("SymmMatrix assign from scalar expression", mySymmMatrix, O, true, O = 2.0*(myReal)(4.0));
  EVAL("UpperMatrix assign from scalar expression", myUpperMatrix, U, true, U = 2.0*(myReal)(4.0));


  EVAL("SymmMatrix diag_vector member function as lvalue (upper)", mySymmMatrix, O, true, O.diag_vector(1) = 0);
  EVAL("SymmMatrix diag_vector member function as lvalue (lower)", mySymmMatrix, O, true, O.diag_vector(-2) += 10.0);
  EVAL("DiagMatrix diag_vector member function as lvalue", myDiagMatrix, D, true, D.diag_vector() = 0.0);

  should_fail = true;
  EVAL("DiagMatrix diag_vector member function incorrectly using offdiagonal", myDiagMatrix, D, true, D.diag_vector(1) = 0.0);
  should_fail = false;

  EVAL("TridiagMatrix diag_vector member function as lvalue (upper)", myTridiagMatrix, T, true, T.diag_vector(1) += 10.0);
  EVAL("TridiagMatrix diag_vector member function as lvalue (lower)", myTridiagMatrix, T, true, T.diag_vector(-1) = 0.0);
  EVAL("LowerMatrix diag_vector member function as lvalue (lower)", myLowerMatrix, L, true, L.diag_vector(-1) = 0.0);

  should_fail = true;
  EVAL("LowerMatrix diag_vector member function as lvalue (upper)", myLowerMatrix, L, true, L.diag_vector(1) = 0.0);
  EVAL("UpperMatrix diag_vector member function as lvalue (lower)", myUpperMatrix, U, true, U.diag_vector(-1) = 0.0);
  should_fail = false;

  EVAL("UpperMatrix diag_vector member function as lvalue (upper)", myUpperMatrix, U, true, U.diag_vector(1) = 0.0);
  EVAL("Odd band matrix \"diag_vector\" member function", myOddBandMatrix, Q, true, Q.diag_vector(1) = -1.0);
  EVAL("Odd band matrix \"diag_vector\" member function", myOddBandMatrix, Q, true, Q.diag_vector(0) = -1.0);
  EVAL("Odd band matrix \"diag_vector\" member function", myOddBandMatrix, Q, true, Q.diag_vector(-1) = -1.0);
  EVAL("Odd band matrix \"diag_vector\" member function", myOddBandMatrix, Q, true, Q.diag_vector(-2) = -1.0);

  EVAL2("Array submatrix_on_diagonal member function", myMatrix, M, false, myMatrix, S, M = S.submatrix_on_diagonal(1,2));
  EVAL("Array submatrix_on_diagonal member function as lvalue", myMatrix, S, true, S.submatrix_on_diagonal(0,1) = 0.0);

  should_fail = true;
  EVAL2("Array submatrix_on_diagonal member function to non-square matrix", myMatrix, M, false, myMatrix, N, M = N.submatrix_on_diagonal(1,2));
  should_fail = false;

  EVAL2("SymmMatrix submatrix_on_diagonal member function", mySymmMatrix, P, false, mySymmMatrix, O, P = O.submatrix_on_diagonal(1,2));
  EVAL2("DiagMatrix submatrix_on_diagonal member function", myDiagMatrix, E, false, myDiagMatrix, D, E = D.submatrix_on_diagonal(1,2));
  EVAL2("TridiagMatrix submatrix_on_diagonal member function", myTridiagMatrix, TT, false, myTridiagMatrix, T, TT = T.submatrix_on_diagonal(1,2));
  EVAL2("LowerMatrix submatrix_on_diagonal member function", myLowerMatrix, LL, false, myLowerMatrix, L, LL = L.submatrix_on_diagonal(1,2));
  EVAL2("UpperMatrix submatrix_on_diagonal member function", myUpperMatrix, UU, false, myUpperMatrix, U, UU = U.submatrix_on_diagonal(1,2));
  EVAL2("Odd band matrix submatrix_on_diagonal member function", myOddBandMatrix, R, false, myOddBandMatrix, Q, R = Q.submatrix_on_diagonal(1,3));
  EVAL("Odd band matrix submatrix_on_diagonal as lvalue", myOddBandMatrix, Q, true, Q.submatrix_on_diagonal(1,3) = -1);
  EVAL2("SymmMatrix transpose as rvalue via T member function", mySymmMatrix, P, false, mySymmMatrix, O, P = O.T());
  EVAL2("DiagMatrix transpose as rvalue via T member function", myDiagMatrix, E, false, myDiagMatrix, D, E = D.T());
  EVAL2("TridiagMatrix transpose as rvalue via T member function", myTridiagMatrix, TT, false, myTridiagMatrix, T, TT = T.T());
  EVAL2("LowerMatrix transpose as rvalue via T member function", myUpperMatrix, U, false, myLowerMatrix, L, U = L.T());
  EVAL2("UpperMatrix transpose as rvalue via T member function", myLowerMatrix, L, false, myUpperMatrix, U, L = U.T());

  HEADING("EXPANSION OPERATIONS");
  EVAL2("Outer product", myMatrix, M, false, myVector, v, M = outer_product(v,v));
  EVAL2("Outer product on indexed array", myMatrix, M, false, myVector, v, M = outer_product(v,v(stride(end,0,-1))));
  EVAL2("Outer product on expressions", myMatrix, M, false, myVector, v, M = outer_product(2.0*v,v-1.0));
  EVAL2("Vector spread of dimension 0", myMatrix, M, false, myVector, v, M = spread<0>(v,2));
  EVAL2("Vector spread of dimension 1", myMatrix, M, false, myVector, v, M = spread<1>(v,2));
  EVAL2("Vector spread with expression argument", myMatrix, M, false, myVector, v, M = spread<1>(v*2.0,2));
  EVAL2("Matrix spread of dimension 0", myArray3D, A, false, myMatrix, M, A = spread<0>(M,2));
  EVAL2("Matrix spread of dimension 1", myArray3D, A, false, myMatrix, M, A = spread<1>(M,2));
  EVAL2("Matrix spread of dimension 2", myArray3D, A, false, myMatrix, M, A = spread<2>(M,2));

#ifndef ALL_COMPLEX

#ifndef MARVEL_STYLE
  if (adept::have_matrix_multiplication()) {
    HEADING("MATRIX MULTIPLICATION");
    EVAL3("Matrix-Vector multiplication", myVector, w, false, myMatrix, M, myVector, v, w = M ** v);
    EVAL3("Matrix-Vector multiplication with strided matrix", myVector, w, false, myMatrix, Mstrided, myVector, v, w = Mstrided ** v);
    EVAL2("Matrix-Matrix multiplication", myMatrix, M, false, myMatrix, N, M = N.T() ** N);
    EVAL2("Matrix-Matrix multiplication with matmul", myMatrix, M, false, myMatrix, N, M = matmul(N.T(), N));
    
    should_fail = true;
    EVAL2("Matrix-Matrix multiplication with inner dimension mismatch", myMatrix, M, false, myMatrix, N, M = N ** N);
    should_fail = false;
    
    // TESTING!
    EVAL2("Matrix-Matrix-Vector multiplication", myVector, v, true, myMatrix, S, v = S ** S ** v);
    
    EVAL2("Matrix-Matrix-Vector multiplication", myVector, v, false, myMatrix, S, v = S ** log(S) ** S(0,__));
    EVAL2("Vector-Matrix multiplication", myVector, v, true, myMatrix, S, v = v ** S);
    EVAL2("Vector-Matrix multiplication with matmul", myVector, v, true, myMatrix, S, v = matmul(v, S));
    EVAL2("SymmMatrix-Vector multiplication", myVector, v, true, mySymmMatrix, O, v = O ** v);
    EVAL2("SymmMatrix-Matrix multiplication", myMatrix, S, true, mySymmMatrix, O, S = O ** S);
    EVAL2("Vector-SymmMatrix multiplication", myVector, v, true, mySymmMatrix, O, v = v ** O);
    EVAL2("Matrix-SymmMatrix multiplication", myMatrix, M, true, mySymmMatrix, O, M = M ** O);
    EVAL2("DiagMatrix-Vector multiplication", myVector, v, true, myDiagMatrix, D, v = D ** v);
    EVAL2("TridiagMatrix-Vector multiplication", myVector, v, true, myTridiagMatrix, T, v = T ** v);
    EVAL2("TridiagMatrix-Matrix multiplication", myMatrix, S, true, myTridiagMatrix, T, S = T ** S);
    
    EVAL2("LowerMatrix-Matrix multiplication", myMatrix, S, true, myLowerMatrix, L, S = L ** S);
    
    EVAL2("Vector-TridiagMatrix multiplication", myVector, v, true, myTridiagMatrix, T, v = v ** T);
    EVAL2("Matrix-TridiagMatrix multiplication", myMatrix, M, true, myTridiagMatrix, T, M = M ** T);
  }
  else {
    std::cout << "NO MATRIX MULTIPLICATION TESTS PERFORMED BECAUSE ADEPT COMPILED WITHOUT LAPACK\n";
  }
    
#ifndef ALL_ACTIVE
  if (adept::have_linear_algebra()) {
    HEADING("LINEAR ALGEBRA");
    EVAL2("Solving general linear equations Ax=b", myVector, v, true, myMatrix, S, v = solve(S,v));
    EVAL2("Solving general linear equations Ax=b with expression arguments", myVector, v, true, myMatrix, S, v = solve(S,2*v));
    
    EVAL2("Solving general linear equations AX=B", myMatrix, M, true, myMatrix, S, M.T() = solve(S,M.T()));
    EVAL2("Solving general linear equations AX=B with expression arguments", myMatrix, M, true, myMatrix, S, M.T() = solve(2.0 * S,2.0 * M.T()));
    EVAL2("Solving linear equations Ax=b with symmetric A", myVector, v, true, mySymmMatrix, O, v = solve(O,v));
    EVAL2("Solving linear equations AX=B with symmetric A", myMatrix, M, true, mySymmMatrix, O, M.T() = solve(O,M.T()));
    EVAL3("Solving linear equations AX=B with symmetric A and B", myMatrix, S, false, mySymmMatrix, O, mySymmMatrix, P, S = solve(O,P));
    EVAL2("Solving linear equations Ax=b with upper-triangular A", myVector, v, true, myUpperMatrix, U, v = solve(U,v));
    EVAL2("Invert general matrix", myMatrix, M, false, myMatrix, S, M = inv(S));
    EVAL2("Invert symmetric matrix", mySymmMatrix, P, false, mySymmMatrix, O, P = inv(O));
  }
  else {
    std::cout << "NO LINEAR ALGEBRA TESTS PERFORMED BECAUSE ADEPT COMPILED WITHOUT LAPACK\n";
  }
#else
  std::cout << "NO LINEAR ALGEBRA TESTS PERFORMED BECAUSE ACTIVE ARRAYS NOT YET SUPPORTED\n";
#endif
#else
  std::cout << "NO MATRIX TESTS PERFORMED BECAUSE USING MARVEL-STYLE ACTIVE ARRAYS\n";
#endif

#endif

  HEADING("FILLING ARRAYS");
  EVAL("Fill vector with \"<<\"", myVector, v, true, (v << 0.1, 0.2));

  should_fail = true;
  EVAL("Overfill vector with \"<<\"", myVector, v, true, (v << 0.1, 0.2, 0.3, 0.4));
  should_fail = false;

  EVAL("Underfill matrix with \"<<\"", myMatrix, M, true, (M << 0.1, 0.2, 0.3, 0.4, 0.5));
  EVAL("Fill matrix with \"<<\"", myMatrix, M, true, (M << 0.1, 0.2, 0.3, 0.4, 0.5, 0.6));

  should_fail = true;
  EVAL("Overfill matrix with \"<<\"", myMatrix, M, true, (M << 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0));
  should_fail = false;

  EVAL("Fill vector with vectors using \"<<\"", myVector, v, true, v << v(range(1,2)) << 0.1);
  EVAL2("Fill matrix with vector using \"<<\"", myMatrix, M, true, myVector, v, M << 0.1 << 0.2 << 0.3 << v);
  EVAL2("Fill matrix with vector using \"<<\"", myMatrix, S, true, myVector, v, S << v << v << v);
  EVAL("Assign array using range", myVector, v, false, v = range(3,6));

  HEADING("PRINTING WITH PLAIN STYLE");
  adept::set_array_print_style(PRINT_STYLE_PLAIN);
  SIMPLE_EVAL("Printing empty vector", myVector, v, false, std::cout << v << '\n');
  SIMPLE_EVAL("Printing vector", myVector, v, true, std::cout << v << '\n');
  SIMPLE_EVAL("Printing matrix", myMatrix, M, true, std::cout << M << '\n');
  SIMPLE_EVAL("Printing 3D array", myArray3D, A, true, std::cout << A << '\n');

  HEADING("PRINTING WITH CSV STYLE");
  adept::set_array_print_style(PRINT_STYLE_CSV);
  SIMPLE_EVAL("Printing empty vector", myVector, v, false, std::cout << v << '\n');
  SIMPLE_EVAL("Printing vector", myVector, v, true, std::cout << v << '\n');
  SIMPLE_EVAL("Printing matrix", myMatrix, M, true, std::cout << M << '\n');
  SIMPLE_EVAL("Printing 3D array", myArray3D, A, true, std::cout << A << '\n');

  HEADING("PRINTING WITH CURLY STYLE");
  adept::set_array_print_style(PRINT_STYLE_CURLY);
  SIMPLE_EVAL("Printing empty vector", myVector, v, false, std::cout << v << '\n');
  SIMPLE_EVAL("Printing vector", myVector, v, true, std::cout << v << '\n');
  SIMPLE_EVAL("Printing matrix", myMatrix, M, true, std::cout << M << '\n');
  SIMPLE_EVAL("Printing 3D array", myArray3D, A, true, std::cout << A << '\n');

  HEADING("PRINTING WITH MATLAB STYLE");
  adept::set_array_print_style(PRINT_STYLE_MATLAB);
  SIMPLE_EVAL("Printing empty vector", myVector, v, false, std::cout << v << '\n');
  SIMPLE_EVAL("Printing vector", myVector, v, true, std::cout << v << '\n');
  SIMPLE_EVAL("Printing matrix", myMatrix, M, true, std::cout << M << '\n');
  SIMPLE_EVAL("Printing 3D array", myArray3D, A, true, std::cout << A << '\n');
  adept::set_array_print_style(PRINT_STYLE_CURLY);

  HEADING("EXPRESSION PRINTING");
  EVAL("Send expression to standard output", myMatrix, M, true,
       std::cout << M(0,__) + M(1,__) << '\n');
  EVAL("Send scalar expression to standard output", myVector, v, true,
       std::cout << v(0) + v(1) << '\n');

#ifdef ADEPT_BOUNDS_CHECKING
  HEADING("BOUNDS CHECKING");
  should_fail = true;
  EVAL("Access vector out of bounds", myVector, v, true, v(0) = v(4));
  EVAL("Access vector out of bounds", myVector, v, true, v(0) = v(end-4));
  EVAL("Access matrix out of bounds", myMatrix, M, true, M(0,0) = M(0,-1));
  EVAL("Access matrix out of bounds", myMatrix, M, true, M(0,0) = M(end+1,1));
  should_fail = false;
#endif

  std::cout << "====================================================================\n";
#ifdef ALL_ACTIVE
  std::cout << stack;
  std::cout << "====================================================================\n";
#endif

  if (anomalous_results > 0) {
    std::cout << "*** In terms of run-time errors, there were " << anomalous_results << " incorrect results\n";
  }
  else {
    std::cout << "In terms of run-time errors, all tests were passed\n";
  }

#ifdef ALL_ACTIVE
#ifdef ADEPT_RECORDING_PAUSABLE
  if (stack.n_statements() > 1) {
    std::cout << "*** Stack contains " << stack.n_statements()-1
	      << " statements and " << stack.n_operations()
	      << " operations but both should be 0 because recording has been paused\n";
    return 1;
  }
#endif
#endif
  if (anomalous_results > 0) {
    return 1;
  }
  else {
    return 0;
  }
}


================================================
FILE: test/test_checkpoint.cpp
================================================
/* test_checkpoint.cpp - Test manual checkpointing of a simulation

  Copyright (C) 2012-2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

#include <iostream>
#include <cmath>
//#include <fenv.h>

#include "adept.h"
// This header file is in the same directory as adept.h in the Adept
// package
#include "Timer.h"

using adept::adouble;

// Number of points in spatial grid of simulation
#define NX 100

// "Toon" advection scheme applied to linear advection in a 1D
// periodic domain - see Adept paper for details
static
void
toon(int nt, double c, const adouble q_init[NX], adouble q[NX]) {
  adouble flux[NX-1];                        // Fluxes between boxes
  for (int i=0; i<NX; i++) q[i] = q_init[i]; // Initialize q
  for (int j=0; j<nt; j++) {                 // Main loop in time
    for (int i=0; i<NX-1; i++) flux[i] = (exp(c*log(q[i]/q[i+1]))-1.0) 
                                         * q[i]*q[i+1] / (q[i]-q[i+1]);
    for (int i=1; i<NX-1; i++) q[i] += flux[i-1]-flux[i];
    q[0] = q[NX-2]; q[NX-1] = q[1];          // Treat boundary conditions
  }
}

// Main program to test checkpointing
int
main(int argc, char** argv)
{
  Timer timer;
  timer.print_on_exit(true);

  // Note that in single precision the derivative calculation causes a
  // floating-point error due to negative overflow
  //  feenableexcept(FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW);

  const double pi = 4.0*atan(1.0);

  // Edit these variables to change properties of simulation
  const int nblocks = 100;   // Number of checkpoints
  const int nt = 100;        // Number of timesteps between checkpoints
  const double dt = 0.125;   // Timestep (actually a Courant number)

  // Initial values of field as a double array
  double q_init_save[NX];

  // First initialize the field - note that the Toon function does not
  // like identical values next to each other
  for (int i = 0; i < NX; i++) {
    q_init_save[i] = (0.5+0.5*sin((i*2.0*pi)/(NX-1.5)))+0.0001;
  }

  // We perform the simulation twice, once without checkpointing and
  // once with
  int full_id = timer.new_activity("Non-checkpointed simulation");
  int checkpointed_id = timer.new_activity("Checkpointed simulation");

  bool nan_appeared = false;


  // PART 1: NON-CHECKPOINTED SIMULATION
  timer.start(full_id);
  { 
    // Note that we run each test in a pair of curly brackets so that
    // the Adept stack goes out of scope and is destructed before the
    // next test is performed
    std::cout << "*** NON-CHECKPOINTED SIMULATION ***\n";

    adept::Stack stack;

    adouble q_init[NX];  // Initial values of field as adouble array
    adouble q[NX];       // Final values 

    // Rate of change of cost function with respect to initial values
    // of the field
    double dJ_dq[NX];

    // Copy initial values
    for (int i = 0; i < NX; i++) {
      q_init[i] = q_init_save[i];
    }

    // Run a simulation with nt*nblocks timesteps
    stack.new_recording();
    toon(nt*nblocks, dt, q_init, q);

    // Define a "cost function" J that is the sum of squared
    // differences between the final field and the initial field
    adouble J = 0.0;
    for (int i = 0; i < NX; i++) {
      J += (q[i]-q_init_save[i])*(q[i]-q_init_save[i]);
    }

    // In order to get the gradients of the cost function with respect
    // to the initial field, we first set the seed gradient of the
    // cost function to unity
    J.set_gradient(1.0);

    // Perform adjoint calculation
    stack.reverse();

    // Extract the gradients
    adept::get_gradients(q_init, NX, dJ_dq);
  
    // Print out the results
    std::cout << "J=" << J << "\n";
    std::cout << "q_final=[";
    for (int i = 0; i < NX; i++) {
      std::cout << " " << q[i];
    }
    std::cout << "]\n";
    std::cout << "dJ_dq=[";
    for (int i = 0; i < NX; i++) {
      std::cout << " " << dJ_dq[i];
      nan_appeared = nan_appeared || std::isnan(dJ_dq[i]);
    }
    std::cout << "]\n";
    std::cout << stack;
  }


  // PART 2: CHECKPOINTED SIMULATION
  timer.start(checkpointed_id);
  {
    std::cout << "*** CHECKPOINTED SIMULATION ***\n";
    adept::Stack stack;

    // We save the field at each checkpoint, where 0 corresponds to
    // the initial values and nblocks-1 corresponds to the final
    // checkpoint (which is not the very final set of values of the
    // field).  Note that this will only work if nblocks is non-const
    // if you use gcc, which has a C++ extension to allow C99-style
    // variable-length arrays.
    adouble q_save[nblocks][NX];

    // This will be the very final set of values of the field
    adouble q[NX];

    // Rate of change of cost function with respect to initial values
    // of the field
    double dJ_dq[NX];

    // Copy initial values
    for (int i = 0; i < NX; i++) {
      q_save[0][i] = q_init_save[i];
    }

    // Run simulation in a set of blocks, saving the results each
    // time. Note that this step does not need to be automatically
    // differentiated, hence the use of pause_recording and
    // continue_recording.
    for (int i = 0; i < nblocks-1; i++) {
      stack.pause_recording();
      toon(nt, dt, q_save[i], q_save[i+1]);
      stack.continue_recording();
    }

    // Now we rerun the simulations multiple times with automatic
    // differentiation, each time stepping back to the previous block.
    // The first simulation is treated separately since this is the
    // one in which the gradient of the cost function is computed.
    stack.new_recording();
    toon(nt, dt, q_save[nblocks-1], q);

    // Define a "cost function" J that is the sum of squared
    // differences between the final field "q" and the initial field
    adouble J = 0.0;
    for (int i = 0; i < NX; i++) {
      J += (q[i]-q_init_save[i])*(q[i]-q_init_save[i]);
    }

    // In order to get the gradients of the cost function with respect
    // to the initial field, we first set the seed gradient of the
    // cost function to unity
    J.set_gradient(1.0);

    // Perform adjoint calculation
    stack.reverse();

    // Extract the gradients of the cost function with respect to the
    // values at the final checkpoint
    adept::get_gradients(q_save[nblocks-1], NX, dJ_dq);

    // Print out the simulation results (not yet the gradients)
    std::cout << "J=" << J << "\n";
    std::cout << "q_final=[";
    for (int i = 0; i < NX; i++) {
      std::cout << " " << q[i];
    }
    std::cout << "]\n";

    // Now we repeat the simulation starting one checkpoint earlier
    // each time, with the final simulation being performed starting
    // at the initial values of the field
    for (int i = nblocks-2; i >= 0; i--) {
      stack.new_recording();
      toon(nt, dt, q_save[i], q);

      // This time we use the set of gradients output from the previous
      // simulation (which can be thought of as dJ/dq_save[i+1]) as
      // the input gradients for the next
      adept::set_gradients(q, NX, dJ_dq);

      // Perform adjoint calculation
      stack.reverse();

      // Extract the next set of gradients (which can be thought of as
      // dJ/dq_save[i]) and place in dJ_dq ready for the next
      // iteration
      adept::get_gradients(q_save[i], NX, dJ_dq);
    }

    // Print out the gradients
    std::cout << "dJ_dq=[";
    for (int i = 0; i < NX; i++) {
      std::cout << " " << dJ_dq[i];
      nan_appeared = nan_appeared || std::isnan(dJ_dq[i]);
    }
    std::cout << "]\n";
    std::cout << stack;
  }
  timer.stop();

  if (nan_appeared) {
    std::cerr << "*** Error: some NaNs appeared\n";
    return 1;
  }
  else {
    return 0;
  }

}


================================================
FILE: test/test_constructors.cpp
================================================
/* test_constructors.cpp - Test Adept's selection of constructors in a range of scenarios

    Copyright (C) 2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

#include <iostream>

#define ADEPT_BOUNDS_CHECKING 1
#define ADEPT_VERBOSE_FUNCTIONS
#define ADEPT_NO_ALIAS_CHECKING

#include <adept_arrays.h>

using namespace adept;

Vector square(const Vector& v) {
  std::cout << "  inside function\n";
  return v*v;
}

void square_in_place(Vector& v) {
  std::cout << "  inside function\n";
  v *= v;
}

Vector square_copy(Vector v) {
  std::cout << "  inside function\n";
  v *= -1.0;
  return v*v;
}

#define COMMA ,

#define EVAL_CONSTRUCT(MSG,X,COMMAND) std::cout << "--------------------------------------------------------------------\n" \
  << MSG << "\n" \
  << #COMMAND << "\n"; \
  COMMAND; \
  std::cout << #X << " = " << X << "\n"

#define EVAL(MSG,X,COMMAND) std::cout << "--------------------------------------------------------------------\n" \
  << MSG << "\n" \
  << #X << " = " << X << "\n" \
  << #COMMAND << "\n"; \
  COMMAND; \
  std::cout << #X << " = " << X << "\n"

 #define EVAL2(MSG,X,COMMAND,Y) std::cout << "--------------------------------------------------------------------\n" \
  << MSG << "\n" \
  << #X << " = " << X << "\n" \
  << #COMMAND << "\n"; \
  COMMAND;					\
  std::cout << #X << " = " << X << "\n" \
            << #Y << " = " << Y << "\n"

#define EVAL_FAIL(MSG,X,COMMAND) std::cout << "--------------------------------------------------------------------\n" \
  << MSG << "\n" \
  << #COMMAND << "\n" \
  << "DOES NOT COMPILE (INCORRECT BEHAVIOUR)\n"

#define EVAL2_FAIL(MSG,X,COMMAND,Y) std::cout << "--------------------------------------------------------------------\n" \
  << MSG << "\n" \
  << #COMMAND << "\n" \
  << "DOES NOT COMPILE (INCORRECT BEHAVIOUR)\n"

#define VERDICT98(MSG) std::cout << "Verdict for C++98: " << MSG << "\n"
#define VERDICT11(MSG) std::cout << "Verdict for C++11: " << MSG << "\n"

#define HEADING(MSG) std::cout << "####################################################################\n" \
  << MSG << "\n"


int
main() {

  Vector v(2), w(2), v_data(2), v_const_data(2);
  v_data << 2, 3;
  v_const_data << 5, 7;
  v = v_data;
  const Vector v_const = v_const_data;

  adept::Stack stack;
  stack.new_recording();

  {
  HEADING("COPY CONSTRUCTORS");
  EVAL2("Passing Vector as argument to Vector copy constructor",
	v, const Vector v2(v), v2);
  VERDICT98("correct");
  VERDICT11("should perform deep copy");

  EVAL2("Passing Vector as argument to const Vector copy constructor",
	v, const Vector v_const(v), v_const);
  VERDICT98("correct");
  VERDICT11("should perform deep copy");

  EVAL2("Passing const Vector as argument to const Vector copy constructor",
	v_const, const Vector v_const2(v_const), v_const2);
  VERDICT98("correct");
  VERDICT11("should perform deep copy");

  EVAL2("Passing const Vector as argument to Vector copy constructor",
	v_const, Vector v3(v_const), v3);
  VERDICT98("should not compile");
  VERDICT11("should perform deep copy");
  }

#ifdef ADEPT_CXX11_FEATURES
  HEADING("INITIALIZER LISTS");
  EVAL_CONSTRUCT("Construct Vector from initializer list of ints",
	v1, Vector v1 = {1 COMMA 2 COMMA 3});
  EVAL_CONSTRUCT("Construct Vector from initializer list of doubles",
	v1d, Vector v1d = {1.0 COMMA 2.0 COMMA 3.0});
  EVAL_CONSTRUCT("Construct Matrix from initializer list",
		 M, Matrix M = { {1 COMMA 2} COMMA {3} } );
  EVAL_CONSTRUCT("Construct Array3D from initializer list",
		 A3, Array3D A3 = { { {1 COMMA 2} COMMA {3} } COMMA { { 4 } } } );
  EVAL_CONSTRUCT("Construct FixedVector from initializer list",
		 fv1, Vector3 fv1 = {1 COMMA 2});
  EVAL_CONSTRUCT("Construct FixedMatrix from initializer list",
		 fM, Matrix33 fM = { {1 COMMA 2} COMMA {3} } );
  EVAL_CONSTRUCT("Construct FixedArray3D from initializer list",
		 fA3, FixedArray<double COMMA false COMMA 3 COMMA 3 COMMA 3> fA3 = { { {1 COMMA 2} COMMA {3} } COMMA { { 4 } } } );
#endif

  HEADING("ASSIGNMENT OPERATOR");
  EVAL2("Passing Vector to assignment operator",
	v, w = v, w);
  EVAL2("Passing const Vector to assignment operator",
	v_const, w = v_const, w);
  EVAL2("Passing Vector rvalue to assignment operator",
	v, w = v(stride(1,0,-1)), w);
  EVAL2("Passing const-Vector rvalue to assignment operator",
	v_const, w = v_const(stride(1,0,-1)), w);
  EVAL2("Passing Expression to assignment operator",
	v, w = v+v, w);

  HEADING("PASSING Vector TO FUNCTIONS");
  EVAL2("Passing Vector as argument to function taking const Vector&",
       v, w = square(v), w);
  VERDICT98("too many copies");
  VERDICT11("could replace last copy with a move");
  EVAL("Passing Vector as argument to function taking Vector&",
       v, square_in_place(v));
  VERDICT98("correct");

  v = v_data;
  EVAL2("Passing Vector as argument to function taking Vector",
       v, w = square_copy(v), w);
  VERDICT98("too many copies, unexpected change of argument");
  VERDICT11("should do deep copy on input, replace last copy with a move");

  /*

    // Behaves same as passing non-const Vector, which is correct

  // Passing const Vector
  EVAL2("Passing const Vector as argument to function taking const Vector&",
       v_const, w = square(v_const), w);
  // The following should not compile:
  //  EVAL("Passing const Vector as argument to function taking Vector&",
  //       v_const, square_in_place(v_const));
  EVAL2("Passing const Vector as argument to function taking Vector",
       v_const, w = square_copy(v_const), w);

  */


  HEADING("LINKING");
  w.clear();
  EVAL2("Linking to Vector",
	v, w >>= v, w);

  /*
  w.clear();
  // This should not compile
  EVAL2("Linking to const Vector",
	v_const, w >>= v_const, w);
  */
  w.clear();
  EVAL2("Linking to Vector rvalue",
	v, w >>= v(stride(1,0,-1)), w);

  /*
  // This should not compile
  w.clear();
  EVAL2("Linking to const-Vector rvalue",
	v_const, w >>= v_const(stride(1,0,-1)), w);
  */
  /*
    // This should not compile
  w.clear();
  EVAL2("Linking to Expression",
	v, w >>= v+v, w);
  VERDICT98("this doesn't make much sense");
  */

  HEADING("PASSING Vector TO FUNCTIONS");
  EVAL2("Passing Vector as argument to function taking const Vector&",
       v, w = square(v), w);
  VERDICT98("too many copies");
  VERDICT11("could replace last copy with a move");
  EVAL("Passing Vector as argument to function taking Vector&",
       v, square_in_place(v));
  VERDICT98("correct");

  v = v_data;
  EVAL2("Passing Vector as argument to function taking Vector",
       v, w = square_copy(v), w);
  VERDICT98("too many copies, unexpected change of argument");
  VERDICT11("should do deep copy on input, replace last copy with a move");


  HEADING("PASSING Vector RVALUE TO FUNCTIONS");
  EVAL2("Passing Vector rvalue as argument to function taking const Vector&",
	v, w = square(v(stride(1,0,-1))), w);
  VERDICT98("correct");
  EVAL_FAIL("Passing Vector rvalue as argument to function taking Vector&",
       v, square_in_place(v(stride(1,0,-1))));
  VERDICT98("Vector subset functions could return references?");

  v = v_data;
  EVAL2("Passing Vector rvalue as argument to function taking Vector",
	     v, w = square_copy(v(stride(1,0,-1))), w);
  VERDICT98("Vector subset functions could return references?");
  VERDICT11("Should use move function");

  HEADING("PASSING const Vector RVALUES TO FUNCTIONS");
  EVAL2("Passing const-Vector rvalue as argument to function taking const Vector&",
	v_const, w = square(v_const(stride(1,0,-1))), w);
  VERDICT98("correct");
  // This should not compile
  //  EVAL("Passing const-Vector rvalue as argument to function taking Vector&",
  //       v_const, square_in_place(v_const(stride(1,0,-1))));
  //  VERDICT98("Vector subset functions could return references?");
  EVAL2("Passing const-Vector rvalue as argument to function taking Vector",
	     v_const, w = square_copy(v_const(stride(1,0,-1))), w);
  VERDICT98("correct");
  //  VERDICT11("Should use move function");

  HEADING("PASSING Expression TO FUNCTIONS");
  EVAL2("Passing Expression as argument to function taking const Vector&",
       v, w = square(v+v), w);
  VERDICT98("Unclear why copy-assignment + constructor needed");
  // This should not compile:
  //  EVAL("Passing Expression as argument to function taking Vector&",
  //       v, square_in_place(v+v));
  v = v_data;
  EVAL2("Passing Expression as argument to function taking Vector",
       v, w = square_copy(v+v), w);
  VERDICT98("Unclear why copy-assignment + constructor needed");

  return 0;
}


================================================
FILE: test/test_derivatives.cpp
================================================
/* test_derivatives.cpp - Test derivatives of mathematical functions 

    Copyright (C) 2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.

*/

#include <adept_arrays.h>


#define TEST_UNARY_FUNC(FUNC)					\
  {								\
    std::cout << "  Checking " << #FUNC << "... \t";		\
    aVector x = x_save;					\
    stack.new_recording();					\
    aVector y = FUNC(x);					\
    Vector dy_dx_num  = (FUNC(x_save+dx)-FUNC(x_save)) / dx;	\
    Vector dy_dx_adept(N);					\
    for (int i = 0; i < N; ++i) {				\
      x[i].set_gradient(1.0);					\
      stack.forward();						\
      y[i].get_gradient(dy_dx_adept[i]);			\
    }								\
    Real max_err						\
      = maxval(abs(dy_dx_adept-dy_dx_num));			\
    Real max_frac_err						\
      = maxval(abs(dy_dx_adept-dy_dx_num)/dy_dx_adept);		\
    if (max_err == 0) {						\
      std::cout << "max error = 0: PASSED\n";			\
    }								\
    if (max_frac_err <= MAX_FRAC_ERR) {				\
      std::cout << "max fractional error = " << max_frac_err	\
		<< ": PASSED\n";				\
    }								\
    else {							\
      std::cout << "max fractional error = "			\
		<< max_frac_err << ": FAILED\n";		\
      std::cout << "    Adept     dy/dx = "			\
		<< dy_dx_adept << "\n";				\
      std::cout << "    Numerical dy/dx = " << dy_dx_num << "\n";	\
      error_too_large = true;					\
    }								\
  }

#define TEST_BINARY_FUNC(FUNC)					\
  {								\
    std::cout << "  Checking " << #FUNC << "... \t";		\
    aVector x = x_save;					\
    aVector y = y_save;						\
    stack.new_recording();					\
    aVector z = FUNC(x,y);					\
    Vector dz_dx_num						\
      = (FUNC(x_save+dx,y_save)-FUNC(x_save,y_save)) / dx; \
    Vector dz_dy_num						\
      = (FUNC(x_save,y_save+dy)-FUNC(x_save,y_save)) / dy;	\
    Vector dz_dx_adept(N);					\
    Vector dz_dy_adept(N);					\
    for (int i = 0; i < N; ++i) {				\
      z[i].set_gradient(1.0);					\
      stack.reverse();						\
      x[i].get_gradient(dz_dx_adept[i]);			\
      y[i].get_gradient(dz_dy_adept[i]);			\
    }								\
    Real max_err						\
      = std::max(maxval(abs(dz_dx_adept-dz_dx_num)),		\
		 maxval(abs(dz_dy_adept-dz_dy_num)));		\
    Real max_frac_err						\
      = std::max(maxval(abs(dz_dx_adept-dz_dx_num)/dz_dx_adept),	\
		 maxval(abs(dz_dy_adept-dz_dy_num)/dz_dy_adept));	\
    if (max_err == 0) {						\
      std::cout << "max error = 0: PASSED\n";			\
    }								\
    if (max_frac_err <= MAX_FRAC_ERR) {				\
      std::cout << "max fractional error = " << max_frac_err	\
		<< ": PASSED\n";				\
    }								\
    else {							\
      std::cout << "max fractional error = "			\
		<< max_frac_err << ": FAILED\n";		\
      std::cout << "    Adept     dz/dx = " << dz_dx_adept << "\n";	\
      std::cout << "    Adept     dz/dy = " << dz_dy_adept << "\n";	\
      std::cout << "    Numerical dz/dx = " << dz_dx_num << "\n";	\
      std::cout << "    Numerical dz/dy = " << dz_dy_num << "\n";	\
      error_too_large = true;					\
    }								\
  }


int
main(int argc, const char** argv) {
  using namespace adept;

  Stack stack;

  static const int N             = 12;
  static const Real MAX_FRAC_ERR = 1.0e-5;

  Vector x_save(N);
  x_save = 0.2;
  x_save << 0.01, 0.4, 0.99;

  Vector y_save(N);
  y_save = 0.7;
  y_save << 0.9, 0.6, 0.1, -0.1;

  Real dx = 1.0e-8;

  if (sizeof(Real) < 8) {
    // Single precision only works with larger perturbations
    dx = 1.0e-5;
  }

  Real dy = dx;

  bool error_too_large = false;  

  std::cout << "EVALUATING UNARY FUNCTIONS\n";
  std::cout << "For functions of the form y=FUNC(x), where x=" << x_save << ",\n";
  std::cout << "checking that fractional difference between dy/dx computed using Adept\n";
  std::cout << "and numerically by perturbing x by " << dx << " is less than " << MAX_FRAC_ERR << ".\n";    

  
  TEST_UNARY_FUNC(-); // Unary minus
  TEST_UNARY_FUNC(+); // Unary plus
  TEST_UNARY_FUNC(log);
  TEST_UNARY_FUNC(log10);
  TEST_UNARY_FUNC(sin);
  TEST_UNARY_FUNC(cos);
  TEST_UNARY_FUNC(tan);
  TEST_UNARY_FUNC(asin);
  TEST_UNARY_FUNC(acos);
  TEST_UNARY_FUNC(atan);
  TEST_UNARY_FUNC(sinh);
  TEST_UNARY_FUNC(cosh);
  TEST_UNARY_FUNC(tanh);
  TEST_UNARY_FUNC(abs);
  TEST_UNARY_FUNC(fabs);
  TEST_UNARY_FUNC(exp);
  TEST_UNARY_FUNC(sqrt);
  TEST_UNARY_FUNC(ceil);
  TEST_UNARY_FUNC(floor);
  TEST_UNARY_FUNC(log2);
  TEST_UNARY_FUNC(expm1);
  TEST_UNARY_FUNC(exp2);
  TEST_UNARY_FUNC(log1p);
  TEST_UNARY_FUNC(asinh);
  TEST_UNARY_FUNC(acosh);
  TEST_UNARY_FUNC(atanh);
  TEST_UNARY_FUNC(erf);
  TEST_UNARY_FUNC(erfc);
  TEST_UNARY_FUNC(cbrt);
  TEST_UNARY_FUNC(round);
  TEST_UNARY_FUNC(trunc);
  TEST_UNARY_FUNC(rint);
  TEST_UNARY_FUNC(nearbyint);

  std::cout << "EVALUATING BINARY FUNCTIONS\n";
  std::cout << "For functions of the form z=FUNC(x,y), where x=" << x_save << ",\n";
  std::cout << "and y=" << y_save << ", checking that fractional difference between\n";
  std::cout << "dz/dx and dz/dy computed using Adept and numerically by perturbing\n";
  std::cout << "x and y by " << dx << " is less than " << MAX_FRAC_ERR << ".\n";    

  TEST_BINARY_FUNC(pow);
  TEST_BINARY_FUNC(atan2);
  TEST_BINARY_FUNC(max);
  TEST_BINARY_FUNC(min);
  TEST_BINARY_FUNC(fmax);
  TEST_BINARY_FUNC(fmin);
  TEST_BINARY_FUNC(copysign);


  if (error_too_large) {
    std::cerr << "*** Error: fractional error in the derivatives of some functions too large\n";

    if (sizeof(Real) < 8) {
      std::cerr << "*** (but you are using less than double precision so it is not surprising)\n";
    }

    return 1;
  }
  else {
    return 0;
  }
}


================================================
FILE: test/test_fastexp.cpp
================================================
/* test_fastexp.cpp - Test Adept's fast exponential for correctness 

  Copyright (C) 2020 European Centre for Medium-Range Weather Forecasts

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.

  This file tests Adept's fast exponential function "fastexp", which
  is vectorizable.
*/

#include <iostream>
#include <limits>
#include "adept_arrays.h"

using namespace adept;

int main(int argc, const char** argv)
{
  {
    std::cout << "DOUBLE PRECISION\n";
    std::cout << "Packet<double>::size = " << internal::Packet<double>::size << "\n";
    Vector x = linspace(-750.0,750.0,128);
    x(end) = std::numeric_limits<double>::quiet_NaN();
    Vector exponential = exp(x);
    Vector fast_exponential = fastexp(x);
    Vector fractional_error = (fast_exponential - exponential) / exponential;
    //    std::cout << fractional_error << "\n";
    Matrix M(128,4);
    M(__,0) = x;
    M(__,1) = exponential;
    M(__,2) = fast_exponential;
    M(__,3) = fractional_error;
    std::cout << "x  exp(x)  fastexp(x)  fractional-error";
    std::cout << M << "\n";
  }
  {
    std::cout << "\nSINGLE PRECISION\n";
    std::cout << "Packet<float>::size = " << internal::Packet<float>::size << "\n";
    floatVector x = linspace(-100.0,100.0,128);
    x(end) = std::numeric_limits<float>::quiet_NaN();
    floatVector exponential = exp(x);
    floatVector fast_exponential = fastexp(x);
    floatVector fractional_error = (fast_exponential - exponential) / exponential;
    floatMatrix M(128,4);
    M(__,0) = x;
    M(__,1) = exponential;
    M(__,2) = fast_exponential;
    M(__,3) = fractional_error;
    std::cout << "x  exp(x)  fastexp(x)  fractional-error";
    std::cout << M << "\n";
  }
  return 0;
}


================================================
FILE: test/test_fixed_arrays.cpp
================================================
/* test_arrays.cpp - Test Adept's array functionality

    Copyright (C) 2016-2017 European Centre for Medium-Range Weather Forecasts

    Author: Robin Hogan <r.j.hogan@ecmwf.int>

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

#include <iostream>

#define ADEPT_BOUNDS_CHECKING 1

#include <adept_arrays.h>
#include <adept/FixedArray.h>

// The following controls whether to use active variables or not
//#define ALL_ACTIVE 1
//#define MARVEL_STYLE 1

using namespace adept;

int
main(int argc, const char** argv) {
  using namespace adept;
  Stack stack;
  
#define HEADING(MESSAGE) \
  std::cout << "====================================================================\n"	\
	    << "   TESTING " << MESSAGE << "\n"

#define EVAL(MESSAGE, TYPE, X, EXPR)					\
  std::cout << "--------------------------------------------------------------------\n" \
	    << "### " << MESSAGE << "\n### " << #EXPR << "\n";	\
  try {								\
    TYPE X;								\
    X = test. X;							\
    std::cout << #TYPE << " " << #X << " = " << X << "\n";		\
    std::cout << "Evaluating " << #EXPR << "\n";			\
    std::cout.flush();						\
    EXPR;								\
    std::cout << "Result: " << #X << " = " << X << "\n";		\
    if (should_fail) { std::cout << "*** INCORRECT OUTCOME\n";	\
      anomalous_results++;						\
    }									\
  } catch (const adept::exception& e) {					\
    std::cout << "*** Failed with: " << e.what() << "\n";		\
    if (!should_fail) { std::cout << "*** INCORRECT OUTCOME\n";		\
      anomalous_results++;						\
    }									\
    else {								\
      std::cout << "*** Correct behaviour\n";				\
    }									\
  }

#define EVAL2(MESSAGE, TYPEX, X, TYPEY, Y, EXPR)			\
  std::cout << "--------------------------------------------------------------------\n" \
	    << "### " << MESSAGE << "\n### " << #EXPR << "\n";	\
  try {									\
    TYPEX X;								\
    X = test. X;							\
    std::cout << #TYPEX << " " << #X << " = " << X << "\n";		\
    TYPEY Y; Y = test. Y;						\
    std::cout << #TYPEY << " " << #Y << " = " << Y << "\n";		\
    std::cout << "Evaluating " << #EXPR << "\n";			\
    std::cout.flush();							\
    EXPR;								\
    std::cout << "Result: " << #X << " = " << X << "\n";		\
    if (should_fail) { std::cout << "*** INCORRECT OUTCOME\n";	        \
      anomalous_results++;						\
    }									\
  } catch (const adept::exception& e) {					\
    std::cout << "*** Failed with: " << e.what() << "\n";		\
    if (!should_fail) { std::cout << "*** INCORRECT OUTCOME\n";		\
      anomalous_results++;						\
    }									\
    else {								\
      std::cout << "*** Correct behaviour\n";				\
    }									\
  }


#define EVAL3(MESSAGE, TYPEX, X, TYPEY, Y, TYPEZ, Z, EXPR)		\
  std::cout << "--------------------------------------------------------------------\n" \
	    << "### " << MESSAGE << "\n### " << #EXPR << "\n";	\
  try {									\
    TYPEX X;								\
    X = test. X;							\
    std::cout << #TYPEX << " " << #X << " = " << X << "\n";		\
    TYPEY Y; Y = test. Y;						\
    TYPEZ Z; Z = test. Z;						\
    std::cout << #TYPEY << " " << #Y << " = " << Y << "\n";		\
    std::cout << #TYPEZ << " " << #Z << " = " << Z << "\n";		\
    std::cout << "Evaluating " << #EXPR << "\n";			\
    std::cout.flush();							\
    EXPR;								\
    std::cout << "Result: " << #X << " = " << X << "\n";		\
    if (should_fail) { std::cout << "*** INCORRECT OUTCOME\n";	        \
      anomalous_results++;						\
    }									\
  } catch (const adept::exception& e) {					\
    std::cout << "*** Failed with: " << e.what() << "\n";		\
    if (!should_fail) { std::cout << "*** INCORRECT OUTCOME\n";		\
      anomalous_results++;						\
    }									\
    else {								\
      std::cout << "*** Correct behaviour\n";				\
    }									\
  }

#define EVAL_NO_TRAP(MESSAGE, TYPE, X, EXPR)				\
  std::cout << "--------------------------------------------------------------------\n" \
	    << "### " << MESSAGE << "\n### " << #EXPR << "\n";	\
  {									\
    TYPE X;								\
    X = test. X;							\
    std::cout << #TYPE << " " << #X << " = " << X << "\n";		\
    std::cout << "Evaluating " << #EXPR << "\n";			\
    std::cout.flush();						\
    EXPR;								\
    std::cout << "Result: " << #X << " = " << X << "\n";		\
    if (should_fail) { std::cout << "*** INCORRECT OUTCOME\n";	\
      anomalous_results++;						\
    }									\
  }  

#define EVAL2_NO_TRAP(MESSAGE, TYPEX, X, TYPEY, Y, EXPR)			\
  std::cout << "--------------------------------------------------------------------\n" \
	    << "### " << MESSAGE << "\n### " << #EXPR << "\n";	\
  {									\
    TYPEX X;								\
    X = test. X;							\
    std::cout << #TYPEX << " " << #X << " = " << X << "\n";		\
    TYPEY Y; Y = test. Y;						\
    std::cout << #TYPEY << " " << #Y << " = " << Y << "\n";		\
    std::cout << "Evaluating " << #EXPR << "\n";			\
    std::cout.flush();							\
    EXPR;								\
    std::cout << "Result: " << #X << " = " << X << "\n";		\
    if (should_fail) { std::cout << "*** INCORRECT OUTCOME\n";	\
      anomalous_results++;						\
    }									\
  }


#ifdef ALL_ACTIVE
#define IS_ACTIVE true
#else
#define IS_ACTIVE false
#endif

  typedef FixedArray<double,IS_ACTIVE,2> myVector2;
  typedef FixedArray<double,IS_ACTIVE,3> myVector3;
  typedef FixedArray<double,IS_ACTIVE,1,2> myMatrix12;
  typedef FixedArray<double,IS_ACTIVE,3,3> myMatrix33;
  typedef FixedArray<double,IS_ACTIVE,2,3> myMatrix23;
  typedef FixedArray<double,IS_ACTIVE,3,2> myMatrix32;
  typedef FixedArray<double,IS_ACTIVE,2,2> myMatrix22;

#ifndef ALL_ACTIVE
  typedef Real myReal;
  typedef SymmMatrix mySymmMatrix;
  typedef DiagMatrix myDiagMatrix;
  typedef TridiagMatrix myTridiagMatrix;
  typedef LowerMatrix myLowerMatrix;
  typedef UpperMatrix myUpperMatrix;
#else
  typedef aReal myReal;
  typedef aSymmMatrix mySymmMatrix;
  typedef aDiagMatrix myDiagMatrix;
  typedef aTridiagMatrix myTridiagMatrix;
  typedef aLowerMatrix myLowerMatrix;
  typedef aUpperMatrix myUpperMatrix;
#endif

  //  typedef SpecialMatrix<Real,SymmEngine<ROW_UPPER_COL_LOWER>,false> mySymmMatrix;
  //  typedef SpecialMatrix<Real,BandEngine<COL_MAJOR,0,0>,false> myDiagMatrix;
  //  typedef SpecialMatrix<Real,BandEngine<COL_MAJOR,1,1>,false> myTridiagMatrix;


  struct Test {

    myReal x;
    myVector2 z;
    myVector3 v, w;
    myMatrix12 K;
    myMatrix23 M, N;
    myMatrix33 S, C;
    myMatrix32 A;
    myMatrix22 B;

    mySymmMatrix O, P;
    myDiagMatrix D, E;
    myTridiagMatrix T, TT;
    myLowerMatrix L, LL;
    myUpperMatrix U, UU;

    intVector index;


    Test() {
      x = -2;

      O.resize(3);
      //      Q.resize(5);
      index.resize(2);
      v(0) = 2; v(1) = 3; v(2) = 5;
      w(0) = 7; w(1) = 11; w(2) = 13;
      M(0,0) = 2; M(0,1) = 3; M(0,2) = 5;
      M(1,0) = 7; M(1,1) = 11; M(1,2) = 13;
      N(0,0) = 17; N(0,1) = 19; N(0,2) = 23;
      N(1,0) = 29; N(1,1) = 31; N(1,2) = 37;
      S(0,0) = 2; S(0,1) = 3; S(0,2) = 5;
      S(1,0) = 7; S(1,1) = 11; S(1,2) = 13;
      S(2,0) = 17; S(2,1) = 19; S(2,2) = 23;

      K << 57, 59;
      z << 37, 47;

      A << 21,22,23,24,25,26;
      B << 31,32,33,34;

      //      O = -M.T();


      O(0,0) = 7;
      O(1,0) = 2; O(1,1) = 11;
      O(2,0) = 3; O(2,1) = 5; O(2,2) = 13;
      /*

      P = 14-O;

      Q.diag_vector(-2) = 1;
      Q.diag_vector(-1) = 2;
      Q.diag_vector(0)  = 3;
      Q.diag_vector(1)  = 4;
      */

      C = 0;
      D = S;
      T = S;
      L = S;
      U = S;
      index << 1, 0;
    }
  };

  stack.new_recording();

  Test test;

  bool should_fail=false;
  int anomalous_results=0;

#ifdef ALL_ACTIVE
  std::cout << "Testing ACTIVE arrays\n";
#else
  std::cout << "Testing INACTIVE arrays\n";
#endif


  HEADING("BASIC EXPRESSIONS");
  EVAL2("Vector assignment to vector", myVector3, v, myVector3, w, v = w);
  EVAL2("Vector assignment to expression", myVector3, v, myVector3, w, v = log(w) + 1.0);

  EVAL("Matrix *= operator", myMatrix23, M, M *= 0.5);
  EVAL2("Matrix = scalar", myMatrix23, M, myReal, x, M = x);

  EVAL2("Matrix = scalar expression", myMatrix23, M, myReal, x, M = (10.0*x));
  HEADING("BASIC FUNCTIONS");
  EVAL2("max", myVector3, v, myVector3, w, v = max(v,w/3.0));
  EVAL2("min", myVector3, v, myVector3, w, v = min(v,w/3.0));

  HEADING("ARRAY SLICING");
  EVAL2("Array indexing rvalue", myReal, x, myMatrix23, M, x = M(1,end-1));

  should_fail=true;
  EVAL2("Array indexing rvalue out of range (SHOULD FAIL)", myReal, x, myMatrix23, M, x = M(1,3));
  should_fail=false;

  EVAL("Array indexing lvalue", myMatrix23, M, M(1,end-1) *= -1.0);

  EVAL2("contiguous subarray rvalue", myVector3, v, myMatrix23, M, v = M(end,__));
  EVAL("contiguous subarray lvalue", myMatrix23, M, M(end-1,__) /= 2.0);
  EVAL2("contiguous subarray rvalue using range", myVector2, z, myMatrix23, M, z = 2.0 * M(1,range(1,2)));
  EVAL2("contiguous subarray lvalue using range", myMatrix23, M, myVector3, v, M(end-1,range(0,1)) = log(v(range(1,2))));
  EVAL2("contiguous subarray rvalue using subset", myMatrix12, K, myMatrix23, N, K = 2.0 * N.subset(1,1,1,2));
  EVAL("contiguous subarray lvalue using subset", myVector3, v, v.subset(end-1,end) *= 10.0);

  EVAL2("regular subarray rvalue", myVector3, v, myVector3, w, v = w(stride(end,0,-1)));
  EVAL2("regular subarray lvalue", myMatrix23, M, myVector3, w, M(0,stride(0,end,2)) *= w(stride(end,0,-2)));
  EVAL("irregular subarray rvalue", myMatrix23, M, M(stride(1,0,-1),find(M(0,__)>4)) = 0);
  EVAL("slice leading dimension", myMatrix23, M, M[end] = 0);
  EVAL("slice two dimensions", myMatrix23, M, M[end][0] = 0);
  EVAL2("diag_vector member function as rvalue", myVector2, z, myMatrix33, S, z = diag_vector(S,1));
  EVAL2("diag_vector member function as lvalue", myMatrix33, S, myVector3, v, S.diag_vector() += v);
  EVAL2("diag_matrix member function", myMatrix33, S, myVector3, v, S = v.diag_matrix());
  EVAL2("diag_matrix external function", myMatrix33, S, myVector3, v, S = diag_matrix(v));
  EVAL2("transpose as rvalue via T member function", myMatrix32, A, myMatrix23, M, A = 2 * M.T());
  EVAL2("transpose as rvalue via permute member function", myMatrix32, A, myMatrix23, M, A = 2 * M.permute(1,0));
  //  EVAL3("2D arbitrary index as rvalue", myMatrix22, B, myMatrix23, N, intVector, index, B = const_cast<const myMatrix23&>(N)(index,index));
  EVAL3("2D arbitrary index as rvalue", myMatrix22, B, myMatrix23, N, intVector, index, B = N(index,index));
  EVAL3("2D arbitrary index as lvalue", myMatrix23, M, myMatrix23, N, intVector, index, M(index,index) = N(__,range(1,2)));
  EVAL2("2D arbitrary index as lvalue with assign-multiply operator", myMatrix23, M, intVector, index, M(index,index) *= 10.0);
  EVAL2("2D arbitrary index as lvalue with aliased right-hand-side", myMatrix23, M, intVector, index, M(index,index) += M(__,range(1,2)));

  HEADING("REDUCTION OPERATIONS"); 
  EVAL2("full reduction", myReal, x, myMatrix23, M, x = sum(M));
  EVAL2("1-dimension reduction", myVector3, v, myMatrix23, M, v = 0.5 * mean(M,0));
  EVAL2("1-dimension reduction", myVector2, z, myMatrix23, M, z = norm2(M,1));
  EVAL2("maxval", myVector2, z, myMatrix23, M, z = maxval(M,1));
  EVAL2("minval", myVector2, z, myMatrix23, M, z = minval(M,1));
  EVAL2("dot product", myReal, x, myVector3, w, x = dot_product(w,w(stride(end,0,-1))));
  //  EVAL2("1D interpolation", myVector3, v, myVector3, w, (v = interp<double,double,true,double>(value(v), w, value(w)/3.0) ));
  EVAL2("1D interpolation", myVector3, v, myVector3, w, v = interp(value(v), w, value(w)/2.0));
  EVAL2("1D clamped interpolation", myVector3, v, myVector3, w, v = interp(value(v), w, value(w)/2.0, ADEPT_EXTRAPOLATE_CLAMP));
#ifndef ALL_ACTIVE
  EVAL2("1D interpolation of matrix", myMatrix23, M, myVector3, v, M = interp(v(range(0,1)), M, v(range(1,2))/2.0));
  EVAL2("1D clamped interpolation of matrix", myMatrix23, M, myVector3, v, M = interp(v(range(0,1)), M, v(range(1,2))/2.0, ADEPT_EXTRAPOLATE_CLAMP));
#endif
  HEADING("CONDITIONAL OPERATIONS");
  EVAL2("where construct, scalar right-hand-side", myMatrix23, M, myMatrix23, N, M.where(N > 20) = 0);
  EVAL2("where construct, expression right-hand-side", myMatrix23, M, myMatrix23, N, M.where(N > 20) = -N);
  EVAL2("where construct, scalar either-or right-hand-side", myMatrix23, M, myMatrix23, N, M.where(N > 20) = either_or(0,1));
  EVAL2("where construct, expression either-or right-hand-side", myMatrix23, M, myMatrix23, N, M.where(N > 20) = either_or(-N,N));
  EVAL("find construct, scalar right-hand-side", myVector3, v, v(find(v > 3.5)) = 0);
  EVAL("find construct, expression right-hand-side", myVector3, v, v(find(v > 3.5)) = -v(range(end,end)));
  EVAL("find construct, multiply-assign right-hand-side", myVector3, v, v(find(v != 5.0)) *= 10.0);

  HEADING("SPECIAL SQUARE MATRICES");
  EVAL2("SymmMatrix assign from fixed matrix", mySymmMatrix, O, myMatrix33, S, O = S);
  EVAL2("DiagMatrix assign from dense matrix", myDiagMatrix, D, myMatrix33, S, D = S);
  EVAL2("TridiagMatrix assign from dense matrix", myTridiagMatrix, T, myMatrix33, S, T = S);
  EVAL2("LowerMatrix assign from dense matrix", myLowerMatrix, L, myMatrix33, S, L = S);
  EVAL2("UpperMatrix assign from dense matrix", myUpperMatrix, U, myMatrix33, S, U = S);
  EVAL2("SymmMatrix as rvalue", myMatrix33, S, mySymmMatrix, O, S = O);
  EVAL2("DiagMatrix as rvalue", myMatrix33, S, myDiagMatrix, D, S = D);
  EVAL2("TridiagMatrix as rvalue", myMatrix33, S, myTridiagMatrix, T, S = T);
  EVAL2("LowerMatrix as rvalue", myMatrix33, S, myLowerMatrix, L, S = L);
  EVAL2("UpperMatrix as rvalue", myMatrix33, S, myUpperMatrix, U, S = U);

  EVAL2("Array submatrix_on_diagonal member function", myMatrix22, B, myMatrix33, S, B = S.submatrix_on_diagonal(1,2));
  EVAL("Array submatrix_on_diagonal member function as lvalue", myMatrix33, S, S.submatrix_on_diagonal(0,1) = 0);

  should_fail = true;
  EVAL2("Array submatrix_on_diagonal member function to non-square matrix", myMatrix22, B, myMatrix33, N, B = N.submatrix_on_diagonal(1,2));
  should_fail = false;

#ifndef MARVEL_STYLE
  if (adept::have_matrix_multiplication()) {
    HEADING("MATRIX MULTIPLICATION");
    EVAL2("Matrix-Matrix multiplication", myMatrix33, S, myMatrix23, M, S = M.T() ** M);
    EVAL2("Matrix-Matrix multiplication with matmul", myMatrix33, S, myMatrix23, M, S = matmul(M.T(), M));

    should_fail = true;
    EVAL2("Matrix-Matrix multiplication with inner dimension mismatch", myMatrix33, S, myMatrix23, M, S = M ** M);
    should_fail = false;
    
    // TESTING!
    EVAL2("Matrix-Matrix-Vector multiplication", myVector3, v, myMatrix33, S, v = S ** S ** v);
    
    EVAL2("Matrix-Matrix-Vector multiplication", myVector3, v, myMatrix33, S, v = S ** log(S) ** S(0,__));
    EVAL2("Vector-Matrix multiplication", myVector3, v, myMatrix33, S, v = v ** S);
    EVAL2("Vector-Matrix multiplication with matmul", myVector3, v, myMatrix33, S, v = matmul(v, S));
    EVAL2("SymmMatrix-Vector multiplication", myVector3, v, mySymmMatrix, O, v = O ** v);
    EVAL2("SymmMatrix-Matrix multiplication", myMatrix33, S, mySymmMatrix, O, S = O ** S);
    EVAL2("Vector-SymmMatrix multiplication", myVector3, v, mySymmMatrix, O, v = v ** O);
    EVAL2("Matrix-SymmMatrix multiplication", myMatrix23, M, mySymmMatrix, O, M = M ** O);
    EVAL2("DiagMatrix-Vector multiplication", myVector3, v, myDiagMatrix, D, v = D ** v);
    EVAL2("TridiagMatrix-Vector multiplication", myVector3, v, myTridiagMatrix, T, v = T ** v);
    EVAL2("TridiagMatrix-Matrix multiplication", myMatrix33, S, myTridiagMatrix, T, S = T ** S);
    EVAL2("Vector-TridiagMatrix multiplication", myVector3, v, myTridiagMatrix, T, v = v ** T);
    EVAL2("Matrix-TridiagMatrix multiplication", myMatrix23, M, myTridiagMatrix, T, M = M ** T);
  }
  else {
    std::cout << "NO MATRIX MULTIPLICATION TESTS PERFORMED BECAUSE ADEPT COMPILED WITHOUT LAPACK\n";
  }

#ifndef ALL_ACTIVE
  if (adept::have_linear_algebra()) {
    HEADING("LINEAR ALGEBRA");
    EVAL2("Solving general linear equations Ax=b", myVector3, v, myMatrix33, S, v = solve(S,v));
    
    EVAL2("Solving general linear equations AX=B", myMatrix23, M, myMatrix33, S, M.T() = solve(S,M.T()));
    EVAL2("Solving linear equations Ax=b with symmetric A", myVector3, v, mySymmMatrix, O, v = solve(O,v));
    EVAL2("Solving linear equations AX=B with symmetric A", myMatrix23, M, mySymmMatrix, O, M.T() = solve(O,M.T()));
    EVAL2("Invert general matrix", myMatrix33, C, myMatrix33, S, C = inv(S));
  }
  else {
    std::cout << "NO LINEAR ALGEBRA TESTS PERFORMED BECAUSE ADEPT COMPILED WITHOUT LAPACK\n";
  }    
#else
    std::cout << "NO LINEAR ALGEBRA TESTS PERFORMED BECAUSE ACTIVE ARRAYS NOT YET SUPPORTED\n";
#endif
#else
    std::cout << "NO MATRIX TESTS PERFORMED BECAUSE USING MARVEL-STYLE ACTIVE ARRAYS\n";
#endif


  HEADING("FILLING ARRAYS");
  EVAL("Fill vector with \"<<\"", myVector3, v, (v << 0.1, 0.2));

  should_fail = true;
  EVAL("Overfill vector with \"<<\"", myVector3, v, (v << 0.1, 0.2, 0.3, 0.4));
  should_fail = false;

  EVAL("Underfill matrix with \"<<\"", myMatrix23, M, (M << 0.1, 0.2, 0.3, 0.4, 0.5));
  EVAL("Fill matrix with \"<<\"", myMatrix23, M, (M << 0.1, 0.2, 0.3, 0.4, 0.5, 0.6));

  should_fail = true;
  EVAL("Overfill matrix with \"<<\"", myMatrix23, M, (M << 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0));
  should_fail = false;

  EVAL("Fill vector with vectors using \"<<\"", myVector3, v, v << v(range(1,2)) << 0.1);
  EVAL2("Fill matrix with vector using \"<<\"", myMatrix23, M, myVector3, v, M << 0.1 << 0.2 << 0.3 << v);
  EVAL2("Fill matrix with vector using \"<<\"", myMatrix33, S, myVector3, v, S << v << v << v);
  EVAL("Assign array using range", myVector3, v, v = range(3,5));

#ifdef ADEPT_BOUNDS_CHECKING
  HEADING("BOUNDS CHECKING");
  should_fail = true;
  EVAL("Access vector out of bounds", myVector3, v, v(0) = v(4));
  EVAL("Access vector out of bounds", myVector3, v, v(0) = v(end-4));
  EVAL("Access matrix out of bounds", myMatrix23, M, M(0,0) = M(0,-1));
  EVAL("Access matrix out of bounds", myMatrix23, M, M(0,0) = M(end+1,1));
  should_fail = false;
#endif


  std::cout << "====================================================================\n";
  if (anomalous_results > 0) {
    std::cout << "*** In terms of run-time errors, there were " << anomalous_results << " incorrect results\n";
    return 1;
  }
  else {
    std::cout << "In terms of run-time errors, all tests were passed\n";
    return 0;
  }
}


================================================
FILE: test/test_gsl_interface.cpp
================================================
/* test_gsl_interface.cpp - "main" function for Test 4

  Copyright (C) 2012-2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

// This program minimizes the N-dimensional Rosenbrock banana
// function, with the number of dimensions optionally provided on the
// command line

#include <iostream>
#include <vector>
#include <cstdlib>

#include "state.h"

int
main(int argc, char** argv)
{
  std::cout << "Testing Adept-GSL interface using N-dimensional Rosenbrock function\n";
  std::cout << "Usage: " << argv[0] << " [number_of_dimensions]\n";

  // Read number of dimensions from the command line (default 2)
  int nx = 2;
  if (argc > 1) {
    nx = std::atoi(argv[1]);
  }
   
  if (nx < 2) {
    std::cout << "Error: must have 2 or more dimensions, but "
	      << nx << " requested\n";
    return 1;
  }

  // Create minimization environment (see state.h) and then minimize
  // the function; note that initial values are set on construction.
  State state(nx);
  state.minimize();

  // Print out the result
  std::vector<double> x;
  state.x(x);
  std::cout << "Final state: x = [";
  for (int i = 0; i < nx; i++) {
    std::cout << " " << x[i];
  }
  std::cout << "]\n";
  
  return 0;
}


================================================
FILE: test/test_interp.cpp
================================================
/* test_interp.cpp

  Copyright (C) 2024- European Centre for Medium-Range Weather Forecasts

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.

  This file tests interpolation operations
*/

#include <iostream>
#include "adept_arrays.h"

using namespace adept;

#define TEST_MULTI(FUNC)				\
  {						\
    std::cout << #FUNC << " =";		\
    std::cout << FUNC << "\n";			\
  }

#define TEST(FUNC)					\
  {							\
    std::cout << FUNC << "     \t = " << #FUNC << "\n";	\
  }

int
main(int argc, const char** argv)
{
  set_array_print_style(PRINT_STYLE_MATLAB);
  {
    std::cout << "*** 1D interpolation ***\n\n";
    Vector x = {1.0, 4.0, 9.0};
    Vector m = {2.0, 3.0, 5.0};
    Vector xi = {4.0, 4.8, 3.0, 0.5, 10.0};
    std::cout << "Coordinate vector and interpolation vector:\n";
    std::cout << "x  = " << x << "\n";
    std::cout << "m  = " << m << "\n";
    std::cout << "xi = " << xi << "\n";
    std::cout << "...which are:\n"
	      << "  (1) at a point in the interpolation vector,\n"
	      << "  (2) between points in the interpolation vector (closer to left),\n"
      	      << "  (3) between points in the interpolation vector (closer to right),\n"
	      << "  (4) off the left of the interpolation vector, and\n"
	      << "  (5) off the right of the interpolation vector.\n\n";
    TEST(interp(x,m,xi));
    TEST(interp(x,m,xi,ADEPT_EXTRAPOLATE_LINEAR));
    TEST(interp(x,m,xi,ADEPT_EXTRAPOLATE_CLAMP));
    TEST(interp(x,m,xi,ADEPT_EXTRAPOLATE_CONSTANT));
    TEST(interp(x,m,xi,ADEPT_EXTRAPOLATE_CONSTANT,-10.0));
    TEST(interp(x(stride(end,0,-1)),m(stride(end,0,-1)),xi,ADEPT_EXTRAPOLATE_LINEAR));
    TEST(interp(x+0.0,m+0.0,xi+0.0,ADEPT_EXTRAPOLATE_LINEAR));
    TEST(interp(x,m,xi,ADEPT_INTERPOLATE_NEAREST));
    TEST(interp(x,m,xi,ADEPT_INTERPOLATE_NEAREST|ADEPT_EXTRAPOLATE_CLAMP));
    TEST(interp(x,m,xi,ADEPT_INTERPOLATE_NEAREST|ADEPT_EXTRAPOLATE_CONSTANT));
    TEST(interp(x,m,xi,ADEPT_INTERPOLATE_NEAREST|ADEPT_EXTRAPOLATE_CONSTANT,-10.0));
    TEST(interp(x(stride(end,0,-1)),m(stride(end,0,-1)),xi,ADEPT_INTERPOLATE_NEAREST));

    Matrix M = spread<1>(m,2);
    std::cout << "\n*** Multiple 1D linear interpolation ***\n";
    std::cout << "M = " << M << "\n";
    TEST_MULTI(interp(x,M,xi));
    TEST_MULTI(interp(x,M,xi,ADEPT_INTERPOLATE_NEAREST));
  }

  
  {
    std::cout << "\n*** 2D linear interpolation ***\n\n";
    int nx = 4;
    int ny = 3;

    Vector y = pow(linspace(1.0,ny,ny),2.0);
    Vector x = linspace(1.0,nx,nx);
    Matrix M = {{2.0,3.0,5.0,7.0},
		{11.0,13.0,17.0,19.0},
		{23.0,29.0,31.0,37.0}};//outer_product(y,x);
    
    Vector yi = {4.0, 2.0, 6.5, 0.5};
    Vector xi = {2.0, 3.8, 0.5, 5.0};

    std::cout << "Coordinate vectors and interpolation matrix:\n";
    std::cout << "y = " << y << "\n";
    std::cout << "x = " << x << "\n";
    std::cout << "M = " << M << "\n";
    std::cout << "\nTo be interpolated to the following points:\n";
    std::cout << "yi = " << yi << "\n";
    std::cout << "xi = " << xi << "\n";
    std::cout << "...which are:\n"
	      << "  (1) at a point in the interpolation matrix,\n"
	      << "  (2) between points in the interpolation matrix,\n"
	      << "  (3) off the left of the matrix, and\n"
	      << "  (4) off the top-right of the matrix.\n\n";
  
    TEST(interp2d(y,x,M,yi,xi));
    TEST(interp2d(y,x,M,yi,xi,ADEPT_EXTRAPOLATE_LINEAR));
    TEST(interp2d(y,x,M,yi,xi,ADEPT_EXTRAPOLATE_CLAMP));
    TEST(interp2d(y,x,M,yi,xi,ADEPT_EXTRAPOLATE_CONSTANT));
    TEST(interp2d(y,x,M,yi,xi,ADEPT_EXTRAPOLATE_CONSTANT,-10.0));
    TEST(interp2d(y(stride(end,0,-1)),x,M(stride(end,0,-1),__),yi,xi));
    TEST(interp2d(y+0.0,x+0.0,M+0.0,yi+0.0,xi+0.0));
    TEST(interp2d(y,x,M,yi,xi,ADEPT_INTERPOLATE_NEAREST));
    TEST(interp2d(y,x,M,yi,xi,ADEPT_INTERPOLATE_NEAREST|ADEPT_EXTRAPOLATE_CLAMP));
    TEST(interp2d(y,x,M,yi,xi,ADEPT_INTERPOLATE_NEAREST|ADEPT_EXTRAPOLATE_CONSTANT));
    TEST(interp2d(y,x,M,yi,xi,ADEPT_INTERPOLATE_NEAREST|ADEPT_EXTRAPOLATE_CONSTANT,-10.0));
    TEST(interp2d(y(stride(end,0,-1)),x,M(stride(end,0,-1),__),yi,xi,ADEPT_INTERPOLATE_NEAREST));

    Array3D A = spread<2>(M,2);
    std::cout << "\n*** Multiple 2D linear interpolation ***\n";
    std::cout << "A = " << A << "\n";
    TEST_MULTI(interp2d(y,x,A,yi,xi));
  }

  {
    std::cout << "\n*** 3D interpolation ***\n\n";
    int nx = 4;
    int ny = 3;
    int nz = 2;

    Vector z = linspace(1.0,nz,nz);
    Vector y = linspace(1.0,ny,ny);
    Vector x = pow(linspace(1.0,nx,nx),2.0);
    Array3D A(nz,ny,nx);
    A(0,__,__) = outer_product(y,x);
    A(1,__,__) = outer_product(y,x)+1.0;

    Vector zi = {2.0, 1.2, 1.5,  5.0};
    Vector yi = {2.0, 2.6, 0.5,  5.0};
    Vector xi = {4.0, 10.0,20.0, 0.5};

    std::cout << "Coordinate vectors and interpolation array:\n";
    std::cout << "z = " << z << "\n";
    std::cout << "y = " << y << "\n";
    std::cout << "x = " << x << "\n";
    std::cout << "A = " << A << "\n";
    std::cout << "\nTo be interpolated to the following points:\n";
    std::cout << "zi = " << zi << "\n";
    std::cout << "yi = " << yi << "\n";
    std::cout << "xi = " << xi << "\n";
    std::cout << "...which are:\n"
	      << "  (1) at a point in the interpolation array,\n"
	      << "  (2) between points in the interpolation array,\n"
	      << "  (3) off the array in two dimension but not the third, and\n"
	      << "  (4) off all dimensions of the array.\n\n";
  
    TEST(interp3d(z,y,x,A,zi,yi,xi));
    TEST(interp3d(z,y,x,A,zi,yi,xi,ADEPT_EXTRAPOLATE_LINEAR));
    TEST(interp3d(z,y,x,A,zi,yi,xi,ADEPT_EXTRAPOLATE_CLAMP));
    TEST(interp3d(z,y,x,A,zi,yi,xi,ADEPT_EXTRAPOLATE_CONSTANT));
    TEST(interp3d(z,y,x,A,zi,yi,xi,ADEPT_EXTRAPOLATE_CONSTANT,-10.0));
    TEST(interp3d(z,y(stride(end,0,-1)),x,A(__,stride(end,0,-1),__),zi,yi,xi,ADEPT_EXTRAPOLATE_LINEAR));
    TEST(interp3d(z+0.0,y+0.0,x+0.0,A+0.0,zi+0.0,yi+0.0,xi+0.0,ADEPT_EXTRAPOLATE_LINEAR));
    TEST(interp3d(z,y,x,A,zi,yi,xi,ADEPT_INTERPOLATE_NEAREST));
    TEST(interp3d(z,y,x,A,zi,yi,xi,ADEPT_INTERPOLATE_NEAREST|ADEPT_EXTRAPOLATE_CLAMP));
    TEST(interp3d(z,y,x,A,zi,yi,xi,ADEPT_INTERPOLATE_NEAREST|ADEPT_EXTRAPOLATE_CONSTANT));
    TEST(interp3d(z,y,x,A,zi,yi,xi,ADEPT_INTERPOLATE_NEAREST|ADEPT_EXTRAPOLATE_CONSTANT,-10.0));
    TEST(interp3d(z,y(stride(end,0,-1)),x,A(__,stride(end,0,-1),__),zi,yi,xi,ADEPT_INTERPOLATE_NEAREST));

  }
  
  return 0;
}


================================================
FILE: test/test_minimizer.cpp
================================================
/* test_minimizer.cpp - Test Adept minimizer with N-dimensional Rosenbrock function

  Copyright (C) 2020-2022 ECMWF

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

//#include <string> // for std::stoi in C++11
#include <cstdio>  // for std::sscanf in C++98
#include <iostream>
#include <adept_optimize.h>

// Set this to a large or small number to test if the minimization
// algorithms are immune to the absolute scaling of the cost function
#define COST_SCALING 1.0

using namespace adept;

class RosenbrockN : public Optimizable {
public:

  RosenbrockN() : ls_iteration_(0), exact_hessian_(false) {}

  int ls_iteration_;   // Line search iteration

  // Do we use the exact Hessian derivate analytically, or the
  // approximate one from the Jacobian matrix and the Gauss-Newton
  // formula?
  bool exact_hessian_;

  // N-dimensional Rosenbrock function can be expressed as the sum of
  // the squared elements of vector y(x) defined as follows.  This
  // form facilitates the calculation of the approximate Hessian from
  // the Jacobian dy/dx.  It is templated so that can be called either
  // with a passive "Vector" or active "aVector" argument.
  template <bool IsActive>
  Array<1,Real,IsActive> calc_y(const Array<1,Real,IsActive>& x) {
    int nx = x.size();
    Array<1,Real,IsActive> y((nx-1)*2);
    for (int ix = 0; ix < nx-1; ++ix) {
      y(ix*2)   = 10.0 * (x(ix+1)-x(ix)*x(ix));
      y(ix*2+1) = 1.0 - x(ix);
    }
    y *= sqrt(2.0 * COST_SCALING);
    return y;
  }

  void calc_exact_hessian(const adept::Vector& x, SymmMatrix& hessian) {
    hessian = 0.0;
    int nx = hessian.dimension();
    for (int ix = 0; ix < nx-1; ++ix) {
      hessian(ix,ix) = 1200.0*x(ix)*x(ix) - 400.0*x(ix+1) + 2.0;
      hessian(ix,ix+1) = -400.0*x(ix);
    }
    for (int ix = 1; ix < nx; ++ix) {
      hessian(ix,ix) = hessian(ix,ix) + 200.0;
    }
  }

  virtual void report_progress(int niter, const adept::Vector& x,
			       Real cost, Real gnorm) {
    ls_iteration_ = 0;
    std::cout << "Iteration " << niter
	      << ": cost=" << cost << ", gnorm=" << gnorm << "\n";
  }

  void state_to_stderr(const adept::Vector& x, Real cost) {
    
    // For plotting progress, direct standard error to a text file
    std::cerr << ls_iteration_ << " ";
    for (int ix = 0; ix < x.size(); ++ix) {
      std::cerr << x(ix) << " ";
    }
    std::cerr << cost << "\n";
    ++ls_iteration_;
  }

  void final_state_to_stderr(const adept::Vector& x, Real cost) {
    ls_iteration_ = -1;
    state_to_stderr(x, cost);
  }

  virtual bool provides_derivative(int order) {
    if (order >= 0 && order <= 2) {
      return true;
    }
    else {
      return false;
    }
  }

  virtual Real calc_cost_function(const Vector& x) {
    //std::cout << "  test x: " << x << "\n";
    Vector y = calc_y(x);
    Real cost = 0.5*sum(y*y);
    state_to_stderr(x,cost);
    return cost;
  }

  virtual Real calc_cost_function_gradient(const Vector& x,
					   Vector gradient) {
    Stack stack;
    aVector xactive = x;
    stack.new_recording();
    aVector y = calc_y(xactive);
    aReal cost = 0.5*sum(y*y);
    cost.set_gradient(1.0);
    stack.reverse();
    gradient = xactive.get_gradient();
    state_to_stderr(x,value(cost));
    return value(cost);
  }

  virtual Real calc_cost_function_gradient_hessian(const Vector& x,
						   Vector gradient,
						   SymmMatrix& hessian) {
    Stack stack;
    aVector xactive = x;
    stack.new_recording();
    aVector y = calc_y(xactive);
    aReal cost = 0.5*sum(y*y);
    stack.independent(xactive);
    stack.dependent(y);
    Matrix jac = stack.jacobian();
    if (exact_hessian_) {
      calc_exact_hessian(x, hessian);
    }
    else {
      hessian  = jac.T() ** jac;
    }
    gradient = jac.T() ** value(y);
    state_to_stderr(x,value(cost));
    return value(cost);
  }

};

int
main(int argc, const char* argv[])
{

  if (!adept::have_linear_algebra()) {
    std::cout << "Adept compiled without linear-algebra support: minimizer not available\n";
    return 0;
  }

  RosenbrockN rosenbrock;
  Minimizer minimizer(MINIMIZER_ALGORITHM_LEVENBERG_MARQUARDT);
  // The convergence criterion should be changed in accordance with
  // the cost function scaling
  minimizer.set_converged_gradient_norm(0.1*COST_SCALING);
  int nx = 2;
  if (argc > 1) {
    // nx = std::stoi(argv[1]);
    std::sscanf(argv[1], "%d", &nx);
    if (argc > 2) {
      const char* algo_ptr = argv[2];
      std::string algo(argv[2]);
      // If algorithm name is prefixed by "Newton-" then use the exact
      // Hessian matrix (analytically derived for this specific
      // function) rather than the Gauss-Newton approximation from the
      // Jacobian matrix
      if (algo.find("Newton-") == 0) {
	algo_ptr += 7;
	rosenbrock.exact_hessian_ = true;
      }
      minimizer.set_algorithm(algo_ptr);
      if (argc > 3) {
	int max_it;
	// max_it = std::stof(argv[3]);
	std::sscanf(argv[3], "%d", &max_it);
	minimizer.set_max_iterations(max_it);
	if (argc > 4) {
	  double converged_grad_norm;
	  //converged_grad_norm = std::stof(argv[4]);
	  std::sscanf(argv[4], "%lf", &converged_grad_norm);
	  minimizer.set_converged_gradient_norm(converged_grad_norm);
	}
      }
    }
  }
  else {
    std::cout << "Usage: " << argv[0] << " [nx] [Levenberg|Levenberg-Marquardt|Newton-Levenberg|Newton-Levenberg-Marquardt|L-BFGS|Conjugate-Gradient] [max_iterations] [converged_gradient_norm]\n";
  }

  minimizer.set_levenberg_damping_start(0.25);
  //minimizer.set_max_step_size(1.0);
  //  minimizer.set_levenberg_damping_multiplier(3.0, 5.0);
  minimizer.ensure_updated_state(2);

  std::cout << "Minimizing " << nx << "-dimensional Rosenbrock function\n";
  std::cout << "Algorithm: " << minimizer.algorithm_name() << "\n";
  std::cout << "Use exact Hessian: " << rosenbrock.exact_hessian_ << "\n";
  std::cout << "Maximum iterations: " << minimizer.max_iterations() << "\n";
  std::cout << "Converged gradient norm: " << minimizer.converged_gradient_norm() << "\n";

  // Initial state vector
  Vector x(nx);
  // Standard start
  x = -3.0;
  // Trickier start (other end of the banana)
  //x = -3.0; x(1) = 3.0;
  // Near other minima in higher dimensions
  //x = 1.0; x(0) = -1.0;

  bool is_bounded = false;
  MinimizerStatus status;

  if (is_bounded) {
    //    x = -3.0; x(1) = 3.0;
    x = -0.75; x(1) = 3.0;
    Vector x_lower, x_upper;
    adept::minimizer_initialize_bounds(nx, x_lower, x_upper);
    // x_upper(1) = 2.0;   x_lower(1) = 0.2;
    x_lower(0) = -1;
    status = minimizer.minimize(rosenbrock, x, x_lower, x_upper);
  }
  else {
    status = minimizer.minimize(rosenbrock, x);
  }
  //rosenbrock.final_state_to_stderr(x, minimizer.cost_function());

  std::cout << "Status: " << minimizer_status_string(status) << "\n";
  std::cout << "Solution: x=" << x << "\n";
  std::cout << "Number of samples: " << minimizer.n_samples() << "\n";

  return static_cast<int>(status);
}


================================================
FILE: test/test_misc.cpp
================================================
/* test_misc.cpp

  Copyright (C) 2012-2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

#include "adept.h"
#include "algorithm.h"

// A straight implementation of the trivial example in Hogan (2014)

double algorithm_ad(const double x_val[2], // Input values
                    double* Y_ad,          // Input-output adjoint
                    double x_ad[2]) {      // Output adjoint
  using namespace adept;                   // Import Stack and adouble from adept
  Stack stack;                             // Where differential information is stored
  adouble x[2] = {x_val[0], x_val[1]};     // Initialize adouble inputs
  stack.new_recording();                   // Start recording derivatives
  adouble Y = algorithm(x);                // Version overloaded for adouble args
  Y.set_gradient(*Y_ad);                   // Load the input-output adjoint
  stack.reverse();                         // Run the adjoint algorithm
  x_ad[0] = x[0].get_gradient();           // Extract the output adjoint for x[0]
  x_ad[1] = x[1].get_gradient();           //   ...and x[1]
  *Y_ad   = Y.get_gradient();              // Input-output adjoint has changed too
  return Y.value();                        // Return result of simple computation
}   

int main()
{
  double x[2] = {2.0, 3.0};
  double y_ad = 1.0;
  double x_ad[2];
  double y = algorithm_ad(x, &y_ad, x_ad);
  std::cout << "x[0] = " << x[0] << "\n"
	    << "x[1] = " << x[1] << "\n"
	    << "y    = " << y    << "\n"
	    << "y_ad = " << y_ad << "\n"
	    << "x_ad[0]=" << x_ad[0] << "\n"
	    << "x_ad[1]=" << x_ad[1] << "\n";
  return 0;
}


================================================
FILE: test/test_no_lib.cpp
================================================
/* test_no_lib.cpp

  Copyright (C) 2012-2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

// This source file contains everything that would normally be
// compiled into a static or dynamic library; this means that the
// executable can be built without needing to link to the
// library. This is useful for non-Unix platforms where the configure
// script cannot be run. Note that only one source file should
// #include "adept_source.h"; all the others should #include "adept.h"
// as normal.
#include "adept_source.h"

#include "algorithm.h"

// A straight implementation of the trivial example in Hogan (2014)

double algorithm_ad(const double x_val[2], // Input values
                    double* Y_ad,          // Input-output adjoint
                    double x_ad[2]) {      // Output adjoint
  using namespace adept;                   // Import Stack and adouble from adept
  Stack stack;                             // Where differential information is stored
  adouble x[2] = {x_val[0], x_val[1]};     // Initialize adouble inputs
  stack.new_recording();                   // Start recording derivatives
  adouble Y = algorithm(x);                // Version overloaded for adouble args
  Y.set_gradient(*Y_ad);                   // Load the input-output adjoint
  stack.reverse();                         // Run the adjoint algorithm
  x_ad[0] = x[0].get_gradient();           // Extract the output adjoint for x[0]
  x_ad[1] = x[1].get_gradient();           //   ...and x[1]
  *Y_ad   = Y.get_gradient();              // Input-output adjoint has changed too
  return Y.value();                        // Return result of simple computation
}   

int main()
{
  double x[2] = {2.0, 3.0};
  double y_ad = 1.0;
  double x_ad[2];
  double y = algorithm_ad(x, &y_ad, x_ad);
  std::cout << "x[0] = " << x[0] << "\n"
	    << "x[1] = " << x[1] << "\n"
	    << "y    = " << y    << "\n"
	    << "y_ad = " << y_ad << "\n"
	    << "x_ad[0]=" << x_ad[0] << "\n"
	    << "x_ad[1]=" << x_ad[1] << "\n";
  return 0;
}


================================================
FILE: test/test_packet_operations.cpp
================================================
/* test_packet_operations.cpp

  Copyright (C) 2020 European Centre for Medium-Range Weather Forecasts

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.

  This file tests Adept's vectorization capabilities Adept vectors of
  types float and double, and also Packet<float> and Packet<double>
  that encapsulate the underlying intrinsic SIMD vector types.
*/

#include <iostream>
#include "adept_arrays.h"

using namespace adept;

template <typename Type>
Array<1,Type> p2v(internal::Packet<Type> p) {
  Array<1,Type> v(internal::Packet<Type>::size);
  p.put(v.data());
  return v;
}

template <typename Type>
void test_packet_operations() {
  static const int N = internal::Packet<Type>::size;
  std::cout << "\nADEPT PACKET\n";
  std::cout << "Type: " << sizeof(Type) << "-byte floating point numbers\n";
  std::cout << "Packet size: " << N << "\n";
  Array<1,Type> v(N), w(N);
  v = range(1,N);
  w = 2.0;
  internal::Packet<Type> p(v.data());
  internal::Packet<Type> q(w.data());
  std::cout << "p = " << p2v(p) << "\n";
  std::cout << "q = " << p2v(q) << "\n";
  std::cout << "p+q = " << p2v(p+q) << "\n";
  std::cout << "p-q = " << p2v(p-q) << "\n";
  std::cout << "p*q = " << p2v(p*q) << "\n";
  std::cout << "p/q = " << p2v(p/q) << "\n";
  std::cout << "sqrt(p) = " << p2v(sqrt(p)) << "\n";
  std::cout << "fmin(p,q) = " << p2v(fmin(p,q)) << "\n";
  std::cout << "fmax(p,q) = " << p2v(fmax(p,q)) << "\n";
  std::cout << "hsum(p) = " << hsum(p) << "\n";
  std::cout << "hprod(p) = " << hprod(p) << "\n";
  std::cout << "hmin(p) = " << hmin(p) << "\n";
  std::cout << "hmax(p) = " << hmax(p) << "\n";
}
  

template <typename Type>
void test_vector_operations(int N) {
  std::cout << "\nADEPT ARRAY\n";
  std::cout << "Type: " << sizeof(Type) << "-byte floating point numbers\n";
  std::cout << "Packet size: " << internal::Packet<Type>::size << "\n";
  Array<1,Type> v(N), w(N);
  v = range(1,N);
  w = 2.0;
  std::cout << "v = " << v << "\n";
  std::cout << "w = " << w << "\n";
  std::cout << "v+w = " << v+w << "\n";
  std::cout << "v-w = " << v-w << "\n";
  std::cout << "v*w = " << v*w << "\n";
  std::cout << "v/w = " << v/w << "\n";
  std::cout << "sqrt(v) = " << sqrt(v) << "\n";
  std::cout << "fmin(v,w) = " << fmin(v,w) << "\n";
  std::cout << "fmax(v,w) = " << fmax(v,w) << "\n";
  std::cout << "sum(v) = " << sum(v) << "\n";
  std::cout << "product(v) = " << product(v) << "\n";
  std::cout << "minval(v) = " << minval(v) << "\n";
  std::cout << "maxval(v) = " << maxval(v) << "\n";
}

template <typename Type>
void test_unaligned_reduce(int N) {
  std::cout << "\nUNALIGNED REDUCE\n";
  std::cout << "Type: " << sizeof(Type) << "-byte floating point numbers\n";
  std::cout << "Packet size: " << internal::Packet<Type>::size << "\n";
  Array<1,Type> v(N);
  v = range(1,N);
  std::cout << "v = " << v << "\n";
  std::cout << "sum(v(range(1,end-1))) = " << sum(v(range(1,end-1))) << "\n";
}

template <typename Type>
void test_unaligned_assign(int N) {
  std::cout << "\nUNALIGNED ASSIGN\n";
  std::cout << "Type: " << sizeof(Type) << "-byte floating point numbers\n";
  std::cout << "Packet size: " << internal::Packet<Type>::size << "\n";
  Array<1,Type> v(N), w(N), x(N);
  v = range(1,N);
  w = 2.0;
  x = 0.0;
  std::cout << "v = " << v << "\n";
  std::cout << "w = " << w << "\n";
  std::cout << "x = " << x << "\n";
  std::cout << "x(range(1,end-1)) = v(range(1,end-1))+w(range(1,end-1)) ->\n";
  x(range(1,end-1)) = v(range(1,end-1))+w(range(1,end-1));
  std::cout << "x = " << x << "\n";

}

int
main(int argc, const char** argv)
{
  // Vectorization is only carried out on arrays of length twice the
  // packet length or longer
  static const int N = 2*internal::Packet<float>::size;

  test_packet_operations<float>();
  test_packet_operations<double>();

  Packet<double> d(2.0);
  Packet<double> e = fastexp(d);
  std::cout << "e=" << e << "\n";
  
  test_vector_operations<float>(N);
  test_vector_operations<double>(N);

  test_unaligned_reduce<float>(2*N);
  test_unaligned_reduce<double>(2*N);

  test_unaligned_assign<float>(2*N);
  test_unaligned_assign<double>(2*N);

  return 0;
}


================================================
FILE: test/test_radiances.cpp
================================================
/* test_radiances.cpp - "main" function for Test 3

  Copyright (C) 2012-2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

#include "adept.h"
#include "simulate_radiances.h"

using adept::Real;
using adept::aReal;

// This function provides an Adept interface to the simulate_radiances
// function
void simulate_radiances_wrapper(int n,
				const aReal& surface_temperature,
				const aReal* temperature,
				aReal radiance[2]) {
  // Create inactive (Real) versions of the active (aReal) inputs
  Real st = value(surface_temperature);
  std::vector<Real> t(n);
  for (int i = 0; i < n; ++i) t[i] = value(temperature[i]);
  
  // Declare variables to hold the inactive outputs and their Jacobians
  Real r[2];
  Real dr_dst[2];
  std::vector<Real> dr_dt(2*n);
  
   // Call the function with the non-Adept interface
  simulate_radiances(n, st, &t[0], &r[0], dr_dst, &dr_dt[0]);
  
  // Copy the results into the active variables, but use set_value in order
  // not to write any equivalent derivative statement to the Adept stack
  radiance[0].set_value(r[0]);
  radiance[1].set_value(r[1]);
  
  // Loop over the two radiances and add the derivative statements to
  // the Adept stack
  for (int i = 0; i < 2; ++i) {
    // Add the first term on the right-hand-side of Equation 1 in the text
    radiance[i].add_derivative_dependence(surface_temperature, dr_dst[i]);
    // Now append the second term on the right-hand-side of Equation
    // 1. The third argument "n" of the following function says that
    // there are n terms to be summed, and the fourth argument "2"
    // says to take only every second element of the Jacobian dr_dt,
    // since the derivatives with respect to the two radiances have
    // been interlaced.  If the fourth argument is omitted then
    // relevant Jacobian elements will be assumed to be contiguous in
    // memory.
    radiance[i].append_derivative_dependence(temperature, &dr_dt[i], n, 2);
  }

  for (int i = 0; i < 2; ++i) {
    std::cout << "Channel " << i << "\n";
    std::cout << "d[radiance]/d[surface_temperature] = " << dr_dst[i] << "\n";
    std::cout << "d[radiance]/d[temperature] =";
    for (int j = 0; j < n; ++j) {
      std::cout << " " << dr_dt[i+j*2];
    }
    std::cout << "\n\n";
  }

}


int
main(int argc, char** argv)
{
  // Temperature (K) at 1000-m intervals from the mid-latitude summer
  // standard atmosphere
  static const int N_POINTS = 25;
  static const Real temperature_profile[N_POINTS+1]
    = {294.0, 290.0, 285.0, 279.0, 273.0, 267.0, 261.0, 255.0,
       248.0, 242.0, 235.0, 229.0, 222.0, 216.0, 216.0, 216.0,
       216.0, 216.0, 216.0, 217.0, 218.0, 219.0, 220.0, 222.0,
       223.0, 224.0};

  // Start the Adept stack
  adept::Stack s;
  
  // Copy the temperature profile information into active variables
  aReal surface_temperature = temperature_profile[0];
  aReal temperature[N_POINTS];
  for (int i = 0; i < N_POINTS; i++) {
    temperature[i] = temperature_profile[i+1];
  }

  // The simulated radiances will be put here...
  aReal sim_radiance[2];

  // ...and compared to the observed radiances here with their 1-sigma
  // error
  Real obs_radiance[2] = {0.00189, 0.00140};
  Real radiance_error = 2.0e-5;

  // Start recording derivative information
  s.new_recording();

  // Simulate the radiances for the input surface temperature and
  // atmospheric temperature
  simulate_radiances_wrapper(N_POINTS, surface_temperature,
			     temperature, sim_radiance);

  std::cout << "Simulated radiances = "
	    << sim_radiance[0].value() << " "
	    << sim_radiance[1].value() << "\n";

  // Compute a "cost function" (or "penalty function") expressing the
  // sum of the squared number of error standard deviations the
  // simulated radiances are from the observed radiances
  aReal cost_function = 0.0;
  for (int ichan = 0; ichan < 2; ichan++) {
    cost_function
      += (sim_radiance[ichan] - obs_radiance[ichan])
       * (sim_radiance[ichan] - obs_radiance[ichan])
      / (radiance_error*radiance_error);
  }
  
  std::cout << "Cost function = " << cost_function << "\n";

  // We want the computed adjoints to be gradients of the cost
  // function with respect to the surface temperature or atmospheric
  // temperature
  cost_function.set_gradient(1.0);

  // Reverse-mode automatic differentiation
  s.reverse();

  // Extract the gradients  
  Real dcost_dsurface_temperature = 0;
  Real dcost_dtemperature[N_POINTS];
  surface_temperature.get_gradient(dcost_dsurface_temperature);
  adept::get_gradients(temperature, N_POINTS, dcost_dtemperature);


  std::cout << "d[cost_function]/d[surface_temperature] = "
	    << dcost_dsurface_temperature << "\n";
  std::cout << "d[cost_function]/d[temperature] =";
  for (int i = 0; i < N_POINTS; i++) {
    std::cout << " " << dcost_dtemperature[i];
  }
  std::cout << "\n";


}


================================================
FILE: test/test_radiances_array.cpp
================================================
/* test_radiances.cpp - "main" function for Test 3

  Copyright (C) 2012-2014 The University of Reading
  Copyright (C) 2016      European Centre for Medium Range Weather Forecasts

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.
*/

#include "adept_arrays.h"
#include "simulate_radiances.h"

using adept::Real;
using adept::aReal;
using adept::Vector;
using adept::aVector;
using adept::value;

// This function provides an Adept interface to the simulate_radiances
// function
void simulate_radiances_wrapper(int n,
				const aReal& surface_temperature,
				const aVector& temperature,
				aReal radiance[2]) {
  // Create inactive (Real) versions of the active (aReal) inputs
  Real st = adept::value(surface_temperature);
  Vector t(n);
  for (int i = 0; i < n; i++) { 
    t(i) = adept::value(temperature(i));
  }
  
  // Declare variables to hold the inactive outputs and their Jacobians
  Real r[2];
  Real dr_dst[2];
  Vector dr_dt(2*n);
  
   // Call the function with the non-Adept interface
  simulate_radiances(n, st, &t[0], &r[0], dr_dst, &dr_dt[0]);
  
  // Copy the results into the active variables, but use set_value in order
  // not to write any equivalent derivative statement to the Adept stack
  radiance[0].set_value(r[0]);
  radiance[1].set_value(r[1]);
  
  // Loop over the two radiances and add the derivative statements to
  // the Adept stack
  for (int i = 0; i < 2; ++i) {
    // Add the first term on the right-hand-side of Equation 1 in the text
    radiance[i].add_derivative_dependence(surface_temperature, dr_dst[i]);
    // Now append the second term on the right-hand-side of Equation
    // 1. The third argument "n" of the following function says that
    // there are n terms to be summed, and the fourth argument "2"
    // says to take only every second element of the Jacobian dr_dt,
    // since the derivatives with respect to the two radiances have
    // been interlaced.  If the fourth argument is omitted then
    // relevant Jacobian elements will be assumed to be contiguous in
    // memory.
    for (int j = 0; j < n; ++j) {
      radiance[i].append_derivative_dependence(temperature(j), dr_dt(i+j*2));
    }
  }

  for (int i = 0; i < 2; ++i) {
    std::cout << "Channel " << i << "\n";
    std::cout << "d[radiance]/d[surface_temperature] = " << dr_dst[i] << "\n";
    std::cout << "d[radiance]/d[temperature] =";
    for (int j = 0; j < n; ++j) {
      std::cout << " " << dr_dt[i+j*2];
    }
    std::cout << "\n\n";
  }

}


int
main(int argc, char** argv)
{
  // Temperature (K) at 1000-m intervals from the mid-latitude summer
  // standard atmosphere
  static const int N_POINTS = 25;
  static const Real temperature_profile[N_POINTS+1]
    = {294.0, 290.0, 285.0, 279.0, 273.0, 267.0, 261.0, 255.0,
       248.0, 242.0, 235.0, 229.0, 222.0, 216.0, 216.0, 216.0,
       216.0, 216.0, 216.0, 217.0, 218.0, 219.0, 220.0, 222.0,
       223.0, 224.0};

  // Start the Adept stack
  adept::Stack s;
  
  // Copy the temperature profile information into active variables
  aReal surface_temperature = temperature_profile[0];
  aVector temperature(N_POINTS);
  for (int i = 0; i < N_POINTS; i++) {
    temperature[i] = temperature_profile[i+1];
  }

  // The simulated radiances will be put here...
  aReal sim_radiance[2];

  // ...and compared to the observed radiances here with their 1-sigma
  // error
  Real obs_radiance[2] = {0.00189, 0.00140};
  Real radiance_error = 2.0e-5;

  // Start recording derivative information
  s.new_recording();

  // Simulate the radiances for the input surface temperature and
  // atmospheric temperature
  simulate_radiances_wrapper(N_POINTS, surface_temperature,
			     temperature, sim_radiance);

  std::cout << "Simulated radiances = "
	    << sim_radiance[0].value() << " "
	    << sim_radiance[1].value() << "\n";

  // Compute a "cost function" (or "penalty function") expressing the
  // sum of the squared number of error standard deviations the
  // simulated radiances are from the observed radiances
  aReal cost_function = 0.0;
  for (int ichan = 0; ichan < 2; ichan++) {
    cost_function
      += (sim_radiance[ichan] - obs_radiance[ichan])
       * (sim_radiance[ichan] - obs_radiance[ichan])
      / (radiance_error*radiance_error);
  }
  
  std::cout << "Cost function = " << cost_function << "\n";

  // We want the computed adjoints to be gradients of the cost
  // function with respect to the surface temperature or atmospheric
  // temperature
  cost_function.set_gradient(1.0);

  // Reverse-mode automatic differentiation
  s.reverse();

  // Extract the gradients  
  Real dcost_dsurface_temperature = 0;
  Vector dcost_dtemperature;
  surface_temperature.get_gradient(dcost_dsurface_temperature);
  adept::get_gradients(temperature, dcost_dtemperature);

  std::cout << "d[cost_function]/d[surface_temperature] = "
	    << dcost_dsurface_temperature << "\n";
  std::cout << "d[cost_function]/d[temperature] =";
  for (int i = 0; i < N_POINTS; i++) {
    std::cout << " " << dcost_dtemperature[i];
  }
  std::cout << "\n";


}


================================================
FILE: test/test_reduce_active.cpp
================================================
/* test_reduce_active.cpp

  Copyright (C) 2020 European Centre for Medium-Range Weather Forecasts

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.

  This file tests reduce operations on active vectors
*/

#include <iostream>
#include "adept_arrays.h"

using namespace adept;

#define TEST_REDUCE(FUNC)			\
  {						\
    std::cout << "\nTESTING REDUCE FUNCTION "	\
	      << #FUNC << "\n";			\
    stack.new_recording();			\
    aReal J = FUNC(x);				\
    Real Jp = FUNC(value(x));			\
    J.set_gradient(1.0);			\
    stack.reverse();				\
    Vector dJdx = x.get_gradient();		\
    std::cout << #FUNC << "(x) = "		\
	      << J << "\n";			\
    std::cout << #FUNC << "(value(x)) = "       \
	      << Jp << "\n";	\
    std::cout << "d(" << #FUNC << "(x))/dx = "	\
              << dJdx << "\n";			\
    if (J != Jp) { ++status; }		        \
    stack.print_statements();			\
  }


int
main(int argc, const char** argv)
{
  Stack stack;

  aVector x(5);
  x << -2.0, -3.0, -1.0, -50.0, 7.0;

  std::cout << "x = " << x << "\n";

  int status = 0;

  TEST_REDUCE(sum);
  TEST_REDUCE(mean);
  TEST_REDUCE(maxval);
  TEST_REDUCE(minval);
  TEST_REDUCE(product);
  TEST_REDUCE(norm2);

  // Test product by hand
  {
    std::cout << "\nTESTING MANUAL PRODUCT\n";
    stack.new_recording();
    //aReal J = x(0)*x(1)*x(2)*x(3)*x(4);
    aReal J = x(0)*x(1);
    J *= x(2);
    J *= x(3);
    J *= x(4);
    J.set_gradient(1.0);
    stack.reverse();
    Vector dJdx = x.get_gradient();
    std::cout << "manual_product(x) = " << J << "\n";
    std::cout << "d(manual_product(x))/x = " << dJdx << "\n";
    stack.print_statements();
  }

  // Test norm2 by hand
  {
    std::cout << "\nTESTING MANUAL NORM2\n";
    stack.new_recording();
    aReal J = sqrt(x(0)*x(0) + x(1)*x(1) + x(2)*x(2)
		   + x(3)*x(3) + x(4)*x(4));
    J.set_gradient(1.0);
    stack.reverse();
    Vector dJdx = x.get_gradient();
    std::cout << "manual_norm2(x) = " << J << "\n";
    std::cout << "d(manual_norm2(x))/x = " << dJdx << "\n";
    stack.print_statements();
  }

  if (status != 0) {
    std::cout << "Error: " << status << " of the active/passive reduce operations are different\n";
  }

  return status;
}


================================================
FILE: test/test_thread_safe.cpp
================================================
/* test_thread_safe.cpp - Tests that Adept is thread-safe

  Copyright (C) 2012-2014 The University of Reading

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.

  This program tests the thread-safety of the Adept library: compile
  with and without ADEPT_STACK_THREAD_UNSAFE defined, and run with
  -serial and -parallel command-line arguments.  It should crash only
  if ADEPT_STACK_THREAD_UNSAFE is defined AND -parallel is selected.
*/

#include <iostream>
#include <string>

#ifdef _OPENMP
#include <omp.h>
#endif

// Test what happens if thread safety is disabled by uncommenting the
// following
//#define ADEPT_STACK_THREAD_UNSAFE 1
#include "adept.h"

using adept::adouble;
using adept::Real;

// Number of points in spatial grid of simulation
#define NX 128

// "Toon" advection scheme applied to linear advection in a 1D
// periodic domain - see Adept paper for details
static
void
toon(int nt, double c, const adouble q_init[NX], adouble q[NX]) {
  adouble flux[NX-1];                        // Fluxes between boxes
  for (int i=0; i<NX; i++) q[i] = q_init[i]; // Initialize q
  for (int j=0; j<nt; j++) {                 // Main loop in time
    for (int i=0; i<NX-1; i++) flux[i] = (exp(c*log(q[i]/q[i+1]))-1.0) 
                                         * q[i]*q[i+1] / (q[i]-q[i+1]);
    for (int i=1; i<NX-1; i++) q[i] += flux[i-1]-flux[i];
    q[0] = q[NX-2]; q[NX-1] = q[1];          // Treat boundary conditions
  }
}

// Perform a simulation and compute the Jacobian two ways - this is to
// be run in parallel to test thread safety
static
bool
compute(int i, int nt, double dt, double q_init_save[NX])
{
  bool error_occurred = false; // Return value

  // Start an Adept stack before the first adouble object is
  // constructed
  adept::Stack s;

  adouble q_init[NX];  // Initial values of field as adouble array
  adouble q[NX];       // Final values 
  
  // Copy initial values
  for (int j = 0; j < NX; j++) {
    q_init[j] = q_init_save[j];
  }

  // Do something to the data specific to the loop
  q_init[i+5] = q_init[i+5] + 1.0;

  // Start a new recording of derivative statements; note that this
  s.new_recording();

  // Run the simulation with nt timesteps
  toon(nt, dt, q_init, q);

  s.independent(q_init, NX); // Declare independents
  s.dependent(q, NX);        // Declare dependents
  Real jac_for[NX*NX];       // Where Jacobian will be stored from forward computation
  Real jac_rev[NX*NX];       // Where Jacobian will be stored from reverse computation
  // Compute Jacobian two ways
  s.jacobian_forward(jac_for);
  s.jacobian_reverse(jac_rev);
    
  double rmsd = 0.0;
  for (int j = 0; j < NX*NX; j++) {
    if (jac_for[j] != jac_rev[j]) {
      double diff = jac_for[j]-jac_rev[j];
      rmsd += diff*diff;
    }
    }
  rmsd = sqrt(rmsd / (NX*NX));
    
#pragma omp critical
  {
    std::cout.flush();
    
#ifdef _OPENMP
    std::cout << "*** Iteration " << i << " executed by thread " << omp_get_thread_num() 
	      << " (stack address " << adept::active_stack() << "):\n";
#else
    std::cout << "*** Iteration " << i 
	      << " (stack address " << adept::active_stack() << "):\n";
#endif
      
    std::cout << "Used maximum of " << s.max_jacobian_threads() << " thread(s) for Jacobian calculation\n";
    
    if (rmsd > 1.0e-5) {
      std::cout << "*** ERROR: Jacobian from forward and reverse computations disagree (RMSD = "
		<< rmsd << ")\n";
      error_occurred = true;
    }
    else {
      std::cout << "CORRECT BEHAVIOUR: Jacobians from forward and reverse computations agree within tolerance\n";
    }
    
    if (i == 0) {
      // Print information about the data held in the stack
	std::cout << "Stack status for iteration 0:\n"
		  << s;
	// Print memory information
	std::cout << "Memory usage: " << s.memory() << " bytes\n\n";
    }
    
    std::cout << "\n";
  }
  
  return error_occurred;
}


int
main(int argc, char** argv)
{
  using adept::adouble;

  bool error_occurred = false;

  const double pi = 4.0*atan(1.0);

  // Edit these variables to change properties of simulation
  const int nt = 200;        // Number of timesteps
  const double dt = 0.125;   // Timestep (actually a Courant number)
  const int ncomputations = 8;

  // Initial values of field as a double array
  double q_init_save[NX];

  bool is_parallel = true;

  if (argc > 1) {
    if (std::string("-serial") == argv[1]) {
      is_parallel = false;
    }
    else if (std::string("-parallel") == argv[1]) {
      is_parallel = true;
    }
    else {
      std::cout << "Usage: " << argv[0] << " [-serial|-parallel]\n";
      return 1;
    }
  }


  std::cout << "Running " << argv[0] << "...\n";
  
#ifdef ADEPT_STACK_THREAD_UNSAFE
  std::cout << "  Compiled to be THREAD UNSAFE\n";
#else
  std::cout << "  Compiled to be THREAD SAFE\n";
#endif

#ifdef _OPENMP
  std::cout << "  " << omp_get_num_procs() << " processors available running maximum of "
	    << omp_get_max_threads() << " threads\n";
  if (is_parallel) {
    std::cout << "  Performing " << ncomputations << " parallel computations,\n";
    std::cout << "    within which Jacobian (" << NX << "x" << NX << " matrix) calculations will be serial\n";
#ifdef ADEPT_STACK_THREAD_UNSAFE
    if (omp_get_max_threads() > 1) {
      std::cout << "*** You should expect this program to crash now!\n";
    }
#endif
  }
  else {
    std::cout << "  Performing " << ncomputations << " serial computations,\n";
    std::cout << "    within which Jacobian (" << NX << "x" << NX << " matrix) calculations will be in parallel\n";
  }
#else
  std::cout << "  Compiled with no OpenMP support\n";
#endif

  std::cout << "\n";
  std::cout.flush();


  // Initialize the field
  for (int i = 0; i < NX; i++) {
    q_init_save[i] = (0.5+0.5*sin((i*2.0*pi)/(NX-1.5)))+0.0001;
  }

  if (is_parallel) {
#pragma omp parallel for
    for (int i = 0; i < ncomputations; i++) {
      if (compute(i, nt, dt, q_init_save)) {
	error_occurred = true;
      }
    }
  }
  else {
    for (int i = 0; i < ncomputations; i++) {
      if (compute(i, nt, dt, q_init_save)) {
	error_occurred = true;
      }
    }
  }

  if (error_occurred) {
    std::cout << "An error occurred\n";
  }

  return error_occurred;
    
}


================================================
FILE: test/test_thread_safe_arrays.cpp
================================================
/* test_thread_safe_arrays.cpp - Tests that Adept arrays are thread-safe

  Copyright (C) 2017 ECMWF

  Copying and distribution of this file, with or without modification,
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.  This file is offered as-is,
  without any warranty.

*/

#ifdef _OPENMP
#include <omp.h>
#endif

//#define ADEPT_STORAGE_THREAD_SAFE 1

#include <adept_arrays.h>

int main(int argc, const char** argv)
{
  using namespace adept;

  int N = 2;
  Matrix A(N,N);
  SymmMatrix S(N);

  Matrix B;
  SymmMatrix T;
#ifdef ADEPT_STORAGE_THREAD_SAFE
  std::cout << "Storage should be thread safe\n";
  // B shares the data and increases the reference counter of the
  // shared Storage object. If A goes out of scope, B will "steal" the
  // data.
  B >>= A;
  T >>= S;
#else
  std::cout << "Storage is not thread safe: using soft_link()\n";
  // B points to the data but does not have access to the Storage
  // object. If A goes out of scope, B will most likely point to an
  // inaccessible memory location.
  B >>= A.soft_link();
  T >>= S.soft_link();
#endif

  A = 1.0; // Also seen by B
  S = 2.0; // Also seen by S

  int nthreads = 1;

#ifdef _OPENMP
  nthreads = omp_get_max_threads();
  std::cout << omp_get_num_procs() << " processors available running maximum of "
	    << nthreads << " threads\n";
#else
  std::cout << "Compiled without OpenMP support: 1 thread\n";
#endif

  // The following almost always causes a crash if the code is not
  // properly thread safe
#pragma omp parallel for
  for (int i = 0; i < N*1000; ++i) {

    for (int j = 0; j < N*1000; ++j) {
      B[j % N] = noalias(B(__, j % N)) + T.diag_vector();
    }

  }

  if (nthreads > 1) {
    std::cout << "Parallel subsetting of array zillions of times was successful\n";
  }
  else {
    std::cout << "Serial subsetting of array zillions of times was successful (unsurprisingly)\n";
  }

  return 0;

}