% Papers using MPI
%
% This is a partial list, begun in late October, 1997. It is intended to give
% an example of the range of applications that are known to be using
% MPI.
% Note that some author lists are incomplete; if you have a more
% complete reference, please send it to gropp mcs.anl.gov .



@Article{CooFinTseYor97:mpi-groups,
author = {G. Cooperman and L. Finkelstein and M. Tselman and B. York},
title = {Constructing permutation representations for matrix groups},
journal = {Journal of Symbolic Computation},
year = 1997,
volume = 24,
number = {3--4},
month = {Sept.-Oct.},
pages = {471--488},
abstract = {The theory has been successfully tested on a representation of the sporadic simple group Ly, discovered by Lyons (1972). With no a priori assumptions, we find a permutation representation of degree 9606125 on a conjugacy class of subgroups of order 3, find the order of the resulting permutation group, and verify simplicity A Monte Carlo variation of the algorithm was used to achieve better space and time efficiency. The construction of the permutation representation required four CPU days on a SPARC-server 670MP with 64 MB. The permutation representation was used implicitly in the sense that the group element was stored as a matrix, and its permutation action on a ''point'' was determined using a pre-computed data structure. Thus, additional computations required little additional space. The algorithm has also been implemented using the MasPar MP-1 SIMD parallel computer and 8 SPARC-2's running under MPI. The results of those parallel experiments are briefly reviewed.}
}


@Article{AhuLon97:mpi-rk-scattering,
author = {V. Ahuja and L. N. Long},
title = {A parallel finite-volume {R}unge-{K}utta algorithm for electromagnetic scattering},
journal = {Journal of Computational Physics},
year = 1997,
volume = 137,
number = 2,
month = NOV,
pages = {299--320},
abstract = {A 3D explicit finite volume algorithm has been developed to simulate scattering from complex geometries on parallel computers using structured body conformal curvilinear grids. Most simulations for practical 3D geometries require a large number of grid points for adequate spatial resolution making them suitable to parallel computation. The simulations have been carried out using a multi-block/zonal approach in the message passing paradigm on the SP-2. Each zone is placed on a separate processor and interprocessor communication is carried out using the Message Passing Library/Interface (MPL/MPI). Integration of Maxwell's equations is performed using the four-stage Runge-Kutta time integration method on a dual grid. This method of integrating on a staggered grid gives enhanced dissipative and dispersive characteristics. A scattered field formulation has been used and the Liao boundary condition is used at the outer nonreflecting boundary. The far zone transformation has also been implemented efficiently, using specialized MPL functions to evaluate the far zone scattering results. Results show extremely good comparisons for scattering from the sphere and the ogive with the exact solution and standard FDTD type algorithms. Comparisons for nonaxisymmetric targets like the NASA almond with experimental data has also been found to be extremely good.}
}

@Article{GorBi98,
author = "S. Gorlatch and H. Bischof",
title = "A Generic MPI Implementation for a Data-Parallel Skeleton: Formal Derivation and Application to FFT",
journal = "Parallel Processing Letters",
volume = 8,
number = 4,
month = DEC,
year = 1998,
pages = {447--458},
abstract = "We derive a provably correct, architecture-independent family of parallel implementations for a class of data-parallel algorithms, called DH (distributable homomorphisms). The implementations are well-structured SPMD programs with group-wise personalized all-to-all exchange, directly realizable in MPI. As a case study, we systematically adjust the mathematical specification of the Fast Fourier Transform (FFT) to the DH format and, thereby, obtain a generic SPMD implementation for FFT. The target program includes FFT solutions used in practice -- the binary-exchange and the 2D- and 3D-transpose -- as special cases."
}
@Article{YevCinZhu98:mpi-groundwatersim,
author = {G. Yevi and P. Cinnella and X. Zhuang},
title = {On parallelizing a groundwater pollution simulator},
journal = {Applied Mathematics and Computation},
year = 1998,
volume = 89,
number = {1-3},
month = {Jan.-Feb.},
pages = {313--325},
abstract = {Domain decomposition strategies and computational mesh reordering are discussed for finite difference parallel simulations of groundwater contaminants transport. The parallel performance of point iterative methods traditionally used in groundwater pollution modelling is studied. The algorithms were implemented with red-black and wavefront reordering of the computational mesh. A standard conservative transport equation defined on a two-dimensional grid with Dirichlet boundary conditions was used for the analysis. Completely portable multiple instructions multiple data (MIMD) implementations of the algorithm were performed using message-passing interface (MPI). The runtimes of the algorithms are presented as a function of grid refinement and number of processors, and the communication overhead of the parallel simulation process is investigated, showing that the red-black reordering technique yields the best performance results. The method also provides higher efficiency and scalability when applied to large-scale problems. Optimal parameters are suggested for parallel simulation of groundwater pollution using finite difference schemes.}
}



@Article{Ian97:mpi-reducescatter,
author = {G. Iannello},
title = {Efficient algorithms for the reduce-scatter operation in {LogGP}},
journal = {IEEE Transactions on Parallel and Distributed Systesm},
year = 1997,
volume = 8,
number = 9,
month = SEP,
pages = {970--982},
abstract = {We consider the problem of efficiently performing a reduce-scatter operation in a message passing system. Reduce-scatter is the composition of an element-wise reduction on vectors of n elements initially held by n processors, with a scatter of the resulting vector among the processors. In this paper, we present two algorithms for the reduce-scatter operation, designed in LogGP. The first algorithm assumes an associative and commutative reduction operator and it is optimal in LogGP within a small constant factor. The second algorithm allows the reduction operator to be noncommutative, and it is asymptotically optimal when values to be combined are large arrays. To achieve these results, we developed a complete analysis of both algorithms in LogGP, including the derivation of lower bounds for the reduce-scatter operation, and the study of the m-item version of the problem, i.e., the case when the initial elements are vectors themselves. Reduce-scatter has been included as a collective operation in the MPI standard message passing library, and can be used, for instance, in parallel matrix-vector multiply when the matrix is decomposed by columns. To model a message passing system, we adopted the LogGP model, an extension of LogP that allows the modeling of messages of different length. While this choice makes the analysis somewhat more complex, it leads to more realistic results in the case of gather/scatter algorithms.}
}



@Article{YuaSalBalMel97:mpi-load-balancing,
author = {X. Yuan and G. Salisbury and D. Balsara and R. Melhem},
title = {A load balancing package on distributed memory systems and its application to particle-particle particle-mesh ({P3M}) methods},
journal = {Parallel Computing},
year = 1997,
volume = 23,
number = 10,
month = NOV,
pages = {1525--1544},
abstract = {We present a tool, Bisect, for balanced decomposition of spatial domains. In addition to applying a nested bisection algorithm to determine the boundaries of each subdomain, Bisect replicates a user specified zone along the boundaries of the subdomain in order to minimize future interactions between subdomains, Results of running the tool on the Cray T3D system using both shared memory operations and MPI communications are reported and discussed. In addition, Bisect is used in a parallel implementation of a particle-particle/particle-mesh (P3M) simulation program on the Cray T3D system. The performance of the P3M program with different load-balancing criteria is evaluated and compared. The results show that the use of the Bisect package balances the load efficiently and minimizes communication on the T3D massively parallel system.}
}


@Article{FosKohKriCho97:mpi-task-parallel,
author = {I. Foster and D. R. Kohr and R. Krishnaiyer and A. Choudhary},
title = {A library-based approach to task parallelism in a data-parallel language},
journal = {Journal of Parallel and Distributed Computing},
year = 1997,
volume = 45,
number = 2,
month = SEP,
pages = {148--158},
abstract = {Pure data-parallel languages such as High Performance Fortran version 1 (HPF) do not allow efficient expression of mixed task/data-parallel computations or the coupling of separately compiled data-parallel modules, In this paper, we show how these common parallel program structures can be represented, with only minor extensions to the HPF model, by using a coordination library based on the Message Passing Interface (MPI). This library allows data-parallel tasks to Exchange distributed data structures using falls to simple communication functions. We present microbenchmark results that characterize the performance of this library and that quantify the impact of optimizations that allow reuse of communication schedules in common situations, In addition, results from two-dimensional FFT, convolution, and multiblock programs demonstrate that the HPF/MPI library can provide performance superior to that of pore HPF, We conclude that this synergistic combination of two parallel programming standards represents a useful approach to task parallelism in a data-parallel framework, increasing the range of problems addressable in HPF without requiring complex compiler technology.}
}


@Article{BruGehRei97:mpi-resource-mgmt,
author = {M. Brune and J. Gehring and A. Reinefeld},
title = {Heterogeneous message passing and a link to resource management},
journal = {Journal of Supercomputing},
year = 1997,
volume = 11,
number = 4,
pages = {355--369},
abstract = {PLUS is a light-weight, extensible and efficient communication interface. with only four commands, PLUS is almost transparent to the application code. Our current implementation supports inter-process communication between PVM, MPI and PARIX, but it can be easily extended to other vendor-specific message passing Libraries. As PLUS has been designed for wide area networks, much effort has been spent on portability and on optimizing the communication speed across internet and also intranet links.}
}


@Article{Hor97,
author = {K. Hori},
title = {Supercomputer {SX-4} multinode system},
journal = {NEC Research \& Development},
year = 1997,
volume = 38,
number = 4,
pages = {461--473},
abstract = {The NEC supercomputer SX-4 multinode system series consists of two models, one being HIPPI (High Performance Parallel Interface)-connected model and the other IXS (Internode Crossbar Switch)-connected model. With the IXS, a proprietary high-speed crossbar switch, the HPC (High Performance Computing) up to 1 TFLOPS (Tera Flops) has been enabled by providing the most comprehensive environment for distributed parallel processing. This also means the world's first implementation of a clustered parallel processing. In this paper, we describe the functions of IXS hardware, the new operating system functions, MPI/SX the MPI (Message Passing Interface) processor and NQS/MPI which supports the close cooperation between NQS (Network Queuing System) batch processing system and MPI.}
}


@Article{Fac97:mpi-load-balance,
author = {A. Fachat and K. H. Hoffmann},
title = {Implementation of ensemble-based simulated annealing with dynamic load balancing under {MPI}},
journal = {Computer Physics Communications},
year = 1997,
volume = 107,
number = {1--3},
month = DEC,
pages = {49--53},
abstract = {This paper describes an implementation of Ensemble Based Simulated Annealing (EBSA) with dynamic load balancing. It is running under the MPI Message Passing Library allowing parallel execution on various types of computers. The load balancing is used to get maximum use of the available processing power, even on heterogeneous workstation clusters where the machines differ a lot in computing power.}
}


@Article{BarHau98:mpi-app,
author = {E. Baron and P. H. Hauschildt},
title = {Parallel implementation of the phoenix generalized stellar atmosphere program. {II}. Wavelength parallelization},
journal = {Astrophysical Journal},
year = 1998,
volume = 495,
number = {1 part 1},
month = MAR,
pages = {370--376},
abstract = {We describe an important addition to the parallel implementation of our generalized nonlocal thermodynamic equilibrium (NLTE) stellar atmosphere and radiative transfer computer program PHOENIX. In a previous paper in this series we described data and task parallel algorithms we have developed for radiative transfer, spectral line opacity, and NLTE opacity and rate calculations. These algorithms divided the work spatially or by spectral lines, that is, distributing the radial zones, individual spectral lines, or characteristic rays among different processors and employ, in addition, task parallelism for logically independent functions (such as atomic and molecular line opacities). For finite, monotonic velocity fields, the radiative transfer equation is an initial value problem in wavelength, and hence each wavelength point depends upon the previous one. However, for sophisticated NLTE models of both static and moving atmospheres needed to accurately describe, e.g., novae and supernovae, the number of wavelength points is very large (200,000-300,000) and hence parallelization over wavelength can lead both to considerable speedup in calculation time and the ability to make use of the aggregate memory available on massively parallel supercomputers. Here, we describe an implementation of a pipelined design for the wavelength parallelization of PHOENIX, where the necessary data from the processor working on a previous wavelength point is sent to the processor working on the succeeding wavelength point as soon as it is known. Our implementation uses a MIMD design based on a relatively small number of standard message passing interface (MPI) library calls and is fully portable between serial and parallel computers.}
}

@Article{Yas98:complex-flows,
author = {O. Yasar},
title = {A scalable model for complex flows},
journal = {Computers and Mathematics with Applications},
year = 1998,
volume = 35,
number = 7,
month = APR,
pages = {117-128},
abstract = {We describe a scalable parallel algorithm for numerical simulations of turbulent, radiative, magnetized, and reactive fluid + particle systems on message-passing distributed-memory computers. Accurate simulation of such complex flows has applications in engine combustion, industrial pulverized coal burners, astrophysics, inertial confinement fusion, nuclear systems, and many other strategically and economically important areas. Our algorithm has been developed based on a widely-used combustion code KIVA-3, a plasma and radiation hydrodynamics code R-MHD, a classical particle dynamics code CMDT, and a discrete ordinates particle transport code TORT. The development is being done on the Intel Paragon with PVM and MPI extensions. We report high levels of parallel efficiency and scalability (up to 1024 nodes) for a baseline engine test case, using our current message-passing reactive and turbulent flow code. The three-dimensional extension of radiation magnetohydrodynamics component is still being worked at and we hope to report further progress in the future.}
}


@Article{LepSchHei98:reactive-flow,
author = {J. Lepper and U. Schnell and K. R. G. Hein},
title = {Parallelization of a simulation code for reactive flows on the Intel Paragon},
journal = {Computers and Mathematics with Applications},
year = 1998,
volume = 35,
number = 7,
month = APR,
pages = {101-109},
abstract = {The paper shows the implementation of a 3D simulation code for turbulent how and combustion processes in full-scale utility boilers on an Intel Paragon XP/S computer. For the portable parallelization, an explicit approach is chosen using a domain decomposition method for the static subdivision of the numerical grid together with the SPMD programming model. The measured speedup for the presented case using a coarse grid is good, although some numerical requirements restrict the implemented message passing to strongly synchronized communication. On the Paragon, the NX message passing library is used for the computations. Furthermore, MPI and PVM are applied and their pros and cons on this computer are described. In addition to the basic message passing techniques for local and global communication, other possibilities are investigated. Besides the applicability of the vectorizing capability of the compiler, the influence of the I/O performance during computations is demonstrated. The scalability of the parallel application is presented for a refined discretization.}
}


@Article{Gor98:fft,
author = {S. Gorlatch},
title = {Programming with divide-and-conquer skeletons: A case study of {FFT}},
journal = {Journal of Supercomputing},
year = 1998,
volume = 12,
number = {1-2},
pages = {85-97},
}

@Article{Hio98:qcd,
author = {S. Hioki},
title = {{QCDMPI}---pure {QCD} Monte Carlo Simulation code with MPI},
journal = {Nuclear Physics B-Proceedings Supplements},
year = 1998,
volume = 63,
month = APR,
pages = {1000--1002},
abstract = {In this paper, outline of QCDMPI is reported. Comparison of the performances on several parallel machines; AP1000, AP1000+, AP3000, Cenju-3, Paragon, SR2201 and Workstation Cluster, is also reported.}
}


@Article{Han98:mpi-eval,
author = {P. B. Hansen},
title = {An evaluation of the message-passing interface},
journal = {ACM Sigplan Notices},
year = 1998,
volume = 33,
number = 3,
month = MAR,
pages = {65--72},
abstract = {The Message-Passing Interface (MPI) is evaluated by rewriting message parallel programs for Householder reduction, matrix multiplication, and successive overrelaxation. The author concludes that MPI is a practical programming tool. It does, however, lack the elegance and security that can only be achieved by a parallel programming language.}
}


@Article{Iss98:cfd-precond,
author = {E. Issman},
title = {Non-overlapping preconditioners for a parallel implicit Navier-Stokes solver},
journal = {Future Generation Computer Systems},
year = 1998,
volume = 13,
number = {4--5},
month = MAR,
pages = {303-313},
abstract = {Parallel implicit iterative solution techniques are considered for application to a compressible hypersonic Navier-Stokes solver on unstructured meshes. The construction of parallel preconditioners with quasi-optimal convergence properties with respect to their serial counterpart is a key issue in the design of modern parallel implicit schemes, Two types of non-overlapping preconditioners are presented and compared. The first one is an additive Schwarz preconditioner requiring overlapping of the mesh and the second one is based on a Schur complement formulation. Both are using incomplete LU factorisation at the subdomain level but scale differently. Results are presented for computations on the Cray T3D under the message passing interface MPI. }
}


@Article{Bar98:migration,
author = {A. Barak},
title = {The MOSIX multicomputer operating system for high performance cluster computing},
journal = {Future Generation Computer Systems},
year = 1998,
volume = 13,
number = {4--5},
month = MAR,
pages = {361-372},
abstract = {The scalable computing cluster at Hebrew University consists of 88 Pentium II and Pentium-Pro servers that are connected by fast Ethernet and the Myrinet LANs. It is running the MOSIX operating system, an enhancement of BSD/OS with algorithms for adaptive resource sharing, that are geared for performance scalability in a scalable computing cluster. These algorithms use a preemptive process migration for load-balancing and memory ushering, in order to create a convenient multiuser time-sharing execution environment for HPC, particularly for applications that are written in PVM or MPI. This paper begins with a brief overview of MOSIX and its resource sharing algorithms. Then the paper presents the performance of these algorithms as well as the performance of several large-scale, parallel applications.}
}


@Article{Rei97:interop,
author = {A. Reinefeld},
title = {Communicating across parallel message-passing environments},
journal = {Journal of Systems Architecture},
year = 1997,
volume = 44,
number = {3--4},
month = DEC,
pages = {261--272},
abstract = {We present a small, extensible interface for the transparent communication between vendor-specific and standard message-passing environments. With only four new commands, existing parallel applications can make use of our PLUS communication interface, thereby allowing inter-process communication with other programming environments. Much effort has been spent in optimizing the communication speed across Internet and Intranet links. Our current implementation supports process communication between PVM, MPI, and PARIX. With only marginal additional effort, the interface can be adapted to support other message-passing environments as well.}
}

@Article{hom97:mpi-maxcup,
author = {S. Homer},
title = {Design and performance of parallel and distributed approximation algorithms for maxcut},
journal = {Journal of Parallel and Distributed Computing},
year = 1997,
volume = 41,
number = 1,
pages = {48--61},
month = OCT,
abstract = { We develop and experiment with a new parallel algorithm to approximate the maximum weight cut in a weighted undirected graph, Our implementation starts with the recent (serial) algorithm of Goemans and Williamson for this problem, We consider several different versions of this algorithm, varying the interior-point part of the algorithm in order to optimize the parallel efficiency of our method, Our work aims for an efficient, practical formulation of the algorithm with close-to-optimal parallelization. We analyze our parallel algorithm in the LogP model and predict linear speedup for a wide range of the parameters, We have implemented the algorithm using the message passing interface (MPI) and run it on several parallel machines. In particular, we present performance measurements on the IBM SP2, the Connection Machine CM5, and a cluster of workstations, We observe that the measured speedups are predicted well by our analysis in the LogP model, Finally, we test our implementation on several large graphs (up to 13,000 vertices), particularly on large instances of the Ising model.}
}

@Article{War:mpi-cluster,
author = {T. M. Warschko},
title = {ParaStation: Efficient parallel computing by clustering workstations: Design and evaluation},
journal = {Journal of Systems Architecture},
year = 1997,
volume = 44,
number = {3--4},
pages = {241--260},
month = DEC,
abstract = {ParaStation is a communications fabric for connecting off-the-shelf workstations into a supercomputer. The fabric employs technology used in massively parallel machines and scales up to 4096 nodes, ParaStation's user-level message passing software preserves the low latency of the fabric by taking the operating system out of the communication path, while still providing full protection in a multiprogramming environment. The programming interface presented by ParaStation consists of a UNIX socket emulation and widely used parallel programming environments such as PVM, P4, and MPI. Implementations of ParaStation using various platforms, such as Digitals AlphaGeneration workstations and Linux PCs, achieve end-to-end (process-to-process) latencies as low as 2 mu s and a sustained bandwidth of up to 15 Mbyte/s per channel, even with small packets. Benchmarks using PVM on ParaStation demonstrate real application performance of 1 GFLOP on an 8-node cluster.}
}

@Article{War98:mpi-cluster,
author = {T. M. Warschko},
title = {The {ParaStation} project: Using workstations as building blocks for parallel computing},
journal = {Information Sciences},
year = 1998,
volume = 106,
number = {3--4},
pages = {277--292},
month = MAY,
abstract = {The ParaStation communication fabric provides a high-speed communication network with user-level access to enable efficient parallel computing on workstation clusters. The architecture, implemented on off-the-shelf workstations coupled by the ParaStation communication hardware, removes the kernel and common network protocols from the communication path while still providing full protection in a multiuser, multiprogramming environment. The programming interface presented by ParaStation consists of a UNIX socket emulation and widely used parallel programming environments such as PVM, P4, and MPI. This allows porting a wide range of client/server and parallel applications to the ParaStation architecture. Implementations of ParaStation using various platforms, such as Digital's AlphaGeneration workstations and Linux PCs, achieve end-to-end (process-to-process) latencies as low as 2 mu s and a sustained bandwidth of up to 15 Mbyte/s per channel with small packets. Benchmarks using PVM on ParaStation demonstrate real application performance of 1 GFLOP on an 8-node cluster. }
}

@Article{Dan98:mpi-scheduling,
author = {M. A. R. Dantas},
title = {Efficient scheduling of {MPI} applications on networks of workstations},
journal = {Future Generation Computer Systems},
year = 1998,
volume = 13,
number = 6,
pages = {489--499},
month = MAY,
abstract = {The availability of a large number of workstations connected through a network can represent an attractive option for high-performance computing for many applications. The message-passing interface (MPI) software environment is an effort from many organisations to define a de facto message-passing standard. In other words, the original specification was not designed as a comprehensive parallel programming environment and some researchers agree that the standard should be preserved as simple and clean as possible. Nevertheless, a software environment such as MPI should have somehow a scheduling mechanism for the effective submission of parallel applications on network of workstations. This paper presents an alternative lightweight approach called Selective-MPI (S-MPI), which was designed to enhance the efficiency of the scheduling of applications on an MPI implementation environment.}
}

@Article{Cou98:mpi-c++,
author = {O. Coulaud},
title = {Para++: A high level {C++} interface for message passing},
journal = {Journal of Parallel and Distributed Computing},
year = 1998,
volume = 51,
number = 1,
pages = {46--62},
month = MAY,
abstract = {This paper describes a high level C++ interface for message passing applications. Our interface is built on top of PVM and MPI. The two main contributions are to allow a quicker design of parallel applications without any important drop of performances. We introduce two levels of tasks and use C++ streams for communications. We also present a performance study over both PVM and MPI to show the overhead of our implementation. Finally, we detail two applications based on the heat equation to explain how lPara++ call be used for SPMD and MPMD applications.}
}


@Article{Sal98:mpi-genetic,
author = {A. Salhi},
title = {Parallel implementation of a genetic-programming based tool for symbolic regression},
journal = {Information Processing Letters},
year = 1998,
volume = 66,
number = 6,
pages = {299-307},
month = JUN,
abstract = {We report on a parallel implementation of a tool for symbolic regression, the algorithmic mechanism of which is based on genetic programming, and communication is handled using MPI. The implementation relies on a random islands model (RIM), which combines both the conventional islands model where migration of individuals between islands occurs periodically and niching where no migration takes place. The system was designed so that the algorithm is synergistic with parallel/distributed architectures, and works to make use of processor time and minimum use of network bandwidth without complicating the sequential algorithm significantly. Results on an IBM SP2 are included. }
}

@Article{Har98:mpi-application,
author = {H. K. Harbury},
title = {Parallel computation for electronic waves in quantum corrals},
journal = {VLSI Design},
year = 1998,
volume = 6,
number = {1--4},
pages = {57--51},
abstract = {Recent scanning tunneling microscopy (STM) studies on the (111) faces of noble metals have directly imaged electronic surface-confined states and dramatic standing-wave patterns have been observed [1,2]. We solve for the local density of electronic states in these ''leaky'' quantum corral confinement structures using a coherent elastic scattering theory. We seek solutions of the two-dimensional Schrodinger equation compatible with non-reflecting boundary conditions which asymptotically satisfy the Sommerfeld radiation condition [11,14]. The large matrices generated by the discretization of realistic quantum corral structures require the use of sparse matrix methods. In addition, a parallel finite element solution was undertaken using the message passing interface standard (MPI) and the Portable, Extensible, Toolkit for Scientific Computation (PETSc) [5] for an efficient computational solution on both distributed and shared memory architectures. Our calculations reveal excellent agreement with the reported experimental dI/dV STM data.}
}

@Article{Jak98:mpi-application,
author = {U. Jakobus},
title = {Analysis of electromagnetic scattering problems by an iterative combination of {MoM} with {GMT} using {MPI} for the communication},
journal = {Microwave and Optical Technology Letters},
year = 1998,
volume = 19,
number = 1,
pages = {1--4},
month = SEP,
abstract = {A hybrid method is proposed combining the method of moments (MoM) with the generalized multipole technique (GMT) for the efficient analysis of electromagnetic radiation and scattering problems involving metallic as well as dielectric bodies. An iterative coupling scheme is applied so that only some small changes to the MoM and GMT formulations are required, making it very attractive for the combination of already existing MoM and GMT codes. During the iteration, the MoM and GMT processes are executed in parallel, and communication is done using the message-passing interface (MPI).}
}

@Article{Ril98:mpi-application,
author = {C. J. Riley},
title = {Distributed-memory computing with the {L}angley {A}erothermodynamic {U}pwind {R}elaxation {A}lgorithm {(LAURA)}},
journal = {Advances in Engineering Software},
year = 1998,
volume = 29,
number = {3--6},
pages = {317--324},
month = APR-JUL,
abstract = {The Langley Aerothermodynamic Upwind Relaxation Algorithm (LAURA), a Navier-Stokes solver, has been modified for use in a parallel, distributed-memory environment using the Message-Passing Interface (MPI) standard. A standard domain decomposition strategy is used in which the computational domain is divided into subdomains with each subdomain assigned to a processor. Performance is examined on dedicated parallel machines and a network of desktop workstations. The effect of domain decomposition and frequency of boundary updates on performance and convergence is also examined for several realistic configurations and conditions typical of large-scale computational fluid dynamic analysis.}
}


@Article{Wan98:mpi-application,
author = {P. Wang},
title = {Massively parallel finite volume computation of three-dimensional thermal convective flows},
journal = {Advances in Engineering Software},
year = 1998,
volume = 29,
number = {3--6},
pages = {307--315},
month = APR-JUL,
abstract = {A parallel implementation of the finite volume method for three-dimensional, time-dependent, thermal convective flows is presented. The algebraic equations resulting from the finite volume discretization are solved by a parallel multigrid method. A flexible parallel code has been implemented on distributed-memory systems, by using domain decomposition techniques and the MPI communication software. The code uses one-, two- or three-dimensional partition according to different geometries. It currently runs on the Intel Paragon, the Cray T3D, T3E, the IBM SP2 and the Beowulf systems, which can be ported easily to other parallel systems. A comparison of the wallclock time of the code between these systems is made, and code performances with respect to different numbers of processors are presented.}
}

@Article{Dan98:mpi-application,
author = {K. T. Danielson},
title = {Nonlinear dynamic finite element analysis on parallel computers using {FORTRAN} 90 and {MPI}},
journal = {Advances in Engineering Software},
year = 1998,
volume = 29,
number = {3--6},
pages = {179--186},
month = APR-JUL,
abstract = {A nonlinear explicit dynamic finite element code for use on scalable computers is presented. The code was written entirely in FORTRAN 90, but uses MPI for all interprocessor communication. Although MPI is not formally a standard for FORTRAN 90, the code runs properly in parallel on CRAY T3E, IBM SP, and SGI ORIGIN 2000 computing systems. Issues regarding the installation, portability, and effectiveness of the FORTRAN 90-MPI combination on these machines are discussed. An algorithm that overlaps message passing and computations of the explicit finite element equations is also presented and evaluated. Several large-scale ground-shock analyses demonstrate the varying combined importance of load balance and interprocessor communication among the different computing platforms. The analyses were performed on only a few to hundreds of processors with excellent speedup and scalability.}
}


@Article{Vat98:mpi-application,
author = {V. N. Vatsa},
title = {Viscous pow computations for complex geometries on parallel computers},
journal = {Advances in Engineering Software},
year = 1998,
volume = 29,
number = {3--6},
month = APR-JUL,
abstract = {A widely used computational fluid dynamics (CFD) code known as TLNS3D, which was developed for large, shared-memory computers, is ported to a distributed computing environment. An engineering approach is used here to parallelize this code so that minimal deviation from the original (non-parallel) code is incurred. A natural partitioning along grid blocks is adopted in which one or more blocks are distributed to each of the available processors. An automatic, static load-balancing strategy is employed for equitable distribution of computational work to specified processors. The message passing interface (MPI) protocols are incorporated for data communication. Both synchronous and asynchronous communication modes have been incorporated. As the number of processors is increased, the asynchronous communication mode shows much better scalability and clearly outperforms the synchronous mode of communication.}
}

@Article{Riv98:mpi-application,
author = {W. RiveraGallego},
title = {A genetic algorithm for circulant Euclidean distance matrices},
journal = {Applied Mathematics and Computation},
year = 1998,
volume = 97,
number = {2--3},
pages = {197--208},
month = DEC,
abstract = {This paper presents a fast genetic algorithm to determine three-dimensional configurations of points that generate circulant Euclidean Distance Matrices (EDMs). A parallel implementation is possible by using the message passing interface (MPI) standard. In addition, theoretical results about the polyhedral structure of both the cone of circulant symmetric positive semidefinite matrices and the cone of circulant EDMs are introduced.}
}

@Article{Ada98:mpi-application,
author = {P. Adamidis},
title = {Steel strip production --- a pilot application for coupled simulation with several calculation systems},
journal = {Journal of Materials Processing Technology},
year = 1998,
volume = {80-1},
pages = {330--336},
month = AUG-SEP,
abstract = {For the simulation of technological and natural processes in specific application domains, efficient calculation software solving differential equation systems on grid-based computational models is available, especially in the area of computer-aided engineering (CAE). To handle a so-called 'multiphysics' problem, for example the fluid flow and metal forming process in a twin-roll casting arrangement for steel strip production, several calculation systems usually have to be employed in a high-performance computing environment, e.g. on parallel computers. The GRISSLi Coupling Interface is a software tool facilitating the coupled computation based on the message passing standard MPI.}
}


@Article{Dow98:mpi-implementation,
author = {P. W. Dowd},
title = {{BLAST}: broadband lightweight {ATM} secure transport for high-performance distributed computing},
journal = {Computer Communications},
year = 1998,
volume = 21,
number = 12,
pages = {1040--1057},
month = AUG,
abstract = {This paper investigates the use of ATM for cluster-based computing. The need for a native ATM API is discussed as well as the performance of message passing libraries (MPL) that are written to use such an API to exploit the advantages of a high-speed network for cluster-based computing. The MPLs offer a standard interface, such as PVM or MPI, and interoperate with existing TCP/IP- and UDP/IP-based versions in addition to the ATM API environment. The interoperability extensions made to two MPLs, MPI and Prowess, which allow a hybrid environment of both ATM and TCP-based legacy network technology will be described. Shared object space (SOS), an extension to the MPLs, is described that helps support the geographically distributed computing (GDC) environment through latency hiding. It allows a user to develop applications in a shared memory type of environment. The native ATM API which supports cluster-based computing is described in this paper. This API provides a reliable transport interface to the MPL which has been optimized for an ATM environment. The transport protocol is a low-state design that optimizes the performance based on the available bandwidth, buffer constraints, propagation delay characteristics and security requirements of a particular connection.}
}

@Article{Kac98:mpi-tool,
author = {P. Kacsuk},
title = {{GRADE}: A graphical programming environment for multicomputers},
journal = {Computers and Artificial Intelligence},
year = 1998,
volume = 17,
number = 5,
pages = {417--427},
abstract = {To provide high-level graphical support for developing message passing programs, an integrated programming environment (GRADE) is being developed. GRADE currently provides tools to construct, execute, debug, monitor and visualise message-passing based parallel programs. GRADE offers the programmer an integrated graphical user interface during the whole life-cycle of program development and provides high-level graphical programming abstraction mechanisms to construct parallel applications. The current version of GRADE can generate C+PVM code but there is no theoretical obstacle to extend it for supporting MPI [9] and FORTRAN. Those new features of the GRADE graphical environment are described in the paper that enhanced GRADE towards a professional parallel programming environment.}
}

@Article{Ras98:mpi-application,
author = {J. Rasch},
title = {6-dimensional integrals and supercomputers},
journal = {Computer Physics Communications},
year = 1998,
volume = 114,
number = {1--3},
pages = {378--384},
month = NOV,
abstract = {Recently, a numerical method has been developed for the evaluation of general 6-dimensional integrals (6DIME), which has been successfully applied to the study of (e,2e) and (gamma,2e) processes. Details of the parallelization of that code are given using MPI and the scaling behaviour with respect to the number of nodes is presented. Almost full load balancing is obtained.The method is extended to include two centre scattering problems.}
}

@Article{Chu98:mpi-balancing,
author = {Y. Chung},
title = {An asynchronous algorithm for balancing unpredictable workload on distributed-memory machines},
journal = {ETRI Journal},
year = 1998,
volume = 20,
number = 4,
pages = {346--360},
month = DEC,
abstract = {It is challenging to parallelize problems with irregular computation and communication. In this paper, we propose an asynchronous algorithm for balancing unpredictable workload on distributed-memory machines. By using an initial workload estimate, we first partition the computations such that the workload is distributed evenly across the processors. In addition, we performtask migrations dynamically for adapting to the evolving workload. To demonstrate the usefulness of our load balancing strategy, we conducted experiments on an IBM SP2 and a Cray T3D. Experimental results show that our task migration strategy can balance unpredictable workload with little overhead. Our code using C and MPI is portable onto other distributed-memory machines.}
}

@Article{Ber99:mpi-tools,
author = {M. Bertozzi},
title = {Tools for code optimization and system evaluation of the image processing system {PAPRICA-3}},
journal = {Journal of Systems Architecture},
year = 1999,
volume = 45,
number = {6--7},
pages = {519--542},
month = JAN,
abstract = {This paper presents the complex environment that was built to ease the prototyping of real-time applications on the PAPRICA-3 massively parallel system. Applications are developed in C++ using high level data types and the corresponding Assembly code is automatically created by a code generator. A stochastic code optimizer takes the assembly code and improves it according to a genetic approach; due to the high computational power required by thisapproach, the stochastic code optimizer was implemented with MPI and runs in parallel on a cluster of workstations. The availability of this complex environment allowed to test the performance of the system and to tune it according to some target applications before the actual development of the hardware. For this purpose a system-level simulator was also built to determine the number of clock cycles required to run a specific segment of code. The whole environment has been used to validate possible solutions for the hardware system and to develop, test, and tune several real-time image processing applications. The hardware system is now completely defined.}
}

@Article{Lee99:mpi-applicatin,
author = {P. C. S. Lee},
title = {On the parallelization of a global climate-chemistry modeling system},
journal = {Atmospheric Environment},
year = 1999,
volume = 33,
number = 4,
pages = {675--681},
month = FEB,
abstract = {Coupled climate-chemistry simulations are computationally intensive owing to the spatial and temporal scope of the problem. In global chemistry models, the time integrations encountered in the chemistry and aerosol modules usually comprise the major CPU consumption. Parallelization of these segmentsof the code can contribute to multifold CPU speed-ups with minimal modification of the original serial code. This technical note presents a single program-multiple data (SPMD) strategy applied to the time-split chemistry modules of a coupled climate - global tropospheric chemistry model. Latitudinal domain decomposition is adopted along with a dynamic load-balancing technique that uses the previous time-step's load/latitude estimates for distributing the latitude bands amongst the processors. The coupled model is manually parallelized using the Message Passing Interface standard (MPI) on a distributed memory platform (IBM-SP2), Load-balancing efficiencies and the associated MPI overheads are discussed. Overall speed-ups and efficiencies are also calculated for a series of runs employing up to eight processors.}
}

@Article{May99:mpi-application,
author = {F. May},
title = {Mathematical modelling of glass melting furnace design with regard to {NOx} formation},
journal = {Glastechnische Berichte-Glass Science and Technology},
year = 1999,
volume = 72,
number = 1,
pages = {1--6},
month = JAN,
abstract = {A three-dimensional mathematical model for turbulent flow and combustion onthe basis of turbulence/chemistry interactions and radiative heat transfertaking into account spectral effects of surrounding walls and combustion gases is described. For this the transport equation for radiative intensity was split into different wavelength ranges. A block-structured finite volume grid with local refinements was used to solve the governing equations. The calculation domain is subdivided into a number of subdomains which are linked within the solver based on the Message Passing Interface library. Computed distributions of velocity, temperature, and heat fluxes are given. Results of a parametric study in a producing horseshoe furnace by increasing the height of the furnace with regard to NOx concentration distributions are presented.}
}

@Article{Reu99:mpi-application,
author = {J. Reuther},
title = {Aerodynamic shape optimization of supersonic aircraft configurations via anadjoint formulation on distributed memory parallel computers},
journal = {Computers and Fluids},
year = 1999,
volume = 28,
number = {4--5},
pages = {675--700},
month = MAY-JUN,
abstract = {This work describes the application of a control theory-based aerodynamic shape optimization method to the problem of supersonic aircraft design. A high fidelity computational fluid dynamics (CFD) algorithm modelling the Euler equations is used to calculate the aerodynamic properties of complex three-dimensional aircraft configurations. The design process is greatly accelerated through the use of both control theory and parallel computing. Control theory is employed to derive the adjoint differential equations whose solution allows for the evaluation of design gradient information at a fraction of the computational cost required by previous design methods. The resulting problem is then implemented in parallel using a domain decomposition approach, an optimized communication schedule, and the Message Passing Interface (MPI) Standard for portability and efficiency. In our earlier studies, the serial implementation of this design method, was shown to be effective for the optimization of airfoils, wings, wing-bodies, and complex aircraft configurations using both the potential equation and the Euler equations. In this work, our concern will be to extend the methodologies such that the combined capabilities of these new technologies can be used routinely and efficiently in an industrial design environment. The aerodynamic optimization of a supersonic transport configuration is presented as a demonstration test case of the capability, A particular difficulty of this test case is posed by the close coupling of the propulsion/airframe integration.}
}

@Article{Vat99:mpi-application,
author = {V. N. Vatsa},
title = {Parallelization of a multiblock flow code: an engineering implementation},
journal = {Computers and Fluids},
year = 1999,
volume = 38,
number = {4--5},
pages = {603--614},
month = MAY-JUN,
abstract = {Current trends in computer hardware are dictating a gradual shift toward the use of clusters of relatively inexpensive but powerful workstations, or massively parallel processing (MPP) machines, for scientific computing. However, most computational fluid dynamics (CFD) codes in use today were developed for large, shared-memory machines and are not readily portable to the distributed computing environment. One major hurdle in porting CFD codes to distributed computing platforms is the difficulty encountered in partitioning the problem so that the computation-to-communication ratio for each compute node (process) is maximized and the idle time during which one node waits for other nodes to transfer data is minimized. In the present work, pertinent issues involved in the parallelization of a widely used multiblock Navier-Stokes code TLNS3D are discussed. An engineering; approach is used here to parallelize this code so that minimal deviation from the original (nonparallel) code is incurred. A natural partitioning along grid blocks is adopted in which one or more blocks are distributed to each of the available nodes. An automatic, static load-balancing strategy is employed for equitable distribution of computational work to specified nodes. Both parallel Virtual machine (PVM) and message passing interface (MPI) protocols are incorporated for data communication to allow maximum portability to a wide range of computer configurations. Results are presented that are comparable with apriori estimates of performance for distributed computing and that are competitive in terms of central processing unit (CPU) time and wall time usagewith large, shared-memory supercomputers.}
}

@Article{Dzw99:mpi-application,
author = {W. Dzwinel},
title = {Method of particles in visual clustering of multi-dimensional and large data sets},
journal = {Future Generation Computer Systems},
year = 1999,
volume = 15,
number = 3,
pages = {365--379},
month = APR,
abstract = {A method dedicated for visual clustering of N-dimensional data sets is presented. It is based on the classical feature extraction technique - the Sammon's mapping. This technique empowered by a particle approach used in the Sammon's criterion minimization makes the method more reliable, general and efficient. To show its reliability, the results of tests are presented, which were made to exemplify the algorithm 'immunity' from data errors. The general character of the method is emphasized and its role in multicriterial analysis discussed. Due to inherent parallelism of the methods, which are based on the particle approach, the visual clustering technique can be implemented easily in parallel environment. It is shown that parallel realization of the mapping algorithm enables the visualization of data sets consisting of more than 10(4) multi-dimensional data points. The method was tested in the PVM, MPI and data parallel environments on an HP/Convex SPP/1600. In this paper, the authors compare the parallel algorithm performance for these three interfaces. The approach to visual clustering, presented in the paper, can be used in visualization and analysis of large multi-dimensional data sets. }
}

@Article{Wan99:mpi-application,
author = {P. Wang},
title = {Parallel multigrid finite volume computation of three-dimensional thermal convection},
journal = {Computers and Mathematics with Applications},
year = 1999,
volume = 37,
number = 9,
pages = {49-60},
month = MAY,
abstract = {A parallel implementation of the finite volume method for three-dimensional, time-dependent, thermal convective flows is presented. The algebraic equations resulting from the finite volume discretization, including a pressureequation which consumes most of the computation time, are solved by a parallel multigrid method. A flexible parallel code has been implemented on theIntel Paragon, the Cray T3D, and the IBM SP2 by using domain decompositiontechniques and the MPI communication software. The code can use 1D, 2D, or3D partitions as required by different geometries, and is easily ported toother parallel systems. Numerical solutions for air (Prandtl number Pr = 0.733) with various Rayleigh numbers up to 10(7) are discussed.}
}


@Article{Bar99:mpi-application,
author = {S. T. Barnard},
title = {An {MPI} implementation of the {SPAI} preconditioner on the {T3E}},
journal = {International Journal of High Performance Computing Applications},
year = 1999,
volume = 13,
number = 2,
pages = {107--123},
month = {Summer},
abstract = {The authors describe and test spai-1.1, a parallel MPI implementation of the sparse approximate inverse (SPAI) preconditioner. They show that SPAI canbe very effective for solving a set of very large and difficult problems on a Cray T3E. The results clearly show the value of SPAI (and approximate inverse methods in general) as the Viable alternative to ILU-type methods when facing very large and difficult problems. The authors strengthen this conclusion by showing that spai-1.1 also has very good scaling behavior.}
}

@Article{Ree99:mpi-application,
author = {J. S. Reeve},
title = {An efficient parallel version of the {Householder-QL} matrix diagonalisation algorithm},
journal = {Parallel Computing},
year = 1999,
volume = 25,
number = 3,
pages = {311-319},
month = MAR,
abstract = {In this paper we report an effective parallelisation of the Householder routine for the reduction of a real symmetric matrix to tri-diagonal form and the QL algorithm for the diagonalisation of the resulting matrix. The Householder algorithm scales like alpha N-3/P + beta N(2)log(2)(P) and the QL algorithm like gamma N-2 + delta N-3/P as the number of processors P is increased for fixed problem size. The constant parameters alpha, beta, gamma anddelta are obtained empirically. When the eigenvalues only are required theHouseholder method scales as above while the QL algorithm remains sequential. The code is implemented in c in conjunction with the message passing interface (MPI) libraries and verified on a sixteen node IBM SP2 and for realmatrices that occur in the simulation of properties of crystaline materials.}
}

@Article{Gen99:mpi-application,
author = {C. Gennaro},
title = {Parallelising the Mean Value Analysis algorithm},
journal = {Transactions of the Society for Computer Simulation International},
year = 1999,
volume = 16,
number = 1,
pages = {16--22},
month = MAR,
abstract = {The Mean Value Analysis (MVA) algorithm is one of the most popular for evaluating the performance of separable (or product-form) queueing networks. Although its complexity is modest when jobs are indistinguishable, the introduction of different customer classes rapidly increases is computational cost. The problems of parallelising the algorithm while retaining its conceptual simplicity are examined. In particular, a parallel implementation of MVAon a distributed memory machine is developed using the MPI library for communication.}
}

@Article{Ble99:mpi-application,
author = {G. E. Blelloch},
title = {Design and implementation of a practical parallel {D}elaunay algorithm},
journal = {Algorithmica},
year = 1999,
volume = 24,
number = {3--4},
pages = {243--269},
month = JUL-AUG,
abstract = {Initial experiments using a variety of distributions showed that our parallel algorithm was within a factor of 2 in work from the best sequential algorithm. Based on these promising results, the algorithm was implemented using C and an MPI-based toolkit. Compared with previous work, the resulting implementation achieves significantly better speedups over good sequential code, does not assume a uniform distribution of points, and is widely portable due to its use of MPI as a communication mechanism. Results are presentedfor the IBM SP2, Cray T3D, SGI Power Challenge, and DEC AlphaCluster.}
}

@Article{Coe99:mpi-application,
author = {P. J. Coelho},
title = {Modelling of a utility boiler using parallel computing},
journal = {Journal of Supercomputing},
year = 1999,
volume = 13,
number = 2,
pages = {211-232},
month = MAR,
abstract = {A mathematical model for the simulation of the turbulent reactive flow and heat transfer in a power station boiler has been parallelized. The mathematical model is based on the numerical solution of the governing equations for mass, momentum, energy and transport equations for the scalar quantities.The k-epsilon model and the conserved scalar/prescribed probability density function formalism are employed. Radiative heat transfer is calculated using the discrete ordinates method. The code has been fully parallelized using the spatial domain decomposition approach and MPI. Calculations were performed using an IBM-SP2. It is shown that the computational requirements are reduced and the parallel efficiency increases if the mean temperature anddensity are calculated a priori, and stored. The role of the different parts of the code on the parallel performance is discussed. A speedup of 5.9 is achieved using 8 processors.}
}

@Article{Rus99:mpi-cluster,
author = {S. H. Russ},
title = {Using {Hector} to run {MPI} programs over networked workstations},
journal = {Concurrency Practice and Experience},
year = 1999,
volume = 11,
number = 4,
pages = {189--204},
month = APR,
abstract = {Networked workstations represent an increasingly popular distributed platform for running large parallel programs. They can present a low-cost alternative to purchasing supercomputer time or additional usable computational capability, Several capabilities are desirable in order to harness workstations, including support for a widely accepted parallel programming environment, task migration, intelligent resource allocation, fault tolerance, and totally transparent support of these features. The Hector system is designed to provide these capabilities to MPI programs. The structure of the system and experiences using the system on loaded workstations to run scientific codes are described.}
}

@Article{Ros99:mpi-tool,
author = {T. Rossi},
title = {SIAM Journal on Scientific Computing},
journal = {A parallel fast direct solver for block tridiagonal systems with separable matrices of arbitrary dimension},
year = 1999,
volume = 20,
number = 5,
pages = {1778-1796},
month = MAY,
abstract = {A parallel fast direct solution method for linear systems with separable block tridiagonal matrices is considered. Such systems appear, for example, when discretizing the Poisson equation in a rectangular domain using the five-point finite difference scheme or the piecewise linear finite elements ona triangulated, possibly nonuniform rectangular mesh. The method under consideration has the arithmetical complexity O(N log N), and it is closely related to the cyclic reduction method, but instead of using the matrix polynomial factorization, the so-called partial solution technique is employed. Hence, in this paper, the method is called the partial solution variant of the cyclic reduction method (PSCR method). The method is presented and analyzed in a general radix-q framework and, based on this analysis, the radix-4 variant is chosen for parallel implementation using the MPI standard. Thegeneralization of the method to the case of arbitrary block dimension is described. The numerical experiments show the sequential efficiency and numerical stability of the PSCR method compared to the well-known BLKTRI implementation of the generalized cyclic reduction method. The good scalability properties of the parallel PSCR method are demonstrated in a distributed-memory Cray T3E-750 computer.}
}

@Article{Bou99:mpi-algorithm,
author = {P. Boulet},
title = {Static tiling for heterogeneous computing platforms},
journal = {Parallel Computing},
year = 1999,
volume = 25,
number = 5,
pages = {547--568},
month = MAY,
abstract = {In the framework of fully permutable loops, tiling has been extensively studied as a source-to-source program transformation. However, little work hasbeen devoted to the mapping and scheduling of the tiles on physical processors. Moreover, targeting heterogeneous computing platforms has to the best of our knowledge, never been considered. In this paper we extend static tiling techniques to the context of limited computational resources with different-speed processors. In particular, we present efficient scheduling and mapping strategies that are asymptotically optimal. The practical usefulness of these strategies is fully demonstrated by MPI experiments on a heterogeneous network of workstations.}
}

@Article{Ros99:mpi-application,
author = {I. Rosenblum},
title = {Multi-processor molecular dynamics using the {Brenner} potential: Parallelization of an implicit multi-body potential},
journal = {International Journal of Modern Physics C},
year = 1999,
volume = 10,
number = 1,
pages = {189--203},
month = FEB,
abstract = {We present computational aspects of Molecular Dynamics calculations of thermal properties of diamond using the Brenner potential. Parallelization was essential in order to carry out these calculations on samples of suitable sizes. Our implementation uses MPI on a multi-processor machine such as the IBM SP2. Three aspects of parallelization of the Brenner potential are discussed in depth. These are its long-range nature, the need for different parallelization algorithms for forces and neighbors, and the relative expense of force calculations compared to that of data communication. The efficiency of parallelization is presented as a function of different approaches to these issues as well as of cell size and number of processors employed in the calculation. In the calculations presented here, information from almosthalf of the atoms were needed by each processor even when 16 processors were used. This made it worthwhile to avoid unnecessary complications by making data from all atoms available to all processors. Superlinear speedup wasachieved for four processors (by avoiding paging) with 512 atom samples, and 5ps long trajectories were calculated (for 5120 atom samples) in 53 hours using 16 processors; 514 hours would have been needed to complete this calculation using a serial program. Finally, we discuss and make available a set of routines that enable MPI-based codes such as ours to be debugged on scalar machines.}
}

@Article{Luo99:mpi-comparision,
author = {Y. Luo},
title = {Shared memory vs. message passing: the {COMOPS} benchmark experiment},
journal = {Journal of Supercomputing},
year = 1999,
volume = 13,
number = 3,
pages = {283--301},
month = MAY,
abstract = {This paper presents the comparison of the COMOPS benchmark performance in MPI and shared memory on four different shared memory platforms: the DEC AlphaServer 8400/300, the SGI Power Challenge, the SGI Origin2000, and the HP-Convex Exemplar SPP1600. The paper also qualitatively analyzes the obtained performance data based on an understanding of the corresponding architecture and the MPI implementations. Some conclusions are made for the inter-processor communication performance on these four shared memory platforms.}
}


@Article{Hio99:mpi-application,
author = {S. Hioki},
title = {{QCDimMPI: MPI} code for {QCD} with an improved action},
journal = {Nuclear Physics B-Proceedings Supplements},
year = 1999,
volume = 73,
pages = {895--897},
month = MAR,
abstract = {QCDimMPI[I] is a simulation code for pure SU(3) gauge theory with an improved action consisting of 1 x 1 and 2 x 1 plaquettes. It uses Fortran77 and the Message Passing Interface Standard, MPI[2]. QCDimMPI is an extended version of QCDMPI. It is portable, allows simulations in any number of dimensions, on any number of processors, and with arbitrary dimensional partitioning. It requires a rather small working area, and yields excellent performance on single processor computers and a wide variety of parallel computers which support MPI. The program provides information on link update time and communications time. In this paper, an outline of QCDimMPI is given, and benchmark results on several parallel computers are reported.}
}


@Article{Gol99:mpi-application,
author = {A. Goller},
title = {Parallel processing strategies for large {SAR} image data sets in a distributed environment},
journal = {Computing},
year = 1999,
volume = 62,
number = 4,
pages = {277-291},
abstract = {Key algorithms like image matching and Shape-from-Shading were parallelizedmainly using MPI, and ported onto suitable computer architectures. Our experiments showed that all algorithms perform well, and they further proved the concept of CDIP to be beneficial: Usability of all integrated algorithmswas significantly improved, mainly due to less user-centered network traffic, simple access to supercomputers, the creation of method sequences, and easy-to-use and well maintained algorithms.}
}

@Article{Chi99:mpi-implementation,
author = {A. Chien},
title = {Design and evaluation of an {HPVM}-based windows {NT} supercomputer},
journal = {International Journal of High Performance Computing Applications},
year = 1999,
volume = 13,
number = 3,
pages = {201--219},
month = {Fall},
abstract = {We describe the design and evaluation of a 192-processor Windows NT clusterfor high performance computing based on the High Performance Virtual Machine (HPVM) communication suite. While other clusters have been described in the literature, building a 58 GFlop/s NT cluster to be used as a general-purpose production machine for NCSA required solving new problems. The HPVM software meets the challenges represented by the large number of processors,the peculiarities of the NT operating system, the need for a production-strength job submission facility and the requirement for mainstream programming interfaces. First, HPVM provides users with a collection of standard APIs like MPI, Shmem, Global Arrays with supercomputer class performance (13 mu s minimum latency, 84 MB/s peak bandwidth for MPI), efficiently delivering Myrinet's hardware performance to application programs. Second, HPVM provides cluster management and scheduling (through integration with Platform Computing's LSF). Finally, HPVM addresses Windows NT's remote access problem, providing convenient remote access and job control (through a graphical Java-applet front-end). Given the production nature of the cluster, the performance characterization is largely based on a sample of the NCSA scientific applications the machine will be running. The side-by-side comparison with other present-generation NCSA supercomputers shows the cluster to be within a factor of 2 to 4 of the SGI Origin 2000 and Cray T3E performance at a fraction of the cost. The inherent scalability of the cluster design produces a comparable or better speedup than the Origin 2000 despite a limitationin the HPVM flow control mechanism.}
}


@Article{Ros99:mpi-tools,
author = {T. Rossi},
title = {Parallel fictitious domain method for a non-linear elliptic {Neumann} boundary value problem},
journal = {Numerical Linear Algebra with Applications},
year = 1999,
volume = 6,
number = 1,
pages = {51--60},
month = JAN-FEB,
abstract = {Parallelization of the algebraic fictitious domain method is considered forsolving Neumann boundary value problems with variable coefficients. The resulting method is applied to the parallel solution of the subsonic full potential flow problem which is linearized by the Newton method. Good scalability of the method is demonstrated on a Cray T3E distributed memory parallel computer using MPI in communication.}
}


@Article{Zak99:mpi-tools,
author = {O. Zaki},
title = {Toward scalable performance visualization with Jumpshot},
journal = {International Journal of High Performance Computing Applications},
year = 1999,
volume = 13,
number = 3,
pages = {277-288},
month = {Fall},
abstract = {Jumpshot is a graphical tool for understanding the performance of parallel programs. It is in the tradition of the upshot tool but contains a number of extensions and enhancements that make it suitable for large-scale parallel computations. Jumpshot takes as input a new, more flexible logfile formatand comes with a library for generating such logfiles. An MPI profiling library is also included, enabling the automatic generation of such logfiles from MPI programs. Jumpshot is written in Java and can easily be integratedas an applet into browser-based computing environments. The most novel feature of Jumpshot is its automatic detection of anomalous durations, drawingthe user's attention to problem areas in a parallel execution. This capability is particularly useful in large-scale parallel computations containingmany events.}
}

@Article{BegVin99:transport,
author = {S. Bergeron and A. Vincent},
title = {Implementation strategies for real-time particle transport solver},
journal = {Computer Physics Communications},
year = 1999,
volume = 120,
number = {2--3},
month = AUG,
pages = {177-184},
abstract = {Many problems in physics and engineering involve the transport of solid particles in a turbulent field. In some cases, it is desirable to study the transport of those particles in "real time". The prediction of erosion in therotating part of hydraulic turbines is such a problem. This paper presentsa semi-analytic predictor-corrector scheme adapted to the case of a rotating frame of reference. Simplification, related to the interpolation scheme required, is discussed as well as a parallel implementation using MPI on 10Base-T Ethernet interconnected workstations. The 3D solver is coupled with a high performance visualization software. Performance then shows a quasi-linear speedup.}
}



@Article{BruFagRes99:meta,
author = {M. A. Brune and G. E. Fagg and M. M. Resch},
title = {Message-passing environments for metacomputing},
journal = {Future Generation Computer Systems},
year = 1999,
volume = 15,
number = {5--6},
month = OCT,
pages = {699-712},
abstract = {The PACX-MPI approach offers a transparent interface for the communication between two or more MPI environments. PVAMPI allows the user spawning parallel processes under the MPI environment. The PLUS protocol bridges the gap between vendor-specific (e.g., MPL, NX, and PARIX) and vendor-independent message-passing environments (e.g., PVM and MPI). Moreover, it offers the ability to create and control processes at application runtime.}
}

@Article{ResRanSto99:meta,
author = {M. M. Resch and D. Rantzau and R. Stoy},
title = {Metacomputing experience in a transatlantic wide area application test-bed},
journal = {Future Generation Computer Systems},
year = 1999,
volume = 15,
number = {5--6},
month = OCT,
pages = {807--816},
abstract = {In the frame of a G7 initiative the High Performance Computing Center Stuttgart (HLRS) together with the Pittsburgh Supercomputing Center (PSC) and Sandia National Laboratories (SNL) has set up a transatlantic wide area application test-bed in 1997. A dedicated ATM-Link was installed that connected German research networks to vBNS and ESnet. During 1 year this test-bed wasextensively used for metacomputing and collaborative working. Two applications - one from computational fluid dynamics and one from molecular dynamics - were adapted and run on the test-bed. For message-passing an MPI library was implemented that supports metacomputing. An already existing softwarefor collaborative visualization was adapted for that scenario. This article describes the technical background of the cooperation, results that have been achieved for the two applications so far and lessons that have been learned. Special emphasis will be given to future work planned.}
}


@Article{Tho99:mpi-application,
author = {S. J. Thomas and M. Desgagne and R. Benoit},
title = {A real-time north American forecast at 10-km resolution with the {C}anadian {MC2 Meso-LAM}},
journal = {Journal of Atmospheric and Oceanic Technology},
year = 1999,
volume = 16,
number = 8,
pages = {1092-1101},
month = AUG,
abstract = {The next generation of high-performance computers will be based on clustersof shared-memory symmetric multip