% Papers using MPI % % This is a partial list, begun in late October, 1997. It is intended to give % an example of the range of applications that are known to be using % MPI. % Note that some author lists are incomplete; if you have a more % complete reference, please send it to gropp mcs.anl.gov . @String{apr-jul="April-July"} @String{aug-sep="August-September"} @String{may-jun="May-June"} @String{jul-aug="July-August"} @String{jan-feb="Jan.-Feb."} @String{spr="Spring"} @String{feb-mar="Feb.-March"} @String{nov-dec="Nov.-Dec."} @String{win="Winter"} @String{jun-jul="June-July"} @String{jul-oct="July-Oct."} @String{mar-apr="Mar.-April"} @String{sep-oct="Sept.-Oct."} @STRING{lncs="{L}ecture {N}otes in {C}omputer {S}cience"} @STRING{sv="{S}pringer"} @Article{CooFinTseYor97:mpi-groups, author = {G. Cooperman and L. Finkelstein and M. Tselman and B. York}, title = {Constructing permutation representations for matrix groups}, journal = {Journal of Symbolic Computation}, year = 1997, volume = 24, number = {3--4}, month = {Sept.-Oct.}, pages = {471--488}, abstract = {The theory has been successfully tested on a representation of the sporadic simple group Ly, discovered by Lyons (1972). With no a priori assumptions, we find a permutation representation of degree 9606125 on a conjugacy class of subgroups of order 3, find the order of the resulting permutation group, and verify simplicity A Monte Carlo variation of the algorithm was used to achieve better space and time efficiency. The construction of the permutation representation required four CPU days on a SPARC-server 670MP with 64 MB. The permutation representation was used implicitly in the sense that the group element was stored as a matrix, and its permutation action on a ''point'' was determined using a pre-computed data structure. Thus, additional computations required little additional space. The algorithm has also been implemented using the MasPar MP-1 SIMD parallel computer and 8 SPARC-2's running under MPI. The results of those parallel experiments are briefly reviewed.} } @Article{AhuLon97:mpi-rk-scattering, author = {V. Ahuja and L. N. Long}, title = {A parallel finite-volume {R}unge-{K}utta algorithm for electromagnetic scattering}, journal = {Journal of Computational Physics}, year = 1997, volume = 137, number = 2, month = NOV, pages = {299--320}, abstract = {A 3D explicit finite volume algorithm has been developed to simulate scattering from complex geometries on parallel computers using structured body conformal curvilinear grids. Most simulations for practical 3D geometries require a large number of grid points for adequate spatial resolution making them suitable to parallel computation. The simulations have been carried out using a multi-block/zonal approach in the message passing paradigm on the SP-2. Each zone is placed on a separate processor and interprocessor communication is carried out using the Message Passing Library/Interface (MPL/MPI). Integration of Maxwell's equations is performed using the four-stage Runge-Kutta time integration method on a dual grid. This method of integrating on a staggered grid gives enhanced dissipative and dispersive characteristics. A scattered field formulation has been used and the Liao boundary condition is used at the outer nonreflecting boundary. The far zone transformation has also been implemented efficiently, using specialized MPL functions to evaluate the far zone scattering results. Results show extremely good comparisons for scattering from the sphere and the ogive with the exact solution and standard FDTD type algorithms. Comparisons for nonaxisymmetric targets like the NASA almond with experimental data has also been found to be extremely good.} } @Article{GorBi98, author ="S. Gorlatch and H. Bischof", title = "A Generic MPI Implementation for a Data-Parallel Skeleton: Formal Derivation and Application to FFT", journal = "Parallel Processing Letters", volume=8, number=4, month=DEC, year=1998, pages={447--458}, abstract = "We derive a provably correct, architecture-independent family of parallel implementations for a class of data-parallel algorithms, called DH (distributable homomorphisms). The implementations are well-structured SPMD programs with group-wise personalized all-to-all exchange, directly realizable in MPI. As a case study, we systematically adjust the mathematical specification of the Fast Fourier Transform (FFT) to the DH format and, thereby, obtain a generic SPMD implementation for FFT. The target program includes FFT solutions used in practice -- the binary-exchange and the 2D- and 3D-transpose -- as special cases." } @Article{YevCinZhu98:mpi-groundwatersim, author = {G. Yevi and P. Cinnella and X. Zhuang}, title = {On parallelizing a groundwater pollution simulator}, journal = {Applied Mathematics and Computation}, year = 1998, volume = 89, number = {1-3}, month = {Jan.-Feb.}, pages = {313--325}, abstract = {Domain decomposition strategies and computational mesh reordering are discussed for finite difference parallel simulations of groundwater contaminants transport. The parallel performance of point iterative methods traditionally used in groundwater pollution modelling is studied. The algorithms were implemented with red-black and wavefront reordering of the computational mesh. A standard conservative transport equation defined on a two-dimensional grid with Dirichlet boundary conditions was used for the analysis. Completely portable multiple instructions multiple data (MIMD) implementations of the algorithm were performed using message-passing interface (MPI). The runtimes of the algorithms are presented as a function of grid refinement and number of processors, and the communication overhead of the parallel simulation process is investigated, showing that the red-black reordering technique yields the best performance results. The method also provides higher efficiency and scalability when applied to large-scale problems. Optimal parameters are suggested for parallel simulation of groundwater pollution using finite difference schemes.} } @Article{Ian97:mpi-reducescatter, author = {G. Iannello}, title = {Efficient algorithms for the reduce-scatter operation in {LogGP}}, journal = {IEEE Transactions on Parallel and Distributed Systesm}, year = 1997, volume = 8, number = 9, month = SEP, pages = {970--982}, abstract = {We consider the problem of efficiently performing a reduce-scatter operation in a message passing system. Reduce-scatter is the composition of an element-wise reduction on vectors of n elements initially held by n processors, with a scatter of the resulting vector among the processors. In this paper, we present two algorithms for the reduce-scatter operation, designed in LogGP. The first algorithm assumes an associative and commutative reduction operator and it is optimal in LogGP within a small constant factor. The second algorithm allows the reduction operator to be noncommutative, and it is asymptotically optimal when values to be combined are large arrays. To achieve these results, we developed a complete analysis of both algorithms in LogGP, including the derivation of lower bounds for the reduce-scatter operation, and the study of the m-item version of the problem, i.e., the case when the initial elements are vectors themselves. Reduce-scatter has been included as a collective operation in the MPI standard message passing library, and can be used, for instance, in parallel matrix-vector multiply when the matrix is decomposed by columns. To model a message passing system, we adopted the LogGP model, an extension of LogP that allows the modeling of messages of different length. While this choice makes the analysis somewhat more complex, it leads to more realistic results in the case of gather/scatter algorithms.} } @Article{YuaSalBalMel97:mpi-load-balancing, author = {X. Yuan and G. Salisbury and D. Balsara and R. Melhem}, title = {A load balancing package on distributed memory systems and its application to particle-particle particle-mesh ({P3M}) methods}, journal = {Parallel Computing}, year = 1997, volume = 23, number = 10, month = NOV, pages = {1525--1544}, abstract = {We present a tool, Bisect, for balanced decomposition of spatial domains. In addition to applying a nested bisection algorithm to determine the boundaries of each subdomain, Bisect replicates a user specified zone along the boundaries of the subdomain in order to minimize future interactions between subdomains, Results of running the tool on the Cray T3D system using both shared memory operations and MPI communications are reported and discussed. In addition, Bisect is used in a parallel implementation of a particle-particle/particle-mesh (P3M) simulation program on the Cray T3D system. The performance of the P3M program with different load-balancing criteria is evaluated and compared. The results show that the use of the Bisect package balances the load efficiently and minimizes communication on the T3D massively parallel system.} } @Article{FosKohKriCho97:mpi-task-parallel, author = {I. Foster and D. R. Kohr and R. Krishnaiyer and A. Choudhary}, title = {A library-based approach to task parallelism in a data-parallel language}, journal = {Journal of Parallel and Distributed Computing}, year = 1997, volume = 45, number = 2, month = SEP, pages = {148--158}, abstract = {Pure data-parallel languages such as High Performance Fortran version 1 (HPF) do not allow efficient expression of mixed task/data-parallel computations or the coupling of separately compiled data-parallel modules, In this paper, we show how these common parallel program structures can be represented, with only minor extensions to the HPF model, by using a coordination library based on the Message Passing Interface (MPI). This library allows data-parallel tasks to Exchange distributed data structures using falls to simple communication functions. We present microbenchmark results that characterize the performance of this library and that quantify the impact of optimizations that allow reuse of communication schedules in common situations, In addition, results from two-dimensional FFT, convolution, and multiblock programs demonstrate that the HPF/MPI library can provide performance superior to that of pore HPF, We conclude that this synergistic combination of two parallel programming standards represents a useful approach to task parallelism in a data-parallel framework, increasing the range of problems addressable in HPF without requiring complex compiler technology.} } @Article{BruGehRei97:mpi-resource-mgmt, author = {M. Brune and J. Gehring and A. Reinefeld}, title = {Heterogeneous message passing and a link to resource management}, journal = {Journal of Supercomputing}, year = 1997, volume = 11, number = 4, pages = {355--369}, abstract = {PLUS is a light-weight, extensible and efficient communication interface. with only four commands, PLUS is almost transparent to the application code. Our current implementation supports inter-process communication between PVM, MPI and PARIX, but it can be easily extended to other vendor-specific message passing Libraries. As PLUS has been designed for wide area networks, much effort has been spent on portability and on optimizing the communication speed across internet and also intranet links.} } @Article{Hor97, author = {K. Hori}, title = {Supercomputer {SX-4} multinode system}, journal = {NEC Research \& Development}, year = 1997, volume = 38, number = 4, pages = {461--473}, abstract = {The NEC supercomputer SX-4 multinode system series consists of two models, one being HIPPI (High Performance Parallel Interface)-connected model and the other IXS (Internode Crossbar Switch)-connected model. With the IXS, a proprietary high-speed crossbar switch, the HPC (High Performance Computing) up to 1 TFLOPS (Tera Flops) has been enabled by providing the most comprehensive environment for distributed parallel processing. This also means the world's first implementation of a clustered parallel processing. In this paper, we describe the functions of IXS hardware, the new operating system functions, MPI/SX the MPI (Message Passing Interface) processor and NQS/MPI which supports the close cooperation between NQS (Network Queuing System) batch processing system and MPI.} } @Article{Fac97:mpi-load-balance, author = {A. Fachat and K. H. Hoffmann}, title = {Implementation of ensemble-based simulated annealing with dynamic load balancing under {MPI}}, journal = {Computer Physics Communications}, year = 1997, volume = 107, number = {1--3}, month = DEC, pages = {49--53}, abstract = {This paper describes an implementation of Ensemble Based Simulated Annealing (EBSA) with dynamic load balancing. It is running under the MPI Message Passing Library allowing parallel execution on various types of computers. The load balancing is used to get maximum use of the available processing power, even on heterogeneous workstation clusters where the machines differ a lot in computing power.} } @Article{BarHau98:mpi-app, author = {E. Baron and P. H. Hauschildt}, title = {Parallel implementation of the phoenix generalized stellar atmosphere program. {II}. Wavelength parallelization}, journal = {Astrophysical Journal}, year = 1998, volume = 495, number = {1 part 1}, month = MAR, pages = {370--376}, abstract = {We describe an important addition to the parallel implementation of our generalized nonlocal thermodynamic equilibrium (NLTE) stellar atmosphere and radiative transfer computer program PHOENIX. In a previous paper in this series we described data and task parallel algorithms we have developed for radiative transfer, spectral line opacity, and NLTE opacity and rate calculations. These algorithms divided the work spatially or by spectral lines, that is, distributing the radial zones, individual spectral lines, or characteristic rays among different processors and employ, in addition, task parallelism for logically independent functions (such as atomic and molecular line opacities). For finite, monotonic velocity fields, the radiative transfer equation is an initial value problem in wavelength, and hence each wavelength point depends upon the previous one. However, for sophisticated NLTE models of both static and moving atmospheres needed to accurately describe, e.g., novae and supernovae, the number of wavelength points is very large (200,000-300,000) and hence parallelization over wavelength can lead both to considerable speedup in calculation time and the ability to make use of the aggregate memory available on massively parallel supercomputers. Here, we describe an implementation of a pipelined design for the wavelength parallelization of PHOENIX, where the necessary data from the processor working on a previous wavelength point is sent to the processor working on the succeeding wavelength point as soon as it is known. Our implementation uses a MIMD design based on a relatively small number of standard message passing interface (MPI) library calls and is fully portable between serial and parallel computers.} } @Article{Yas98:complex-flows, author = {O. Yasar}, title = {A scalable model for complex flows}, journal = {Computers and Mathematics with Applications}, year = 1998, volume = 35, number = 7, month = APR, pages = {117-128}, abstract = {We describe a scalable parallel algorithm for numerical simulations of turbulent, radiative, magnetized, and reactive fluid + particle systems on message-passing distributed-memory computers. Accurate simulation of such complex flows has applications in engine combustion, industrial pulverized coal burners, astrophysics, inertial confinement fusion, nuclear systems, and many other strategically and economically important areas. Our algorithm has been developed based on a widely-used combustion code KIVA-3, a plasma and radiation hydrodynamics code R-MHD, a classical particle dynamics code CMDT, and a discrete ordinates particle transport code TORT. The development is being done on the Intel Paragon with PVM and MPI extensions. We report high levels of parallel efficiency and scalability (up to 1024 nodes) for a baseline engine test case, using our current message-passing reactive and turbulent flow code. The three-dimensional extension of radiation magnetohydrodynamics component is still being worked at and we hope to report further progress in the future.} } @Article{LepSchHei98:reactive-flow, author = {J. Lepper and U. Schnell and K. R. G. Hein}, title = {Parallelization of a simulation code for reactive flows on the Intel Paragon}, journal = {Computers and Mathematics with Applications}, year = 1998, volume = 35, number = 7, month = APR, pages = {101-109}, abstract = {The paper shows the implementation of a 3D simulation code for turbulent how and combustion processes in full-scale utility boilers on an Intel Paragon XP/S computer. For the portable parallelization, an explicit approach is chosen using a domain decomposition method for the static subdivision of the numerical grid together with the SPMD programming model. The measured speedup for the presented case using a coarse grid is good, although some numerical requirements restrict the implemented message passing to strongly synchronized communication. On the Paragon, the NX message passing library is used for the computations. Furthermore, MPI and PVM are applied and their pros and cons on this computer are described. In addition to the basic message passing techniques for local and global communication, other possibilities are investigated. Besides the applicability of the vectorizing capability of the compiler, the influence of the I/O performance during computations is demonstrated. The scalability of the parallel application is presented for a refined discretization.} } @Article{Gor98:fft, author = {S. Gorlatch}, title = {Programming with divide-and-conquer skeletons: A case study of {FFT}}, journal = {Journal of Supercomputing}, year = 1998, volume = 12, number = {1-2}, pages = {85-97}, } @Article{Hio98:qcd, author = {S. Hioki}, title = {{QCDMPI}---pure {QCD} Monte Carlo Simulation code with MPI}, journal = {Nuclear Physics B-Proceedings Supplements}, year = 1998, volume = 63, month = APR, pages = {1000--1002}, abstract = {In this paper, outline of QCDMPI is reported. Comparison of the performances on several parallel machines; AP1000, AP1000+, AP3000, Cenju-3, Paragon, SR2201 and Workstation Cluster, is also reported.} } @Article{Han98:mpi-eval, author = {P. B. Hansen}, title = {An evaluation of the message-passing interface}, journal = {ACM Sigplan Notices}, year = 1998, volume = 33, number = 3, month = MAR, pages = {65--72}, abstract = {The Message-Passing Interface (MPI) is evaluated by rewriting message parallel programs for Householder reduction, matrix multiplication, and successive overrelaxation. The author concludes that MPI is a practical programming tool. It does, however, lack the elegance and security that can only be achieved by a parallel programming language.} } @Article{Iss98:cfd-precond, author = {E. Issman}, title = {Non-overlapping preconditioners for a parallel implicit Navier-Stokes solver}, journal = {Future Generation Computer Systems}, year = 1998, volume = 13, number = {4--5}, month = MAR, pages = {303-313}, abstract = {Parallel implicit iterative solution techniques are considered for application to a compressible hypersonic Navier-Stokes solver on unstructured meshes. The construction of parallel preconditioners with quasi-optimal convergence properties with respect to their serial counterpart is a key issue in the design of modern parallel implicit schemes, Two types of non-overlapping preconditioners are presented and compared. The first one is an additive Schwarz preconditioner requiring overlapping of the mesh and the second one is based on a Schur complement formulation. Both are using incomplete LU factorisation at the subdomain level but scale differently. Results are presented for computations on the Cray T3D under the message passing interface MPI. } } @Article{Bar98:migration, author = {A. Barak}, title = {The MOSIX multicomputer operating system for high performance cluster computing}, journal = {Future Generation Computer Systems}, year = 1998, volume = 13, number = {4--5}, month = MAR, pages = {361-372}, abstract = {The scalable computing cluster at Hebrew University consists of 88 Pentium II and Pentium-Pro servers that are connected by fast Ethernet and the Myrinet LANs. It is running the MOSIX operating system, an enhancement of BSD/OS with algorithms for adaptive resource sharing, that are geared for performance scalability in a scalable computing cluster. These algorithms use a preemptive process migration for load-balancing and memory ushering, in order to create a convenient multiuser time-sharing execution environment for HPC, particularly for applications that are written in PVM or MPI. This paper begins with a brief overview of MOSIX and its resource sharing algorithms. Then the paper presents the performance of these algorithms as well as the performance of several large-scale, parallel applications.} } @Article{Rei97:interop, author = {A. Reinefeld}, title = {Communicating across parallel message-passing environments}, journal = {Journal of Systems Architecture}, year = 1997, volume = 44, number = {3--4}, month = DEC, pages = {261--272}, abstract = {We present a small, extensible interface for the transparent communication between vendor-specific and standard message-passing environments. With only four new commands, existing parallel applications can make use of our PLUS communication interface, thereby allowing inter-process communication with other programming environments. Much effort has been spent in optimizing the communication speed across Internet and Intranet links. Our current implementation supports process communication between PVM, MPI, and PARIX. With only marginal additional effort, the interface can be adapted to support other message-passing environments as well.} } @Article{hom97:mpi-maxcup, author = {S. Homer}, title = {Design and performance of parallel and distributed approximation algorithms for maxcut}, journal = {Journal of Parallel and Distributed Computing}, year = 1997, volume = 41, number = 1, pages = {48--61}, month = OCT, abstract = { We develop and experiment with a new parallel algorithm to approximate the maximum weight cut in a weighted undirected graph, Our implementation starts with the recent (serial) algorithm of Goemans and Williamson for this problem, We consider several different versions of this algorithm, varying the interior-point part of the algorithm in order to optimize the parallel efficiency of our method, Our work aims for an efficient, practical formulation of the algorithm with close-to-optimal parallelization. We analyze our parallel algorithm in the LogP model and predict linear speedup for a wide range of the parameters, We have implemented the algorithm using the message passing interface (MPI) and run it on several parallel machines. In particular, we present performance measurements on the IBM SP2, the Connection Machine CM5, and a cluster of workstations, We observe that the measured speedups are predicted well by our analysis in the LogP model, Finally, we test our implementation on several large graphs (up to 13,000 vertices), particularly on large instances of the Ising model.} } @Article{War:mpi-cluster, author = {T. M. Warschko}, title = {ParaStation: Efficient parallel computing by clustering workstations: Design and evaluation}, journal = {Journal of Systems Architecture}, year = 1997, volume = 44, number = {3--4}, pages = {241--260}, month = DEC, abstract = {ParaStation is a communications fabric for connecting off-the-shelf workstations into a supercomputer. The fabric employs technology used in massively parallel machines and scales up to 4096 nodes, ParaStation's user-level message passing software preserves the low latency of the fabric by taking the operating system out of the communication path, while still providing full protection in a multiprogramming environment. The programming interface presented by ParaStation consists of a UNIX socket emulation and widely used parallel programming environments such as PVM, P4, and MPI. Implementations of ParaStation using various platforms, such as Digitals AlphaGeneration workstations and Linux PCs, achieve end-to-end (process-to-process) latencies as low as 2 mu s and a sustained bandwidth of up to 15 Mbyte/s per channel, even with small packets. Benchmarks using PVM on ParaStation demonstrate real application performance of 1 GFLOP on an 8-node cluster.} } @Article{War98:mpi-cluster, author = {T. M. Warschko}, title = {The {ParaStation} project: Using workstations as building blocks for parallel computing}, journal = {Information Sciences}, year = 1998, volume = 106, number = {3--4}, pages = {277--292}, month = MAY, abstract = {The ParaStation communication fabric provides a high-speed communication network with user-level access to enable efficient parallel computing on workstation clusters. The architecture, implemented on off-the-shelf workstations coupled by the ParaStation communication hardware, removes the kernel and common network protocols from the communication path while still providing full protection in a multiuser, multiprogramming environment. The programming interface presented by ParaStation consists of a UNIX socket emulation and widely used parallel programming environments such as PVM, P4, and MPI. This allows porting a wide range of client/server and parallel applications to the ParaStation architecture. Implementations of ParaStation using various platforms, such as Digital's AlphaGeneration workstations and Linux PCs, achieve end-to-end (process-to-process) latencies as low as 2 mu s and a sustained bandwidth of up to 15 Mbyte/s per channel with small packets. Benchmarks using PVM on ParaStation demonstrate real application performance of 1 GFLOP on an 8-node cluster. } } @Article{Dan98:mpi-scheduling, author = {M. A. R. Dantas}, title = {Efficient scheduling of {MPI} applications on networks of workstations}, journal = {Future Generation Computer Systems}, year = 1998, volume = 13, number = 6, pages = {489--499}, month = MAY, abstract = {The availability of a large number of workstations connected through a network can represent an attractive option for high-performance computing for many applications. The message-passing interface (MPI) software environment is an effort from many organisations to define a de facto message-passing standard. In other words, the original specification was not designed as a comprehensive parallel programming environment and some researchers agree that the standard should be preserved as simple and clean as possible. Nevertheless, a software environment such as MPI should have somehow a scheduling mechanism for the effective submission of parallel applications on network of workstations. This paper presents an alternative lightweight approach called Selective-MPI (S-MPI), which was designed to enhance the efficiency of the scheduling of applications on an MPI implementation environment.} } @Article{Cou98:mpi-c++, author = {O. Coulaud}, title = {Para++: A high level {C++} interface for message passing}, journal = {Journal of Parallel and Distributed Computing}, year = 1998, volume = 51, number = 1, pages = {46--62}, month = MAY, abstract = {This paper describes a high level C++ interface for message passing applications. Our interface is built on top of PVM and MPI. The two main contributions are to allow a quicker design of parallel applications without any important drop of performances. We introduce two levels of tasks and use C++ streams for communications. We also present a performance study over both PVM and MPI to show the overhead of our implementation. Finally, we detail two applications based on the heat equation to explain how lPara++ call be used for SPMD and MPMD applications.} } @Article{Sal98:mpi-genetic, author = {A. Salhi}, title = {Parallel implementation of a genetic-programming based tool for symbolic regression}, journal = {Information Processing Letters}, year = 1998, volume = 66, number = 6, pages = {299-307}, month = JUN, abstract = {We report on a parallel implementation of a tool for symbolic regression, the algorithmic mechanism of which is based on genetic programming, and communication is handled using MPI. The implementation relies on a random islands model (RIM), which combines both the conventional islands model where migration of individuals between islands occurs periodically and niching where no migration takes place. The system was designed so that the algorithm is synergistic with parallel/distributed architectures, and works to make use of processor time and minimum use of network bandwidth without complicating the sequential algorithm significantly. Results on an IBM SP2 are included. } } @Article{Har98:mpi-application, author = {H. K. Harbury}, title = {Parallel computation for electronic waves in quantum corrals}, journal = {VLSI Design}, year = 1998, volume = 6, number = {1--4}, pages = {57--51}, abstract = {Recent scanning tunneling microscopy (STM) studies on the (111) faces of noble metals have directly imaged electronic surface-confined states and dramatic standing-wave patterns have been observed [1,2]. We solve for the local density of electronic states in these ''leaky'' quantum corral confinement structures using a coherent elastic scattering theory. We seek solutions of the two-dimensional Schrodinger equation compatible with non-reflecting boundary conditions which asymptotically satisfy the Sommerfeld radiation condition [11,14]. The large matrices generated by the discretization of realistic quantum corral structures require the use of sparse matrix methods. In addition, a parallel finite element solution was undertaken using the message passing interface standard (MPI) and the Portable, Extensible, Toolkit for Scientific Computation (PETSc) [5] for an efficient computational solution on both distributed and shared memory architectures. Our calculations reveal excellent agreement with the reported experimental dI/dV STM data.} } @Article{Jak98:mpi-application, author = {U. Jakobus}, title = {Analysis of electromagnetic scattering problems by an iterative combination of {MoM} with {GMT} using {MPI} for the communication}, journal = {Microwave and Optical Technology Letters}, year = 1998, volume = 19, number = 1, pages = {1--4}, month = SEP, abstract = {A hybrid method is proposed combining the method of moments (MoM) with the generalized multipole technique (GMT) for the efficient analysis of electromagnetic radiation and scattering problems involving metallic as well as dielectric bodies. An iterative coupling scheme is applied so that only some small changes to the MoM and GMT formulations are required, making it very attractive for the combination of already existing MoM and GMT codes. During the iteration, the MoM and GMT processes are executed in parallel, and communication is done using the message-passing interface (MPI).} } @Article{Ril98:mpi-application, author = {C. J. Riley}, title = {Distributed-memory computing with the {L}angley {A}erothermodynamic {U}pwind {R}elaxation {A}lgorithm {(LAURA)}}, journal = {Advances in Engineering Software}, year = 1998, volume = 29, number = {3--6}, pages = {317--324}, month = APR-JUL, abstract = {The Langley Aerothermodynamic Upwind Relaxation Algorithm (LAURA), a Navier-Stokes solver, has been modified for use in a parallel, distributed-memory environment using the Message-Passing Interface (MPI) standard. A standard domain decomposition strategy is used in which the computational domain is divided into subdomains with each subdomain assigned to a processor. Performance is examined on dedicated parallel machines and a network of desktop workstations. The effect of domain decomposition and frequency of boundary updates on performance and convergence is also examined for several realistic configurations and conditions typical of large-scale computational fluid dynamic analysis.} } @Article{Wan98:mpi-application, author = {P. Wang}, title = {Massively parallel finite volume computation of three-dimensional thermal convective flows}, journal = {Advances in Engineering Software}, year = 1998, volume = 29, number = {3--6}, pages = {307--315}, month = APR-JUL, abstract = {A parallel implementation of the finite volume method for three-dimensional, time-dependent, thermal convective flows is presented. The algebraic equations resulting from the finite volume discretization are solved by a parallel multigrid method. A flexible parallel code has been implemented on distributed-memory systems, by using domain decomposition techniques and the MPI communication software. The code uses one-, two- or three-dimensional partition according to different geometries. It currently runs on the Intel Paragon, the Cray T3D, T3E, the IBM SP2 and the Beowulf systems, which can be ported easily to other parallel systems. A comparison of the wallclock time of the code between these systems is made, and code performances with respect to different numbers of processors are presented.} } @Article{Dan98:mpi-application, author = {K. T. Danielson}, title = {Nonlinear dynamic finite element analysis on parallel computers using {FORTRAN} 90 and {MPI}}, journal = {Advances in Engineering Software}, year = 1998, volume = 29, number = {3--6}, pages = {179--186}, month = APR-JUL, abstract = {A nonlinear explicit dynamic finite element code for use on scalable computers is presented. The code was written entirely in FORTRAN 90, but uses MPI for all interprocessor communication. Although MPI is not formally a standard for FORTRAN 90, the code runs properly in parallel on CRAY T3E, IBM SP, and SGI ORIGIN 2000 computing systems. Issues regarding the installation, portability, and effectiveness of the FORTRAN 90-MPI combination on these machines are discussed. An algorithm that overlaps message passing and computations of the explicit finite element equations is also presented and evaluated. Several large-scale ground-shock analyses demonstrate the varying combined importance of load balance and interprocessor communication among the different computing platforms. The analyses were performed on only a few to hundreds of processors with excellent speedup and scalability.} } @Article{Vat98:mpi-application, author = {V. N. Vatsa}, title = {Viscous pow computations for complex geometries on parallel computers}, journal = {Advances in Engineering Software}, year = 1998, volume = 29, number = {3--6}, month = APR-JUL, abstract = {A widely used computational fluid dynamics (CFD) code known as TLNS3D, which was developed for large, shared-memory computers, is ported to a distributed computing environment. An engineering approach is used here to parallelize this code so that minimal deviation from the original (non-parallel) code is incurred. A natural partitioning along grid blocks is adopted in which one or more blocks are distributed to each of the available processors. An automatic, static load-balancing strategy is employed for equitable distribution of computational work to specified processors. The message passing interface (MPI) protocols are incorporated for data communication. Both synchronous and asynchronous communication modes have been incorporated. As the number of processors is increased, the asynchronous communication mode shows much better scalability and clearly outperforms the synchronous mode of communication.} } @Article{Riv98:mpi-application, author = {W. RiveraGallego}, title = {A genetic algorithm for circulant Euclidean distance matrices}, journal = {Applied Mathematics and Computation}, year = 1998, volume = 97, number = {2--3}, pages = {197--208}, month = DEC, abstract = {This paper presents a fast genetic algorithm to determine three-dimensional configurations of points that generate circulant Euclidean Distance Matrices (EDMs). A parallel implementation is possible by using the message passing interface (MPI) standard. In addition, theoretical results about the polyhedral structure of both the cone of circulant symmetric positive semidefinite matrices and the cone of circulant EDMs are introduced.} } @Article{Ada98:mpi-application, author = {P. Adamidis}, title = {Steel strip production --- a pilot application for coupled simulation with several calculation systems}, journal = {Journal of Materials Processing Technology}, year = 1998, volume = {80-1}, pages = {330--336}, month = AUG-SEP, abstract = {For the simulation of technological and natural processes in specific application domains, efficient calculation software solving differential equation systems on grid-based computational models is available, especially in the area of computer-aided engineering (CAE). To handle a so-called 'multiphysics' problem, for example the fluid flow and metal forming process in a twin-roll casting arrangement for steel strip production, several calculation systems usually have to be employed in a high-performance computing environment, e.g. on parallel computers. The GRISSLi Coupling Interface is a software tool facilitating the coupled computation based on the message passing standard MPI.} } @Article{Dow98:mpi-implementation, author = {P. W. Dowd}, title = {{BLAST}: broadband lightweight {ATM} secure transport for high-performance distributed computing}, journal = {Computer Communications}, year = 1998, volume = 21, number = 12, pages = {1040--1057}, month = AUG, abstract = {This paper investigates the use of ATM for cluster-based computing. The need for a native ATM API is discussed as well as the performance of message passing libraries (MPL) that are written to use such an API to exploit the advantages of a high-speed network for cluster-based computing. The MPLs offer a standard interface, such as PVM or MPI, and interoperate with existing TCP/IP- and UDP/IP-based versions in addition to the ATM API environment. The interoperability extensions made to two MPLs, MPI and Prowess, which allow a hybrid environment of both ATM and TCP-based legacy network technology will be described. Shared object space (SOS), an extension to the MPLs, is described that helps support the geographically distributed computing (GDC) environment through latency hiding. It allows a user to develop applications in a shared memory type of environment. The native ATM API which supports cluster-based computing is described in this paper. This API provides a reliable transport interface to the MPL which has been optimized for an ATM environment. The transport protocol is a low-state design that optimizes the performance based on the available bandwidth, buffer constraints, propagation delay characteristics and security requirements of a particular connection.} } @Article{Kac98:mpi-tool, author = {P. Kacsuk}, title = {{GRADE}: A graphical programming environment for multicomputers}, journal = {Computers and Artificial Intelligence}, year = 1998, volume = 17, number = 5, pages = {417--427}, abstract = {To provide high-level graphical support for developing message passing programs, an integrated programming environment (GRADE) is being developed. GRADE currently provides tools to construct, execute, debug, monitor and visualise message-passing based parallel programs. GRADE offers the programmer an integrated graphical user interface during the whole life-cycle of program development and provides high-level graphical programming abstraction mechanisms to construct parallel applications. The current version of GRADE can generate C+PVM code but there is no theoretical obstacle to extend it for supporting MPI [9] and FORTRAN. Those new features of the GRADE graphical environment are described in the paper that enhanced GRADE towards a professional parallel programming environment.} } @Article{Ras98:mpi-application, author = {J. Rasch}, title = {6-dimensional integrals and supercomputers}, journal = {Computer Physics Communications}, year = 1998, volume = 114, number = {1--3}, pages = {378--384}, month = NOV, abstract = {Recently, a numerical method has been developed for the evaluation of general 6-dimensional integrals (6DIME), which has been successfully applied to the study of (e,2e) and (gamma,2e) processes. Details of the parallelization of that code are given using MPI and the scaling behaviour with respect to the number of nodes is presented. Almost full load balancing is obtained.The method is extended to include two centre scattering problems.} } @Article{Chu98:mpi-balancing, author = {Y. Chung}, title = {An asynchronous algorithm for balancing unpredictable workload on distributed-memory machines}, journal = {ETRI Journal}, year = 1998, volume = 20, number = 4, pages = {346--360}, month = DEC, abstract = {It is challenging to parallelize problems with irregular computation and communication. In this paper, we propose an asynchronous algorithm for balancing unpredictable workload on distributed-memory machines. By using an initial workload estimate, we first partition the computations such that the workload is distributed evenly across the processors. In addition, we performtask migrations dynamically for adapting to the evolving workload. To demonstrate the usefulness of our load balancing strategy, we conducted experiments on an IBM SP2 and a Cray T3D. Experimental results show that our task migration strategy can balance unpredictable workload with little overhead. Our code using C and MPI is portable onto other distributed-memory machines.} } @Article{Ber99:mpi-tools, author = {M. Bertozzi}, title = {Tools for code optimization and system evaluation of the image processing system {PAPRICA-3}}, journal = {Journal of Systems Architecture}, year = 1999, volume = 45, number = {6--7}, pages = {519--542}, month = JAN, abstract = {This paper presents the complex environment that was built to ease the prototyping of real-time applications on the PAPRICA-3 massively parallel system. Applications are developed in C++ using high level data types and the corresponding Assembly code is automatically created by a code generator. A stochastic code optimizer takes the assembly code and improves it according to a genetic approach; due to the high computational power required by thisapproach, the stochastic code optimizer was implemented with MPI and runs in parallel on a cluster of workstations. The availability of this complex environment allowed to test the performance of the system and to tune it according to some target applications before the actual development of the hardware. For this purpose a system-level simulator was also built to determine the number of clock cycles required to run a specific segment of code. The whole environment has been used to validate possible solutions for the hardware system and to develop, test, and tune several real-time image processing applications. The hardware system is now completely defined.} } @Article{Lee99:mpi-applicatin, author = {P. C. S. Lee}, title = {On the parallelization of a global climate-chemistry modeling system}, journal = {Atmospheric Environment}, year = 1999, volume = 33, number = 4, pages = {675--681}, month = FEB, abstract = {Coupled climate-chemistry simulations are computationally intensive owing to the spatial and temporal scope of the problem. In global chemistry models, the time integrations encountered in the chemistry and aerosol modules usually comprise the major CPU consumption. Parallelization of these segmentsof the code can contribute to multifold CPU speed-ups with minimal modification of the original serial code. This technical note presents a single program-multiple data (SPMD) strategy applied to the time-split chemistry modules of a coupled climate - global tropospheric chemistry model. Latitudinal domain decomposition is adopted along with a dynamic load-balancing technique that uses the previous time-step's load/latitude estimates for distributing the latitude bands amongst the processors. The coupled model is manually parallelized using the Message Passing Interface standard (MPI) on a distributed memory platform (IBM-SP2), Load-balancing efficiencies and the associated MPI overheads are discussed. Overall speed-ups and efficiencies are also calculated for a series of runs employing up to eight processors.} } @Article{May99:mpi-application, author = {F. May}, title = {Mathematical modelling of glass melting furnace design with regard to {NOx} formation}, journal = {Glastechnische Berichte-Glass Science and Technology}, year = 1999, volume = 72, number = 1, pages = {1--6}, month = JAN, abstract = {A three-dimensional mathematical model for turbulent flow and combustion onthe basis of turbulence/chemistry interactions and radiative heat transfertaking into account spectral effects of surrounding walls and combustion gases is described. For this the transport equation for radiative intensity was split into different wavelength ranges. A block-structured finite volume grid with local refinements was used to solve the governing equations. The calculation domain is subdivided into a number of subdomains which are linked within the solver based on the Message Passing Interface library. Computed distributions of velocity, temperature, and heat fluxes are given. Results of a parametric study in a producing horseshoe furnace by increasing the height of the furnace with regard to NOx concentration distributions are presented.} } @Article{Reu99:mpi-application, author = {J. Reuther}, title = {Aerodynamic shape optimization of supersonic aircraft configurations via anadjoint formulation on distributed memory parallel computers}, journal = {Computers and Fluids}, year = 1999, volume = 28, number = {4--5}, pages = {675--700}, month = MAY-JUN, abstract = {This work describes the application of a control theory-based aerodynamic shape optimization method to the problem of supersonic aircraft design. A high fidelity computational fluid dynamics (CFD) algorithm modelling the Euler equations is used to calculate the aerodynamic properties of complex three-dimensional aircraft configurations. The design process is greatly accelerated through the use of both control theory and parallel computing. Control theory is employed to derive the adjoint differential equations whose solution allows for the evaluation of design gradient information at a fraction of the computational cost required by previous design methods. The resulting problem is then implemented in parallel using a domain decomposition approach, an optimized communication schedule, and the Message Passing Interface (MPI) Standard for portability and efficiency. In our earlier studies, the serial implementation of this design method, was shown to be effective for the optimization of airfoils, wings, wing-bodies, and complex aircraft configurations using both the potential equation and the Euler equations. In this work, our concern will be to extend the methodologies such that the combined capabilities of these new technologies can be used routinely and efficiently in an industrial design environment. The aerodynamic optimization of a supersonic transport configuration is presented as a demonstration test case of the capability, A particular difficulty of this test case is posed by the close coupling of the propulsion/airframe integration.} } @Article{Vat99:mpi-application, author = {V. N. Vatsa}, title = {Parallelization of a multiblock flow code: an engineering implementation}, journal = {Computers and Fluids}, year = 1999, volume = 38, number = {4--5}, pages = {603--614}, month = MAY-JUN, abstract = {Current trends in computer hardware are dictating a gradual shift toward the use of clusters of relatively inexpensive but powerful workstations, or massively parallel processing (MPP) machines, for scientific computing. However, most computational fluid dynamics (CFD) codes in use today were developed for large, shared-memory machines and are not readily portable to the distributed computing environment. One major hurdle in porting CFD codes to distributed computing platforms is the difficulty encountered in partitioning the problem so that the computation-to-communication ratio for each compute node (process) is maximized and the idle time during which one node waits for other nodes to transfer data is minimized. In the present work, pertinent issues involved in the parallelization of a widely used multiblock Navier-Stokes code TLNS3D are discussed. An engineering; approach is used here to parallelize this code so that minimal deviation from the original (nonparallel) code is incurred. A natural partitioning along grid blocks is adopted in which one or more blocks are distributed to each of the available nodes. An automatic, static load-balancing strategy is employed for equitable distribution of computational work to specified nodes. Both parallel Virtual machine (PVM) and message passing interface (MPI) protocols are incorporated for data communication to allow maximum portability to a wide range of computer configurations. Results are presented that are comparable with apriori estimates of performance for distributed computing and that are competitive in terms of central processing unit (CPU) time and wall time usagewith large, shared-memory supercomputers.} } @Article{Dzw99:mpi-application, author = {W. Dzwinel}, title = {Method of particles in visual clustering of multi-dimensional and large data sets}, journal = {Future Generation Computer Systems}, year = 1999, volume = 15, number = 3, pages = {365--379}, month = APR, abstract = {A method dedicated for visual clustering of N-dimensional data sets is presented. It is based on the classical feature extraction technique - the Sammon's mapping. This technique empowered by a particle approach used in the Sammon's criterion minimization makes the method more reliable, general and efficient. To show its reliability, the results of tests are presented, which were made to exemplify the algorithm 'immunity' from data errors. The general character of the method is emphasized and its role in multicriterial analysis discussed. Due to inherent parallelism of the methods, which are based on the particle approach, the visual clustering technique can be implemented easily in parallel environment. It is shown that parallel realization of the mapping algorithm enables the visualization of data sets consisting of more than 10(4) multi-dimensional data points. The method was tested in the PVM, MPI and data parallel environments on an HP/Convex SPP/1600. In this paper, the authors compare the parallel algorithm performance for these three interfaces. The approach to visual clustering, presented in the paper, can be used in visualization and analysis of large multi-dimensional data sets. } } @Article{Wan99:mpi-application, author = {P. Wang}, title = {Parallel multigrid finite volume computation of three-dimensional thermal convection}, journal = {Computers and Mathematics with Applications}, year = 1999, volume = 37, number = 9, pages = {49-60}, month = MAY, abstract = {A parallel implementation of the finite volume method for three-dimensional, time-dependent, thermal convective flows is presented. The algebraic equations resulting from the finite volume discretization, including a pressureequation which consumes most of the computation time, are solved by a parallel multigrid method. A flexible parallel code has been implemented on theIntel Paragon, the Cray T3D, and the IBM SP2 by using domain decompositiontechniques and the MPI communication software. The code can use 1D, 2D, or3D partitions as required by different geometries, and is easily ported toother parallel systems. Numerical solutions for air (Prandtl number Pr = 0.733) with various Rayleigh numbers up to 10(7) are discussed.} } @Article{Bar99:mpi-application, author = {S. T. Barnard}, title = {An {MPI} implementation of the {SPAI} preconditioner on the {T3E}}, journal = {International Journal of High Performance Computing Applications}, year = 1999, volume = 13, number = 2, pages = {107--123}, month = {Summer}, abstract = {The authors describe and test spai-1.1, a parallel MPI implementation of the sparse approximate inverse (SPAI) preconditioner. They show that SPAI canbe very effective for solving a set of very large and difficult problems on a Cray T3E. The results clearly show the value of SPAI (and approximate inverse methods in general) as the Viable alternative to ILU-type methods when facing very large and difficult problems. The authors strengthen this conclusion by showing that spai-1.1 also has very good scaling behavior.} } @Article{Ree99:mpi-application, author = {J. S. Reeve}, title = {An efficient parallel version of the {Householder-QL} matrix diagonalisation algorithm}, journal = {Parallel Computing}, year = 1999, volume = 25, number = 3, pages = {311-319}, month = MAR, abstract = {In this paper we report an effective parallelisation of the Householder routine for the reduction of a real symmetric matrix to tri-diagonal form and the QL algorithm for the diagonalisation of the resulting matrix. The Householder algorithm scales like alpha N-3/P + beta N(2)log(2)(P) and the QL algorithm like gamma N-2 + delta N-3/P as the number of processors P is increased for fixed problem size. The constant parameters alpha, beta, gamma anddelta are obtained empirically. When the eigenvalues only are required theHouseholder method scales as above while the QL algorithm remains sequential. The code is implemented in c in conjunction with the message passing interface (MPI) libraries and verified on a sixteen node IBM SP2 and for realmatrices that occur in the simulation of properties of crystaline materials.} } @Article{Gen99:mpi-application, author = {C. Gennaro}, title = {Parallelising the Mean Value Analysis algorithm}, journal = {Transactions of the Society for Computer Simulation International}, year = 1999, volume = 16, number = 1, pages = {16--22}, month = MAR, abstract = {The Mean Value Analysis (MVA) algorithm is one of the most popular for evaluating the performance of separable (or product-form) queueing networks. Although its complexity is modest when jobs are indistinguishable, the introduction of different customer classes rapidly increases is computational cost. The problems of parallelising the algorithm while retaining its conceptual simplicity are examined. In particular, a parallel implementation of MVAon a distributed memory machine is developed using the MPI library for communication.} } @Article{Ble99:mpi-application, author = {G. E. Blelloch}, title = {Design and implementation of a practical parallel {D}elaunay algorithm}, journal = {Algorithmica}, year = 1999, volume = 24, number = {3--4}, pages = {243--269}, month = JUL-AUG, abstract = {Initial experiments using a variety of distributions showed that our parallel algorithm was within a factor of 2 in work from the best sequential algorithm. Based on these promising results, the algorithm was implemented using C and an MPI-based toolkit. Compared with previous work, the resulting implementation achieves significantly better speedups over good sequential code, does not assume a uniform distribution of points, and is widely portable due to its use of MPI as a communication mechanism. Results are presentedfor the IBM SP2, Cray T3D, SGI Power Challenge, and DEC AlphaCluster.} } @Article{Coe99:mpi-application, author = {P. J. Coelho}, title = {Modelling of a utility boiler using parallel computing}, journal = {Journal of Supercomputing}, year = 1999, volume = 13, number = 2, pages = {211-232}, month = MAR, abstract = {A mathematical model for the simulation of the turbulent reactive flow and heat transfer in a power station boiler has been parallelized. The mathematical model is based on the numerical solution of the governing equations for mass, momentum, energy and transport equations for the scalar quantities.The k-epsilon model and the conserved scalar/prescribed probability density function formalism are employed. Radiative heat transfer is calculated using the discrete ordinates method. The code has been fully parallelized using the spatial domain decomposition approach and MPI. Calculations were performed using an IBM-SP2. It is shown that the computational requirements are reduced and the parallel efficiency increases if the mean temperature anddensity are calculated a priori, and stored. The role of the different parts of the code on the parallel performance is discussed. A speedup of 5.9 is achieved using 8 processors.} } @Article{Rus99:mpi-cluster, author = {S. H. Russ}, title = {Using {Hector} to run {MPI} programs over networked workstations}, journal = {Concurrency Practice and Experience}, year = 1999, volume = 11, number = 4, pages = {189--204}, month = APR, abstract = {Networked workstations represent an increasingly popular distributed platform for running large parallel programs. They can present a low-cost alternative to purchasing supercomputer time or additional usable computational capability, Several capabilities are desirable in order to harness workstations, including support for a widely accepted parallel programming environment, task migration, intelligent resource allocation, fault tolerance, and totally transparent support of these features. The Hector system is designed to provide these capabilities to MPI programs. The structure of the system and experiences using the system on loaded workstations to run scientific codes are described.} } @Article{Ros99:mpi-tool, author = {T. Rossi}, title = {SIAM Journal on Scientific Computing}, journal = {A parallel fast direct solver for block tridiagonal systems with separable matrices of arbitrary dimension}, year = 1999, volume = 20, number = 5, pages = {1778-1796}, month = MAY, abstract = {A parallel fast direct solution method for linear systems with separable block tridiagonal matrices is considered. Such systems appear, for example, when discretizing the Poisson equation in a rectangular domain using the five-point finite difference scheme or the piecewise linear finite elements ona triangulated, possibly nonuniform rectangular mesh. The method under consideration has the arithmetical complexity O(N log N), and it is closely related to the cyclic reduction method, but instead of using the matrix polynomial factorization, the so-called partial solution technique is employed. Hence, in this paper, the method is called the partial solution variant of the cyclic reduction method (PSCR method). The method is presented and analyzed in a general radix-q framework and, based on this analysis, the radix-4 variant is chosen for parallel implementation using the MPI standard. Thegeneralization of the method to the case of arbitrary block dimension is described. The numerical experiments show the sequential efficiency and numerical stability of the PSCR method compared to the well-known BLKTRI implementation of the generalized cyclic reduction method. The good scalability properties of the parallel PSCR method are demonstrated in a distributed-memory Cray T3E-750 computer.} } @Article{Bou99:mpi-algorithm, author = {P. Boulet}, title = {Static tiling for heterogeneous computing platforms}, journal = {Parallel Computing}, year = 1999, volume = 25, number = 5, pages = {547--568}, month = MAY, abstract = {In the framework of fully permutable loops, tiling has been extensively studied as a source-to-source program transformation. However, little work hasbeen devoted to the mapping and scheduling of the tiles on physical processors. Moreover, targeting heterogeneous computing platforms has to the best of our knowledge, never been considered. In this paper we extend static tiling techniques to the context of limited computational resources with different-speed processors. In particular, we present efficient scheduling and mapping strategies that are asymptotically optimal. The practical usefulness of these strategies is fully demonstrated by MPI experiments on a heterogeneous network of workstations.} } @Article{Ros99:mpi-application, author = {I. Rosenblum}, title = {Multi-processor molecular dynamics using the {Brenner} potential: Parallelization of an implicit multi-body potential}, journal = {International Journal of Modern Physics C}, year = 1999, volume = 10, number = 1, pages = {189--203}, month = FEB, abstract = {We present computational aspects of Molecular Dynamics calculations of thermal properties of diamond using the Brenner potential. Parallelization was essential in order to carry out these calculations on samples of suitable sizes. Our implementation uses MPI on a multi-processor machine such as the IBM SP2. Three aspects of parallelization of the Brenner potential are discussed in depth. These are its long-range nature, the need for different parallelization algorithms for forces and neighbors, and the relative expense of force calculations compared to that of data communication. The efficiency of parallelization is presented as a function of different approaches to these issues as well as of cell size and number of processors employed in the calculation. In the calculations presented here, information from almosthalf of the atoms were needed by each processor even when 16 processors were used. This made it worthwhile to avoid unnecessary complications by making data from all atoms available to all processors. Superlinear speedup wasachieved for four processors (by avoiding paging) with 512 atom samples, and 5ps long trajectories were calculated (for 5120 atom samples) in 53 hours using 16 processors; 514 hours would have been needed to complete this calculation using a serial program. Finally, we discuss and make available a set of routines that enable MPI-based codes such as ours to be debugged on scalar machines.} } @Article{Luo99:mpi-comparision, author = {Y. Luo}, title = {Shared memory vs. message passing: the {COMOPS} benchmark experiment}, journal = {Journal of Supercomputing}, year = 1999, volume = 13, number = 3, pages = {283--301}, month = MAY, abstract = {This paper presents the comparison of the COMOPS benchmark performance in MPI and shared memory on four different shared memory platforms: the DEC AlphaServer 8400/300, the SGI Power Challenge, the SGI Origin2000, and the HP-Convex Exemplar SPP1600. The paper also qualitatively analyzes the obtained performance data based on an understanding of the corresponding architecture and the MPI implementations. Some conclusions are made for the inter-processor communication performance on these four shared memory platforms.} } @Article{Hio99:mpi-application, author = {S. Hioki}, title = {{QCDimMPI: MPI} code for {QCD} with an improved action}, journal = {Nuclear Physics B-Proceedings Supplements}, year = 1999, volume = 73, pages = {895--897}, month = MAR, abstract = {QCDimMPI[I] is a simulation code for pure SU(3) gauge theory with an improved action consisting of 1 x 1 and 2 x 1 plaquettes. It uses Fortran77 and the Message Passing Interface Standard, MPI[2]. QCDimMPI is an extended version of QCDMPI. It is portable, allows simulations in any number of dimensions, on any number of processors, and with arbitrary dimensional partitioning. It requires a rather small working area, and yields excellent performance on single processor computers and a wide variety of parallel computers which support MPI. The program provides information on link update time and communications time. In this paper, an outline of QCDimMPI is given, and benchmark results on several parallel computers are reported.} } @Article{Gol99:mpi-application, author = {A. Goller}, title = {Parallel processing strategies for large {SAR} image data sets in a distributed environment}, journal = {Computing}, year = 1999, volume = 62, number = 4, pages = {277-291}, abstract = {Key algorithms like image matching and Shape-from-Shading were parallelizedmainly using MPI, and ported onto suitable computer architectures. Our experiments showed that all algorithms perform well, and they further proved the concept of CDIP to be beneficial: Usability of all integrated algorithmswas significantly improved, mainly due to less user-centered network traffic, simple access to supercomputers, the creation of method sequences, and easy-to-use and well maintained algorithms.} } @Article{Chi99:mpi-implementation, author = {A. Chien}, title = {Design and evaluation of an {HPVM}-based windows {NT} supercomputer}, journal = {International Journal of High Performance Computing Applications}, year = 1999, volume = 13, number = 3, pages = {201--219}, month = {Fall}, abstract = {We describe the design and evaluation of a 192-processor Windows NT clusterfor high performance computing based on the High Performance Virtual Machine (HPVM) communication suite. While other clusters have been described in the literature, building a 58 GFlop/s NT cluster to be used as a general-purpose production machine for NCSA required solving new problems. The HPVM software meets the challenges represented by the large number of processors,the peculiarities of the NT operating system, the need for a production-strength job submission facility and the requirement for mainstream programming interfaces. First, HPVM provides users with a collection of standard APIs like MPI, Shmem, Global Arrays with supercomputer class performance (13 mu s minimum latency, 84 MB/s peak bandwidth for MPI), efficiently delivering Myrinet's hardware performance to application programs. Second, HPVM provides cluster management and scheduling (through integration with Platform Computing's LSF). Finally, HPVM addresses Windows NT's remote access problem, providing convenient remote access and job control (through a graphical Java-applet front-end). Given the production nature of the cluster, the performance characterization is largely based on a sample of the NCSA scientific applications the machine will be running. The side-by-side comparison with other present-generation NCSA supercomputers shows the cluster to be within a factor of 2 to 4 of the SGI Origin 2000 and Cray T3E performance at a fraction of the cost. The inherent scalability of the cluster design produces a comparable or better speedup than the Origin 2000 despite a limitationin the HPVM flow control mechanism.} } @Article{Ros99:mpi-tools, author = {T. Rossi}, title = {Parallel fictitious domain method for a non-linear elliptic {Neumann} boundary value problem}, journal = {Numerical Linear Algebra with Applications}, year = 1999, volume = 6, number = 1, pages = {51--60}, month = JAN-FEB, abstract = {Parallelization of the algebraic fictitious domain method is considered forsolving Neumann boundary value problems with variable coefficients. The resulting method is applied to the parallel solution of the subsonic full potential flow problem which is linearized by the Newton method. Good scalability of the method is demonstrated on a Cray T3E distributed memory parallel computer using MPI in communication.} } @Article{Zak99:mpi-tools, author = {O. Zaki}, title = {Toward scalable performance visualization with Jumpshot}, journal = {International Journal of High Performance Computing Applications}, year = 1999, volume = 13, number = 3, pages = {277-288}, month = {Fall}, abstract = {Jumpshot is a graphical tool for understanding the performance of parallel programs. It is in the tradition of the upshot tool but contains a number of extensions and enhancements that make it suitable for large-scale parallel computations. Jumpshot takes as input a new, more flexible logfile formatand comes with a library for generating such logfiles. An MPI profiling library is also included, enabling the automatic generation of such logfiles from MPI programs. Jumpshot is written in Java and can easily be integratedas an applet into browser-based computing environments. The most novel feature of Jumpshot is its automatic detection of anomalous durations, drawingthe user's attention to problem areas in a parallel execution. This capability is particularly useful in large-scale parallel computations containingmany events.} } @Article{BegVin99:transport, author = {S. Bergeron and A. Vincent}, title = {Implementation strategies for real-time particle transport solver}, journal = {Computer Physics Communications}, year = 1999, volume = 120, number = {2--3}, month = AUG, pages = {177-184}, abstract = {Many problems in physics and engineering involve the transport of solid particles in a turbulent field. In some cases, it is desirable to study the transport of those particles in "real time". The prediction of erosion in therotating part of hydraulic turbines is such a problem. This paper presentsa semi-analytic predictor-corrector scheme adapted to the case of a rotating frame of reference. Simplification, related to the interpolation scheme required, is discussed as well as a parallel implementation using MPI on 10Base-T Ethernet interconnected workstations. The 3D solver is coupled with a high performance visualization software. Performance then shows a quasi-linear speedup.} } @Article{BruFagRes99:meta, author = {M. A. Brune and G. E. Fagg and M. M. Resch}, title = {Message-passing environments for metacomputing}, journal = {Future Generation Computer Systems}, year = 1999, volume = 15, number = {5--6}, month = OCT, pages = {699-712}, abstract = {The PACX-MPI approach offers a transparent interface for the communication between two or more MPI environments. PVAMPI allows the user spawning parallel processes under the MPI environment. The PLUS protocol bridges the gap between vendor-specific (e.g., MPL, NX, and PARIX) and vendor-independent message-passing environments (e.g., PVM and MPI). Moreover, it offers the ability to create and control processes at application runtime.} } @Article{ResRanSto99:meta, author = {M. M. Resch and D. Rantzau and R. Stoy}, title = {Metacomputing experience in a transatlantic wide area application test-bed}, journal = {Future Generation Computer Systems}, year = 1999, volume = 15, number = {5--6}, month = OCT, pages = {807--816}, abstract = {In the frame of a G7 initiative the High Performance Computing Center Stuttgart (HLRS) together with the Pittsburgh Supercomputing Center (PSC) and Sandia National Laboratories (SNL) has set up a transatlantic wide area application test-bed in 1997. A dedicated ATM-Link was installed that connected German research networks to vBNS and ESnet. During 1 year this test-bed wasextensively used for metacomputing and collaborative working. Two applications - one from computational fluid dynamics and one from molecular dynamics - were adapted and run on the test-bed. For message-passing an MPI library was implemented that supports metacomputing. An already existing softwarefor collaborative visualization was adapted for that scenario. This article describes the technical background of the cooperation, results that have been achieved for the two applications so far and lessons that have been learned. Special emphasis will be given to future work planned.} } @Article{Tho99:mpi-application, author = {S. J. Thomas and M. Desgagne and R. Benoit}, title = {A real-time north American forecast at 10-km resolution with the {C}anadian {MC2 Meso-LAM}}, journal = {Journal of Atmospheric and Oceanic Technology}, year = 1999, volume = 16, number = 8, pages = {1092-1101}, month = AUG, abstract = {The next generation of high-performance computers will be based on clustersof shared-memory symmetric multiprocessor (SMP) nodes interconnected by a low-latency, high-bandwidth network. In this paper, the parallel performance of the nonhydrostatic Mesoscale Compressible Community (MC2) limited-areaatmospheric model on clusters of NEC SX-4 symmetric multiprocessor (SMP) nodes is presented. Several hybrid parallel-programming approaches are now possible with the SMP cluster SC-MC2 implementation based on internode MPI message-passing and intranode shared-memory tasking or threads. At total sustained execution rates of between 25 and 30 Gflop s(-1) on single-node or multinode clusters, it is now possible for the first time ever to generate a24-48-h real-time weather forecast over North America at 10-km resolution.} } @Article{Rod99:mpi-evals, author = {J. L. Roda and C. Rodriguez and D. G. Morales and E. Almeida}, title = {Predicting the execution time of message passing models}, journal = {Concurrency Practice and Experience}, year = 1999, volume = 11, number = 9, month = AUG, pages = {461--477}, abstract = {Recent publications prove that runtime systems oriented to the Bulk Synchronous Parallel Model usually achieve remarkable accuracy in their predictions, That accuracy can be seen in the capacity of the software for packing the messages generated during the superstep and their capability to find a rearrangement of the messages sent at the end of the superstep, Unfortunately, barrier synchronisation imposes some limits both in the range of available algorithms and in their performance, The asynchronous nature of many MPI/PVM programs makes their expression difficult or infeasible using a BSP oriented library. Through the generalisation of the concept of superstep we propose two extensions of the BSP model: the BSP Without Barriers (BSPWB) andthe Message Passing Machine (MPM) models, These new models are oriented toMPI/PVM parallel programming. The parameters of the models and their quality are evaluated on four standard parallel platforms, The use of these BSP extensions is illustrated using the Past Fourier Transform and the ParallelSorting by Regular Sampling algorithms.} } @Article{Lir99:mpi-apps, author = {I. Lirkov and S. Margenov}, title = {{MPI} parallel implementation of {CBF} preconditioning for {3D} elasticity problems}, journal = {Mathematics and Computers in Simulation}, year = 1999, volume = 50, number = {1--4}, month = NOV, pages = {247--254}, abstract = {New construction of a parallel algorithm for the discussed preconditioning method is proposed. The theoretical part of this study includes analysis ofthe execution time on various parallel architectures and asymptotic estimates of the parallel speedup and the parallel efficiency. The parallel performance estimates indicate that the proposed algorithm will be especially efficient on coarse-grain parallel systems, which is also confirmed by the numerical experiments. A portable MPI parallel code is developed. Numerical tests on three symmetric multiprocessor systems: SUN Enterprise 3000, SUN SPARCstation 10 and Origin 2000 are presented. The reported speedup and parallel efficiency illustrate well the features of the proposed method and its implementation. } } @Article{den99:mpi-app, author = {L. Deng and Z. S. Xie}, title = {Parallelization of {MCNP} Monte Carlo neutron and photon transport code in parallel virtual machine and message passing interface}, journal = {Journal of Nuclear Science and Technology}, year = 1999, volume = 36, number = 7, month = JUL, abstract = {The coupled neutron and photon transport Monte Carlo code MCNP (version 3B)has been parallelized in parallel virtual machine (PVM) and message passing interface (MPI) by modifying a previous serial code. The new code has been verified by serving sample problems. The speedup increases linearly with the number of processors and the average efficiency is up to 99\% for 12-processor.} } @Article{Arp99:mpi-app, author = {K. Arpe and E. Roechner}, title = {Simulation of the hydrological cycle over Europe: Model validation and impacts of increasing greenhouse gases}, journal = {Advances in Water Resources}, year = 1999, volume = 23, number = 2, month = OCT, pages = {105--119}, abstract = {Different methods of estimating precipitation area means, based on observations, are compared with each other to investigate their usefulness for model validation. For the applications relevant to this study the ECMWF reanalyses provide a good and comprehensive data set for validation. The uncertainties of precipitation analyses, based on observed precipitation or from numerical weather forecasting schemes, are generally in the range of 20\% but regionally much larger. The MPI atmospheric general circulation model is able to reproduce long term means of the main features of the hydrological cycle within the range of uncertainty of observational data, even for relatively small areas such as the Rhine river basin. Simulations with the MPI coupled general circulation model, assuming a further increase of anthropogenicgreenhouse gases, show clear trends in temperature and precipitation for the next century which would have significant implications for human activity, e.g. a further increase of the sea level of the Caspian Sea and less water in the Rhine and the Danube. We have gained confidence in these results because trends in the temperature and precipitation in the coupled model simulations up to the present are partly confirmed by an atmospheric model simulation forced with observed SSTs and by observational data. We gained further confidence because the simulations with the same coupled model but using constant greenhouse gases do not show such trends. However, doubts arisefrom the fact that these trends are strong where the systematic errors of the model are large.} } @Article{Yah99:mpi-app, author = {Y. Yahagi and M. Mori and Y. Yoshii}, title = {The forest method as a new parallel tree method with the sectional Voronoi tessellation}, journal = {Astrophysical Journal Supplement Series}, year = 1999, volume = 124, number = 1, month = SEP, pages = {1--9}, abstract = {We have developed a new parallel tree method which will be called the forest method hereafter. This new method uses the sectional Voronoi tessellation(SVT) for the domain decomposition. The SVT decomposes a whole space into polyhedra and allows their flat borders to move by assigning different weights. The forest method determines these weights based on the load balancingamong processors by means of the overload diffusion (OLD). Moreover, sinceall the borders are hat, before receiving the data from other processors, each processor can collect enough data to calculate the gravity force with precision. Both the SVT and the OLD are coded in a highly vectorizable manner to accommodate on vector parallel processors. The parallel code based onthe forest method with the Message Passing Interface is run on various platforms so that a wide portability is guaranteed. Extensive calculations with 15 processors of Fujitsu VPP300/16R indicate that the code can calculate the gravity force exerted on 10(5) particles in each second for some ideal dark halo. This code is found to enable an N-body simulation with 10(7) or more particles for a wide dynamic range and is therefore a very powerful tool for the study of galaxy formation and large-scale structure in the universe.} } @Article{tan99:mpi-impl, author = {H. Tang and K. Shen and T. Yang}, title = {Compile/run-time support for threaded {MPI} execution on multiprogrammed shared memory machines}, journal = {ACM SIGPLAN Notices}, year = 1999, volume = 34, number = 8, month = AUG, pages = {107--118}, abstract = {MPI is a message-passing standard widely used for developing high-performance parallel applications. Because of the restriction in the MPI computationmodel, conventional implementations on shared memory machines map each MPInode to an OS process, which suffers serious performance degradation in the presence of multiprogramming, especially when a space/time sharing policyis employed in OS job scheduling In this paper, we study compile-time and run-time support for MPI by using threads and demonstrate our optimization techniques for executing a large class of MPI programs written in C. The compile-time transformation adopts thread-specific data structures to eliminate the use of global and static variables in C code. The run-time support includes an efficient point-to-point communication protocol based on a novellock-free queue management scheme. Our experiments on an SGI Origin 2000 show that our MPI prototype called TMPI using the proposed techniques is competitive with SGI's native MPI implementation in a dedicated environment, and it has significant performance advantages with up to a 23-fold improvement in a multiprogrammed environment.} } @Article{kie99:mpi-collective, author = {T. Kielmann and R. F. H. Hofman and H. E. Bal and A. Plaat and R. A. F. Bhoedjang}, title = {{MAGPIE: MPI}'s collective communication operations for clustered wide area systems}, journal = {ACM SIGPLAN Notices}, year = 1999, volume = 34, number = 8, month = AUG, pages = {131-140}, abstract = {Writing parallel applications for computational grids is a challenging task. To achieve good performance, algorithms designed for local area networks must be adapted to the differences in link speeds. An important class of algorithms are collective operations, such as broadcast and reduce. We have developed MAGPIE, a library of collective communication operations optimizedfor wide area systems. MAGPIE's algorithms send the minimal amount of dataover the slow wide area links, and only incur a single wide area latency. Using our system, existing MPI applications can be run unmodified on geographically distributed systems. On moderate cluster sizes, using a wide area latency of 10 milliseconds and a bandwidth of 1 MByte/s, MAGPIE executes operations up to 10 times faster than MPICH, a widely used MPI implementation; application kernels improve by up to a factor of 4. Due to the structure of our algorithms, MAGPIE's advantage increases for higher wide area latencies.} } @Article{zhu99:mpi-app, author = {W. J. Zhu and L. Petzold}, title = {Parallel sensitivity analysis for {DAE}s with many parameters}, journal = {Concurrency-Practice and Experience}, year = 1999, volume = 11, number = 10, month = AUG, pages = {571--585}, abstract = {In this paper, we discuss the parallel computation of the sensitivity analysis of systems of differential-algebraic equations (DAEs) with a moderate number of state variables and a large number of sensitivity parameters, Several parallel implementations based on DASSLSO are explored and their performance when using the Message Passing Interface (MPI) on an SGI Origin 2000 is compared, } } @Article{Sun99:mpi-perf, author = {D. Sundaram-Stukel and M. K. Vernon}, title = {Predictive analysis of a wavefront application using {LogGP}}, journal = {ACM SIGPLAN Notices}, year = 1999, volume = 34, number = 8, month = AUG, pages = {141-150}, abstract = {This paper develops a highly accurate LogGP model of a complex wavefront application that uses MPI communication on the IBM SP/2. Key features of the model include: (1) elucidation of the principal wavefront synchronization structure, and (2) explicit high-fidelity models of the MPI-send and MPI-receive primitives. The MPI-send/receive models are used to derive L, o, and Gfrom simple two-node micro-benchmarks, Other model parameters are obtainedby measuring small application problem sizes on four SP nodes. Results show that the LogGP model predicts, in seconds and with a high degree of accuracy, measured application execution time for large problems running on 128 nodes. Detailed performance projections are provided for very large future processor configurations that are expected to be available to the application developers. These results indicate that scaling beyond one or two thousand nodes yields greatly diminished improvements in execution time, and thatsynchronization delays are a principal factor limiting the scalability of the application.} } @Article{kimura99:mpi-app, author = {T. Kimura and H. Takemiya}, title = {Distributed parallel computing for fluid structure coupled simulations on a heterogeneous parallel computer cluster}, journal = {International Journal of High Performance Computing Applications}, year = 1999, volume = 13, number = 4, pages = {320--333}, abstract = {Distributed parallel computing for a fluid-structure coupled simulation hasbeen performed on a heterogeneous parallel computer cluster. The fluid andthe structure dynamics are simulated on different parallel computers connected by a high-speed local network. These dynamics are coupled by a loose coupling method exchanging the boundary data between the fluid and the structure domains through the network. The data communication among parallel computers is realized by using the new communication library, Stampi, which has been developed to enable communication in a heterogeneous environment. The performance evaluation on a heterogeneous parallel computer cluster has shown that the distributed parallel computing for fluid-structure coupled simulations has the advantage of increasing the performance compared with theparallel computing on a single parallel computer.} } @Article{morrow99:mpi-app, author = {P. J. Morrow and D. Crookes and J. Brown and G. McAleese and D. Roantree and I. Spence}, title = {Efficient implementation of a portable parallel programming model for image processing}, journal = {Concurrency-Practice and Experience}, year = 1999, volume = 11, number = 11, month = SEP, pages = {671--685}, abstract = {This paper describes a domain specific programming model for execution on parallel and distributed architectures. The model has initially been targeted at the application area of image processing, though the techniques developed may be more generally applicable to other domains where an algebraic orlibrary-based approach is common. Efficiency is achieved by the concept ofa self-optimising class library of primitive image processing operations, which allows programs to be written in a high level, algebraic notation andwhich is automatically parallelised (using an application-specific data parallel approach). The class library is extended automatically with optimised operations, generated by a transformation system, giving improved execution performance. The parallel implementation of the model described here is based on MPI and has been tested on a C40 processor network, a quad-processor Unix workstation, and a network of PCs running Linux. Timings are included to indicate the impact of the automatic optimisation facility (rather than the effect of parallelisation). } } @Article{byrne:mpi-app, author = {G. D. Byrne and A. C. Hindmarsh}, title = {{PVODE}, an {ODE} solver for parallel computers}, journal = {International Journal of High Performance Computing Applications}, year = 1999, volume = 13, number = 4, pages = {354--365}, abstract = {PVODE is a general-purpose solver for ordinary differential equation (ODE) systems that implements methods for both stiff and nonstiff systems. The code is designed for single-program multiple-data environments. It is writtenin ANSI standard C, with a highly modular structure. The version being distributed uses the message-passing interface (MPI) system for communication.In the stiff case, PVODE uses a backward differentiation formula method combined with preconditioned GMRES iteration. Parallelism is achieved by distributing the ODE solution vector into user-specified segments and parallelizing a set of vector kernels accordingly. For PDE-based ODE systems, we provide a module that generates a band block-diagonal preconditioner for use with the GMRES iteration. We also provide a set of interfaces to accommodateFortran applications. The paper includes a stiff example problem and test results on a Cray-T3D with three different message-passing systems. PVODE is publicly available.} } @Article{Coelho:mpi-app, author = {P. J. Coelho}, title = {Parallel simulation of a utility boiler. Part {I}: Mathematical model and numerical solution method}, journal = {Communications in Numerical Methods in Engineering}, year = 1999, volume = 15, number = 10, month = OCT, pages = {717--726}, abstract = {A computer code for the modelling of turbulent reactive flows with heat transfer has been parallelized and applied to the simulation of a utility boiler. The code is based on the numerical solution of the density-weighted averaged form of the governing equations for mass, momentum and energy conservation, and transport equations for scalars associated with the turbulence and combustion models. The k-epsilon model and the chemical equilibrium approach are used. The turbulent fluctuations are accounted for in the calculation of the mean properties by means of a presumed joint probability densityfunction for the mixture fraction and the fraction of radiative heat loss.The discrete ordinates method is used for radiation modelling. The governing equations are solved using the finite volume method. The parallelizationis carried out using the domain decomposition approach and the message-passing MPI library. The paper is divided into two parts. This part is concerned with the description of the model and the parallel implementation, whilethe model evaluation and the analysis of the parallel performance are presented in Part II (pp. 727-736).} } @Article{Torres:mpi-app, author = {D. J. Torres and E. A. Coutsias}, title = {Pseudospectral solution of the two-dimensional {N}avier-{S}tokes equations in a disk}, journal = {SIAM Journal on Scientific Computing}, year = 1999, volume = 21, number = 1, month = SEP, pages = {378--403}, abstract = {An efficient and accurate algorithm for solving the two-dimensional(2D) incompressible Navier-Stokes equations on a disk with no-slip boundary conditions is described. The vorticity-stream function formulation of these equations is used, and spatially the vorticity and stream functions are expressedas Fourier-Chebyshev expansions. The Poisson and Helmholtz equations whicharise from the implicit-explicit time marching scheme are solved as bandedsystems using a post-conditioned spectral tau-method. The polar coordinatesingularity is handled by expanding fields radially over the entire diameter using a parity modified Chebyshev series and building partial regularityinto the vorticity. The no-slip boundary condition is enforced by transferring one of the two boundary conditions imposed on the stream function ontothe vorticity via a solvability constraint. Significant gains in run timeswere realized by parallelizing the code in message passage interface (MPI).} } @Article{Ann99:mpi-app, author = {V. Annamalai and C. S. Krishnamoorthy and V. Kamakoti}, title = {Adaptive finite element analysis on a parallel and distributed environment}, journal = {Parallel Computing}, year = 1999, volume = 25, number = 12, month = NOV, pages = {1413--1434}, abstract = {Industries in general and automotive industries in particular, use Finite Element Analysis (FEA) for better solutions to the engineering problems theyencounter. The reliability of the Finite Element method can be improved toa larger extent by Adaptive Finite Element Analysis (AFEA), As we look towards increasingly accurate solutions, the process becomes computationally intensive and requires parallel and economic high-performance scientific computing environments to solve them. In this paper we present a parallel implementation of AFEA on a cluster of workstations and illustrate its efficiency and scalability with examples. In this process, we have developed a user-friendly environment for Parallel Distributed computing which is portable on top of both Parallel Virtual Machine (PVM) and Message Passing Interface(MPI) message passing layers. We have addressed the issues of the several stages in AFEA from a parallel computing perspective that includes Domain decomposition, Parallel Mesh generation, Parallel Finite Element Analysis using a Substructuring technique and Load balancing.} } @Article{Nagar99:mpi-impl, author = {S. Nagar and A. Banerjee and A. Sivasubramaniam and C. R. Das}, title = {Alternatives to coscheduling a network of workstations}, journal = {Journal of Parallel and Distributed Computing}, year = 1999, volume = 59, number = 2, month = NOV, pages = {302--327}, abstract = {Efficient scheduling of processes on processors of a Network of Workstations (NOW) is essential for good system performance. However, the design of such schedulers is challenging because of the complex interaction between several system and workload parameters. Coscheduling, though desirable, is impractical for such a loosely coupled environment. Two operations, waiting for a message and arrival of a message, can be used to take remedial actions that can guide the behavior of the system toward coscheduling using local information. We present a taxonomy of three possibilities for each of these two operations. leading to a design space of 3x3 scheduling mechanisms. This paper presents an extensive implementation and evaluation exercise in studying these mechanisms. Adhering to the philosophy that scheduling and communication are intertwined and should be studied in conjunction, a complete communication substrate for UltraSPARC workstations, connected by Myrinet and running Solaris 2.5.1, has been developed. This platform provides the entire Message Passing Interface (MPI) to readily run off-the-shelf MPI applications by employing protected low-latency user-level messaging. Several applications can concurrently use this interface. This platform has been usedto design. implement, and uniformly evaluate nine scheduling strategies with a mixture of concurrent real applications with varying communication intensities. This includes five new schemes (Periodic Boost, Periodic Boost with Spin Block, Spin Yield, Periodic Boost with Spin Yield, Dynamic Coscheduling with Spin Yield) that are presented in this paper. In addition to our evaluations of the pms and cons of each mechanism in terms of throughput, response time, CPU utilization, and Fairness, it is shown that Periodic Boost is a promising approach for scheduling processes on a NOW.} } @Article{Lappa99:mpi-app, author = {M. Lappa and R. Savino}, title = {Parallel solution of three-dimensional {M}arangoni flow in liquid bridges}, journal = {International Journal for Numerical Methods in Fluids}, year = 1999, volume = 31, number = 6, month = NOV, pages = {911--935}, abstract = {This paper describes the implementation and performances of a parallel solver for the direct numerical simulation of the three-dimensional and time-dependent Navier-Stokes equations on distributed-memory, massively parallel computers. The feasibility of this approach to study Marangoni flow instability in half zone liquid bridges is examined. The results indicate that the incompressible, non-linear Navier-Stokes problem, governing the Marangoni flows behavior, can effectively be parallelized on a distributed memory parallel machine by remapping the distributed data structure. The numerical code is based on a three-dimensional Simplified Marker and Cell (SMAC) primitive variable method applied to a staggered finite difference grid. Using this method, the problem is split into two problems, one parabolic and the other elliptic A parallel algorithm, explicit in time, is utilized to solve the parabolic equations. A parallel multisplitting kernel is introduced for the solution of the pseudo pressure elliptic equation, representing the mosttime-consuming part of the algorithm. A grid-partition strategy is used inthe parallel implementations of both the parabolic equations and the multisplitting elliptic kernel. A Message Passing Interface (MPI) is coded for the boundary conditions; this protocol is portable to different systems supporting this interface for interprocessor communications. Numerical experiments illustrate good numerical properties and parallel efficiency. In particular, good scalability on a large number of processors can be achieved as long as the granularity of the parallel application is not too small. However, increasing the number of processors, the Speed-Up is ever smaller than the ideal linear Speed-Up. The communication timings indicate that complex practical calculations, such as the solutions of the Navier-Stokes equationsfor the numerical simulation of the instability of Marangoni flows, can beexpected to run on a massively parallel machine with good efficiency.} } @Article{hill99:mpi-app, author = {R. W. Hill and K. S. Ball}, title = {Parallel implementation of a {F}ourier-{C}hebyshev collocation method for incompressible fluid flow and heat transfer}, journal = {Numerical Heat Transfer Part B}, year = 1999, volume = 36, number = 3, month = {Oct-Nov}, pages = {309--329}, abstract = { A Fourier-Chebyshev collocation spectral method is parallelized to simulatethe three-dimensional unsteady flow and heat transfer inside a cylindricalenclosure. Two solution approaches using different techniques for determining the pressure field and enforcing mass conservation are presented for shared memory applications using Cray directives and for distributed memory applications using MPI and SHMEM message passing libraries. Matrix diagonalization is employed for solving the pressure Poisson equation and Helmholtz equations for the velocity components and temperature. The parallelization approach is described and scaling results are presented for both platform types.} } @Article{poggi:mpi-extension, author = {A. Poggi and G. Destri}, title = {{MPOOL}: an object-oriented library for task composition and co-ordination}, journal = {Concurrency-Practice and Experience}, year = 1999, volume = 11, number = 14, month = DEC, pages = {835--848}, abstract = { MPOOL is an object-oriented extension to the MPI library, based on three categories of objects, called units, groups and schemes. Units are active objects composed of data (state) and procedures (like traditional passive objects), but with the additional ability to store incoming messages in a queuewhile they are active and to send messages in parallel to other units; moreover, different units may be active simultaneously. Groups and schemes arepassive objects used for the composition of units and the co-ordination oftheir actions, Groups manage collective communications and synchronizationoperations such as barriers. Schemes compose units' actions through the use of a set of constructs derived by path expressions.} } @Article{sel99:mpi-app, author = {P. M. Selwood and M. Berzins}, title = {Parallel unstructured tetrahedral mesh adaptation: algorithms, implementation and scalability}, journal = {Concurrency-Practice and Experience}, year = 1999, volume = 11, number = 14, month = DEC, pages = {863--884}, abstract = { The use of unstructured adaptive tetrahedral meshes in the solution of transient flows poses a challenge for parallel computing due to the irregular and frequently changing nature of the data and its distribution. A parallel mesh adaptation algorithm, PTETRAD, for unstructured tetrahedral meshes (based on the serial code TETRAD) is described and analysed. The portable implementation of the parallel code in C with MPI is described and discussed, The scalability of the code is considered, analysed and illustrated by numerical experiments using a shock wave diffraction problem. } } @Article{meme:mpi-graphics-app, author = {D. Meneveaux and K. Bouatouch}, title = {Synchronisation and load balancing for parallel hierarchical radiosity of complex scenes on a heterogeneous computer network}, journal = {Computer Graphics Forum}, year = 1999, volume = 18, number = 4, month = DEC, pages = {201--212}, abstract = {In this paper ae propose a SPMD parallel hierarchical radiosity algorithm relying on a novel partitioning method which may apply, to any kind of archilectural scene. This algorithm is based on MPI (Message Passing Interface),a communication library which allows the use of either a heterogeneous setof concurrent computers or a parallel computer or both. The database is stored on a common directory and accessed by all the processors (through NFS in case of a network of computers). As the objective is to handle complex scenes such as building interiors, to cope with the problem of memory size, only a subset of the database resides in memory of each processor. This subset is determined with the help of a partitioning into 3D cells, clusteringand visibility calculations. A graph expressing visibility between the resulting clusters is determined partitioned (with a new method based on classification of K-means type) and distributed amongst all the processors. Eachprocessor is responsible for gathering energy (using the Gauss-Seidel method) only for its subset of clusters. In order to reduce the disk transfers due to downloading these subsets of clusters, we use an ordering strategy based on the traveling salesman algorithm. Dynamic load balancing relies on a task stealing approach while termination is detected by configuring the processors into a ring and moving a token around this ring. The parallel iterative resolution is of group iterative type. Its mathematical convergence is proven in the appendix.} } @Article{bova2000:mpi-app, author = {S. W. Bova and G. F. Carey}, title = {A distributed memory parallel element-by-element scheme for semiconductor device simulation}, journal = {Computer Methods in Applied Mechanics and Engineering}, year = 1999, volume = 181, number = 4, pages = {403--423}, abstract = { A domain decomposition and parallel element-by-element (EBE) scheme is developed for semiconductor device simulation modeled by the drift-diffusion (DD) equations. A classical Gummel iterative decoupling of the potential and carrier transport equations is applied on an unstructured triangulation. The distributed memory EBE scheme is formulated for a Galerkin finite elementapproximation of the nonlinear Poisson problem, and a modified Scharfetter-Gummel method is used for the carrier transport problem. The resulting sequences of symmetric and nonsymmetric linear systems are solved via preconditioned Krylov methods. Unstructured triangular grids are used to permit grading of the mesh, which is then partitioned to processor subdomains with appropriate data structures for message passing. Details of the parallel algorithm and data structure are provided. The scheme is implemented in Fortran90 with MPI and performance results are presented for a representative MOSFET on an IBM SP, a CRAY T3E, and an SGI/CRAY Origin2000.} } @Article{bova2000:mpi-openmp-app, author = {S. W. Bova and C. P. Breshears and C. E. Cuicchi and Z. Demirbilek and H. A. Gabb}, title = {Dual-level parallel analysis of harbor wave response using {MPI} and {OpenMP}}, journal = {International Journal of High Performance Computing Applications}, year = 2000, volume = 14, number = 1, pages = {49--64}, abstract = {The authors describe their experiences converting an existing serial production code to a parallel code combining both MPI and OpenMP. Such dual-levelparallel codes will be able to take full advantage of the emerging class of high performance computer architectures using small clusters of shared-memory processors connected via a message-passing network. While the focus isrestricted to a harbor response simulation code, the techniques presented herein are appropriate for a broad class of applications that explore a parameter space. The code modifications reduced the execution time of one testcase from 3100 minutes on a single CPU to just over 12 minutes on 256 CPUs. Results demonstrate that dual-level parallelism allows substantial increases in model resolution combined with improvements in simulation turnaroundtime but, contrary to conventional wisdom, requires very little source code alteration.} } @Article{park99:mpi-app, author = {N. Park and V. K. Prasanna and C. S. Raghavendra}, title = {Efficient algorithms for block-cyclic array redistribution between processor sets}, journal = {IEEE Transactions on Parallel and Distributed Systems}, year = 1999, volume = 10, number = 12, month = DEC, pages = {1217--1240}, abstract = {Run-time array redistribution is necessary to enhance the performance of parallel programs on distributed memory supercomputers. In this paper, we present an efficient algorithm for array redistribution from cyclic(x) on P processors to cyclic(Kx) on Q processors. The algorithm reduces the overall time for communication by considering the data transfer, communication schedule, and index computation costs. The proposed algorithm is based on a generalized circulant matrix formalism. Our algorithm generates a schedule thatminimizes the number of communication steps and eliminates node contentionin each communication step. The network bandwidth is fully utilized by ensuring that equal-sized messages are transferred in each communication step.Furthermore, the time to compute the schedule and the index sets is significantly smaller. It takes O(maz(P, Q)) time and is less than 1 percent of the data transfer time. In comparison, the schedule computation time using the state-of-the-art scheme (which is based on the bipartite matching scheme) is 10 to 50 percent of the data transfer time for similar problem sizes. Therefore, our proposed algorithm is suitable for run-time array redistribution. To evaluate the performance of our scheme, we have implemented the algorithm using C and MPI on an IBM SP2. Results show that our algorithm performs better than the previous algorithms with respect to the total redistribution time, which includes the time for data transfer. schedule, and indexcomputation.} } @Article{dan00:mpi-app, author = {K. T. Danielson and S. Hao and W. K. Liu and R. A. Uras and S. F. Li}, title = {Parallel computation of meshless methods for explicit dynamic analysis}, journal = {International Journal for Numerical Methods in Engineering}, year = 2000, volume = 47, number = 7, month = MAR, pages = {1323-1341}, abstract = {A parallel computational implementation of modern meshless methods is presented for explicit dynamic analysis. The procedures are demonstrated by application of the Reproducing Kernel Particle Method (RKPM). Aspects of a coarse grain parallel paradigm are detailed for a Lagrangian formulation using model partitioning. Integration points are uniquely defined on separate processors and particle definitions are duplicated, as necessary, so that all support particles for each point are defined locally on the corresponding processor. Several partitioning schemes are considered and a reduced graph-based procedure is presented. Partitioning issues are discussed and procedures to accommodate essential boundary conditions in parallel are presented. Explicit MPI message passing statements are used for all communications among partitions on different processors. The effectiveness of the procedure is demonstrated by highly deformable inelastic example problems.} } @Article{mar00:mpi-app, author = {N. Marco and S. Lanteri}, title = {A two-level parallelization strategy for Genetic Algorithms applied to optimum shape design}, journal = {Parallel Computing}, year = 2000, volume = 26, number = 4, month = MAR, pages = {377--397}, abstract = {This pager presents a two-level strategy for the parallelization of a Genetic Algorithm (GA) coupled to a compressible flow solver designed on unstructured triangular meshes. The parallel implementation is based on MPI and makes use of the process group features of this environment. The resulting algorithm is used for the optimum shape design of aerodynamic configurations.Numerical and performance results are presented for the optimization of two-dimensional airfoils for calculations performed on the following systems:an SGI Origin 2000 and an IBM SP-2 MIMD systems; an Pentium Pro (P6/200 MHz) cluster where the interconnection is realized through a FastEthernet (100 Mbits/s) switch. } } @Article{An00:mpi-app, author = {R. E. Ansorge and T. A. Carpenter and L. D. Hall and N. R. Shaw and G. B. Williams}, title = {Use of parallel supercomputing to design magnetic resonance systems}, journal = {IEEE Transactions on Applied Superconductivity}, year = 2000, volume = 10, number = 1, month = MAR, pages = {1368--1371}, abstract = {Historically analytical methods have been the preferred approach to designing magnets and gradient sets for magnetic resonance systems. Such methods are computationally efficient but are approximate, particularly away from the axis of symmetry. Alternative methods, which are much more computationally intensive, for example Genetic Algorithms, are now becoming practical, Such methods have the advantage that they can be used for unconventional designs and for the inclusion of nonanalytical design constraints such as real-word engineering and cost limitations. Gradient coil designs have been published previously [1]-[3]. Now with the availability of more powerful computers, more ambitious designs can be undertaken using parallel computing methods. The use of a Hitachi SR2201 supercomputer and clusters of Linux PCs (Beowulf) to develop a short whole body MRI magnet for clinical applications are reported on. An important feature of these computer codes is that they have been developed to run on parallel computing systems using the MPI message passing standard. MPI is an accepted industry standard, which means that these codes can readily be ported to different parallel computers. Previous success has been achieved in using MPI for a variety of other Medical Imaging problems [4].} } @InProceedings{cle95:mpi-debugging, author = {C. Cl\'emen\,con and J. Fritscher and M. J. Meehan and R. R\"uhl}, title = {An Implementation of Race Detection and Deterministic Replay with {MPI}}, booktitle = {Proceedings of Euro-Par'95}, number = 966, series = {LNCS}, year = 1995, publisher = {Springer-Verlag}, month = AUG, pages = {155-166}, meetingloc = {Stockholm, Sweden} } @Article{danad00:mpi-app, author = {K. T. Danielson and M. D. Adley}, title = {A meshless treatment of three-dimensional penetrator targets for parallel computation}, journal = {Computational Mechanics}, year = 2000, volume = 25, number = 3, month = MAR, pages = {267--273}, abstract = {A meshless modeling procedure of three-dimensional targets for penetration analysis on parallel computing systems is described. Buried structures are modeled by arbitrary layers of concrete and geologic materials, and the projectile is modeled by standard finite elements. Penetration resistance of the buried structure is provided by functions derived from principles of dynamic cavity expansion. The resistance functions are influenced by the target material properties and projectile kinematics. Additional capabilities accommodate the varying structural and geometrical characteristics of the target. Coupling between the finite elements and the meshless target model is made by applying resistance loads to elements on the outer surface of the projectile mesh. Penetration experiments verify the approach. In this manner, the target is effectively modeled and the strategy is well suited for parallel processing. The procedure is incorporated into an explicit transient dynamics code, using mesh partitioning for a coarse grain parallel processing paradigm. Message Passing Interface (MPI) is used for all interprocessorcommunication. Large detailed finite element analyses of projectiles are performed on up to several hundred processors with excellent scalability. The efficiency of the strategy is demonstrated by analyses executed on several types of scalable computing platforms.} } @Article{kim00:mpi-app, author = {S. Kim}, title = {Lattice {QCD} on a beowulf cluster}, journal = {Nuclear Physics B-Proceedings Supplements}, year = 2000, volume = 83, number = 4, month = APR, pages = {807--809}, abstract = { Using commodity component personal computers based on Alpha processor and commodity network devices and a switch, we built an 8-node parallel computer. GNU/Linux is chosen as an operating system and message passing libraries Such as PVM, LAM, and MPICH have been tested as a parallel programming environment. We discuss our lattice QCD project for a heavy quark system on this computer.} } @Article{wat00:mpi-app, author = {N. Watari and S. Ohnishi and H. Onishi and Y. Iwasawa}, title = {Total energy estimation for {Pd/Al} bimetallic surfaces by a parallel computation scheme}, journal = {Japanese Journal of Applied Physics Part 1---Regular Papers Short Notes \& Review Papers}, year = 2000, volume = 39, number = {3A}, month = MAR, pages = {1457--1461}, Abstract = { A numerical calculation scheme for the multicenter problem in large molecules and clusters is presented by applying the message-passing inter-face (MPI) in a massively parallel computer that uses the density functional method. The multicenter problem associated with the Coulomb singularity of an atom is efficiently treated by the parallel processors by allocating several atoms into each processor element (PE). The order N-2/P tuning is obtained for the Coulomb energy calculation by using the MPI which transfers Coulomb potential field between PE's. This method is applied to estimate the total energy of the reconstructed Al/Pd bimetallic surface. The energy estimationby the charge density of a superposition of isolated atomic charge fragments predict a stabilization caused by the reconstruction, being consistent with a self-consistent-field (SCF) cluster calculation of the bimetallic surface.} } @Article{rod00:mpi-model, author = {C. Rodriguez and J. L. Roda and F. Sande and D. G. Morales and F. Almeida}, title = {A new parallel model for the analysis of asynchronous algorithms}, journal = {Parallel Computing}, year = 2000, volume = 26, number = 6, month = MAY, pages = {753--767}, abstract = {The BSP model barrier synchronization imposes some limits both in the rangeof available algorithms and also in their performance. Although BSP programs can be translated to MPI/PVM programs, the counterpart is not true. The asynchronous nature of some MPI/PVM programs does not easily fit inside theBSP model. Through the suppression of barriers and the generalization of the concept of superstep we propose two new models, the BSP-like and the BSPwithout barriers (BSPWB) models. While the BSP-like extends the BSP* modelto programs written using collective operations, the more general BSPWB model admits the MPI/PVM parallel asynchronous programming style. The parameters of the models and their quality are evaluated on four standard parallelplatforms: the Cray T3E, the IBM SP2, the Origin 2000 and the Digital Alpha Server 8400. The study shows that the time spent in an h-relation is moreindependent on the number of processors than on the communication pattern.We illustrate the use of these BSP extensions through two problem-solving paradig