@inproceedings{414, author = {Pan Pan and Abhishek Dubey and Luciano Piccoli}, title = {Dynamic Workflow Management and Monitoring Using DDS}, abstract = {Large scientific computing data-centers require a distributed dependability subsystem that can provide fault isolation and recovery and is capable of learning and predicting failures to improve the reliability of scientific workflows. This paper extends our previous work on the scientific workflow management systems by presenting a hierarchical dynamic workflow management system that tracks the state of job execution using timed state machines. Workflow monitoring is achieved using a reliable distributed monitoring framework, which employs publish-subscribe middleware built upon OMG Data Distribution Service Standard. Failure recovery is achieved by stopping and restarting the failed portions of workflow directed acyclic graph.}, year = {2010}, journal = {7th IEEE International Workshop on Engineering of Autonomic & Autonomous Systems (EASe)}, pages = {20--29}, note = {under Review}, }