@techreport{TR-IC-10-24, number = {IC-10-24}, author = {André Oriani and Islene Calciolari Garcia and Rodrigo Schmidt}, title = {{The Search for a Highly-Available Hadoop Distributed Filesystem}}, month = {August}, year = {2010}, institution = {Institute of Computing, University of Campinas}, note = {In English, 27 pages. \par\selectlanguage{english}\textbf{Abstract} Hadoop is becoming the standard framework to process large amounts of data. It takes advantage of distributed computing to do it fast and efficiently. However, this is only possible if data is supplied with high availability and consistency. Those goals are fulfilled by a core piece of Hadoop called the Hadoop Distributed Filesystem (HDFS). HDFS is a very scalable distributed filesystem capable of storing petabytes and providing high throughput data access. It makes intensive use of replication and checksums to protect the system from data loss and corruption. Despite of all those qualities, HDFS has a central component whose maintenance demands the entire system to be shut down. Furthermore, that component is also a single point of failure. Those limitations make HDFS unsuitable for 24x7 applications. In this technical report we are going to make a high-level introduction to HDFS and discuss attempts to solve the mentioned problem. } }