Reading data from CephFS by Hadoop Client

DPBD90
2 min readJul 24, 2019

--

In this article, i will demonstrate how to read data from CephFS via s3a interface.

Create maven pom:

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>vn.tienbm90</groupId>
<artifactId>ceph-client</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<amazon.sdk.version>1.11.327</amazon.sdk.version>
<hadoop.version>3.1.1</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk</artifactId>
<version>${amazon.sdk.version}</version>
</dependency>
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-s3</artifactId>
<version>${amazon.sdk.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-aws</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>


</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>2.2.1</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>2.4</version>
<configuration>
<archive>
<manifest>
<addDefaultImplementationEntries>true</addDefaultImplementationEntries>
</manifest>
</archive>
</configuration>
</plugin>
</plugins>
</build>

</project>

Create configuration file:

<configuration xmlns:xi="http://www.w3.org/2001/XInclude">

<property>
<name>fs.s3a.access.key</name>
<value>OSWIHA5HDP8EDN55E8AS</value>
<!-- <value>AKIA2PELYOB4ZT24FDGU</value>-->
<description>AWS access key ID. Omit for Role-based authentication.</description>
</property>

<property>
<name>fs.s3a.secret.key</name>
<value>OCOkqTQSnUAL7rA9aJgqDvFLz8O1X0P2YeoHgBxI</value>
<!-- <value>L2ILWWewDbly1kcbI6UaLv5h3y8TNTaHeVO+aSYE</value>-->
<description>AWS secret key</description>
</property>

<property>
<name>fs.s3a.endpoint</name>
<value>10.240.183.4</value>
<!-- <value>s3.amazonaws.com</value>-->
<description>AWS S3 endpoint to connect to. An up-to-date list is
provided in the AWS Documentation: regions and endpoints. Without this
property, the standard region (s3.amazonaws.com) is assumed.
</description>
</property>

<property>
<name>central.endpoint</name>
<value>10.240.183.4</value>
</property>

<property>
<name>fs.s3a.aws.credentials.provider</name>
<value>org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider</value>
</property>
<property>
<name>fs.s3a.connection.ssl.enabled</name>
<value>false</value>
<description>Enables or disables SSL connections to S3.</description>
</property>

</configuration>

Create CephClient.java

package vn.aic.ceph.client.main;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import java.io.IOException;
import java.net.URI;

public class CephClient {

public static void main(String[] args) {
String s3Conf = args[0];
String bucket = args[1];
String s3uriBuilder = new StringBuilder().append("s3a://").append(bucket).toString();
System.out.printf(s3uriBuilder);
// URI s3uri = URI.create("s3a:///" + bucket);
URI s3uri = URI.create(s3uriBuilder);
Configuration conf = new Configuration();
// conf.addResource(new Path("file:///home/tienbm/IdeaProjects/cephclient/conf/s3-site.xml")); // Replace with actual path
conf.addResource(new Path("file:///" + s3Conf)); // Replace with actual path
FileSystem s3fs = null;
try {
s3fs = FileSystem.get(s3uri, conf);
FileStatus[] status = s3fs.listStatus(new Path("s3a://" + bucket + "/2019/07/17/14"));
for (FileStatus stas : status) {
System.out.println(stas.toString());
}
// s3fs.create(new Path("s3a://signal-aic/tienbm"));
} catch (IOException e) {
e.printStackTrace();
}
}
}

--

--

DPBD90
DPBD90

Written by DPBD90

I'm an engineer. I love to work on data and open-source systems.

No responses yet