Skip to content
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
8708856
add getContentSummary and prelim test
sumangala17 Dec 11, 2020
01aba06
remove gfs call
sumangala17 Dec 14, 2020
2876a8b
add tests
sumangala17 Dec 15, 2020
a9960da
pr draft
sumangala17 Dec 15, 2020
30cf195
checkstyle fix
sumangala17 Dec 16, 2020
95d1396
linkedBlockingQ + junit test fix
sumangala17 Dec 17, 2020
03d342c
linkedBlockingQ + junit test fix (#5)
sumangala-patki Dec 17, 2020
bb55b14
using executors
sumangala17 Dec 22, 2020
a9e94a9
using executors
sumangala17 Dec 22, 2020
06609da
run()->call(), terminate condition, add invalid path test
sumangala17 Dec 24, 2020
1433c85
pr revw + checkstyle
sumangala17 Dec 24, 2020
27b6007
Merge branch 'trunk' into HADOOP-17428
sumangala17 Dec 24, 2020
d747f06
findbugs use future returned
sumangala17 Dec 24, 2020
be2daf0
completion service + temp concurrency tests
sumangala17 Jan 5, 2021
96cd2b9
pr revw + exec test
sumangala17 Jan 7, 2021
e3eaca7
clean up
sumangala17 Jan 8, 2021
94a95df
minor changes
sumangala17 Jan 8, 2021
48d0607
rm thread test
sumangala17 Jan 9, 2021
a10be00
checkstyle
sumangala17 Jan 10, 2021
636b434
Merge branch 'trunk' into HADOOP-17428
sumangala17 Jan 10, 2021
bc276b2
revw changes + doc
sumangala17 Jan 12, 2021
744f8c4
javadoc
sumangala17 Jan 12, 2021
9070413
trigger yetus
sumangala17 Jan 12, 2021
657d7ea
Merge branch 'trunk' into HADOOP-17428
sumangala17 Feb 7, 2021
041d9bc
use listingsupport to abstract store
sumangala17 Feb 7, 2021
9c92338
merge
sumangala17 Feb 22, 2021
4be7b19
checkstyle
sumangala17 Feb 22, 2021
7a2e218
Merge branch 'trunk' into HADOOP-17428
sumangala17 Feb 22, 2021
d21b58a
import order
sumangala17 Feb 22, 2021
9b2723b
Merge branch 'trunk' into HADOOP-17428
sumangala17 Mar 31, 2021
2378431
log ex
sumangala17 Apr 19, 2021
fe71af1
Merge branch 'trunk' into HADOOP-17428
sumangala17 May 11, 2021
fa34b57
rm abfs cs
sumangala17 May 11, 2021
aa48086
test fix
sumangala17 May 12, 2021
f320785
clean up
sumangala17 May 12, 2021
be0e94c
merge with tc
sumangala17 Jul 5, 2021
2104268
Merge branch 'trunk' into HADOOP-17428
sumangala17 Aug 23, 2021
a718cbd
address revw comments
sumangala17 Aug 25, 2021
c9d65aa
review comments part 2: move executor->abfsStore
sumangala17 Aug 26, 2021
4003aff
Merge branch 'trunk' into HADOOP-17428
sumangala17 Sep 8, 2021
3039f7f
use iterator + rejected-ex handler
sumangala17 Sep 10, 2021
8259a2e
undo extra formatting
sumangala17 Sep 13, 2021
b64b492
more formatting
sumangala17 Sep 13, 2021
16d9436
format
sumangala17 Sep 13, 2021
137627d
fix merge conflict
sumangala17 Jan 10, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.ContentSummary.Builder;
import org.apache.hadoop.fs.azurebfs.services.ContentSummaryProcessor;
import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.thirdparty.com.google.common.base.Preconditions;
import org.slf4j.Logger;
Expand Down Expand Up @@ -363,6 +366,16 @@ public boolean delete(final Path f, final boolean recursive) throws IOException

}

@Override
public ContentSummary getContentSummary(Path f) throws IOException {
org.apache.hadoop.fs.azurebfs.utils.ContentSummary contentSummary =
(new ContentSummaryProcessor(abfsStore)).getContentSummary(f);
return new Builder().length(contentSummary.getLength())
.directoryCount(contentSummary.getDirectoryCount())
.fileCount(contentSummary.getFileCount())
.spaceConsumed(contentSummary.getSpaceConsumed()).build();
}

@Override
public FileStatus[] listStatus(final Path f) throws IOException {
LOG.debug(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.fs.azurebfs.services;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore;
import org.apache.hadoop.fs.azurebfs.utils.ContentSummary;

import java.io.IOException;
import java.util.concurrent.atomic.AtomicLong;

public class ContentSummaryProcessor {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: javadocs

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added description

private final AtomicLong fileCount = new AtomicLong(0L);
private final AtomicLong directoryCount = new AtomicLong(0L);
private final AtomicLong totalBytes = new AtomicLong(0L);
private final ProcessingQueue<FileStatus> queue = new ProcessingQueue<>();
private final AzureBlobFileSystemStore abfsStore;
private static final int NUM_THREADS = 16;

public ContentSummaryProcessor(AzureBlobFileSystemStore abfsStore) {
this.abfsStore = abfsStore;
}

public ContentSummary getContentSummary(Path path) throws IOException {
processDirectoryTree(path);
Thread[] threads = new Thread[16];

for (int i = 0; i < NUM_THREADS; ++i) {
threads[i] = new Thread(new ContentSummaryProcessor.ThreadProcessor());
threads[i].start();
}

for (Thread t : threads) {
try {
t.join();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
return new ContentSummary(totalBytes.get(), directoryCount.get(),
fileCount.get(), totalBytes.get());
}

private void processDirectoryTree(Path path) throws IOException {
FileStatus[] fileStatuses = abfsStore.listStatus(path);
for (FileStatus fileStatus : fileStatuses) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you supported paged results, you could start queuing subdir work while still iterating through the list.

In directories with many pages of results including directories, this could speed up processing as subdirectory scanning could start as soon as of the first page of results had been retrieved.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, we should be able to incorporate that using the listiterator. Thanks, will make the change

Copy link
Contributor Author

@sumangala-patki sumangala-patki Sep 10, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trying to confirm the advantage of processing page-wise listStatus results; would like to know your opinion. Analyzed time taken by direct liststatus call vs using listiterator (queueing subdir while iterating), but getting ambiguous results.

The tests used involved creating a directory tree and calling GetContentSummary on the top folder, as the primary use of this api might be on the root of an account.

Expt 1: Directory tree with 12 levels (tree height=12), where each level comprises one dir and 1-2 files.
Expt 2: Same 12-level structure as 1, with a branch (of 2 subdir levels) around the mid-level, i.e., two subdirs at level 5, each having a subdir. All directories in the tree have ~15 files
Expt 3: Same as expt 2, but with each dir having more than 5000 files (will result in liststatus results being fetched in multiple pages)

The analysis was done for both lexicographical positions of directory with respect to files at the same level, as it determines whether the directory is fetched first. The time taken was calculated as the time between the first ListStatus REST call and the DeleteFileSystem call (post the last LS) => this will eliminate differences in file/dir creation time.

Expt number	Dir after files		Dir before files
1		LS (few ms)		LS
2		LS (0.5s)		Itr (8.7s)
3		LS (3s)			Itr (4.5s)

LS(t) -> Normal direct ListStatus call was faster by t
Itr(t) -> ListIterator was faster by t

Using iterator seems beneficial for some scenarios, will go ahead with it.

if (fileStatus.isDirectory()) {
this.processDirectory();
this.queue.add(fileStatus);
} else {
this.processFile(fileStatus);
}
}
}

private void processDirectory() {
this.directoryCount.incrementAndGet();
}

private void processFile(FileStatus fileStatus) {
this.fileCount.incrementAndGet();
this.totalBytes.addAndGet(fileStatus.getLen());
}

private final class ThreadProcessor implements Runnable {
private ThreadProcessor() {
}

public void run() {
try {
FileStatus fileStatus;
while ((fileStatus = ContentSummaryProcessor.this.queue.poll())
!= null) {
if (fileStatus.isDirectory()) {
ContentSummaryProcessor.this
.processDirectoryTree(fileStatus.getPath());
}
ContentSummaryProcessor.this.queue.unregister();
}
} catch (IOException e) {
throw new RuntimeException("IOException processing Directory tree", e);
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.fs.azurebfs.services;

import java.util.LinkedList;
import java.util.Queue;

public class ProcessingQueue<T> {

private final Queue<T> internalQueue = new LinkedList<>();
private int processorCount = 0;

ProcessingQueue() {
}

public synchronized void add(T item) {
if (item == null) {
throw new IllegalArgumentException("Cannot put null into queue");
} else {
this.internalQueue.add(item);
this.notifyAll();
}
}

public synchronized T poll() {
while (true) {
try {
if (this.isQueueEmpty() && !this.done()) {
this.wait();
continue;
}
if (!this.isQueueEmpty()) {
++this.processorCount;
return this.internalQueue.poll();
}
return null;
} catch (InterruptedException var2) {
Thread.currentThread().interrupt();
}
return null;
}
}

public synchronized void unregister() {
--this.processorCount;
if (this.processorCount < 0) {
throw new IllegalStateException(
"too many unregister()'s. processorCount is now "
+ this.processorCount);
} else {
if (this.done()) {
this.notifyAll();
}
}
}

private boolean done() {
return this.processorCount == 0 && this.isQueueEmpty();
}

private boolean isQueueEmpty() {
return this.internalQueue.peek() == null;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.fs.azurebfs.utils;

public class ContentSummary {
private final long length;
private final long directoryCount;
private final long fileCount;
private final long spaceConsumed;

public ContentSummary(long length, long directoryCount, long fileCount,
long spaceConsumed) {
this.length = length;
this.directoryCount = directoryCount;
this.fileCount = fileCount;
this.spaceConsumed = spaceConsumed;
}

public long getLength() {
return length;
}

public long getDirectoryCount() {
return directoryCount;
}

public long getFileCount() {
return fileCount;
}
public long getSpaceConsumed() {
return spaceConsumed;
}
}
Loading