Add support for merges in ES93BloomFilterStoredFieldsFormat#137622
Add support for merges in ES93BloomFilterStoredFieldsFormat#137622fcofdez merged 10 commits intoelastic:mainfrom
Conversation
|
|
||
| private void addToBloomFilter(FieldInfo info, BytesRef value) { | ||
| private int getBloomFilterSizeInBits() { | ||
| var bloomFilterSizeInBits = defaultBloomFilterSizeInBitsSupplier.getAsInt(); |
There was a problem hiding this comment.
| var bloomFilterSizeInBits = defaultBloomFilterSizeInBitsSupplier.getAsInt(); | |
| int bloomFilterSizeInBits = defaultBloomFilterSizeInBitsSupplier.getAsInt(); |
| ); | ||
| } | ||
|
|
||
| private void addToBloomFilter(FieldInfo info, BytesRef value) { |
There was a problem hiding this comment.
nit: maybe just pass the value? We're checking beforehand the field info.
| this.bitSetSizeInBytes = bitsetSizeInBits / Byte.SIZE; | ||
| this.buffer = bigArrays.newByteArray(bitSetSizeInBytes, false); | ||
| this.hashes = new int[numHashFunctions]; | ||
| this.bloomFilterDataOut = directory.createOutput(bloomFilterFileName(segmentInfo, segmentSuffix), context); |
There was a problem hiding this comment.
nit: maybe ensure bloomFilterDataOut is closed if writing the header fails?
| buffer.set(pos, val); | ||
| } | ||
| maybeInitializeBloomFilterWriter(info, getBloomFilterSizeInBits()); | ||
| bloomFilterWriter.addToBloomFilter(info, value); |
There was a problem hiding this comment.
nit: I would rename to:
| bloomFilterWriter.addToBloomFilter(info, value); | |
| bloomFilterWriter.add(value); |
| @Override | ||
| public int merge(MergeState mergeState) throws IOException { | ||
| // Skip merging the bloom filter for now | ||
| if (canOrBloomFilters(mergeState)) { |
There was a problem hiding this comment.
nit:
| if (canOrBloomFilters(mergeState)) { | |
| if (useOptimizedMerge(mergeState)) { |
| } | ||
| } | ||
|
|
||
| private void mergeBloomFiltersWithOr(MergeState mergeState) throws IOException { |
There was a problem hiding this comment.
| private void mergeBloomFiltersWithOr(MergeState mergeState) throws IOException { | |
| private void mergeOptimized(MergeState mergeState) throws IOException { |
|
|
||
| for (int readerIdx = 0; readerIdx < mergeState.storedFieldsReaders.length; readerIdx++) { | ||
| StoredFieldsReader storedFieldsReader = mergeState.storedFieldsReaders[readerIdx]; | ||
| if (storedFieldsReader instanceof Reader reader) { |
| bloomFilterFieldReader.checkIntegrity(); | ||
| IndexInput bloomFilterData = bloomFilterFieldReader.bloomFilterData; | ||
|
|
||
| bloomFilterData.prefetch(0, bitSetSizeInBytes); |
| final FieldsProducer f = mergeState.fieldsProducers[readerIndex]; | ||
|
|
||
| final int maxDoc = mergeState.maxDocs[readerIndex]; | ||
| if (f != null) { |
There was a problem hiding this comment.
In theory yes, if the segment doesn't have terms stored.
| final int maxDoc = mergeState.maxDocs[readerIndex]; | ||
| if (f != null) { | ||
| // This can be somewhat expensive and we'll check the integrity again while merging the terms, should we skip it? | ||
| f.checkIntegrity(); |
There was a problem hiding this comment.
I think it's OK to keep it: it should always be used with the synthetic _id postings formats which does nothing in checkIntegrity
|
Pinging @elastic/es-storage-engine (Team:StorageEngine) |
|
Hi @fcofdez, I've created a changelog YAML for you. |
kkrik-es
left a comment
There was a problem hiding this comment.
Please wait for @martijnvg to approve.
|
@elasticmachine update branch |
|
@elasticmachine update branch |
|
@elasticmachine update branch |
|
@elasticmachine update branch |
No description provided.