Skip to main content
GET
/
api
/
documents
/
variants
/
{variant_id}
/
chunks
curl https://api.raptordata.dev/api/documents/variants/variant-001/chunks?limit=50&offset=0 \
  -H "Authorization: Bearer rd_live_xxx"
{
  "chunks": [
    {
      "id": "chunk-001",
      "text": "This Agreement is entered into as of January 1, 2024...",
      "page_number": 1,
      "page_range": [1],
      "section_hierarchy": ["Contract Terms", "Effective Date"],
      "tokens": 512,
      "chunk_index": 0,
      "metadata": {},
      "chunk_type": "text",
      "contains_table": false,
      "table_metadata": null,
      "chunking_strategy": "semantic",
      "section_number": "1.1",
      "quality_score": 0.92,
      "synthetic_context": null,
      "parent_chunk_id": null,
      "bounding_box": null,
      "dedup_strategy": "high_reuse",
      "dedup_confidence": 0.95,
      "is_reused": false,
      "dedup_source_chunk_id": "chunk-parent-001",
      "total_sentences": 8,
      "reused_sentences_count": 7,
      "new_sentences_count": 1,
      "content_reuse_ratio": 0.875,
      "embedding_recommendation": "consider_reuse",
      "recommendation_confidence": "high",
      "dedup_metadata": null
    }
  ],
  "total": 47,
  "limit": 100,
  "offset": 0
}

Variants API

Processing variants represent different chunking configurations of the same document version. This API lets you retrieve variant chunks and deduplication information.

Get Variant Chunks

Retrieve chunks from a specific processing variant with pagination.

Path Parameters

variant_id
string
required
Variant ID

Query Parameters

limit
integer
default:"100"
Maximum chunks to return (1-1000)
offset
integer
default:"0"
Offset for pagination
include_full_metadata
boolean
default:"false"
Include full deduplication metadata JSONB (can be large)

Response

chunks
array
Array of chunk objects
total
integer
Total number of chunks available
limit
integer
Limit used for this request
offset
integer
Offset used for this request

Chunk Object

id
string
Unique chunk ID
text
string
Chunk text content
page_number
integer
Page number (1-indexed, null if not applicable)
page_range
array
Array of page numbers this chunk spans
section_hierarchy
array
Nested section names (e.g., [“Chapter 1”, “Section 1.1”])
tokens
integer
Token count for this chunk
chunk_index
integer
Zero-indexed position in document
metadata
object
Additional metadata
chunk_type
string
Type of chunk: text, table, image, code
contains_table
boolean
Whether chunk contains a table
table_metadata
object
Table extraction metadata
chunking_strategy
string
Strategy used: semantic or recursive
section_number
string
Extracted section number (e.g., “1.2.3”)
quality_score
number
Quality score (0.0-1.0)
synthetic_context
string
AI-generated context for orphan tables
parent_chunk_id
string
Parent chunk ID if hierarchical
bounding_box
object
Bounding box coordinates for images

Deduplication Fields

dedup_strategy
string
Deduplication strategy: exact, high_reuse, partial_reuse, mixed_content, fuzzy, new
dedup_confidence
number
Similarity score (0.0-1.0)
is_reused
boolean
Whether chunk was exactly reused from parent version
dedup_source_chunk_id
string
Source chunk ID if deduplicated
total_sentences
integer
Total sentences in chunk
reused_sentences_count
integer
Number of sentences reused from parent
new_sentences_count
integer
Number of new sentences
content_reuse_ratio
number
Percentage of sentences reused (0.0-1.0)
embedding_recommendation
string
Recommendation: reuse, consider_reuse, regenerate
recommendation_confidence
string
Confidence level: high, medium, low
dedup_metadata
object
Full deduplication JSONB (only if include_full_metadata=true)
{
  "chunks": [
    {
      "id": "chunk-001",
      "text": "This Agreement is entered into as of January 1, 2024...",
      "page_number": 1,
      "page_range": [1],
      "section_hierarchy": ["Contract Terms", "Effective Date"],
      "tokens": 512,
      "chunk_index": 0,
      "metadata": {},
      "chunk_type": "text",
      "contains_table": false,
      "table_metadata": null,
      "chunking_strategy": "semantic",
      "section_number": "1.1",
      "quality_score": 0.92,
      "synthetic_context": null,
      "parent_chunk_id": null,
      "bounding_box": null,
      "dedup_strategy": "high_reuse",
      "dedup_confidence": 0.95,
      "is_reused": false,
      "dedup_source_chunk_id": "chunk-parent-001",
      "total_sentences": 8,
      "reused_sentences_count": 7,
      "new_sentences_count": 1,
      "content_reuse_ratio": 0.875,
      "embedding_recommendation": "consider_reuse",
      "recommendation_confidence": "high",
      "dedup_metadata": null
    }
  ],
  "total": 47,
  "limit": 100,
  "offset": 0
}
curl https://api.raptordata.dev/api/documents/variants/variant-001/chunks?limit=50&offset=0 \
  -H "Authorization: Bearer rd_live_xxx"

Get Deduplication Summary

Get aggregate deduplication statistics for a variant.

Path Parameters

variant_id
string
required
Variant ID

Response

variant_id
string
Variant ID
total_chunks
integer
Total number of chunks
chunk_breakdown
object
Breakdown by deduplication strategy
total_sentences
integer
Total sentences across all chunks
reused_sentences
integer
Number of sentences reused from parent
new_sentences
integer
Number of new sentences
sentence_reuse_ratio
number
Ratio of sentences reused (0.0-1.0)
embedding_recommendations
object
Breakdown of embedding recommendations
parent_version_id
string
Parent version ID (null if no parent)
has_parent
boolean
Whether variant has a parent version
{
  "variant_id": "variant-001",
  "total_chunks": 47,
  "chunk_breakdown": {
    "exact": 15,
    "high_reuse": 18,
    "partial_reuse": 8,
    "new": 6
  },
  "total_sentences": 376,
  "reused_sentences": 298,
  "new_sentences": 78,
  "sentence_reuse_ratio": 0.79,
  "embedding_recommendations": {
    "reuse": 15,
    "consider_reuse": 18,
    "regenerate": 14
  },
  "parent_version_id": "version-000",
  "has_parent": true
}
curl https://api.raptordata.dev/api/documents/variants/variant-001/dedup-summary \
  -H "Authorization: Bearer rd_live_xxx"

Pagination Example

Retrieve all chunks with pagination:
async function getAllChunks(variantId: string) {
  const allChunks = [];
  let offset = 0;
  const limit = 100;

  while (true) {
    const { chunks, total } = await raptor.getChunks(variantId, {
      limit,
      offset
    });

    allChunks.push(...chunks);

    if (offset + chunks.length >= total) {
      break;
    }

    offset += limit;
  }

  return allChunks;
}

const chunks = await getAllChunks('variant-001');
console.log(`Retrieved ${chunks.length} chunks`);

Filter by Quality

Get only high-quality chunks:
const { chunks } = await raptor.getChunks('variant-001', {
  limit: 1000,
  includeFullMetadata: false
});

const highQuality = chunks.filter(chunk =>
  chunk.quality_score && chunk.quality_score >= 0.8
);

console.log(`${highQuality.length} high-quality chunks`);

Analyze Reuse

Analyze content reuse patterns:
const summary = await raptor.getDedupSummary('variant-001');

console.log('Chunk-level reuse:');
Object.entries(summary.chunk_breakdown).forEach(([strategy, count]) => {
  const percent = (count / summary.total_chunks * 100).toFixed(1);
  console.log(`  ${strategy}: ${count} chunks (${percent}%)`);
});

console.log('\nSentence-level reuse:');
console.log(`  Reused: ${summary.reused_sentences} sentences`);
console.log(`  New: ${summary.new_sentences} sentences`);
console.log(`  Ratio: ${(summary.sentence_reuse_ratio * 100).toFixed(1)}%`);

console.log('\nEmbedding recommendations:');
Object.entries(summary.embedding_recommendations).forEach(([rec, count]) => {
  console.log(`  ${rec}: ${count} chunks`);
});

Working with Tables

Extract table chunks:
const { chunks } = await raptor.getChunks('variant-001', {
  limit: 1000
});

const tableChunks = chunks.filter(chunk => chunk.contains_table);

tableChunks.forEach(chunk => {
  console.log(`Table on page ${chunk.page_number}:`);

  if (chunk.table_metadata) {
    console.log(`  Rows: ${chunk.table_metadata.rows}`);
    console.log(`  Columns: ${chunk.table_metadata.columns}`);
  }

  if (chunk.synthetic_context) {
    console.log(`  Context: ${chunk.synthetic_context}`);
  }

  console.log(`  Text: ${chunk.text.substring(0, 200)}...`);
});

Error Responses

404
Not Found
Variant not found or doesn’t belong to user
400
Bad Request
Invalid pagination parameters (e.g., limit > 1000)
{
  "detail": "Variant not found",
  "status_code": 404
}