Resource: awsKendraDataSource
Terraform resource for managing an AWS Kendra Data Source.
Example Usage
Basic Usage
/*Provider bindings are generated by running cdktf get.
See https://cdk.tf/provider-generation for more details.*/
import * as aws from "./.gen/providers/aws";
new aws.kendraDataSource.KendraDataSource(this, "example", {
description: "example",
indexId: "${aws_kendra_index.example.id}",
languageCode: "en",
name: "example",
tags: {
hello: "world",
},
type: "CUSTOM",
});
S3 Connector
With Schedule
/*Provider bindings are generated by running cdktf get.
See https://cdk.tf/provider-generation for more details.*/
import * as aws from "./.gen/providers/aws";
new aws.kendraDataSource.KendraDataSource(this, "example", {
configuration: {
s3Configuration: {
bucketName: "${aws_s3_bucket.example.id}",
},
},
indexId: "${aws_kendra_index.example.id}",
name: "example",
roleArn: "${aws_iam_role.example.arn}",
schedule: "cron(9 10 1 * ? *)",
type: "S3",
});
With Access Control List
/*Provider bindings are generated by running cdktf get.
See https://cdk.tf/provider-generation for more details.*/
import * as aws from "./.gen/providers/aws";
new aws.kendraDataSource.KendraDataSource(this, "example", {
configuration: {
s3Configuration: {
accessControlListConfiguration: {
keyPath: "s3://${aws_s3_bucket.example.id}/path-1",
},
bucketName: "${aws_s3_bucket.example.id}",
},
},
indexId: "${aws_kendra_index.example.id}",
name: "example",
roleArn: "${aws_iam_role.example.arn}",
type: "S3",
});
With Documents Metadata Configuration
/*Provider bindings are generated by running cdktf get.
See https://cdk.tf/provider-generation for more details.*/
import * as aws from "./.gen/providers/aws";
new aws.kendraDataSource.KendraDataSource(this, "example", {
configuration: {
s3Configuration: {
bucketName: "${aws_s3_bucket.example.id}",
exclusionPatterns: ["example"],
inclusionPatterns: ["hello"],
inclusionPrefixes: ["world"],
s3Prefix: "example",
},
},
indexId: "${aws_kendra_index.example.id}",
name: "example",
roleArn: "${aws_iam_role.example.arn}",
type: "S3",
});
Web Crawler Connector
With Seed URLs
/*Provider bindings are generated by running cdktf get.
See https://cdk.tf/provider-generation for more details.*/
import * as aws from "./.gen/providers/aws";
new aws.kendraDataSource.KendraDataSource(this, "example", {
configuration: {
webCrawlerConfiguration: {
urls: {
seedUrlConfiguration: {
seedUrls: ["REPLACE_WITH_YOUR_URL"],
},
},
},
},
indexId: "${aws_kendra_index.example.id}",
name: "example",
roleArn: "${aws_iam_role.example.arn}",
type: "WEBCRAWLER",
});
With Site Maps
/*Provider bindings are generated by running cdktf get.
See https://cdk.tf/provider-generation for more details.*/
import * as aws from "./.gen/providers/aws";
new aws.kendraDataSource.KendraDataSource(this, "example", {
configuration: {
webCrawlerConfiguration: {
urls: {
siteMapsConfiguration: {
siteMaps: ["REPLACE_WITH_YOUR_URL"],
},
},
},
},
indexId: "${aws_kendra_index.example.id}",
name: "example",
roleArn: "${aws_iam_role.example.arn}",
type: "WEBCRAWLER",
});
With Web Crawler Mode
/*Provider bindings are generated by running cdktf get.
See https://cdk.tf/provider-generation for more details.*/
import * as aws from "./.gen/providers/aws";
new aws.kendraDataSource.KendraDataSource(this, "example", {
configuration: {
webCrawlerConfiguration: {
urls: {
seedUrlConfiguration: {
seedUrls: ["REPLACE_WITH_YOUR_URL"],
webCrawlerMode: "SUBDOMAINS",
},
},
},
},
indexId: "${aws_kendra_index.example.id}",
name: "example",
roleArn: "${aws_iam_role.example.arn}",
type: "WEBCRAWLER",
});
With Authentication Configuration
/*Provider bindings are generated by running cdktf get.
See https://cdk.tf/provider-generation for more details.*/
import * as aws from "./.gen/providers/aws";
new aws.kendraDataSource.KendraDataSource(this, "example", {
configuration: {
webCrawlerConfiguration: {
authenticationConfiguration: {
basicAuthentication: [
{
credentials: "${aws_secretsmanager_secret.example.arn}",
host: "a.example.com",
port: "443",
},
],
},
urls: {
seedUrlConfiguration: {
seedUrls: ["REPLACE_WITH_YOUR_URL"],
},
},
},
},
depends_on: ["${aws_secretsmanager_secret_version.example}"],
indexId: "${aws_kendra_index.example.id}",
name: "example",
roleArn: "${aws_iam_role.example.arn}",
type: "WEBCRAWLER",
});
With Crawl Depth
/*Provider bindings are generated by running cdktf get.
See https://cdk.tf/provider-generation for more details.*/
import * as aws from "./.gen/providers/aws";
new aws.kendraDataSource.KendraDataSource(this, "example", {
configuration: {
webCrawlerConfiguration: {
crawlDepth: 3,
urls: {
seedUrlConfiguration: {
seedUrls: ["REPLACE_WITH_YOUR_URL"],
},
},
},
},
indexId: "${aws_kendra_index.example.id}",
name: "example",
roleArn: "${aws_iam_role.example.arn}",
type: "WEBCRAWLER",
});
With Max Links Per Page
/*Provider bindings are generated by running cdktf get.
See https://cdk.tf/provider-generation for more details.*/
import * as aws from "./.gen/providers/aws";
new aws.kendraDataSource.KendraDataSource(this, "example", {
configuration: {
webCrawlerConfiguration: {
maxLinksPerPage: 100,
urls: {
seedUrlConfiguration: {
seedUrls: ["REPLACE_WITH_YOUR_URL"],
},
},
},
},
indexId: "${aws_kendra_index.example.id}",
name: "example",
roleArn: "${aws_iam_role.example.arn}",
type: "WEBCRAWLER",
});
With Max Urls Per Minute Crawl Rate
/*Provider bindings are generated by running cdktf get.
See https://cdk.tf/provider-generation for more details.*/
import * as aws from "./.gen/providers/aws";
new aws.kendraDataSource.KendraDataSource(this, "example", {
configuration: {
webCrawlerConfiguration: {
maxUrlsPerMinuteCrawlRate: 300,
urls: {
seedUrlConfiguration: {
seedUrls: ["REPLACE_WITH_YOUR_URL"],
},
},
},
},
indexId: "${aws_kendra_index.example.id}",
name: "example",
roleArn: "${aws_iam_role.example.arn}",
type: "WEBCRAWLER",
});
With Proxy Configuration
/*Provider bindings are generated by running cdktf get.
See https://cdk.tf/provider-generation for more details.*/
import * as aws from "./.gen/providers/aws";
new aws.kendraDataSource.KendraDataSource(this, "example", {
configuration: {
webCrawlerConfiguration: {
proxyConfiguration: {
credentials: "${aws_secretsmanager_secret.example.arn}",
host: "a.example.com",
port: "443",
},
urls: {
seedUrlConfiguration: {
seedUrls: ["REPLACE_WITH_YOUR_URL"],
},
},
},
},
depends_on: ["${aws_secretsmanager_secret_version.example}"],
indexId: "${aws_kendra_index.example.id}",
name: "example",
roleArn: "${aws_iam_role.example.arn}",
type: "WEBCRAWLER",
});
With URL Exclusion and Inclusion Patterns
/*Provider bindings are generated by running cdktf get.
See https://cdk.tf/provider-generation for more details.*/
import * as aws from "./.gen/providers/aws";
new aws.kendraDataSource.KendraDataSource(this, "example", {
configuration: {
webCrawlerConfiguration: {
urlExclusionPatterns: ["example"],
urlInclusionPatterns: ["hello"],
urls: {
seedUrlConfiguration: {
seedUrls: ["REPLACE_WITH_YOUR_URL"],
},
},
},
},
indexId: "${aws_kendra_index.example.id}",
name: "example",
roleArn: "${aws_iam_role.example.arn}",
type: "WEBCRAWLER",
});
Argument Reference
The following arguments are required:
indexId
- (Required, Forces new resource) The identifier of the index for your Amazon Kendra data_source.name
- (Required) A name for your Data Source connector.roleArn
- (Required, Optional in one scenario) The Amazon Resource Name (ARN) of a role with permission to access the data source connector. For more information, see IAM roles for Amazon Kendra. You can't specify theroleArn
parameter when thetype
parameter is set tocustom
. TheroleArn
parameter is required for all other data sources.type
- (Required, Forces new resource) The type of data source repository. For an updated list of values, refer to Valid Values for Type.
The following arguments are optional:
configuration
- (Optional) A block with the configuration information to connect to your Data Source repository. You can't specify theconfiguration
argument when thetype
parameter is set tocustom
. Detailed below.customDocumentEnrichmentConfiguration
- (Optional) A block with the configuration information for altering document metadata and content during the document ingestion process. For more information on how to create, modify and delete document metadata, or make other content alterations when you ingest documents into Amazon Kendra, see Customizing document metadata during the ingestion process. Detailed below.description
- (Optional) A description for the Data Source connector.languageCode
- (Optional) The code for a language. This allows you to support a language for all documents when creating the Data Source connector. English is supported by default. For more information on supported languages, including their codes, see Adding documents in languages other than English.schedule
- (Optional) Sets the frequency for Amazon Kendra to check the documents in your Data Source repository and update the index. If you don't set a schedule Amazon Kendra will not periodically update the index. You can call thestartDataSourceSyncJob
API to update the index.tags
- (Optional) Key-value map of resource tags. If configured with a providerdefaultTags
configuration block present, tags with matching keys will overwrite those defined at the provider-level.
configuration
The configuration
configuration block supports the following arguments:
s3Configuration
- (Required iftype
is set tos3
) A block that provides the configuration information to connect to an Amazon S3 bucket as your data source. Detailed below.webCrawlerConfiguration
- (Required iftype
is set towebcrawler
) A block that provides the configuration information required for Amazon Kendra Web Crawler. Detailed below.
s3Configuration
The s3Configuration
configuration block supports the following arguments:
accessControlListConfiguration
- (Optional) A block that provides the path to the S3 bucket that contains the user context filtering files for the data source. For the format of the file, see Access control for S3 data sources. Detailed below.bucketName
- (Required) The name of the bucket that contains the documents.documentsMetadataConfiguration
- (Optional) A block that defines the Document metadata files that contain information such as the document access control information, source URI, document author, and custom attributes. Each metadata file contains metadata about a single document. Detailed below.exclusionPatterns
- (Optional) A list of glob patterns for documents that should not be indexed. If a document that matches an inclusion prefix or inclusion pattern also matches an exclusion pattern, the document is not indexed. Refer to Exclusion Patterns for more examples.inclusionPatterns
- (Optional) A list of glob patterns for documents that should be indexed. If a document that matches an inclusion pattern also matches an exclusion pattern, the document is not indexed. Refer to Inclusion Patterns for more examples.inclusionPrefixes
- (Optional) A list of S3 prefixes for the documents that should be included in the index.
accessControlListConfiguration
The accessControlListConfiguration
configuration block supports the following arguments:
keyPath
- (Optional) Path to the AWS S3 bucket that contains the ACL files.
documentsMetadataConfiguration
The documentsMetadataConfiguration
configuration block supports the following arguments:
s3Prefix
- (Optional) A prefix used to filter metadata configuration files in the AWS S3 bucket. The S3 bucket might contain multiple metadata files. Uses3Prefix
to include only the desired metadata files.
webCrawlerConfiguration
The webCrawlerConfiguration
configuration block supports the following arguments:
authenticationConfiguration
- (Optional) A block with the configuration information required to connect to websites using authentication. You can connect to websites using basic authentication of user name and password. You use a secret in AWS Secrets Manager to store your authentication credentials. You must provide the website host name and port number. For example, the host name ofhttps://aExampleCom/page1Html
is"aExampleCom"
and the port is443
, the standard port for HTTPS. Detailed below.crawlDepth
- (Optional) Specifies the number of levels in a website that you want to crawl. The first level begins from the website seed or starting point URL. For example, if a website has 3 levels – index level (i.e. seed in this example), sections level, and subsections level – and you are only interested in crawling information up to the sections level (i.e. levels 0-1), you can set your depth to 1. The default crawl depth is set to2
. Minimum value of0
. Maximum value of10
.maxContentSizePerPageInMegaBytes
- (Optional) The maximum size (in MB) of a webpage or attachment to crawl. Files larger than this size (in MB) are skipped/not crawled. The default maximum size of a webpage or attachment is set to50
MB. Minimum value of10E06
. Maximum value of50
.maxLinksPerPage
- (Optional) The maximum number of URLs on a webpage to include when crawling a website. This number is per webpage. As a website’s webpages are crawled, any URLs the webpages link to are also crawled. URLs on a webpage are crawled in order of appearance. The default maximum links per page is100
. Minimum value of1
. Maximum value of1000
.maxUrlsPerMinuteCrawlRate
- (Optional) The maximum number of URLs crawled per website host per minute. The default maximum number of URLs crawled per website host per minute is300
. Minimum value of1
. Maximum value of300
.proxyConfiguration
- (Optional) Configuration information required to connect to your internal websites via a web proxy. You must provide the website host name and port number. For example, the host name ofhttps://aExampleCom/page1Html
is"aExampleCom"
and the port is443
, the standard port for HTTPS. Web proxy credentials are optional and you can use them to connect to a web proxy server that requires basic authentication. To store web proxy credentials, you use a secret in AWS Secrets Manager. Detailed below.urlExclusionPatterns
- (Optional) A list of regular expression patterns to exclude certain URLs to crawl. URLs that match the patterns are excluded from the index. URLs that don't match the patterns are included in the index. If a URL matches both an inclusion and exclusion pattern, the exclusion pattern takes precedence and the URL file isn't included in the index. Array Members: Minimum number of0
items. Maximum number of100
items. Length Constraints: Minimum length of1
. Maximum length of150
.urlInclusionPatterns
- (Optional) A list of regular expression patterns to include certain URLs to crawl. URLs that match the patterns are included in the index. URLs that don't match the patterns are excluded from the index. If a URL matches both an inclusion and exclusion pattern, the exclusion pattern takes precedence and the URL file isn't included in the index. Array Members: Minimum number of0
items. Maximum number of100
items. Length Constraints: Minimum length of1
. Maximum length of150
.urls
- (Required) A block that specifies the seed or starting point URLs of the websites or the sitemap URLs of the websites you want to crawl. You can include website subdomains. You can list up to100
seed URLs and up to3
sitemap URLs. You can only crawl websites that use the secure communication protocol, Hypertext Transfer Protocol Secure (HTTPS). If you receive an error when crawling a website, it could be that the website is blocked from crawling. When selecting websites to index, you must adhere to the Amazon Acceptable Use Policy and all other Amazon terms. Remember that you must only use Amazon Kendra Web Crawler to index your own webpages, or webpages that you have authorization to index. Detailed below.
authenticationConfiguration
The authenticationConfiguration
configuration block supports the following arguments:
basicAuthentication
- (Optional) The list of configuration information that's required to connect to and crawl a website host using basic authentication credentials. The list includes the name and port number of the website host. Detailed below.
The basicAuthentication
configuration block supports the following arguments:
credentials
- (Required) Your secret ARN, which you can create in AWS Secrets Manager. You use a secret if basic authentication credentials are required to connect to a website. The secret stores your credentials of user name and password.host
- (Required) The name of the website host you want to connect to using authentication credentials. For example, the host name ofhttps://aExampleCom/page1Html
is"aExampleCom"
.port
- (Required) The port number of the website host you want to connect to using authentication credentials. For example, the port forhttps://aExampleCom/page1Html
is443
, the standard port for HTTPS.
proxyConfiguration
The proxyConfiguration
configuration block supports the following arguments:
credentials
- (Optional) Your secret ARN, which you can create in AWS Secrets Manager. The credentials are optional. You use a secret if web proxy credentials are required to connect to a website host. Amazon Kendra currently support basic authentication to connect to a web proxy server. The secret stores your credentials.host
- (Required) The name of the website host you want to connect to via a web proxy server. For example, the host name ofhttps://aExampleCom/page1Html
is"aExampleCom"
.port
- (Required) The port number of the website host you want to connect to via a web proxy server. For example, the port forhttps://aExampleCom/page1Html
is443
, the standard port for HTTPS.
urls
The urls
configuration block supports the following arguments:
seedUrlConfiguration
- (Optional) A block that specifies the configuration of the seed or starting point URLs of the websites you want to crawl. You can choose to crawl only the website host names, or the website host names with subdomains, or the website host names with subdomains and other domains that the webpages link to. You can list up to100
seed URLs. Detailed below.siteMapsConfiguration
- (Optional) A block that specifies the configuration of the sitemap URLs of the websites you want to crawl. Only URLs belonging to the same website host names are crawled. You can list up to3
sitemap URLs. Detailed below.
The seedUrlConfiguration
configuration block supports the following arguments:
seedUrls
- (Required) The list of seed or starting point URLs of the websites you want to crawl. The list can include a maximum of100
seed URLs. Array Members: Minimum number of0
items. Maximum number of100
items. Length Constraints: Minimum length of1
. Maximum length of2048
.webCrawlerMode
- (Optional) The default mode is set toHOST_ONLY
. You can choose one of the following modes:HOST_ONLY
– crawl only the website host names. For example, if the seed URL is"abcExampleCom"
, then only URLs with host name"abcExampleCom"
are crawled.subdomains
– crawl the website host names with subdomains. For example, if the seed URL is"abcExampleCom"
, then"aAbcExampleCom"
and"bAbcExampleCom"
are also crawled.everything
– crawl the website host names with subdomains and other domains that the webpages link to.
The siteMapsConfiguration
configuration block supports the following arguments:
siteMaps
- (Required) The list of sitemap URLs of the websites you want to crawl. The list can include a maximum of3
sitemap URLs.
customDocumentEnrichmentConfiguration
The customDocumentEnrichmentConfiguration
configuration block supports the following arguments:
inlineConfigurations
- (Optional) Configuration information to alter document attributes or metadata fields and content when ingesting documents into Amazon Kendra. Minimum number of0
items. Maximum number of100
items. Detailed below.postExtractionHookConfiguration
- (Optional) A block that specifies the configuration information for invoking a Lambda function in AWS Lambda on the structured documents with their metadata and text extracted. You can use a Lambda function to apply advanced logic for creating, modifying, or deleting document metadata and content. For more information, see Advanced data manipulation. Detailed below.preExtractionHookConfiguration
- (Optional) Configuration information for invoking a Lambda function in AWS Lambda on the original or raw documents before extracting their metadata and text. You can use a Lambda function to apply advanced logic for creating, modifying, or deleting document metadata and content. For more information, see Advanced data manipulation. Detailed below.roleArn
- (Optional) The Amazon Resource Name (ARN) of a role with permission to runpreExtractionHookConfiguration
andpostExtractionHookConfiguration
for altering document metadata and content during the document ingestion process. For more information, see IAM roles for Amazon Kendra.
inlineConfigurations
The inlineConfigurations
configuration block supports the following arguments:
condition
- (Optional) Configuration of the condition used for the target document attribute or metadata field when ingesting documents into Amazon Kendra. See Document Attribute Condition.documentContentDeletion
- (Optional)true
to delete content if the condition used for the target attribute is met.target
- (Optional) Configuration of the target document attribute or metadata field when ingesting documents into Amazon Kendra. You can also include a value. Detailed below.
target
The target
configuration block supports the following arguments:
targetDocumentAttributeKey
- (Optional) The identifier of the target document attribute or metadata field. For example, 'Department' could be an identifier for the target attribute or metadata field that includes the department names associated with the documents.targetDocumentAttributeValue
- (Optional) The target value you want to create for the target attribute. For example, 'Finance' could be the target value for the target attribute key 'Department'. See Document Attribute Value.targetDocumentAttributeValueDeletion
- (Optional)true
to delete the existing target value for your specified target attribute key. You cannot create a target value and set this totrue
. To create a target value (targetDocumentAttributeValue
), set this tofalse
.
hookConfiguration
The hookConfiguration
configuration block supports the following arguments:
invocationCondition
- (Optional) A block that specifies the condition used for when a Lambda function should be invoked. For example, you can specify a condition that if there are empty date-time values, then Amazon Kendra should invoke a function that inserts the current date-time. See Document Attribute Condition.lambdaArn
- (Required) The Amazon Resource Name (ARN) of a Lambda Function that can manipulate your document metadata fields or attributes and content.s3Bucket
- (Required) Stores the original, raw documents or the structured, parsed documents before and after altering them. For more information, see Data contracts for Lambda functions.
Document Attribute Condition
The condition
and invocationCondition
configuration blocks supports the following arguments:
conditionDocumentAttributeKey
- (Required) The identifier of the document attribute used for the condition. For example,sourceUri
could be an identifier for the attribute or metadata field that contains source URIs associated with the documents. Amazon Kendra currently does not supportdocumentBody
as an attribute key used for the condition.conditionOnValue
- (Optional) The value used by the operator. For example, you can specify the value 'financial' for strings in thesourceUri
field that partially match or contain this value. See Document Attribute Value.operator
- (Required) The condition operator. For example, you can usecontains
to partially match a string. Valid Values:greaterThan
|greaterThanOrEquals
|lessThan
|lessThanOrEquals
|equals
|notEquals
|contains
|notContains
|exists
|notExists
|beginsWith
.
Document Attribute Value
The conditionOnValue
and targetDocumentAttributeValue
configuration blocks supports the following arguments:
dateValue
- (Optional) A date expressed as an ISO 8601 string. It is important for the time zone to be included in the ISO 8601 date-time format. As of this writing only UTC is supported. For example,20120325T12:30:10+00:00
.longValue
- (Optional) A long integer value.stringListValue
- (Optional) A list of strings.string
- (Optional) A string, such as "department".
Attributes Reference
In addition to all arguments above, the following attributes are exported:
arn
- ARN of the Data Source.createdAt
- The Unix timestamp of when the Data Source was created.dataSourceId
- The unique identifiers of the Data Source.errorMessage
- When the Status field value isfailed
, the ErrorMessage field contains a description of the error that caused the Data Source to fail.id
- The unique identifiers of the Data Source and index separated by a slash (/
).status
- The current status of the Data Source. When the status isactive
the Data Source is ready to use. When the status isfailed
, theerrorMessage
field contains the reason that the Data Source failed.updatedAt
- The Unix timestamp of when the Data Source was last updated.tagsAll
- A map of tags assigned to the resource, including those inherited from the providerdefaultTags
configuration block.
Timeouts
create
- (Default30M
)update
- (Default30M
)delete
- (Default30M
)
Import
Kendra Data Source can be imported using the unique identifiers of the data_source and index separated by a slash (/
) e.g.,