Edit

Aggregate data automatically

Data pipelines automatically transform raw sensor readings into summaries and insights at a schedule that you choose. Viam stores the output of these pipelines in a separate, queryable database.

For example, you may often query the average temperature across multiple sensors for each hour of the day. To make these queries faster, you can use a data pipeline to pre-calculate the results, saving significant computational resources.

Tip

When a machine goes offline, data collection continues but sync pauses. viam-server stores the data locally and syncs later, when your machine reconnects to Viam.

Once the machine reconnects and syncs this stored data, Viam automatically re-runs affected pipelines to include the new data.

Prerequisites

While not a requirement, it is easier to test data pipelines if you have already enabled data capture from at least one component and begun syncing data with Viam before setting up a pipeline.

Only users with organization owner permissions can create a data pipeline.

Pipeline management

Create a pipeline

To define a data pipeline, specify a name, organization, schedule, data source type, and query:

Use datapipelines create to create your pipeline:

viam datapipelines create \
  --org-id=<org-id> \
  --name=sensor-counts \
  --schedule="0 * * * *" \
  --data-source-type="standard" \
  --mql='[{"$match": {"component_name": "sensor"}}, {"$group": {"_id": "$location_id", "avg_temp": {"$avg": "$data.readings.temperature"}, "count": {"$sum": 1}}, {"$project": {"location": "$_id", "avg_temp": 1, "count": 1}}]'

To pass your query as a file instead of specifying it as inline MQL, pass the --mql-path flag instead of --mql. To create a pipeline that reads data from the hot data store, specify --data-source-type hotstorage.

To define a new pipeline, call DataClient.CreateDataPipeline:

from viam import DataClient
import bson

data_client = DataClient.from_api_key(
    api_key="<api-key>",
    api_key_id="<api-key-id>"
)

request = data_client.create_data_pipeline(
    organization_id="<org-id>",
    name="hourly-temp-average",
    mql_binary=[
        bson.encode({"$match": {"component_name": "temperature-sensor"}}),
        bson.encode({
            "$group": {
                "_id": "$location_id",
                "avg_temp": {"$avg": "$data.readings.temperature"},
                "count": {"$sum": 1}
            }
        }),
        bson.encode({
            "$project": {
                "location": "$_id",
                "avg_temp": 1,
                "count": 1
            }
        })
    ],
    schedule="0 * * * *"  # Run hourly
)

To create a pipeline that reads data from the hot data store, set your query’s data_source to TabularDataSourceType.TABULAR_DATA_SOURCE_TYPE_HOT_STORAGE.

To define a new pipeline, call DataClient.CreateDataPipeline:

import (
    "context"
    "fmt"
    "go.viam.com/rdk/app/utils"
    datapb "go.viam.com/rdk/app/data/v1"
    "go.viam.com/rdk/app/data/v1/bson"
)

client, err := utils.NewViamClient(context.Background(), utils.WithAPIKey("<api-key>", "<api-key-id>"))
if err != nil {
    panic(err)
}
defer client.Close()

dataClient := client.DataClient()

pipeline := [][]byte{
    bson.Marshal(bson.M{"$match": bson.M{"component_name": "temperature-sensor"}}),
    bson.Marshal(bson.M{
        "$group": bson.M{
            "_id": "$location_id",
            "avg_temp": bson.M{"$avg": "$data.readings.temperature"},
            "count": bson.M{"$sum": 1},
        },
    }),
    bson.Marshal(bson.M{
        "$group": bson.M{
            "location": "$_id",
            "avg_temp": 1,
            "count": 1,
        },
    }),
}

resp, err := dataClient.CreateDataPipeline(context.Background(), &datapb.CreateDataPipelineRequest{
    OrganizationId: "<org-id>",
    Name: "hourly-temp-average",
    MqlBinary: pipeline,
    Schedule: "0 * * * *", // Run hourly
})

To create a pipeline that reads data from the hot data store, set your query’s data_source field to TabularDataSourceType.TABULAR_DATA_SOURCE_TYPE_HOT_STORAGE.

To define a new pipeline, call dataClient.CreateDataPipeline:

import { createViamClient } from "@viamrobotics/sdk";
import {
  TabularDataSource,
  TabularDataSourceType,
} from "@viamrobotics/sdk/dist/gen/app/data/v1/data_pb";
import { BSON } from "bson";

const apiKey = "<api-key>";
const apiKeyID = "<api-key-id>";

const client = await createViamClient({
  credential: {
    type: "api-key",
    payload: { key: apiKey, keyId: apiKeyID },
  },
});

const dataClient = client.dataClient;

const pipeline = [
  BSON.serialize({ $match: { component_name: "temperature-sensor" } }),
  BSON.serialize({
    $group: {
      _id: "$location_id",
      avg_temp: { $avg: "$data.readings.temperature" },
      count: { $sum: 1 },
    },
  }),
  BSON.serialize({
    $project: {
      location: "$_id",
      avg_temp: 1,
      count: 1,
    },
  }),
];

const response = await dataClient.createDataPipeline({
  organizationId: "<org-id>",
  name: "hourly-temp-average",
  mqlBinary: pipeline,
  schedule: "0 * * * *", // Run hourly
});

To create a pipeline that reads data from the hot data store, set your query’s dataSource field to TabularDataSourceType.TABULAR_DATA_SOURCE_TYPE_HOT_STORAGE.

Caution

Avoid specifying an _id value in your pipeline’s final group stage unless you can guarantee its uniqueness across all pipeline runs. Non-unique IDs will trigger duplicate key errors, preventing the pipeline from saving subsequent results. Because the $group stage requires an _id value, follow any final $group stage with a $project stage that renames the _id field to a different name.

Schedule format

To create a schedule for your pipeline, specify a cron expression in UTC. The schedule determines both execution frequency and the range of time queried by each execution. The following table contains some common schedules, which correspond to the listed execution frequencies and query time range:

Schedule	Frequency	Query Time Range
`0 * * * *`	Hourly	Previous hour
`0 0 * * *`	Daily	Previous day
`/15 * * *`	Every 15 minutes	Previous 15 minutes

Query limitations

Data pipeline queries only support a subset of MQL aggregation operators. For more information, see Supported aggregation operators.

Query pipeline results

To query the results of your data pipeline, call DataClient.TabularDataByMQL. Configure the data_source argument with the following fields:

type: TabularDataSourceType.TABULAR_DATA_SOURCE_TYPE_PIPELINE_SINK
pipeline_id: your pipeline ID

from viam import DataClient
from viam.app.data.v1.data_pb import TabularDataSource, TabularDataSourceType
import bson

data_client = DataClient.from_api_key(
    api_key="<api-key>",
    api_key_id="<api-key-id>"
)

results = data_client.tabular_data_by_mql(
    organization_id="<org-id>",
    mql_binary=[
        bson.encode({}),
    ],
    data_source=TabularDataSource(
        type=TabularDataSourceType.TABULAR_DATA_SOURCE_TYPE_PIPELINE_SINK,
        pipeline_id="<pipeline-id>"
    )
)

for document in results:
    print(document)

To query the results of your data pipeline, call DataClient.TabularDataByMQL. Configure the DataSource argument with the following fields:

Type: datapb.TabularDataSourceType_TABULAR_DATA_SOURCE_TYPE_PIPELINE_SINK
PipelineId: your pipeline’s ID

import (
    "context"
    "fmt"
    "go.viam.com/rdk/app/utils"
    datapb "go.viam.com/rdk/app/data/v1"
    "go.viam.com/rdk/app/data/v1/bson"
)

client, err := utils.NewViamClient(context.Background(), utils.WithAPIKey("<api-key>", "<api-key-id>"))
if err != nil {
    panic(err)
}
defer client.Close()

dataClient := client.DataClient()

query := [][]byte{
    bson.Marshal(bson.M{}),
}

resp, err := dataClient.TabularDataByMQL(context.Background(), &datapb.TabularDataByMQLRequest{
    OrganizationId: "<org-id>",
    MqlBinary: query,
    DataSource: &datapb.TabularDataSource{
        Type: datapb.TabularDataSourceType_TABULAR_DATA_SOURCE_TYPE_PIPELINE_SINK,
        PipelineId: proto.String("<pipeline-id>"),
    },
})

for _, doc := range resp.Data {
    fmt.Println(doc)
}

To query the results of your data pipeline, call dataClient.TabularDataByMQL. Configure the data_source argument with the following fields:

type: TabularDataSourceType.TABULAR_DATA_SOURCE_TYPE_PIPELINE_SINK
pipelineId: your pipeline’s ID

import { createViamClient } from '@viamrobotics/sdk';
import { TabularDataSource, TabularDataSourceType } from '@viamrobotics/sdk/dist/gen/app/data/v1/data_pb';
import { BSON } from 'bson';

const apiKey = "<api-key>";
const apiKeyID = "<api-key-id>";

const client = await createViamClient({
  credential: {
    type: "api-key",
    payload: { key: apiKey, keyId: apiKeyID },
  },
});

const dataClient = client.dataClient;

const query = [BSON.serialize({})];

const response = await dataClient.tabularDataByMQL({
  "<org-id>",
  query,
  false,
  new TabularDataSource({
    type: TabularDataSourceType.PIPELINE_SINK,
    pipelineId: "<pipeline-id>",
  }),
});

response.data.forEach((doc) => {
  console.log(BSON.deserialize(doc));
});

List pipelines

Use datapipelines list to fetch a list of pipeline configurations in an organization:

viam datapipelines list --org-id=<org-id>

Use DataClient.ListDataPipelines to fetch a list of pipeline configurations in an organization:

from viam import DataClient
import bson

data_client = DataClient.from_api_key(
    api_key="<api-key>",
    api_key_id="<api-key-id>"
)

pipelines = data_client.list_data_pipelines(organization_id="<org-id>")

for pipeline in pipelines:
    print(f"{pipeline.name} (id: {pipeline.id})")
    print(f"Schedule: {pipeline.schedule}")
    print(f"Enabled: {pipeline.enabled}")

Use DataClient.ListDataPipelines to fetch a list of pipeline configurations in an organization:

import (
    "context"
    "fmt"
    "go.viam.com/rdk/app/utils"
)

client, err := utils.NewViamClient(context.Background(), utils.WithAPIKey("<api-key>", "<api-key-id>"))
if err != nil {
    panic(err)
}
defer client.Close()

dataClient := client.DataClient()

resp, err := dataClient.ListDataPipelines(context.Background(), &datapb.ListDataPipelinesRequest{
    OrganizationId: "<org-id>",
})

for _, pipeline := range resp.DataPipelines {
    fmt.Printf("%s (id: %s)\n", pipeline.Name, pipeline.Id)
    fmt.Printf("Schedule: %s\n", pipeline.Schedule)
    fmt.Printf("Enabled: %v\n", pipeline.Enabled)
}

Use dataClient.ListDataPipelines to fetch a list of pipeline configurations in an organization:

import { createViamClient } from "@viamrobotics/sdk";
import { BSON } from "bson";

const apiKey = "<api-key>";
const apiKeyID = "<api-key-id>";

const client = await createViamClient({
  credential: {
    type: "api-key",
    payload: { key: apiKey, keyId: apiKeyID },
  },
});

const dataClient = client.dataClient;

const response = await dataClient.listDataPipelines({
  organizationId: "<org-id>",
});

response.dataPipelines.forEach((pipeline) => {
  console.log(`${pipeline.name} (id: ${pipeline.id})`);
  console.log(`Schedule: ${pipeline.schedule}`);
  console.log(`Enabled: ${pipeline.enabled}`);
});

Update a pipeline

Caution

Use caution when updating the query or schedule of a data pipeline. Changing either value can lead to inconsistent pipeline output history.

Use datapipelines update to alter an existing data pipeline:

viam datapipelines update \
  --org-id=<org-id> \
  --id=<pipeline-id> \
  --schedule="0 * * * *" \
  --name="updated-name" \
  --mql='[{"$match": {"component_name": "sensor"}}, {"$group": {"_id": "$part_id", "total": {"$sum": "$1"}}, {"$project": {"part": "$_id", "avg_temp": 1, "count": 1}}]'

To pass your query from a file instead of from inline MQL, pass the --mql-path flag instead of --mql.

Use DataClient.UpdateDataPipeline to alter an existing data pipeline:

import (
    "context"
    "fmt"
    "go.viam.com/rdk/app/utils"
    "go.viam.com/rdk/app/data/v1/bson"
)

client, err := utils.NewViamClient(context.Background(), utils.WithAPIKey("<api-key>", "<api-key-id>"))
if err != nil {
    panic(err)
}
defer client.Close()

dataClient := client.DataClient()

pipeline := [][]byte{
    bson.Marshal(bson.M{"$match": bson.M{"component_name": "sensor"}}),
    bson.Marshal(bson.M{
        "$group": bson.M{
            "_id": "$part_id",
            "total": bson.M{"$sum": 1},
        },
    bson.Marshal(bson.M{
        "$group": bson.M{
            "part": "$_id",
            "total": 1,
        },
    }),
    }),
}

_, err := dataClient.UpdateDataPipeline(context.Background(), &datapb.UpdateDataPipelineRequest{
    Id: "<pipeline-id>",
    Name: "updated-name",
    MqlBinary: pipeline,
    Schedule: "0 * * * *",
})

Enable a pipeline

Use datapipelines enable to enable a disabled data pipeline:

viam datapipelines enable --id=<pipeline-id>

Use DataClient.EnableDataPipeline to enable a disabled data pipeline:

import (
    "context"
    "fmt"
    "go.viam.com/rdk/app/utils"
)

client, err := utils.NewViamClient(context.Background(), utils.WithAPIKey("<api-key>", "<api-key-id>"))
if err != nil {
    panic(err)
}
defer client.Close()

dataClient := client.DataClient()

_, err := dataClient.EnableDataPipeline(context.Background(), &datapb.EnableDataPipelineRequest{
    Id: "<pipeline-id>",
})

Disable a pipeline

Disabling a data pipeline lets you pause data pipeline execution without fully deleting the pipeline configuration from your organization. The pipeline immediately stops aggregating data. You can re-enable the pipeline at any time to resume execution. When a pipeline is re-enabled, Viam will not backfill missed time windows from the period of time when a pipeline was disabled.

Use datapipelines disable to disable a data pipeline:

viam datapipelines disable --id=<pipeline-id>

Use DataClient.DisableDataPipeline to disable a data pipeline:

import (
    "context"
    "fmt"
    "go.viam.com/rdk/app/utils"
)

client, err := utils.NewViamClient(context.Background(), utils.WithAPIKey("<api-key>", "<api-key-id>"))
if err != nil {
    panic(err)
}
defer client.Close()

dataClient := client.DataClient()

_, err := dataClient.DisableDataPipeline(context.Background(), &datapb.DisableDataPipelineRequest{
    Id: "<pipeline-id>",
})

Delete a pipeline

Use datapipelines delete to delete a data pipeline, its execution history, and all output generated by that pipeline:

viam datapipelines delete --id=<pipeline-id>

Use DataClient.DeleteDataPipeline to delete a data pipeline:

from viam import DataClient

data_client = DataClient.from_api_key(
    api_key="<api-key>",
    api_key_id="<api-key-id>"
)

data_client.delete_data_pipeline(id="<pipeline-id>")

Use DataClient.DeleteDataPipeline to delete a data pipeline:

import (
    "context"
    "fmt"
    "go.viam.com/rdk/app/utils"
)

client, err := utils.NewViamClient(context.Background(), utils.WithAPIKey("<api-key>", "<api-key-id>"))
if err != nil {
    panic(err)
}
defer client.Close()

dataClient := client.DataClient()

_, err := dataClient.DeleteDataPipeline(context.Background(), &datapb.DeleteDataPipelineRequest{
    Id: "<pipeline-id>",
})

Use dataClient.DeleteDataPipeline to delete a data pipeline:

import { createViamClient } from "@viamrobotics/sdk";

const apiKey = "<api-key>";
const apiKeyID = "<api-key-id>";

const client = await createViamClient({
  credential: {
    type: "api-key",
    payload: { key: apiKey, keyId: apiKeyID },
  },
});

const dataClient = client.dataClient;

await dataClient.deleteDataPipeline({ id: "<pipeline-id>" });

Check pipeline execution history

Data pipeline executions may have any of the following statuses:

SCHEDULED: pending execution
STARTED: currently processing
COMPLETED: successfully finished
FAILED: execution error

Use DataClient.ListDataPipelineRuns to view information about past and in-progress executions of a pipeline:

from viam import DataClient

data_client = DataClient.from_api_key(
    api_key="<api-key>",
    api_key_id="<api-key-id>"
)

runs = data_client.list_data_pipeline_runs(
    id="<pipeline-id>",
    page_size=10
)

for run in runs:
    print(f"Run {run.id}: {run.status}")
    print(f"Data window: {run.data_start_time} to {run.data_end_time}")
    print(f"Started: {run.started}, Ended: {run.ended}")

Use DataClient.ListDataPipelineRuns to view information about past executions of a pipeline:

import (
    "context"
    "fmt"
    "go.viam.com/rdk/app/utils"
)

client, err := utils.NewViamClient(context.Background(), utils.WithAPIKey("<api-key>", "<api-key-id>"))
if err != nil {
    panic(err)
}
defer client.Close()

dataClient := client.DataClient()

resp, err := dataClient.ListDataPipelineRuns(context.Background(), &datapb.ListDataPipelineRunsRequest{
    Id: "<pipeline-id>",
    PageSize: 10,
})

for _, run := range resp.Executions {
    fmt.Printf("Run %s: %s\n", run.Id, run.Status)
    fmt.Printf("Data window: %s to %s\n", run.DataStartTime, run.DataEndTime)
    fmt.Printf("Started: %s, Ended: %s\n", run.Started, run.Ended)
}

Use dataClient.ListDataPipelineRuns to view information about past executions of a pipeline:

import { createViamClient } from "@viamrobotics/sdk";

const apiKey = "<api-key>";
const apiKeyID = "<api-key-id>";

const client = await createViamClient({
  credential: {
    type: "api-key",
    payload: { key: apiKey, keyId: apiKeyID },
  },
});

const dataClient = client.dataClient;

const response = await dataClient.listDataPipelineRuns({
  id: "<pipeline-id>",
  pageSize: 10,
});

response.executions.forEach((run) => {
  console.log(`Run ${run.id}: ${run.status}`);
  console.log(`Data window: ${run.dataStartTime} to ${run.dataEndTime}`);
  console.log(`Started: ${run.started}, Ended: ${run.ended}`);
});

Was this page helpful?

Glad to hear it! If you have any other feedback please let us know:

We're sorry about that. To help us improve, please tell us what we can do better:

Thank you!