Saltar al contenido principal
En esta guía mostraremos cómo integrar ClickHouse. Usaremos JupySQL para ejecutar consultas en ClickHouse. Una vez cargados los datos, los visualizaremos mediante gráficos en SQL. La integración entre JupySQL y ClickHouse es posible gracias al uso de la biblioteca clickhouse_sqlalchemy. Esta biblioteca facilita la comunicación entre ambos sistemas y le permite conectarse a ClickHouse y especificar el dialecto SQL. Una vez establecida la conexión, puede ejecutar consultas SQL directamente desde la UI nativa de ClickHouse o desde el Jupyter notebook.
# Instalar los paquetes necesarios
%pip install --quiet jupysql clickhouse_sqlalchemy
Nota: es posible que debas reiniciar el kernel para usar los paquetes actualizados.
import pandas as pd
from sklearn_evaluation import plot

# Importar la extensión de Jupyter jupysql para crear celdas SQL
%load_ext sql
%config SqlMagic.autocommit=False
Debes asegurarte de que tu ClickHouse esté en funcionamiento y sea accesible para los siguientes pasos. Puedes usar la versión local o la versión en la nube. Nota: tendrás que ajustar la cadena de conexión según el tipo de instancia al que intentes conectarte (URL, usuario, contraseña). En el ejemplo de abajo hemos usado una instancia local. Para obtener más información, consulta esta guía.
%sql clickhouse://default:@localhost:8123/default
%%sql
CREATE TABLE trips
(
    `trip_id` UInt32,
    `vendor_id` Enum8('1' = 1, '2' = 2, '3' = 3, '4' = 4, 'CMT' = 5, 'VTS' = 6, 'DDS' = 7, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14, '' = 15),
    `pickup_date` Date,
    `pickup_datetime` DateTime,
    `dropoff_date` Date,
    `dropoff_datetime` DateTime,
    `store_and_fwd_flag` UInt8,
    `rate_code_id` UInt8,
    `pickup_longitude` Float64,
    `pickup_latitude` Float64,
    `dropoff_longitude` Float64,
    `dropoff_latitude` Float64,
    `passenger_count` UInt8,
    `trip_distance` Float64,
    `fare_amount` Float32,
    `extra` Float32,
    `mta_tax` Float32,
    `tip_amount` Float32,
    `tolls_amount` Float32,
    `ehail_fee` Float32,
    `improvement_surcharge` Float32,
    `total_amount` Float32,
    `payment_type` Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4),
    `trip_type` UInt8,
    `pickup` FixedString(25),
    `dropoff` FixedString(25),
    `cab_type` Enum8('yellow' = 1, 'green' = 2, 'uber' = 3),
    `pickup_nyct2010_gid` Int8,
    `pickup_ctlabel` Float32,
    `pickup_borocode` Int8,
    `pickup_ct2010` String,
    `pickup_boroct2010` String,
    `pickup_cdeligibil` String,
    `pickup_ntacode` FixedString(4),
    `pickup_ntaname` String,
    `pickup_puma` UInt16,
    `dropoff_nyct2010_gid` UInt8,
    `dropoff_ctlabel` Float32,
    `dropoff_borocode` UInt8,
    `dropoff_ct2010` String,
    `dropoff_boroct2010` String,
    `dropoff_cdeligibil` String,
    `dropoff_ntacode` FixedString(4),
    `dropoff_ntaname` String,
    `dropoff_puma` UInt16
)
ENGINE = MergeTree
PARTITION BY toYYYYMM(pickup_date)
ORDER BY pickup_datetime;
  • clickhouse://default:***@localhost:8123/default Hecho.
%%sql
INSERT INTO trips
SELECT * FROM s3(
    'https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/trips_{1..2}.gz',
    'TabSeparatedWithNames', "
    `trip_id` UInt32,
    `vendor_id` Enum8('1' = 1, '2' = 2, '3' = 3, '4' = 4, 'CMT' = 5, 'VTS' = 6, 'DDS' = 7, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14, '' = 15),
    `pickup_date` Date,
    `pickup_datetime` DateTime,
    `dropoff_date` Date,
    `dropoff_datetime` DateTime,
    `store_and_fwd_flag` UInt8,
    `rate_code_id` UInt8,
    `pickup_longitude` Float64,
    `pickup_latitude` Float64,
    `dropoff_longitude` Float64,
    `dropoff_latitude` Float64,
    `passenger_count` UInt8,
    `trip_distance` Float64,
    `fare_amount` Float32,
    `extra` Float32,
    `mta_tax` Float32,
    `tip_amount` Float32,
    `tolls_amount` Float32,
    `ehail_fee` Float32,
    `improvement_surcharge` Float32,
    `total_amount` Float32,
    `payment_type` Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4),
    `trip_type` UInt8,
    `pickup` FixedString(25),
    `dropoff` FixedString(25),
    `cab_type` Enum8('yellow' = 1, 'green' = 2, 'uber' = 3),
    `pickup_nyct2010_gid` Int8,
    `pickup_ctlabel` Float32,
    `pickup_borocode` Int8,
    `pickup_ct2010` String,
    `pickup_boroct2010` String,
    `pickup_cdeligibil` String,
    `pickup_ntacode` FixedString(4),
    `pickup_ntaname` String,
    `pickup_puma` UInt16,
    `dropoff_nyct2010_gid` UInt8,
    `dropoff_ctlabel` Float32,
    `dropoff_borocode` UInt8,
    `dropoff_ct2010` String,
    `dropoff_boroct2010` String,
    `dropoff_cdeligibil` String,
    `dropoff_ntacode` FixedString(4),
    `dropoff_ntaname` String,
    `dropoff_puma` UInt16
") SETTINGS input_format_try_infer_datetimes = 0
  • clickhouse://default:***@localhost:8123/default Hecho.
%sql SELECT count() FROM trips limit 5;
  • clickhouse://default:***@localhost:8123/default Hecho.
count()
1999657
%sql SELECT DISTINCT(pickup_ntaname) FROM trips limit 5;
  • clickhouse://default:***@localhost:8123/default Hecho.
pickup_ntaname
Morningside Heights
Hudson Yards-Chelsea-Flatiron-Union Square
Midtown-Midtown South
SoHo-Tribeca-Civic Center-Little Italy
Murray Hill-Kips Bay
%sql SELECT round(avg(tip_amount), 2) FROM trips
  • clickhouse://default:***@localhost:8123/default Listo.
round(avg(tip_amount), 2)
1.68
%%sql
SELECT
    passenger_count,
    ceil(avg(total_amount),2) AS average_total_amount
FROM trips
GROUP BY passenger_count
  • clickhouse://default:***@localhost:8123/default Listo.
passenger_countaverage_total_amount
022.69
115.97
217.15
316.76
417.33
516.35
616.04
759.8
836.41
99.81
%%sql
SELECT
    pickup_date,
    pickup_ntaname,
    SUM(1) AS number_of_trips
FROM trips
GROUP BY pickup_date, pickup_ntaname
ORDER BY pickup_date ASC
limit 5;
  • clickhouse://default:***@localhost:8123/default Listo.
pickup_datepickup_ntanamenumber_of_trips
2015-07-01Bushwick North2
2015-07-01Brighton Beach1
2015-07-01Briarwood-Jamaica Hills3
2015-07-01Williamsburg1
2015-07-01Queensbridge-Ravenswood-Long Island City9
# %sql DESCRIBE trips;
# %sql SELECT DISTINCT(trip_distance) FROM trips limit 50;
%%sql --save short-trips --no-execute
SELECT *
FROM trips
WHERE trip_distance < 6.3
  • clickhouse://default:***@localhost:8123/default Se omite la ejecución…
%sqlplot histogram --table short-trips --column trip_distance --bins 10 --with short-trips
<AxesSubplot: title={'center': "'trip_distance' from 'short-trips'"}, xlabel='trip_distance', ylabel='Count'>
ax = %sqlplot histogram --table short-trips --column trip_distance --bins 50 --with short-trips
ax.grid()
ax.set_title("Trip distance from trips < 6.3")
_ = ax.set_xlabel("Trip distance")
Última modificación el 10 de junio de 2026