meilisearch · qdequele · Sep 22, 2024 · Oct 12, 2024 · Oct 12, 2024 · Oct 13, 2024
diff --git a/.eslintrc.cjs b/.eslintrc.cjs
@@ -39,6 +39,7 @@ module.exports = {
     '@typescript-eslint/return-await': 'off',
     '@typescript-eslint/no-explicit-any': 'off',
     '@typescript-eslint/explicit-function-return-type': 'off',
+    "@typescript-eslint/no-unsafe-assignment": "off",
     '@typescript-eslint/member-delimiter-style': [
       'error',
       {

diff --git a/.github/scripts/scrapix_server_call_check.sh b/.github/scripts/scrapix_server_call_check.sh
diff --git a/.github/scripts/wait-for-it.sh b/.github/scripts/wait-for-it.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# Use: ./wait-for-it.sh host:port [-t timeout] [-- command args]
+# From: https://github.com/vishnubob/wait-for-it
+
+WAITFORIT_cmdname=${0##*/}
+
+echoerr() { if [[ $WAITFORIT_QUIET -ne 1 ]]; then echo "$@" 1>&2; fi }
+
+usage()
+{
+    cat << USAGE >&2
+Usage:
+    $WAITFORIT_cmdname host:port [-t timeout] [-- command args]
+    -h HOST | --host=HOST       Host or IP under test
+    -p PORT | --port=PORT       TCP port under test
+    -t TIMEOUT | --timeout=TIMEOUT Timeout in seconds, zero for no timeout
+    -- COMMAND ARGS             Execute command with args after the test finishes
+USAGE
+    exit 1
+}
+
+wait_for()
+{
+    if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then
+        echoerr "$WAITFORIT_cmdname: waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT"
+    else
+        echoerr "$WAITFORIT_cmdname: waiting for $WAITFORIT_HOST:$WAITFORIT_PORT without a timeout"
+    fi
+    WAITFORIT_start_ts=$(date +%s)
+    while :
+    do
+        if [[ $WAITFORIT_ISBUSY -eq 1 ]]; then
+            nc -z $WAITFORIT_HOST $WAITFORIT_PORT
+            WAITFORIT_result=$?
+        else
+            (echo -n > /dev/tcp/$WAITFORIT_HOST/$WAITFORIT_PORT) >/dev/null 2>&1
+            WAITFORIT_result=$?
+        fi
+        if [[ $WAITFORIT_result -eq 0 ]]; then
+            WAITFORIT_end_ts=$(date +%s)
+            echoerr "$WAITFORIT_cmdname: $WAITFORIT_HOST:$WAITFORIT_PORT is available after $((WAITFORIT_end_ts - WAITFORIT_start_ts)) seconds"
+            break
+        fi
+        sleep 1
+    done
+    return $WAITFORIT_result
+}
+
+# Rest of the script... 
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,73 @@
+name: Test
+
+on:
+  pull_request:
+    branches: [main]
+  push:
+    branches: [main]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v3
+        with:
+          node-version: "20"
+          cache: "npm"
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Build
+        run: npm run build
+
+      - name: Install Docker Compose
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y docker-compose
+
+      - name: Start test environment
+        run: |
+          docker-compose up -d
+          docker ps -a
+
+      - name: Make wait-for-it.sh executable
+        run: chmod +x .github/scripts/wait-for-it.sh
+
+      - name: Wait for services
+        run: |
+          .github/scripts/wait-for-it.sh localhost:7700 -t 60
+          .github/scripts/wait-for-it.sh localhost:3000 -t 60
+          .github/scripts/wait-for-it.sh localhost:8080 -t 60
+          sleep 10 # Give services extra time to fully initialize
+
+      - name: Debug service logs
+        if: always()
+        run: |
+          echo "=== Meilisearch Logs ==="
+          docker-compose logs meilisearch
+          echo "=== Playground Logs ==="
+          docker-compose logs playground
+          echo "=== Scraper Logs ==="
+          docker-compose logs scraper
+          echo "=== Redis Logs ==="
+          docker-compose logs redis
+
+      - name: Run tests
+        run: npm run test
+
+      - name: Show test logs on failure
+        if: failure()
+        run: |
+          echo "=== Service Status ==="
+          docker-compose ps
+          echo "=== Recent Logs ==="
+          docker-compose logs --tail=100
+
+      - name: Cleanup
+        if: always()
+        run: docker-compose down -v
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
diff --git a/.gitignore b/.gitignore
@@ -71,6 +71,7 @@ typings/
 # dotenv environment variables file
 .env
 .env.test
+.env.local
 
 # parcel-bundler cache (https://parceljs.org/)
 .cache

diff --git a/Dockerfile b/Dockerfile
@@ -1,25 +1,25 @@
 # Specify the base Docker image. You can read more about
 # the available images at https://crawlee.dev/docs/guides/docker-images
 # You can also use any other image from Docker Hub.
-FROM apify/actor-node-puppeteer-chrome:18 AS builder
+FROM apify/actor-node-puppeteer-chrome:20 AS builder
 
 # Copy just package.json and package-lock.json
 # to speed up the build using Docker layer cache.
 COPY --chown=myuser package*.json ./
 
 # Install all dependencies. Don't audit to speed up the installation.
-RUN yarn install --production=false
+RUN npm install --include=dev
 
 # Next, copy the source files using the user set
 # in the base image.
 COPY --chown=myuser . ./
 
 # Install all dependencies and build the project.
 # Don't audit to speed up the installation.
-RUN yarn run build
+RUN npm run build
 
 # Create final image
-FROM apify/actor-node-puppeteer-chrome:18
+FROM apify/actor-node-puppeteer-chrome:20
 
 # Copy only built JS files from builder image
 COPY --from=builder --chown=myuser /home/myuser/dist ./dist
@@ -31,7 +31,7 @@ COPY --chown=myuser package*.json ./
 # Install NPM packages, skip optional and development dependencies to
 # keep the image small. Avoid logging too much and print the dependency
 # tree for debugging
-RUN yarn install --production=false
+RUN npm install
 
 # Next, copy the remaining files and directories with the source code.
 # Since we do this after NPM install, quick build will be really fast
@@ -40,4 +40,4 @@ COPY --chown=myuser . ./
 
 # Run the image. If you know you won't need headful browsers,
 # you can remove the XVFB start script for a micro perf gain.
-CMD ./start_xvfb_and_run_cmd.sh && yarn start:prod -- -c $CRAWLER_CONFIG -b /usr/bin/google-chrome --silent 
+CMD ./start_xvfb_and_run_cmd.sh && npm run start:server -- -c $CRAWLER_CONFIG -b /usr/bin/google-chrome --silent 
diff --git a/README.md b/README.md
@@ -33,8 +33,7 @@ data:
   "meilisearch_url": "http://localhost:7700",
   "meilisearch_api_key": "masterKey",
   "meilisearch_index_uid": "google",
-  "strategy": "default", // docssearch, schema*, custom or default
-  "headless": true, // Use headless browser for rendering javascript websites
+  "strategy": "default", // docssearch, schema*, custom, markdown or default
   "batch_size": 1000, // pass null to send documents 1 at a time or specify a batch size
   "primary_key": null,
   "meilisearch_settings": {
@@ -52,6 +51,12 @@ data:
     "filterableAttributes": ["urls_tags"],
     "distinctAttribute": "url"
   },
+  "selectors": { // Only for custom
+    "main_content": "main",
+    "headings": "h1, h2, h3",
+    "paragraphs": "p",
+    "custom_field": ".custom-class",
+  },
   "schema_settings": {
     "only_type": "Product", // Product, Article, etc...
     "convert_dates": true // default false
@@ -159,10 +164,6 @@ Name of the index on which the content is indexed.
 default: `default`
 Scraping strategy: - `default` Scrapes the content of webpages, it is suitable for most use cases. It indexes the content in this format (show example) - `docssearch` Scrapes the content of webpages, it suits most use cases. The difference with the default strategy is that it indexes the content in a format compatible with docs-search bar - `schema` Scraps the [`schema`](https://schema.org/) information of your web app.
 
-`headless`
-default: `true`
-Wether or not the javascript should be loaded before scraping starts.
-
 `primary_key`
 The key name in your documents containing their unique identifier.
 

diff --git a/config/nodemon:build.json b/config/nodemon:build.json
diff --git a/config/nodemon:default-scrap.json b/config/nodemon:default-scrap.json
diff --git a/config/nodemon:docsearch-scrap.json b/config/nodemon:docsearch-scrap.json
diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml