Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
romazhan committed Dec 17, 2023
0 parents commit 5c3499f
Show file tree
Hide file tree
Showing 15 changed files with 1,389 additions and 0 deletions.
12 changes: 12 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Byte-compiled
__pycache__/

# Environments
venv/

# Configs
category_dump.json
config.ini

# Logs
error.log
674 changes: 674 additions & 0 deletions LICENSE

Large diffs are not rendered by default.

29 changes: 29 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# CM Parser

>🚬🚬🚬
CM Parser is designed for parsing product attributes.

## Quick Start

```bash
pip install -r requirements.txt
cp config.ini.example parser/config.ini
cp category_dump.json.example parser/category_dump.json
```

### Server

```bash
python parser/server.py
```

### CLI

```bash
python parser/cli.py
```

## Global Dependencies

- [Python](https://www.python.org/downloads) (tested on v3.11.4)
59 changes: 59 additions & 0 deletions category_dump.json.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
[
{
"id": "1",
"name": "Мониторы",
"attributes": [
{
"name": "Модель"
},
{
"name": "Диагональ экрана"
},
{
"name": "Частота обновления экрана"
},
{
"name": "Яркость"
},
{
"name": "Время отклика"
},
{
"name": "Максимальное разрешение"
},
{
"name": "Глубина цвета"
},
{
"name": "Контрастность"
},
{
"name": "Веб-камера"
},
{
"name": "Плотность пикселей"
},
{
"name": "Поддержка HDR"
},
{
"name": "Подсветка"
},
{
"name": "Покрытие экрана"
},
{
"name": "Соотношение сторон"
},
{
"name": "Тип подсветки матрицы"
},
{
"name": "Углы обзора"
},
{
"name": "Технология защиты зрения"
}
]
}
]
17 changes: 17 additions & 0 deletions config.ini.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
[DUCK]
region = ru-ru
fetch_timeout_sec = 13.5
link_count_limit = 150

[BRAIN]
fetch_timeout_sec = 9.1
kv_len_range = (2, 85)
k_threshold = 71

[SERVER]
port = 8123
debug = false
reload = true

[API]
secret =
68 changes: 68 additions & 0 deletions parser/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#-*- coding: utf-8 -*-
from configparser import ConfigParser

from src.utils.sugar import nonstop
from src.green import Green
from src.duck import Duck
from src.brain import Brain

import asyncio, sys, os


_CONFIG_FILE_PATH = './config.ini'
_CATEGORY_DUMP_FILE_PATH = './category_dump.json'

@nonstop(2)
def _get_product_model() -> str:
product_model = input('[*] input product model: ').strip()
assert product_model, 'no product model'

return product_model

@nonstop(2)
def _get_category_id() -> int:
category_id = int(input('[*] input category id: '))
assert category_id, 'no category id'

return category_id

async def _main() -> None:
confdad = ConfigParser(converters={
'tupleint': lambda l: tuple(
int(v.strip()) for v in l.strip('()').split(',')
)
})
assert confdad.read(_CONFIG_FILE_PATH), f'{_CONFIG_FILE_PATH} not found'

product_model = _get_product_model()
product_category_id = _get_category_id()

green = Green(_CATEGORY_DUMP_FILE_PATH)
await green.load_category_dump()

attribute_names = green.get_attribute_fields('name', product_category_id)
assert attribute_names, 'no attributes found'

duck = Duck(confdad.get('DUCK', 'region'))
product_links = duck.get_links(
product_model,
timeout_sec=confdad.getfloat('DUCK', 'fetch_timeout_sec'),
count_limit=confdad.getint('DUCK', 'link_count_limit'),
)
assert product_links, 'no product links'

summary = await Brain.get_product_summary(
product_links,
product_model=product_model,
attribute_names=attribute_names,
fetch_timeout_sec=confdad.getfloat('BRAIN', 'fetch_timeout_sec'),
kv_len_range=confdad.gettupleint('BRAIN', 'kv_len_range'),
k_threshold=confdad.getint('BRAIN', 'k_threshold')
)
print(f'\n[result]: {summary}')

if __name__ == '__main__':
ROOT = os.path.dirname(sys.argv[0])
ROOT and os.chdir(ROOT)

asyncio.run(_main())
138 changes: 138 additions & 0 deletions parser/server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#-*- coding: utf-8 -*-
from configparser import ConfigParser

from fastapi import (
FastAPI, APIRouter, Depends, Request, HTTPException
)
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import uvicorn

from src.utils.sugar import datetime
from src.green import Green
from src.duck import Duck
from src.brain import (
Brain, LinkSummary
)

import random, time, sys, os


_ROOT = os.path.dirname(sys.argv[0])
_ROOT and os.chdir(_ROOT)

_CONFIG_FILE_PATH = './config.ini'
_CATEGORY_DUMP_FILE_PATH = './category_dump.json'

class _APIface(object):
class _AuthorizableRequest(BaseModel):
secret: str

class ParseRequest(_AuthorizableRequest):
product_model: str
category_id: int

class ParseResponse(BaseModel):
summary: list[LinkSummary]
elapsed_time_sec: int
parsers_used: int

def _register_routes(
app_router: APIRouter,
confdad: ConfigParser
) -> list[Depends]:
green = Green(_CATEGORY_DUMP_FILE_PATH)
duck = Duck(confdad.get('DUCK', 'region'))

@app_router.post('/parse')
async def _(request: _APIface.ParseRequest) -> _APIface.ParseResponse:
request.product_model = request.product_model.strip()
if not request.product_model:
raise HTTPException(400, detail='empty product model')

start_time = time.time()

if not green.is_category_dump_loaded or random.randint(0, 12) == 0:
await green.load_category_dump()

attribute_names = green.get_attribute_fields('name', request.category_id)
if not attribute_names:
raise HTTPException(424, detail=f'no attributes found ({request.category_id})')

product_links = duck.get_links(
request.product_model,
timeout_sec=confdad.getfloat('DUCK', 'fetch_timeout_sec'),
count_limit=confdad.getint('DUCK', 'link_count_limit')
)
if not product_links:
raise HTTPException(424, detail='failed to get links to sites')

summary = await Brain.get_product_summary(
product_links,
product_model=request.product_model,
attribute_names=attribute_names,
fetch_timeout_sec=confdad.getfloat('BRAIN', 'fetch_timeout_sec'),
kv_len_range=confdad.gettupleint('BRAIN', 'kv_len_range'),
k_threshold=confdad.getint('BRAIN', 'k_threshold')
) or []

return _APIface.ParseResponse(
summary=summary,
elapsed_time_sec=int(time.time() - start_time),
parsers_used=1
)

async def assert_secret(request: Request) -> None:
if (await request.json()).get('secret') != confdad.get('API', 'secret'):
raise HTTPException(401)

return [Depends(assert_secret)]

_confdad = ConfigParser(converters={
'tupleint': lambda l: tuple(
int(v.strip()) for v in l.strip('()').split(',')
)
})
assert _confdad.read(_CONFIG_FILE_PATH), f'{_CONFIG_FILE_PATH} not found'

_app = FastAPI(
title='CM Parser API',
description='CM Parser API',
version='1.0.0',
debug=_confdad.getboolean('SERVER', 'debug'),
redoc_url='/'
)
_app.add_middleware(
CORSMiddleware,
allow_origins=['*'],
allow_methods=['POST']
)

_app_router = APIRouter()
_app.include_router(_app_router,
dependencies=_register_routes(_app_router, _confdad)
)

if __name__ == '__main__':
try:
if _confdad.getboolean('SERVER', 'reload'):
uvicorn.run(
'__main__:_app',
port=_confdad.getint('SERVER', 'port'),
reload=True,
reload_includes=[
'*.ini', '*.json'
]
)
else:
uvicorn.run(_app, port=_confdad.getint('SERVER', 'port'))
except Exception as e:
e_message = f'[{datetime()}][unhandled]: {str(e) or "@empty"}'

with open('error.log', 'a', encoding='utf-8') as f:
f.write(f'{e_message}\n')

print(f'\n{e_message}')

input('\n- Press Enter to exit...')
sys.exit(1)
Loading

0 comments on commit 5c3499f

Please sign in to comment.